xref: /netbsd-src/external/gpl3/gcc/dist/gcc/config/aarch64/aarch64.cc (revision 0a3071956a3a9fdebdbf7f338cf2d439b45fc728)
1 /* Machine description for AArch64 architecture.
2    Copyright (C) 2009-2022 Free Software Foundation, Inc.
3    Contributed by ARM Ltd.
4 
5    This file is part of GCC.
6 
7    GCC is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3, or (at your option)
10    any later version.
11 
12    GCC is distributed in the hope that it will be useful, but
13    WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15    General Public License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GCC; see the file COPYING3.  If not see
19    <http://www.gnu.org/licenses/>.  */
20 
21 #define IN_TARGET_CODE 1
22 
23 #define INCLUDE_STRING
24 #define INCLUDE_ALGORITHM
25 #include "config.h"
26 #include "system.h"
27 #include "coretypes.h"
28 #include "backend.h"
29 #include "target.h"
30 #include "rtl.h"
31 #include "tree.h"
32 #include "memmodel.h"
33 #include "gimple.h"
34 #include "cfghooks.h"
35 #include "cfgloop.h"
36 #include "df.h"
37 #include "tm_p.h"
38 #include "stringpool.h"
39 #include "attribs.h"
40 #include "optabs.h"
41 #include "regs.h"
42 #include "emit-rtl.h"
43 #include "recog.h"
44 #include "cgraph.h"
45 #include "diagnostic.h"
46 #include "insn-attr.h"
47 #include "alias.h"
48 #include "fold-const.h"
49 #include "stor-layout.h"
50 #include "calls.h"
51 #include "varasm.h"
52 #include "output.h"
53 #include "flags.h"
54 #include "explow.h"
55 #include "expr.h"
56 #include "reload.h"
57 #include "langhooks.h"
58 #include "opts.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76 #include "expmed.h"
77 #include "function-abi.h"
78 #include "gimple-pretty-print.h"
79 #include "tree-ssa-loop-niter.h"
80 #include "fractional-cost.h"
81 #include "rtlanal.h"
82 #include "tree-dfa.h"
83 #include "asan.h"
84 
85 /* This file should be included last.  */
86 #include "target-def.h"
87 
88 /* Defined for convenience.  */
89 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
90 
91 /* Information about a legitimate vector immediate operand.  */
92 struct simd_immediate_info
93 {
94   enum insn_type { MOV, MVN, INDEX, PTRUE };
95   enum modifier_type { LSL, MSL };
96 
simd_immediate_infosimd_immediate_info97   simd_immediate_info () {}
98   simd_immediate_info (scalar_float_mode, rtx);
99   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
100 		       insn_type = MOV, modifier_type = LSL,
101 		       unsigned int = 0);
102   simd_immediate_info (scalar_mode, rtx, rtx);
103   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
104 
105   /* The mode of the elements.  */
106   scalar_mode elt_mode;
107 
108   /* The instruction to use to move the immediate into a vector.  */
109   insn_type insn;
110 
111   union
112   {
113     /* For MOV and MVN.  */
114     struct
115     {
116       /* The value of each element.  */
117       rtx value;
118 
119       /* The kind of shift modifier to use, and the number of bits to shift.
120 	 This is (LSL, 0) if no shift is needed.  */
121       modifier_type modifier;
122       unsigned int shift;
123     } mov;
124 
125     /* For INDEX.  */
126     struct
127     {
128       /* The value of the first element and the step to be added for each
129 	 subsequent element.  */
130       rtx base, step;
131     } index;
132 
133     /* For PTRUE.  */
134     aarch64_svpattern pattern;
135   } u;
136 };
137 
138 /* Construct a floating-point immediate in which each element has mode
139    ELT_MODE_IN and value VALUE_IN.  */
140 inline simd_immediate_info
simd_immediate_info(scalar_float_mode elt_mode_in,rtx value_in)141 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
142   : elt_mode (elt_mode_in), insn (MOV)
143 {
144   u.mov.value = value_in;
145   u.mov.modifier = LSL;
146   u.mov.shift = 0;
147 }
148 
149 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
150    and value VALUE_IN.  The other parameters are as for the structure
151    fields.  */
152 inline simd_immediate_info
simd_immediate_info(scalar_int_mode elt_mode_in,unsigned HOST_WIDE_INT value_in,insn_type insn_in,modifier_type modifier_in,unsigned int shift_in)153 ::simd_immediate_info (scalar_int_mode elt_mode_in,
154 		       unsigned HOST_WIDE_INT value_in,
155 		       insn_type insn_in, modifier_type modifier_in,
156 		       unsigned int shift_in)
157   : elt_mode (elt_mode_in), insn (insn_in)
158 {
159   u.mov.value = gen_int_mode (value_in, elt_mode_in);
160   u.mov.modifier = modifier_in;
161   u.mov.shift = shift_in;
162 }
163 
164 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
165    and where element I is equal to BASE_IN + I * STEP_IN.  */
166 inline simd_immediate_info
simd_immediate_info(scalar_mode elt_mode_in,rtx base_in,rtx step_in)167 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
168   : elt_mode (elt_mode_in), insn (INDEX)
169 {
170   u.index.base = base_in;
171   u.index.step = step_in;
172 }
173 
174 /* Construct a predicate that controls elements of mode ELT_MODE_IN
175    and has PTRUE pattern PATTERN_IN.  */
176 inline simd_immediate_info
simd_immediate_info(scalar_int_mode elt_mode_in,aarch64_svpattern pattern_in)177 ::simd_immediate_info (scalar_int_mode elt_mode_in,
178 		       aarch64_svpattern pattern_in)
179   : elt_mode (elt_mode_in), insn (PTRUE)
180 {
181   u.pattern = pattern_in;
182 }
183 
184 namespace {
185 
186 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64.  */
187 class pure_scalable_type_info
188 {
189 public:
190   /* Represents the result of analyzing a type.  All values are nonzero,
191      in the possibly forlorn hope that accidental conversions to bool
192      trigger a warning.  */
193   enum analysis_result
194   {
195     /* The type does not have an ABI identity; i.e. it doesn't contain
196        at least one object whose type is a Fundamental Data Type.  */
197     NO_ABI_IDENTITY = 1,
198 
199     /* The type is definitely a Pure Scalable Type.  */
200     IS_PST,
201 
202     /* The type is definitely not a Pure Scalable Type.  */
203     ISNT_PST,
204 
205     /* It doesn't matter for PCS purposes whether the type is a Pure
206        Scalable Type or not, since the type will be handled the same
207        way regardless.
208 
209        Specifically, this means that if the type is a Pure Scalable Type,
210        there aren't enough argument registers to hold it, and so it will
211        need to be passed or returned in memory.  If the type isn't a
212        Pure Scalable Type, it's too big to be passed or returned in core
213        or SIMD&FP registers, and so again will need to go in memory.  */
214     DOESNT_MATTER
215   };
216 
217   /* Aggregates of 17 bytes or more are normally passed and returned
218      in memory, so aggregates of that size can safely be analyzed as
219      DOESNT_MATTER.  We need to be able to collect enough pieces to
220      represent a PST that is smaller than that.  Since predicates are
221      2 bytes in size for -msve-vector-bits=128, that means we need to be
222      able to store at least 8 pieces.
223 
224      We also need to be able to store enough pieces to represent
225      a single vector in each vector argument register and a single
226      predicate in each predicate argument register.  This means that
227      we need at least 12 pieces.  */
228   static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
229   static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
230 
231   /* Describes one piece of a PST.  Each piece is one of:
232 
233      - a single Scalable Vector Type (SVT)
234      - a single Scalable Predicate Type (SPT)
235      - a PST containing 2, 3 or 4 SVTs, with no padding
236 
237      It either represents a single built-in type or a PST formed from
238      multiple homogeneous built-in types.  */
239   struct piece
240   {
241     rtx get_rtx (unsigned int, unsigned int) const;
242 
243     /* The number of vector and predicate registers that the piece
244        occupies.  One of the two is always zero.  */
245     unsigned int num_zr;
246     unsigned int num_pr;
247 
248     /* The mode of the registers described above.  */
249     machine_mode mode;
250 
251     /* If this piece is formed from multiple homogeneous built-in types,
252        this is the mode of the built-in types, otherwise it is MODE.  */
253     machine_mode orig_mode;
254 
255     /* The offset in bytes of the piece from the start of the type.  */
256     poly_uint64_pod offset;
257   };
258 
259   /* Divides types analyzed as IS_PST into individual pieces.  The pieces
260      are in memory order.  */
261   auto_vec<piece, MAX_PIECES> pieces;
262 
263   unsigned int num_zr () const;
264   unsigned int num_pr () const;
265 
266   rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
267 
268   analysis_result analyze (const_tree);
269   bool analyze_registers (const_tree);
270 
271 private:
272   analysis_result analyze_array (const_tree);
273   analysis_result analyze_record (const_tree);
274   void add_piece (const piece &);
275 };
276 }
277 
278 /* The current code model.  */
279 enum aarch64_code_model aarch64_cmodel;
280 
281 /* The number of 64-bit elements in an SVE vector.  */
282 poly_uint16 aarch64_sve_vg;
283 
284 #ifdef HAVE_AS_TLS
285 #undef TARGET_HAVE_TLS
286 #define TARGET_HAVE_TLS 1
287 #endif
288 
289 static bool aarch64_composite_type_p (const_tree, machine_mode);
290 static bool aarch64_return_in_memory_1 (const_tree);
291 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
292 						     const_tree,
293 						     machine_mode *, int *,
294 						     bool *, bool);
295 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
296 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
297 static void aarch64_override_options_after_change (void);
298 static bool aarch64_vector_mode_supported_p (machine_mode);
299 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
300 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
301 							 const_tree type,
302 							 int misalignment,
303 							 bool is_packed);
304 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
305 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
306 					    aarch64_addr_query_type);
307 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
308 
309 /* Major revision number of the ARM Architecture implemented by the target.  */
310 unsigned aarch64_architecture_version;
311 
312 /* The processor for which instructions should be scheduled.  */
313 enum aarch64_processor aarch64_tune = cortexa53;
314 
315 /* Mask to specify which instruction scheduling options should be used.  */
316 uint64_t aarch64_tune_flags = 0;
317 
318 /* Global flag for PC relative loads.  */
319 bool aarch64_pcrelative_literal_loads;
320 
321 /* Global flag for whether frame pointer is enabled.  */
322 bool aarch64_use_frame_pointer;
323 
324 #define BRANCH_PROTECT_STR_MAX 255
325 char *accepted_branch_protection_string = NULL;
326 
327 static enum aarch64_parse_opt_result
328 aarch64_parse_branch_protection (const char*, char**);
329 
330 /* Support for command line parsing of boolean flags in the tuning
331    structures.  */
332 struct aarch64_flag_desc
333 {
334   const char* name;
335   unsigned int flag;
336 };
337 
338 #define AARCH64_FUSION_PAIR(name, internal_name) \
339   { name, AARCH64_FUSE_##internal_name },
340 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
341 {
342   { "none", AARCH64_FUSE_NOTHING },
343 #include "aarch64-fusion-pairs.def"
344   { "all", AARCH64_FUSE_ALL },
345   { NULL, AARCH64_FUSE_NOTHING }
346 };
347 
348 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
349   { name, AARCH64_EXTRA_TUNE_##internal_name },
350 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
351 {
352   { "none", AARCH64_EXTRA_TUNE_NONE },
353 #include "aarch64-tuning-flags.def"
354   { "all", AARCH64_EXTRA_TUNE_ALL },
355   { NULL, AARCH64_EXTRA_TUNE_NONE }
356 };
357 
358 /* Tuning parameters.  */
359 
360 static const struct cpu_addrcost_table generic_addrcost_table =
361 {
362     {
363       1, /* hi  */
364       0, /* si  */
365       0, /* di  */
366       1, /* ti  */
367     },
368   0, /* pre_modify  */
369   0, /* post_modify  */
370   0, /* post_modify_ld3_st3  */
371   0, /* post_modify_ld4_st4  */
372   0, /* register_offset  */
373   0, /* register_sextend  */
374   0, /* register_zextend  */
375   0 /* imm_offset  */
376 };
377 
378 static const struct cpu_addrcost_table exynosm1_addrcost_table =
379 {
380     {
381       0, /* hi  */
382       0, /* si  */
383       0, /* di  */
384       2, /* ti  */
385     },
386   0, /* pre_modify  */
387   0, /* post_modify  */
388   0, /* post_modify_ld3_st3  */
389   0, /* post_modify_ld4_st4  */
390   1, /* register_offset  */
391   1, /* register_sextend  */
392   2, /* register_zextend  */
393   0, /* imm_offset  */
394 };
395 
396 static const struct cpu_addrcost_table xgene1_addrcost_table =
397 {
398     {
399       1, /* hi  */
400       0, /* si  */
401       0, /* di  */
402       1, /* ti  */
403     },
404   1, /* pre_modify  */
405   1, /* post_modify  */
406   1, /* post_modify_ld3_st3  */
407   1, /* post_modify_ld4_st4  */
408   0, /* register_offset  */
409   1, /* register_sextend  */
410   1, /* register_zextend  */
411   0, /* imm_offset  */
412 };
413 
414 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
415 {
416     {
417       1, /* hi  */
418       1, /* si  */
419       1, /* di  */
420       2, /* ti  */
421     },
422   0, /* pre_modify  */
423   0, /* post_modify  */
424   0, /* post_modify_ld3_st3  */
425   0, /* post_modify_ld4_st4  */
426   2, /* register_offset  */
427   3, /* register_sextend  */
428   3, /* register_zextend  */
429   0, /* imm_offset  */
430 };
431 
432 static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
433 {
434     {
435       1, /* hi  */
436       1, /* si  */
437       1, /* di  */
438       2, /* ti  */
439     },
440   0, /* pre_modify  */
441   0, /* post_modify  */
442   0, /* post_modify_ld3_st3  */
443   0, /* post_modify_ld4_st4  */
444   2, /* register_offset  */
445   3, /* register_sextend  */
446   3, /* register_zextend  */
447   0, /* imm_offset  */
448 };
449 
450 static const struct cpu_addrcost_table tsv110_addrcost_table =
451 {
452     {
453       1, /* hi  */
454       0, /* si  */
455       0, /* di  */
456       1, /* ti  */
457     },
458   0, /* pre_modify  */
459   0, /* post_modify  */
460   0, /* post_modify_ld3_st3  */
461   0, /* post_modify_ld4_st4  */
462   0, /* register_offset  */
463   1, /* register_sextend  */
464   1, /* register_zextend  */
465   0, /* imm_offset  */
466 };
467 
468 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
469 {
470     {
471       1, /* hi  */
472       1, /* si  */
473       1, /* di  */
474       2, /* ti  */
475     },
476   1, /* pre_modify  */
477   1, /* post_modify  */
478   1, /* post_modify_ld3_st3  */
479   1, /* post_modify_ld4_st4  */
480   3, /* register_offset  */
481   3, /* register_sextend  */
482   3, /* register_zextend  */
483   2, /* imm_offset  */
484 };
485 
486 static const struct cpu_addrcost_table a64fx_addrcost_table =
487 {
488     {
489       1, /* hi  */
490       1, /* si  */
491       1, /* di  */
492       2, /* ti  */
493     },
494   0, /* pre_modify  */
495   0, /* post_modify  */
496   0, /* post_modify_ld3_st3  */
497   0, /* post_modify_ld4_st4  */
498   2, /* register_offset  */
499   3, /* register_sextend  */
500   3, /* register_zextend  */
501   0, /* imm_offset  */
502 };
503 
504 static const struct cpu_addrcost_table neoversev1_addrcost_table =
505 {
506     {
507       1, /* hi  */
508       0, /* si  */
509       0, /* di  */
510       1, /* ti  */
511     },
512   0, /* pre_modify  */
513   0, /* post_modify  */
514   3, /* post_modify_ld3_st3  */
515   3, /* post_modify_ld4_st4  */
516   0, /* register_offset  */
517   0, /* register_sextend  */
518   0, /* register_zextend  */
519   0 /* imm_offset  */
520 };
521 
522 static const struct cpu_addrcost_table neoversen2_addrcost_table =
523 {
524     {
525       1, /* hi  */
526       0, /* si  */
527       0, /* di  */
528       1, /* ti  */
529     },
530   0, /* pre_modify  */
531   0, /* post_modify  */
532   2, /* post_modify_ld3_st3  */
533   2, /* post_modify_ld4_st4  */
534   0, /* register_offset  */
535   0, /* register_sextend  */
536   0, /* register_zextend  */
537   0 /* imm_offset  */
538 };
539 
540 static const struct cpu_addrcost_table neoversev2_addrcost_table =
541 {
542     {
543       1, /* hi  */
544       0, /* si  */
545       0, /* di  */
546       1, /* ti  */
547     },
548   0, /* pre_modify  */
549   0, /* post_modify  */
550   2, /* post_modify_ld3_st3  */
551   2, /* post_modify_ld4_st4  */
552   0, /* register_offset  */
553   0, /* register_sextend  */
554   0, /* register_zextend  */
555   0 /* imm_offset  */
556 };
557 
558 static const struct cpu_regmove_cost generic_regmove_cost =
559 {
560   1, /* GP2GP  */
561   /* Avoid the use of slow int<->fp moves for spilling by setting
562      their cost higher than memmov_cost.  */
563   5, /* GP2FP  */
564   5, /* FP2GP  */
565   2 /* FP2FP  */
566 };
567 
568 static const struct cpu_regmove_cost cortexa57_regmove_cost =
569 {
570   1, /* GP2GP  */
571   /* Avoid the use of slow int<->fp moves for spilling by setting
572      their cost higher than memmov_cost.  */
573   5, /* GP2FP  */
574   5, /* FP2GP  */
575   2 /* FP2FP  */
576 };
577 
578 static const struct cpu_regmove_cost cortexa53_regmove_cost =
579 {
580   1, /* GP2GP  */
581   /* Avoid the use of slow int<->fp moves for spilling by setting
582      their cost higher than memmov_cost.  */
583   5, /* GP2FP  */
584   5, /* FP2GP  */
585   2 /* FP2FP  */
586 };
587 
588 static const struct cpu_regmove_cost exynosm1_regmove_cost =
589 {
590   1, /* GP2GP  */
591   /* Avoid the use of slow int<->fp moves for spilling by setting
592      their cost higher than memmov_cost (actual, 4 and 9).  */
593   9, /* GP2FP  */
594   9, /* FP2GP  */
595   1 /* FP2FP  */
596 };
597 
598 static const struct cpu_regmove_cost thunderx_regmove_cost =
599 {
600   2, /* GP2GP  */
601   2, /* GP2FP  */
602   6, /* FP2GP  */
603   4 /* FP2FP  */
604 };
605 
606 static const struct cpu_regmove_cost xgene1_regmove_cost =
607 {
608   1, /* GP2GP  */
609   /* Avoid the use of slow int<->fp moves for spilling by setting
610      their cost higher than memmov_cost.  */
611   8, /* GP2FP  */
612   8, /* FP2GP  */
613   2 /* FP2FP  */
614 };
615 
616 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
617 {
618   2, /* GP2GP  */
619   /* Avoid the use of int<->fp moves for spilling.  */
620   6, /* GP2FP  */
621   6, /* FP2GP  */
622   4 /* FP2FP  */
623 };
624 
625 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
626 {
627   1, /* GP2GP  */
628   /* Avoid the use of int<->fp moves for spilling.  */
629   5, /* GP2FP  */
630   6, /* FP2GP  */
631   3, /* FP2FP  */
632 };
633 
634 static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
635 {
636   1, /* GP2GP  */
637   /* Avoid the use of int<->fp moves for spilling.  */
638   4, /* GP2FP  */
639   5, /* FP2GP  */
640   4  /* FP2FP  */
641 };
642 
643 static const struct cpu_regmove_cost tsv110_regmove_cost =
644 {
645   1, /* GP2GP  */
646   /* Avoid the use of slow int<->fp moves for spilling by setting
647      their cost higher than memmov_cost.  */
648   2, /* GP2FP  */
649   3, /* FP2GP  */
650   2  /* FP2FP  */
651 };
652 
653 static const struct cpu_regmove_cost a64fx_regmove_cost =
654 {
655   1, /* GP2GP  */
656   /* Avoid the use of slow int<->fp moves for spilling by setting
657      their cost higher than memmov_cost.  */
658   5, /* GP2FP  */
659   7, /* FP2GP  */
660   2 /* FP2FP  */
661 };
662 
663 static const struct cpu_regmove_cost neoversen2_regmove_cost =
664 {
665   1, /* GP2GP  */
666   /* Spilling to int<->fp instead of memory is recommended so set
667      realistic costs compared to memmov_cost.  */
668   3, /* GP2FP  */
669   2, /* FP2GP  */
670   2 /* FP2FP  */
671 };
672 
673 static const struct cpu_regmove_cost neoversev1_regmove_cost =
674 {
675   1, /* GP2GP  */
676   /* Spilling to int<->fp instead of memory is recommended so set
677      realistic costs compared to memmov_cost.  */
678   3, /* GP2FP  */
679   2, /* FP2GP  */
680   2 /* FP2FP  */
681 };
682 
683 static const struct cpu_regmove_cost neoversev2_regmove_cost =
684 {
685   1, /* GP2GP  */
686   /* Spilling to int<->fp instead of memory is recommended so set
687      realistic costs compared to memmov_cost.  */
688   3, /* GP2FP  */
689   2, /* FP2GP  */
690   2 /* FP2FP  */
691 };
692 
693 /* Generic costs for Advanced SIMD vector operations.   */
694 static const advsimd_vec_cost generic_advsimd_vector_cost =
695 {
696   1, /* int_stmt_cost  */
697   1, /* fp_stmt_cost  */
698   0, /* ld2_st2_permute_cost  */
699   0, /* ld3_st3_permute_cost  */
700   0, /* ld4_st4_permute_cost  */
701   2, /* permute_cost  */
702   2, /* reduc_i8_cost  */
703   2, /* reduc_i16_cost  */
704   2, /* reduc_i32_cost  */
705   2, /* reduc_i64_cost  */
706   2, /* reduc_f16_cost  */
707   2, /* reduc_f32_cost  */
708   2, /* reduc_f64_cost  */
709   2, /* store_elt_extra_cost  */
710   2, /* vec_to_scalar_cost  */
711   1, /* scalar_to_vec_cost  */
712   1, /* align_load_cost  */
713   1, /* unalign_load_cost  */
714   1, /* unalign_store_cost  */
715   1  /* store_cost  */
716 };
717 
718 /* Generic costs for SVE vector operations.  */
719 static const sve_vec_cost generic_sve_vector_cost =
720 {
721   {
722     1, /* int_stmt_cost  */
723     1, /* fp_stmt_cost  */
724     0, /* ld2_st2_permute_cost  */
725     0, /* ld3_st3_permute_cost  */
726     0, /* ld4_st4_permute_cost  */
727     2, /* permute_cost  */
728     2, /* reduc_i8_cost  */
729     2, /* reduc_i16_cost  */
730     2, /* reduc_i32_cost  */
731     2, /* reduc_i64_cost  */
732     2, /* reduc_f16_cost  */
733     2, /* reduc_f32_cost  */
734     2, /* reduc_f64_cost  */
735     2, /* store_elt_extra_cost  */
736     2, /* vec_to_scalar_cost  */
737     1, /* scalar_to_vec_cost  */
738     1, /* align_load_cost  */
739     1, /* unalign_load_cost  */
740     1, /* unalign_store_cost  */
741     1  /* store_cost  */
742   },
743   2, /* clast_cost  */
744   2, /* fadda_f16_cost  */
745   2, /* fadda_f32_cost  */
746   2, /* fadda_f64_cost  */
747   4, /* gather_load_x32_cost  */
748   2, /* gather_load_x64_cost  */
749   1 /* scatter_store_elt_cost  */
750 };
751 
752 /* Generic costs for vector insn classes.  */
753 static const struct cpu_vector_cost generic_vector_cost =
754 {
755   1, /* scalar_int_stmt_cost  */
756   1, /* scalar_fp_stmt_cost  */
757   1, /* scalar_load_cost  */
758   1, /* scalar_store_cost  */
759   3, /* cond_taken_branch_cost  */
760   1, /* cond_not_taken_branch_cost  */
761   &generic_advsimd_vector_cost, /* advsimd  */
762   &generic_sve_vector_cost, /* sve */
763   nullptr /* issue_info  */
764 };
765 
766 static const advsimd_vec_cost a64fx_advsimd_vector_cost =
767 {
768   2, /* int_stmt_cost  */
769   5, /* fp_stmt_cost  */
770   0, /* ld2_st2_permute_cost  */
771   0, /* ld3_st3_permute_cost  */
772   0, /* ld4_st4_permute_cost  */
773   3, /* permute_cost  */
774   13, /* reduc_i8_cost  */
775   13, /* reduc_i16_cost  */
776   13, /* reduc_i32_cost  */
777   13, /* reduc_i64_cost  */
778   13, /* reduc_f16_cost  */
779   13, /* reduc_f32_cost  */
780   13, /* reduc_f64_cost  */
781   13, /* store_elt_extra_cost  */
782   13, /* vec_to_scalar_cost  */
783   4, /* scalar_to_vec_cost  */
784   6, /* align_load_cost  */
785   6, /* unalign_load_cost  */
786   1, /* unalign_store_cost  */
787   1  /* store_cost  */
788 };
789 
790 static const sve_vec_cost a64fx_sve_vector_cost =
791 {
792   {
793     2, /* int_stmt_cost  */
794     5, /* fp_stmt_cost  */
795     0, /* ld2_st2_permute_cost  */
796     0, /* ld3_st3_permute_cost  */
797     0, /* ld4_st4_permute_cost  */
798     3, /* permute_cost  */
799     13, /* reduc_i8_cost  */
800     13, /* reduc_i16_cost  */
801     13, /* reduc_i32_cost  */
802     13, /* reduc_i64_cost  */
803     13, /* reduc_f16_cost  */
804     13, /* reduc_f32_cost  */
805     13, /* reduc_f64_cost  */
806     13, /* store_elt_extra_cost  */
807     13, /* vec_to_scalar_cost  */
808     4, /* scalar_to_vec_cost  */
809     6, /* align_load_cost  */
810     6, /* unalign_load_cost  */
811     1, /* unalign_store_cost  */
812     1  /* store_cost  */
813   },
814   13, /* clast_cost  */
815   13, /* fadda_f16_cost  */
816   13, /* fadda_f32_cost  */
817   13, /* fadda_f64_cost  */
818   64, /* gather_load_x32_cost  */
819   32, /* gather_load_x64_cost  */
820   1 /* scatter_store_elt_cost  */
821 };
822 
823 static const struct cpu_vector_cost a64fx_vector_cost =
824 {
825   1, /* scalar_int_stmt_cost  */
826   5, /* scalar_fp_stmt_cost  */
827   4, /* scalar_load_cost  */
828   1, /* scalar_store_cost  */
829   3, /* cond_taken_branch_cost  */
830   1, /* cond_not_taken_branch_cost  */
831   &a64fx_advsimd_vector_cost, /* advsimd  */
832   &a64fx_sve_vector_cost, /* sve  */
833   nullptr /* issue_info  */
834 };
835 
836 static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
837 {
838   1, /* int_stmt_cost  */
839   3, /* fp_stmt_cost  */
840   0, /* ld2_st2_permute_cost  */
841   0, /* ld3_st3_permute_cost  */
842   0, /* ld4_st4_permute_cost  */
843   2, /* permute_cost  */
844   1, /* reduc_i8_cost  */
845   1, /* reduc_i16_cost  */
846   1, /* reduc_i32_cost  */
847   1, /* reduc_i64_cost  */
848   1, /* reduc_f16_cost  */
849   1, /* reduc_f32_cost  */
850   1, /* reduc_f64_cost  */
851   1, /* store_elt_extra_cost  */
852   1, /* vec_to_scalar_cost  */
853   1, /* scalar_to_vec_cost  */
854   1, /* align_load_cost  */
855   1, /* unalign_load_cost  */
856   1, /* unalign_store_cost  */
857   1  /* store_cost  */
858 };
859 
860 /* QDF24XX costs for vector insn classes.  */
861 static const struct cpu_vector_cost qdf24xx_vector_cost =
862 {
863   1, /* scalar_int_stmt_cost  */
864   1, /* scalar_fp_stmt_cost  */
865   1, /* scalar_load_cost  */
866   1, /* scalar_store_cost  */
867   3, /* cond_taken_branch_cost  */
868   1, /* cond_not_taken_branch_cost  */
869   &qdf24xx_advsimd_vector_cost, /* advsimd  */
870   nullptr, /* sve  */
871   nullptr /* issue_info  */
872 };
873 
874 
875 static const advsimd_vec_cost thunderx_advsimd_vector_cost =
876 {
877   4, /* int_stmt_cost  */
878   1, /* fp_stmt_cost  */
879   0, /* ld2_st2_permute_cost  */
880   0, /* ld3_st3_permute_cost  */
881   0, /* ld4_st4_permute_cost  */
882   4, /* permute_cost  */
883   2, /* reduc_i8_cost  */
884   2, /* reduc_i16_cost  */
885   2, /* reduc_i32_cost  */
886   2, /* reduc_i64_cost  */
887   2, /* reduc_f16_cost  */
888   2, /* reduc_f32_cost  */
889   2, /* reduc_f64_cost  */
890   2, /* store_elt_extra_cost  */
891   2, /* vec_to_scalar_cost  */
892   2, /* scalar_to_vec_cost  */
893   3, /* align_load_cost  */
894   5, /* unalign_load_cost  */
895   5, /* unalign_store_cost  */
896   1  /* store_cost  */
897 };
898 
899 /* ThunderX costs for vector insn classes.  */
900 static const struct cpu_vector_cost thunderx_vector_cost =
901 {
902   1, /* scalar_int_stmt_cost  */
903   1, /* scalar_fp_stmt_cost  */
904   3, /* scalar_load_cost  */
905   1, /* scalar_store_cost  */
906   3, /* cond_taken_branch_cost  */
907   3, /* cond_not_taken_branch_cost  */
908   &thunderx_advsimd_vector_cost, /* advsimd  */
909   nullptr, /* sve  */
910   nullptr /* issue_info  */
911 };
912 
913 static const advsimd_vec_cost tsv110_advsimd_vector_cost =
914 {
915   2, /* int_stmt_cost  */
916   2, /* fp_stmt_cost  */
917   0, /* ld2_st2_permute_cost  */
918   0, /* ld3_st3_permute_cost  */
919   0, /* ld4_st4_permute_cost  */
920   2, /* permute_cost  */
921   3, /* reduc_i8_cost  */
922   3, /* reduc_i16_cost  */
923   3, /* reduc_i32_cost  */
924   3, /* reduc_i64_cost  */
925   3, /* reduc_f16_cost  */
926   3, /* reduc_f32_cost  */
927   3, /* reduc_f64_cost  */
928   3, /* store_elt_extra_cost  */
929   3, /* vec_to_scalar_cost  */
930   2, /* scalar_to_vec_cost  */
931   5, /* align_load_cost  */
932   5, /* unalign_load_cost  */
933   1, /* unalign_store_cost  */
934   1  /* store_cost  */
935 };
936 
937 static const struct cpu_vector_cost tsv110_vector_cost =
938 {
939   1, /* scalar_int_stmt_cost  */
940   1, /* scalar_fp_stmt_cost  */
941   5, /* scalar_load_cost  */
942   1, /* scalar_store_cost  */
943   1, /* cond_taken_branch_cost  */
944   1, /* cond_not_taken_branch_cost  */
945   &tsv110_advsimd_vector_cost, /* advsimd  */
946   nullptr, /* sve  */
947   nullptr /* issue_info  */
948 };
949 
950 static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
951 {
952   2, /* int_stmt_cost  */
953   2, /* fp_stmt_cost  */
954   0, /* ld2_st2_permute_cost  */
955   0, /* ld3_st3_permute_cost  */
956   0, /* ld4_st4_permute_cost  */
957   3, /* permute_cost  */
958   8, /* reduc_i8_cost  */
959   8, /* reduc_i16_cost  */
960   8, /* reduc_i32_cost  */
961   8, /* reduc_i64_cost  */
962   8, /* reduc_f16_cost  */
963   8, /* reduc_f32_cost  */
964   8, /* reduc_f64_cost  */
965   8, /* store_elt_extra_cost  */
966   8, /* vec_to_scalar_cost  */
967   8, /* scalar_to_vec_cost  */
968   4, /* align_load_cost  */
969   4, /* unalign_load_cost  */
970   1, /* unalign_store_cost  */
971   1  /* store_cost  */
972 };
973 
974 /* Cortex-A57 costs for vector insn classes.  */
975 static const struct cpu_vector_cost cortexa57_vector_cost =
976 {
977   1, /* scalar_int_stmt_cost  */
978   1, /* scalar_fp_stmt_cost  */
979   4, /* scalar_load_cost  */
980   1, /* scalar_store_cost  */
981   1, /* cond_taken_branch_cost  */
982   1, /* cond_not_taken_branch_cost  */
983   &cortexa57_advsimd_vector_cost, /* advsimd  */
984   nullptr, /* sve  */
985   nullptr /* issue_info  */
986 };
987 
988 static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
989 {
990   3, /* int_stmt_cost  */
991   3, /* fp_stmt_cost  */
992   0, /* ld2_st2_permute_cost  */
993   0, /* ld3_st3_permute_cost  */
994   0, /* ld4_st4_permute_cost  */
995   3, /* permute_cost  */
996   3, /* reduc_i8_cost  */
997   3, /* reduc_i16_cost  */
998   3, /* reduc_i32_cost  */
999   3, /* reduc_i64_cost  */
1000   3, /* reduc_f16_cost  */
1001   3, /* reduc_f32_cost  */
1002   3, /* reduc_f64_cost  */
1003   3, /* store_elt_extra_cost  */
1004   3, /* vec_to_scalar_cost  */
1005   3, /* scalar_to_vec_cost  */
1006   5, /* align_load_cost  */
1007   5, /* unalign_load_cost  */
1008   1, /* unalign_store_cost  */
1009   1  /* store_cost  */
1010 };
1011 
1012 static const struct cpu_vector_cost exynosm1_vector_cost =
1013 {
1014   1, /* scalar_int_stmt_cost  */
1015   1, /* scalar_fp_stmt_cost  */
1016   5, /* scalar_load_cost  */
1017   1, /* scalar_store_cost  */
1018   1, /* cond_taken_branch_cost  */
1019   1, /* cond_not_taken_branch_cost  */
1020   &exynosm1_advsimd_vector_cost, /* advsimd  */
1021   nullptr, /* sve  */
1022   nullptr /* issue_info  */
1023 };
1024 
1025 static const advsimd_vec_cost xgene1_advsimd_vector_cost =
1026 {
1027   2, /* int_stmt_cost  */
1028   2, /* fp_stmt_cost  */
1029   0, /* ld2_st2_permute_cost  */
1030   0, /* ld3_st3_permute_cost  */
1031   0, /* ld4_st4_permute_cost  */
1032   2, /* permute_cost  */
1033   4, /* reduc_i8_cost  */
1034   4, /* reduc_i16_cost  */
1035   4, /* reduc_i32_cost  */
1036   4, /* reduc_i64_cost  */
1037   4, /* reduc_f16_cost  */
1038   4, /* reduc_f32_cost  */
1039   4, /* reduc_f64_cost  */
1040   4, /* store_elt_extra_cost  */
1041   4, /* vec_to_scalar_cost  */
1042   4, /* scalar_to_vec_cost  */
1043   10, /* align_load_cost  */
1044   10, /* unalign_load_cost  */
1045   2, /* unalign_store_cost  */
1046   2  /* store_cost  */
1047 };
1048 
1049 /* Generic costs for vector insn classes.  */
1050 static const struct cpu_vector_cost xgene1_vector_cost =
1051 {
1052   1, /* scalar_int_stmt_cost  */
1053   1, /* scalar_fp_stmt_cost  */
1054   5, /* scalar_load_cost  */
1055   1, /* scalar_store_cost  */
1056   2, /* cond_taken_branch_cost  */
1057   1, /* cond_not_taken_branch_cost  */
1058   &xgene1_advsimd_vector_cost, /* advsimd  */
1059   nullptr, /* sve  */
1060   nullptr /* issue_info  */
1061 };
1062 
1063 static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
1064 {
1065   4, /* int_stmt_cost  */
1066   5, /* fp_stmt_cost  */
1067   0, /* ld2_st2_permute_cost  */
1068   0, /* ld3_st3_permute_cost  */
1069   0, /* ld4_st4_permute_cost  */
1070   10, /* permute_cost  */
1071   6, /* reduc_i8_cost  */
1072   6, /* reduc_i16_cost  */
1073   6, /* reduc_i32_cost  */
1074   6, /* reduc_i64_cost  */
1075   6, /* reduc_f16_cost  */
1076   6, /* reduc_f32_cost  */
1077   6, /* reduc_f64_cost  */
1078   6, /* store_elt_extra_cost  */
1079   6, /* vec_to_scalar_cost  */
1080   5, /* scalar_to_vec_cost  */
1081   4, /* align_load_cost  */
1082   4, /* unalign_load_cost  */
1083   1, /* unalign_store_cost  */
1084   1  /* store_cost  */
1085 };
1086 
1087 /* Costs for vector insn classes for Vulcan.  */
1088 static const struct cpu_vector_cost thunderx2t99_vector_cost =
1089 {
1090   1, /* scalar_int_stmt_cost  */
1091   6, /* scalar_fp_stmt_cost  */
1092   4, /* scalar_load_cost  */
1093   1, /* scalar_store_cost  */
1094   2, /* cond_taken_branch_cost  */
1095   1,  /* cond_not_taken_branch_cost  */
1096   &thunderx2t99_advsimd_vector_cost, /* advsimd  */
1097   nullptr, /* sve  */
1098   nullptr /* issue_info  */
1099 };
1100 
1101 static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
1102 {
1103   5, /* int_stmt_cost  */
1104   5, /* fp_stmt_cost  */
1105   0, /* ld2_st2_permute_cost  */
1106   0, /* ld3_st3_permute_cost  */
1107   0, /* ld4_st4_permute_cost  */
1108   10, /* permute_cost  */
1109   5, /* reduc_i8_cost  */
1110   5, /* reduc_i16_cost  */
1111   5, /* reduc_i32_cost  */
1112   5, /* reduc_i64_cost  */
1113   5, /* reduc_f16_cost  */
1114   5, /* reduc_f32_cost  */
1115   5, /* reduc_f64_cost  */
1116   5, /* store_elt_extra_cost  */
1117   5, /* vec_to_scalar_cost  */
1118   5, /* scalar_to_vec_cost  */
1119   4, /* align_load_cost  */
1120   4, /* unalign_load_cost  */
1121   4, /* unalign_store_cost  */
1122   4  /* store_cost  */
1123 };
1124 
1125 static const struct cpu_vector_cost thunderx3t110_vector_cost =
1126 {
1127   1, /* scalar_int_stmt_cost  */
1128   5, /* scalar_fp_stmt_cost  */
1129   4, /* scalar_load_cost  */
1130   1, /* scalar_store_cost  */
1131   2, /* cond_taken_branch_cost  */
1132   1,  /* cond_not_taken_branch_cost  */
1133   &thunderx3t110_advsimd_vector_cost, /* advsimd  */
1134   nullptr, /* sve  */
1135   nullptr /* issue_info  */
1136 };
1137 
1138 static const advsimd_vec_cost ampere1_advsimd_vector_cost =
1139 {
1140   1, /* int_stmt_cost  */
1141   3, /* fp_stmt_cost  */
1142   0, /* ld2_st2_permute_cost  */
1143   0, /* ld3_st3_permute_cost  */
1144   0, /* ld4_st4_permute_cost  */
1145   2, /* permute_cost  */
1146   12, /* reduc_i8_cost  */
1147   9, /* reduc_i16_cost  */
1148   6, /* reduc_i32_cost  */
1149   5, /* reduc_i64_cost  */
1150   9, /* reduc_f16_cost  */
1151   6, /* reduc_f32_cost  */
1152   5, /* reduc_f64_cost  */
1153   8, /* store_elt_extra_cost  */
1154   6, /* vec_to_scalar_cost  */
1155   7, /* scalar_to_vec_cost  */
1156   4, /* align_load_cost  */
1157   4, /* unalign_load_cost  */
1158   1, /* unalign_store_cost  */
1159   1  /* store_cost  */
1160 };
1161 
1162 /* Ampere-1 costs for vector insn classes.  */
1163 static const struct cpu_vector_cost ampere1_vector_cost =
1164 {
1165   1, /* scalar_int_stmt_cost  */
1166   3, /* scalar_fp_stmt_cost  */
1167   4, /* scalar_load_cost  */
1168   1, /* scalar_store_cost  */
1169   1, /* cond_taken_branch_cost  */
1170   1, /* cond_not_taken_branch_cost  */
1171   &ampere1_advsimd_vector_cost, /* advsimd  */
1172   nullptr, /* sve  */
1173   nullptr  /* issue_info  */
1174 };
1175 
1176 /* Generic costs for branch instructions.  */
1177 static const struct cpu_branch_cost generic_branch_cost =
1178 {
1179   1,  /* Predictable.  */
1180   3   /* Unpredictable.  */
1181 };
1182 
1183 /* Generic approximation modes.  */
1184 static const cpu_approx_modes generic_approx_modes =
1185 {
1186   AARCH64_APPROX_NONE,	/* division  */
1187   AARCH64_APPROX_NONE,	/* sqrt  */
1188   AARCH64_APPROX_NONE	/* recip_sqrt  */
1189 };
1190 
1191 /* Approximation modes for Exynos M1.  */
1192 static const cpu_approx_modes exynosm1_approx_modes =
1193 {
1194   AARCH64_APPROX_NONE,	/* division  */
1195   AARCH64_APPROX_ALL,	/* sqrt  */
1196   AARCH64_APPROX_ALL	/* recip_sqrt  */
1197 };
1198 
1199 /* Approximation modes for X-Gene 1.  */
1200 static const cpu_approx_modes xgene1_approx_modes =
1201 {
1202   AARCH64_APPROX_NONE,	/* division  */
1203   AARCH64_APPROX_NONE,	/* sqrt  */
1204   AARCH64_APPROX_ALL	/* recip_sqrt  */
1205 };
1206 
1207 /* Generic prefetch settings (which disable prefetch).  */
1208 static const cpu_prefetch_tune generic_prefetch_tune =
1209 {
1210   0,			/* num_slots  */
1211   -1,			/* l1_cache_size  */
1212   -1,			/* l1_cache_line_size  */
1213   -1,			/* l2_cache_size  */
1214   true,			/* prefetch_dynamic_strides */
1215   -1,			/* minimum_stride */
1216   -1			/* default_opt_level  */
1217 };
1218 
1219 static const cpu_prefetch_tune exynosm1_prefetch_tune =
1220 {
1221   0,			/* num_slots  */
1222   -1,			/* l1_cache_size  */
1223   64,			/* l1_cache_line_size  */
1224   -1,			/* l2_cache_size  */
1225   true,			/* prefetch_dynamic_strides */
1226   -1,			/* minimum_stride */
1227   -1			/* default_opt_level  */
1228 };
1229 
1230 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
1231 {
1232   4,			/* num_slots  */
1233   32,			/* l1_cache_size  */
1234   64,			/* l1_cache_line_size  */
1235   512,			/* l2_cache_size  */
1236   false,		/* prefetch_dynamic_strides */
1237   2048,			/* minimum_stride */
1238   3			/* default_opt_level  */
1239 };
1240 
1241 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
1242 {
1243   8,			/* num_slots  */
1244   32,			/* l1_cache_size  */
1245   128,			/* l1_cache_line_size  */
1246   16*1024,		/* l2_cache_size  */
1247   true,			/* prefetch_dynamic_strides */
1248   -1,			/* minimum_stride */
1249   3			/* default_opt_level  */
1250 };
1251 
1252 static const cpu_prefetch_tune thunderx_prefetch_tune =
1253 {
1254   8,			/* num_slots  */
1255   32,			/* l1_cache_size  */
1256   128,			/* l1_cache_line_size  */
1257   -1,			/* l2_cache_size  */
1258   true,			/* prefetch_dynamic_strides */
1259   -1,			/* minimum_stride */
1260   -1			/* default_opt_level  */
1261 };
1262 
1263 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
1264 {
1265   8,			/* num_slots  */
1266   32,			/* l1_cache_size  */
1267   64,			/* l1_cache_line_size  */
1268   256,			/* l2_cache_size  */
1269   true,			/* prefetch_dynamic_strides */
1270   -1,			/* minimum_stride */
1271   -1			/* default_opt_level  */
1272 };
1273 
1274 static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
1275 {
1276   8,			/* num_slots  */
1277   32,			/* l1_cache_size  */
1278   64,			/* l1_cache_line_size  */
1279   256,			/* l2_cache_size  */
1280   true,			/* prefetch_dynamic_strides */
1281   -1,			/* minimum_stride */
1282   -1			/* default_opt_level  */
1283 };
1284 
1285 static const cpu_prefetch_tune tsv110_prefetch_tune =
1286 {
1287   0,                    /* num_slots  */
1288   64,                   /* l1_cache_size  */
1289   64,                   /* l1_cache_line_size  */
1290   512,                  /* l2_cache_size  */
1291   true,                 /* prefetch_dynamic_strides */
1292   -1,                   /* minimum_stride */
1293   -1                    /* default_opt_level  */
1294 };
1295 
1296 static const cpu_prefetch_tune xgene1_prefetch_tune =
1297 {
1298   8,			/* num_slots  */
1299   32,			/* l1_cache_size  */
1300   64,			/* l1_cache_line_size  */
1301   256,			/* l2_cache_size  */
1302   true,                 /* prefetch_dynamic_strides */
1303   -1,                   /* minimum_stride */
1304   -1			/* default_opt_level  */
1305 };
1306 
1307 static const cpu_prefetch_tune a64fx_prefetch_tune =
1308 {
1309   8,			/* num_slots  */
1310   64,			/* l1_cache_size  */
1311   256,			/* l1_cache_line_size  */
1312   32768,		/* l2_cache_size  */
1313   true,			/* prefetch_dynamic_strides */
1314   -1,			/* minimum_stride */
1315   -1			/* default_opt_level  */
1316 };
1317 
1318 static const cpu_prefetch_tune ampere1_prefetch_tune =
1319 {
1320   0,			/* num_slots  */
1321   64,			/* l1_cache_size  */
1322   64,			/* l1_cache_line_size  */
1323   2048,			/* l2_cache_size  */
1324   true,			/* prefetch_dynamic_strides */
1325   -1,			/* minimum_stride */
1326   -1			/* default_opt_level  */
1327 };
1328 
1329 static const struct tune_params generic_tunings =
1330 {
1331   &cortexa57_extra_costs,
1332   &generic_addrcost_table,
1333   &generic_regmove_cost,
1334   &generic_vector_cost,
1335   &generic_branch_cost,
1336   &generic_approx_modes,
1337   SVE_NOT_IMPLEMENTED, /* sve_width  */
1338   { 4, /* load_int.  */
1339     4, /* store_int.  */
1340     4, /* load_fp.  */
1341     4, /* store_fp.  */
1342     4, /* load_pred.  */
1343     4 /* store_pred.  */
1344   }, /* memmov_cost.  */
1345   2, /* issue_rate  */
1346   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1347   "16:12",	/* function_align.  */
1348   "4",	/* jump_align.  */
1349   "8",	/* loop_align.  */
1350   2,	/* int_reassoc_width.  */
1351   4,	/* fp_reassoc_width.  */
1352   1,	/* vec_reassoc_width.  */
1353   2,	/* min_div_recip_mul_sf.  */
1354   2,	/* min_div_recip_mul_df.  */
1355   0,	/* max_case_values.  */
1356   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1357   /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
1358      Neoverse V1.  It does not have a noticeable effect on A64FX and should
1359      have at most a very minor effect on SVE2 cores.  */
1360   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS),	/* tune_flags.  */
1361   &generic_prefetch_tune
1362 };
1363 
1364 static const struct tune_params cortexa35_tunings =
1365 {
1366   &cortexa53_extra_costs,
1367   &generic_addrcost_table,
1368   &cortexa53_regmove_cost,
1369   &generic_vector_cost,
1370   &generic_branch_cost,
1371   &generic_approx_modes,
1372   SVE_NOT_IMPLEMENTED, /* sve_width  */
1373   { 4, /* load_int.  */
1374     4, /* store_int.  */
1375     4, /* load_fp.  */
1376     4, /* store_fp.  */
1377     4, /* load_pred.  */
1378     4 /* store_pred.  */
1379   }, /* memmov_cost.  */
1380   1, /* issue_rate  */
1381   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1382    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1383   "16",	/* function_align.  */
1384   "4",	/* jump_align.  */
1385   "8",	/* loop_align.  */
1386   2,	/* int_reassoc_width.  */
1387   4,	/* fp_reassoc_width.  */
1388   1,	/* vec_reassoc_width.  */
1389   2,	/* min_div_recip_mul_sf.  */
1390   2,	/* min_div_recip_mul_df.  */
1391   0,	/* max_case_values.  */
1392   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1393   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1394   &generic_prefetch_tune
1395 };
1396 
1397 static const struct tune_params cortexa53_tunings =
1398 {
1399   &cortexa53_extra_costs,
1400   &generic_addrcost_table,
1401   &cortexa53_regmove_cost,
1402   &generic_vector_cost,
1403   &generic_branch_cost,
1404   &generic_approx_modes,
1405   SVE_NOT_IMPLEMENTED, /* sve_width  */
1406   { 4, /* load_int.  */
1407     4, /* store_int.  */
1408     4, /* load_fp.  */
1409     4, /* store_fp.  */
1410     4, /* load_pred.  */
1411     4 /* store_pred.  */
1412   }, /* memmov_cost.  */
1413   2, /* issue_rate  */
1414   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1415    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1416   "16",	/* function_align.  */
1417   "4",	/* jump_align.  */
1418   "8",	/* loop_align.  */
1419   2,	/* int_reassoc_width.  */
1420   4,	/* fp_reassoc_width.  */
1421   1,	/* vec_reassoc_width.  */
1422   2,	/* min_div_recip_mul_sf.  */
1423   2,	/* min_div_recip_mul_df.  */
1424   0,	/* max_case_values.  */
1425   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1426   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1427   &generic_prefetch_tune
1428 };
1429 
1430 static const struct tune_params cortexa57_tunings =
1431 {
1432   &cortexa57_extra_costs,
1433   &generic_addrcost_table,
1434   &cortexa57_regmove_cost,
1435   &cortexa57_vector_cost,
1436   &generic_branch_cost,
1437   &generic_approx_modes,
1438   SVE_NOT_IMPLEMENTED, /* sve_width  */
1439   { 4, /* load_int.  */
1440     4, /* store_int.  */
1441     4, /* load_fp.  */
1442     4, /* store_fp.  */
1443     4, /* load_pred.  */
1444     4 /* store_pred.  */
1445   }, /* memmov_cost.  */
1446   3, /* issue_rate  */
1447   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1448    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
1449   "16",	/* function_align.  */
1450   "4",	/* jump_align.  */
1451   "8",	/* loop_align.  */
1452   2,	/* int_reassoc_width.  */
1453   4,	/* fp_reassoc_width.  */
1454   1,	/* vec_reassoc_width.  */
1455   2,	/* min_div_recip_mul_sf.  */
1456   2,	/* min_div_recip_mul_df.  */
1457   0,	/* max_case_values.  */
1458   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1459   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS),	/* tune_flags.  */
1460   &generic_prefetch_tune
1461 };
1462 
1463 static const struct tune_params cortexa72_tunings =
1464 {
1465   &cortexa57_extra_costs,
1466   &generic_addrcost_table,
1467   &cortexa57_regmove_cost,
1468   &cortexa57_vector_cost,
1469   &generic_branch_cost,
1470   &generic_approx_modes,
1471   SVE_NOT_IMPLEMENTED, /* sve_width  */
1472   { 4, /* load_int.  */
1473     4, /* store_int.  */
1474     4, /* load_fp.  */
1475     4, /* store_fp.  */
1476     4, /* load_pred.  */
1477     4 /* store_pred.  */
1478   }, /* memmov_cost.  */
1479   3, /* issue_rate  */
1480   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1481    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
1482   "16",	/* function_align.  */
1483   "4",	/* jump_align.  */
1484   "8",	/* loop_align.  */
1485   2,	/* int_reassoc_width.  */
1486   4,	/* fp_reassoc_width.  */
1487   1,	/* vec_reassoc_width.  */
1488   2,	/* min_div_recip_mul_sf.  */
1489   2,	/* min_div_recip_mul_df.  */
1490   0,	/* max_case_values.  */
1491   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1492   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1493   &generic_prefetch_tune
1494 };
1495 
1496 static const struct tune_params cortexa73_tunings =
1497 {
1498   &cortexa57_extra_costs,
1499   &generic_addrcost_table,
1500   &cortexa57_regmove_cost,
1501   &cortexa57_vector_cost,
1502   &generic_branch_cost,
1503   &generic_approx_modes,
1504   SVE_NOT_IMPLEMENTED, /* sve_width  */
1505   { 4, /* load_int.  */
1506     4, /* store_int.  */
1507     4, /* load_fp.  */
1508     4, /* store_fp.  */
1509     4, /* load_pred.  */
1510     4 /* store_pred.  */
1511   }, /* memmov_cost.  */
1512   2, /* issue_rate.  */
1513   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1514    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
1515   "16",	/* function_align.  */
1516   "4",	/* jump_align.  */
1517   "8",	/* loop_align.  */
1518   2,	/* int_reassoc_width.  */
1519   4,	/* fp_reassoc_width.  */
1520   1,	/* vec_reassoc_width.  */
1521   2,	/* min_div_recip_mul_sf.  */
1522   2,	/* min_div_recip_mul_df.  */
1523   0,	/* max_case_values.  */
1524   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1525   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1526   &generic_prefetch_tune
1527 };
1528 
1529 
1530 
1531 static const struct tune_params exynosm1_tunings =
1532 {
1533   &exynosm1_extra_costs,
1534   &exynosm1_addrcost_table,
1535   &exynosm1_regmove_cost,
1536   &exynosm1_vector_cost,
1537   &generic_branch_cost,
1538   &exynosm1_approx_modes,
1539   SVE_NOT_IMPLEMENTED, /* sve_width  */
1540   { 4, /* load_int.  */
1541     4, /* store_int.  */
1542     4, /* load_fp.  */
1543     4, /* store_fp.  */
1544     4, /* load_pred.  */
1545     4 /* store_pred.  */
1546   }, /* memmov_cost.  */
1547   3,	/* issue_rate  */
1548   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
1549   "4",	/* function_align.  */
1550   "4",	/* jump_align.  */
1551   "4",	/* loop_align.  */
1552   2,	/* int_reassoc_width.  */
1553   4,	/* fp_reassoc_width.  */
1554   1,	/* vec_reassoc_width.  */
1555   2,	/* min_div_recip_mul_sf.  */
1556   2,	/* min_div_recip_mul_df.  */
1557   48,	/* max_case_values.  */
1558   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
1559   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
1560   &exynosm1_prefetch_tune
1561 };
1562 
1563 static const struct tune_params thunderxt88_tunings =
1564 {
1565   &thunderx_extra_costs,
1566   &generic_addrcost_table,
1567   &thunderx_regmove_cost,
1568   &thunderx_vector_cost,
1569   &generic_branch_cost,
1570   &generic_approx_modes,
1571   SVE_NOT_IMPLEMENTED, /* sve_width  */
1572   { 6, /* load_int.  */
1573     6, /* store_int.  */
1574     6, /* load_fp.  */
1575     6, /* store_fp.  */
1576     6, /* load_pred.  */
1577     6 /* store_pred.  */
1578   }, /* memmov_cost.  */
1579   2, /* issue_rate  */
1580   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1581   "8",	/* function_align.  */
1582   "8",	/* jump_align.  */
1583   "8",	/* loop_align.  */
1584   2,	/* int_reassoc_width.  */
1585   4,	/* fp_reassoc_width.  */
1586   1,	/* vec_reassoc_width.  */
1587   2,	/* min_div_recip_mul_sf.  */
1588   2,	/* min_div_recip_mul_df.  */
1589   0,	/* max_case_values.  */
1590   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
1591   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),	/* tune_flags.  */
1592   &thunderxt88_prefetch_tune
1593 };
1594 
1595 static const struct tune_params thunderx_tunings =
1596 {
1597   &thunderx_extra_costs,
1598   &generic_addrcost_table,
1599   &thunderx_regmove_cost,
1600   &thunderx_vector_cost,
1601   &generic_branch_cost,
1602   &generic_approx_modes,
1603   SVE_NOT_IMPLEMENTED, /* sve_width  */
1604   { 6, /* load_int.  */
1605     6, /* store_int.  */
1606     6, /* load_fp.  */
1607     6, /* store_fp.  */
1608     6, /* load_pred.  */
1609     6 /* store_pred.  */
1610   }, /* memmov_cost.  */
1611   2, /* issue_rate  */
1612   AARCH64_FUSE_ALU_BRANCH, /* fusible_ops  */
1613   "8",	/* function_align.  */
1614   "8",	/* jump_align.  */
1615   "8",	/* loop_align.  */
1616   2,	/* int_reassoc_width.  */
1617   4,	/* fp_reassoc_width.  */
1618   1,	/* vec_reassoc_width.  */
1619   2,	/* min_div_recip_mul_sf.  */
1620   2,	/* min_div_recip_mul_df.  */
1621   0,	/* max_case_values.  */
1622   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
1623   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1624    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
1625   &thunderx_prefetch_tune
1626 };
1627 
1628 static const struct tune_params tsv110_tunings =
1629 {
1630   &tsv110_extra_costs,
1631   &tsv110_addrcost_table,
1632   &tsv110_regmove_cost,
1633   &tsv110_vector_cost,
1634   &generic_branch_cost,
1635   &generic_approx_modes,
1636   SVE_NOT_IMPLEMENTED, /* sve_width  */
1637   { 4, /* load_int.  */
1638     4, /* store_int.  */
1639     4, /* load_fp.  */
1640     4, /* store_fp.  */
1641     4, /* load_pred.  */
1642     4 /* store_pred.  */
1643   }, /* memmov_cost.  */
1644   4,    /* issue_rate  */
1645   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1646    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1647   "16", /* function_align.  */
1648   "4",  /* jump_align.  */
1649   "8",  /* loop_align.  */
1650   2,    /* int_reassoc_width.  */
1651   4,    /* fp_reassoc_width.  */
1652   1,    /* vec_reassoc_width.  */
1653   2,    /* min_div_recip_mul_sf.  */
1654   2,    /* min_div_recip_mul_df.  */
1655   0,    /* max_case_values.  */
1656   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
1657   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
1658   &tsv110_prefetch_tune
1659 };
1660 
1661 static const struct tune_params xgene1_tunings =
1662 {
1663   &xgene1_extra_costs,
1664   &xgene1_addrcost_table,
1665   &xgene1_regmove_cost,
1666   &xgene1_vector_cost,
1667   &generic_branch_cost,
1668   &xgene1_approx_modes,
1669   SVE_NOT_IMPLEMENTED, /* sve_width  */
1670   { 6, /* load_int.  */
1671     6, /* store_int.  */
1672     6, /* load_fp.  */
1673     6, /* store_fp.  */
1674     6, /* load_pred.  */
1675     6 /* store_pred.  */
1676   }, /* memmov_cost.  */
1677   4, /* issue_rate  */
1678   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1679   "16",	/* function_align.  */
1680   "16",	/* jump_align.  */
1681   "16",	/* loop_align.  */
1682   2,	/* int_reassoc_width.  */
1683   4,	/* fp_reassoc_width.  */
1684   1,	/* vec_reassoc_width.  */
1685   2,	/* min_div_recip_mul_sf.  */
1686   2,	/* min_div_recip_mul_df.  */
1687   17,	/* max_case_values.  */
1688   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
1689   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
1690   &xgene1_prefetch_tune
1691 };
1692 
1693 static const struct tune_params emag_tunings =
1694 {
1695   &xgene1_extra_costs,
1696   &xgene1_addrcost_table,
1697   &xgene1_regmove_cost,
1698   &xgene1_vector_cost,
1699   &generic_branch_cost,
1700   &xgene1_approx_modes,
1701   SVE_NOT_IMPLEMENTED,
1702   { 6, /* load_int.  */
1703     6, /* store_int.  */
1704     6, /* load_fp.  */
1705     6, /* store_fp.  */
1706     6, /* load_pred.  */
1707     6 /* store_pred.  */
1708   }, /* memmov_cost.  */
1709   4, /* issue_rate  */
1710   AARCH64_FUSE_NOTHING, /* fusible_ops  */
1711   "16",	/* function_align.  */
1712   "16",	/* jump_align.  */
1713   "16",	/* loop_align.  */
1714   2,	/* int_reassoc_width.  */
1715   4,	/* fp_reassoc_width.  */
1716   1,	/* vec_reassoc_width.  */
1717   2,	/* min_div_recip_mul_sf.  */
1718   2,	/* min_div_recip_mul_df.  */
1719   17,	/* max_case_values.  */
1720   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
1721   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
1722   &xgene1_prefetch_tune
1723 };
1724 
1725 static const struct tune_params qdf24xx_tunings =
1726 {
1727   &qdf24xx_extra_costs,
1728   &qdf24xx_addrcost_table,
1729   &qdf24xx_regmove_cost,
1730   &qdf24xx_vector_cost,
1731   &generic_branch_cost,
1732   &generic_approx_modes,
1733   SVE_NOT_IMPLEMENTED, /* sve_width  */
1734   { 4, /* load_int.  */
1735     4, /* store_int.  */
1736     4, /* load_fp.  */
1737     4, /* store_fp.  */
1738     4, /* load_pred.  */
1739     4 /* store_pred.  */
1740   }, /* memmov_cost.  */
1741   4, /* issue_rate  */
1742   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1743    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1744   "16",	/* function_align.  */
1745   "8",	/* jump_align.  */
1746   "16",	/* loop_align.  */
1747   2,	/* int_reassoc_width.  */
1748   4,	/* fp_reassoc_width.  */
1749   1,	/* vec_reassoc_width.  */
1750   2,	/* min_div_recip_mul_sf.  */
1751   2,	/* min_div_recip_mul_df.  */
1752   0,	/* max_case_values.  */
1753   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1754   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1755   &qdf24xx_prefetch_tune
1756 };
1757 
1758 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1759    for now.  */
1760 static const struct tune_params saphira_tunings =
1761 {
1762   &generic_extra_costs,
1763   &generic_addrcost_table,
1764   &generic_regmove_cost,
1765   &generic_vector_cost,
1766   &generic_branch_cost,
1767   &generic_approx_modes,
1768   SVE_NOT_IMPLEMENTED, /* sve_width  */
1769   { 4, /* load_int.  */
1770     4, /* store_int.  */
1771     4, /* load_fp.  */
1772     4, /* store_fp.  */
1773     4, /* load_pred.  */
1774     4 /* store_pred.  */
1775   }, /* memmov_cost.  */
1776   4, /* issue_rate  */
1777   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1778    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1779   "16",	/* function_align.  */
1780   "8",	/* jump_align.  */
1781   "16",	/* loop_align.  */
1782   2,	/* int_reassoc_width.  */
1783   4,	/* fp_reassoc_width.  */
1784   1,	/* vec_reassoc_width.  */
1785   2,	/* min_div_recip_mul_sf.  */
1786   2,	/* min_div_recip_mul_df.  */
1787   0,	/* max_case_values.  */
1788   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1789   (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
1790   &generic_prefetch_tune
1791 };
1792 
1793 static const struct tune_params thunderx2t99_tunings =
1794 {
1795   &thunderx2t99_extra_costs,
1796   &thunderx2t99_addrcost_table,
1797   &thunderx2t99_regmove_cost,
1798   &thunderx2t99_vector_cost,
1799   &generic_branch_cost,
1800   &generic_approx_modes,
1801   SVE_NOT_IMPLEMENTED, /* sve_width  */
1802   { 4, /* load_int.  */
1803     4, /* store_int.  */
1804     4, /* load_fp.  */
1805     4, /* store_fp.  */
1806     4, /* load_pred.  */
1807     4 /* store_pred.  */
1808   }, /* memmov_cost.  */
1809   4, /* issue_rate.  */
1810   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1811    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1812   "16",	/* function_align.  */
1813   "8",	/* jump_align.  */
1814   "16",	/* loop_align.  */
1815   3,	/* int_reassoc_width.  */
1816   2,	/* fp_reassoc_width.  */
1817   2,	/* vec_reassoc_width.  */
1818   2,	/* min_div_recip_mul_sf.  */
1819   2,	/* min_div_recip_mul_df.  */
1820   0,	/* max_case_values.  */
1821   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1822   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1823   &thunderx2t99_prefetch_tune
1824 };
1825 
1826 static const struct tune_params thunderx3t110_tunings =
1827 {
1828   &thunderx3t110_extra_costs,
1829   &thunderx3t110_addrcost_table,
1830   &thunderx3t110_regmove_cost,
1831   &thunderx3t110_vector_cost,
1832   &generic_branch_cost,
1833   &generic_approx_modes,
1834   SVE_NOT_IMPLEMENTED, /* sve_width  */
1835   { 4, /* load_int.  */
1836     4, /* store_int.  */
1837     4, /* load_fp.  */
1838     4, /* store_fp.  */
1839     4, /* load_pred.  */
1840     4 /* store_pred.  */
1841   }, /* memmov_cost.  */
1842   6, /* issue_rate.  */
1843   (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1844    | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
1845   "16",	/* function_align.  */
1846   "8",	/* jump_align.  */
1847   "16",	/* loop_align.  */
1848   3,	/* int_reassoc_width.  */
1849   2,	/* fp_reassoc_width.  */
1850   2,	/* vec_reassoc_width.  */
1851   2,	/* min_div_recip_mul_sf.  */
1852   2,	/* min_div_recip_mul_df.  */
1853   0,	/* max_case_values.  */
1854   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1855   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1856   &thunderx3t110_prefetch_tune
1857 };
1858 
1859 static const struct tune_params neoversen1_tunings =
1860 {
1861   &cortexa76_extra_costs,
1862   &generic_addrcost_table,
1863   &generic_regmove_cost,
1864   &cortexa57_vector_cost,
1865   &generic_branch_cost,
1866   &generic_approx_modes,
1867   SVE_NOT_IMPLEMENTED, /* sve_width  */
1868   { 4, /* load_int.  */
1869     2, /* store_int.  */
1870     5, /* load_fp.  */
1871     2, /* store_fp.  */
1872     4, /* load_pred.  */
1873     4 /* store_pred.  */
1874   }, /* memmov_cost.  */
1875   3, /* issue_rate  */
1876   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
1877   "32:16",	/* function_align.  */
1878   "4",		/* jump_align.  */
1879   "32:16",	/* loop_align.  */
1880   2,	/* int_reassoc_width.  */
1881   4,	/* fp_reassoc_width.  */
1882   2,	/* vec_reassoc_width.  */
1883   2,	/* min_div_recip_mul_sf.  */
1884   2,	/* min_div_recip_mul_df.  */
1885   0,	/* max_case_values.  */
1886   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1887   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
1888   &generic_prefetch_tune
1889 };
1890 
1891 static const struct tune_params ampere1_tunings =
1892 {
1893   &ampere1_extra_costs,
1894   &generic_addrcost_table,
1895   &generic_regmove_cost,
1896   &ampere1_vector_cost,
1897   &generic_branch_cost,
1898   &generic_approx_modes,
1899   SVE_NOT_IMPLEMENTED, /* sve_width  */
1900   { 4, /* load_int.  */
1901     4, /* store_int.  */
1902     4, /* load_fp.  */
1903     4, /* store_fp.  */
1904     4, /* load_pred.  */
1905     4 /* store_pred.  */
1906   }, /* memmov_cost.  */
1907   4, /* issue_rate  */
1908   (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1909    AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1910    AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1911    AARCH64_FUSE_CMP_BRANCH),
1912   /* fusible_ops  */
1913   "32",		/* function_align.  */
1914   "4",		/* jump_align.  */
1915   "32:16",	/* loop_align.  */
1916   2,	/* int_reassoc_width.  */
1917   4,	/* fp_reassoc_width.  */
1918   2,	/* vec_reassoc_width.  */
1919   2,	/* min_div_recip_mul_sf.  */
1920   2,	/* min_div_recip_mul_df.  */
1921   0,	/* max_case_values.  */
1922   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1923   (AARCH64_EXTRA_TUNE_NO_LDP_COMBINE),	/* tune_flags.  */
1924   &ampere1_prefetch_tune
1925 };
1926 
1927 static const struct tune_params ampere1a_tunings =
1928 {
1929   &ampere1a_extra_costs,
1930   &generic_addrcost_table,
1931   &generic_regmove_cost,
1932   &ampere1_vector_cost,
1933   &generic_branch_cost,
1934   &generic_approx_modes,
1935   SVE_NOT_IMPLEMENTED, /* sve_width  */
1936   { 4, /* load_int.  */
1937     4, /* store_int.  */
1938     4, /* load_fp.  */
1939     4, /* store_fp.  */
1940     4, /* load_pred.  */
1941     4 /* store_pred.  */
1942   }, /* memmov_cost.  */
1943   4, /* issue_rate  */
1944   (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1945    AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1946    AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1947    AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
1948    AARCH64_FUSE_ADDSUB_2REG_CONST1),
1949   /* fusible_ops  */
1950   "32",		/* function_align.  */
1951   "4",		/* jump_align.  */
1952   "32:16",	/* loop_align.  */
1953   2,	/* int_reassoc_width.  */
1954   4,	/* fp_reassoc_width.  */
1955   2,	/* vec_reassoc_width.  */
1956   2,	/* min_div_recip_mul_sf.  */
1957   2,	/* min_div_recip_mul_df.  */
1958   0,	/* max_case_values.  */
1959   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1960   (AARCH64_EXTRA_TUNE_NO_LDP_COMBINE),	/* tune_flags.  */
1961   &ampere1_prefetch_tune
1962 };
1963 
1964 static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
1965 {
1966   2, /* int_stmt_cost  */
1967   2, /* fp_stmt_cost  */
1968   4, /* ld2_st2_permute_cost */
1969   4, /* ld3_st3_permute_cost  */
1970   5, /* ld4_st4_permute_cost  */
1971   3, /* permute_cost  */
1972   4, /* reduc_i8_cost  */
1973   4, /* reduc_i16_cost  */
1974   2, /* reduc_i32_cost  */
1975   2, /* reduc_i64_cost  */
1976   6, /* reduc_f16_cost  */
1977   3, /* reduc_f32_cost  */
1978   2, /* reduc_f64_cost  */
1979   2, /* store_elt_extra_cost  */
1980   /* This value is just inherited from the Cortex-A57 table.  */
1981   8, /* vec_to_scalar_cost  */
1982   /* This depends very much on what the scalar value is and
1983      where it comes from.  E.g. some constants take two dependent
1984      instructions or a load, while others might be moved from a GPR.
1985      4 seems to be a reasonable compromise in practice.  */
1986   4, /* scalar_to_vec_cost  */
1987   4, /* align_load_cost  */
1988   4, /* unalign_load_cost  */
1989   /* Although stores have a latency of 2 and compete for the
1990      vector pipes, in practice it's better not to model that.  */
1991   1, /* unalign_store_cost  */
1992   1  /* store_cost  */
1993 };
1994 
1995 static const sve_vec_cost neoversev1_sve_vector_cost =
1996 {
1997   {
1998     2, /* int_stmt_cost  */
1999     2, /* fp_stmt_cost  */
2000     4, /* ld2_st2_permute_cost  */
2001     7, /* ld3_st3_permute_cost  */
2002     8, /* ld4_st4_permute_cost  */
2003     3, /* permute_cost  */
2004     /* Theoretically, a reduction involving 31 scalar ADDs could
2005        complete in ~9 cycles and would have a cost of 31.  [SU]ADDV
2006        completes in 14 cycles, so give it a cost of 31 + 5.  */
2007     36, /* reduc_i8_cost  */
2008     /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7.  */
2009     22, /* reduc_i16_cost  */
2010     /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7.  */
2011     14, /* reduc_i32_cost  */
2012     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8.  */
2013     11, /* reduc_i64_cost  */
2014     /* Theoretically, a reduction involving 15 scalar FADDs could
2015        complete in ~9 cycles and would have a cost of 30.  FADDV
2016        completes in 13 cycles, so give it a cost of 30 + 4.  */
2017     34, /* reduc_f16_cost  */
2018     /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5.  */
2019     19, /* reduc_f32_cost  */
2020     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5.  */
2021     11, /* reduc_f64_cost  */
2022     2, /* store_elt_extra_cost  */
2023     /* This value is just inherited from the Cortex-A57 table.  */
2024     8, /* vec_to_scalar_cost  */
2025     /* See the comment above the Advanced SIMD versions.  */
2026     4, /* scalar_to_vec_cost  */
2027     4, /* align_load_cost  */
2028     4, /* unalign_load_cost  */
2029     /* Although stores have a latency of 2 and compete for the
2030        vector pipes, in practice it's better not to model that.  */
2031     1, /* unalign_store_cost  */
2032     1  /* store_cost  */
2033   },
2034   3, /* clast_cost  */
2035   19, /* fadda_f16_cost  */
2036   11, /* fadda_f32_cost  */
2037   8, /* fadda_f64_cost  */
2038   32, /* gather_load_x32_cost  */
2039   16, /* gather_load_x64_cost  */
2040   3 /* scatter_store_elt_cost  */
2041 };
2042 
2043 static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
2044 {
2045   3, /* loads_stores_per_cycle  */
2046   2, /* stores_per_cycle  */
2047   4, /* general_ops_per_cycle  */
2048   0, /* fp_simd_load_general_ops  */
2049   1 /* fp_simd_store_general_ops  */
2050 };
2051 
2052 static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
2053 {
2054   {
2055     3, /* loads_stores_per_cycle  */
2056     2, /* stores_per_cycle  */
2057     4, /* general_ops_per_cycle  */
2058     0, /* fp_simd_load_general_ops  */
2059     1 /* fp_simd_store_general_ops  */
2060   },
2061   2, /* ld2_st2_general_ops  */
2062   2, /* ld3_st3_general_ops  */
2063   3 /* ld4_st4_general_ops  */
2064 };
2065 
2066 static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
2067 {
2068   {
2069     {
2070       2, /* loads_per_cycle  */
2071       2, /* stores_per_cycle  */
2072       2, /* general_ops_per_cycle  */
2073       0, /* fp_simd_load_general_ops  */
2074       1 /* fp_simd_store_general_ops  */
2075     },
2076     2, /* ld2_st2_general_ops  */
2077     2, /* ld3_st3_general_ops  */
2078     3 /* ld4_st4_general_ops  */
2079   },
2080   1, /* pred_ops_per_cycle  */
2081   2, /* while_pred_ops  */
2082   2, /* int_cmp_pred_ops  */
2083   1, /* fp_cmp_pred_ops  */
2084   1, /* gather_scatter_pair_general_ops  */
2085   1 /* gather_scatter_pair_pred_ops  */
2086 };
2087 
2088 static const aarch64_vec_issue_info neoversev1_vec_issue_info =
2089 {
2090   &neoversev1_scalar_issue_info,
2091   &neoversev1_advsimd_issue_info,
2092   &neoversev1_sve_issue_info
2093 };
2094 
2095 /* Neoverse V1 costs for vector insn classes.  */
2096 static const struct cpu_vector_cost neoversev1_vector_cost =
2097 {
2098   1, /* scalar_int_stmt_cost  */
2099   2, /* scalar_fp_stmt_cost  */
2100   4, /* scalar_load_cost  */
2101   1, /* scalar_store_cost  */
2102   1, /* cond_taken_branch_cost  */
2103   1, /* cond_not_taken_branch_cost  */
2104   &neoversev1_advsimd_vector_cost, /* advsimd  */
2105   &neoversev1_sve_vector_cost, /* sve  */
2106   &neoversev1_vec_issue_info /* issue_info  */
2107 };
2108 
2109 static const struct tune_params neoversev1_tunings =
2110 {
2111   &cortexa76_extra_costs,
2112   &neoversev1_addrcost_table,
2113   &neoversev1_regmove_cost,
2114   &neoversev1_vector_cost,
2115   &generic_branch_cost,
2116   &generic_approx_modes,
2117   SVE_256, /* sve_width  */
2118   { 4, /* load_int.  */
2119     2, /* store_int.  */
2120     6, /* load_fp.  */
2121     2, /* store_fp.  */
2122     6, /* load_pred.  */
2123     1 /* store_pred.  */
2124   }, /* memmov_cost.  */
2125   3, /* issue_rate  */
2126   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2127   "32:16",	/* function_align.  */
2128   "4",		/* jump_align.  */
2129   "32:16",	/* loop_align.  */
2130   2,	/* int_reassoc_width.  */
2131   4,	/* fp_reassoc_width.  */
2132   2,	/* vec_reassoc_width.  */
2133   2,	/* min_div_recip_mul_sf.  */
2134   2,	/* min_div_recip_mul_df.  */
2135   0,	/* max_case_values.  */
2136   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
2137   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2138    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2139    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
2140    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
2141   &generic_prefetch_tune
2142 };
2143 
2144 static const sve_vec_cost neoverse512tvb_sve_vector_cost =
2145 {
2146   {
2147     2, /* int_stmt_cost  */
2148     2, /* fp_stmt_cost  */
2149     4, /* ld2_st2_permute_cost  */
2150     5, /* ld3_st3_permute_cost  */
2151     5, /* ld4_st4_permute_cost  */
2152     3, /* permute_cost  */
2153     /* Theoretically, a reduction involving 15 scalar ADDs could
2154        complete in ~5 cycles and would have a cost of 15.  Assume that
2155        [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6.  */
2156     21, /* reduc_i8_cost  */
2157     /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
2158     13, /* reduc_i16_cost  */
2159     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
2160     9, /* reduc_i32_cost  */
2161     /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7.  */
2162     8, /* reduc_i64_cost  */
2163     /* Theoretically, a reduction involving 7 scalar FADDs could
2164        complete in ~6 cycles and would have a cost of 14.  Assume that
2165        FADDV completes in 8 cycles and so give it a cost of 14 + 2.  */
2166     16, /* reduc_f16_cost  */
2167     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
2168     8, /* reduc_f32_cost  */
2169     /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2.  */
2170     4, /* reduc_f64_cost  */
2171     2, /* store_elt_extra_cost  */
2172     /* This value is just inherited from the Cortex-A57 table.  */
2173     8, /* vec_to_scalar_cost  */
2174     /* This depends very much on what the scalar value is and
2175        where it comes from.  E.g. some constants take two dependent
2176        instructions or a load, while others might be moved from a GPR.
2177        4 seems to be a reasonable compromise in practice.  */
2178     4, /* scalar_to_vec_cost  */
2179     4, /* align_load_cost  */
2180     4, /* unalign_load_cost  */
2181     /* Although stores generally have a latency of 2 and compete for the
2182        vector pipes, in practice it's better not to model that.  */
2183     1, /* unalign_store_cost  */
2184     1  /* store_cost  */
2185   },
2186   3, /* clast_cost  */
2187   10, /* fadda_f16_cost  */
2188   6, /* fadda_f32_cost  */
2189   4, /* fadda_f64_cost  */
2190   /* A strided Advanced SIMD x64 load would take two parallel FP loads
2191      (6 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
2192      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
2193      (cost 8) and a vec_construct (cost 2).  Add a full vector operation
2194      (cost 2) to that, to avoid the difference being lost in rounding.
2195 
2196      There is no easy comparison between a strided Advanced SIMD x32 load
2197      and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2198      operation more than a 64-bit gather.  */
2199   14, /* gather_load_x32_cost  */
2200   12, /* gather_load_x64_cost  */
2201   3 /* scatter_store_elt_cost  */
2202 };
2203 
2204 static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
2205 {
2206   {
2207     {
2208       3, /* loads_per_cycle  */
2209       2, /* stores_per_cycle  */
2210       4, /* general_ops_per_cycle  */
2211       0, /* fp_simd_load_general_ops  */
2212       1 /* fp_simd_store_general_ops  */
2213     },
2214     2, /* ld2_st2_general_ops  */
2215     2, /* ld3_st3_general_ops  */
2216     3 /* ld4_st4_general_ops  */
2217   },
2218   2, /* pred_ops_per_cycle  */
2219   2, /* while_pred_ops  */
2220   2, /* int_cmp_pred_ops  */
2221   1, /* fp_cmp_pred_ops  */
2222   1, /* gather_scatter_pair_general_ops  */
2223   1 /* gather_scatter_pair_pred_ops  */
2224 };
2225 
2226 static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
2227 {
2228   &neoversev1_scalar_issue_info,
2229   &neoversev1_advsimd_issue_info,
2230   &neoverse512tvb_sve_issue_info
2231 };
2232 
2233 static const struct cpu_vector_cost neoverse512tvb_vector_cost =
2234 {
2235   1, /* scalar_int_stmt_cost  */
2236   2, /* scalar_fp_stmt_cost  */
2237   4, /* scalar_load_cost  */
2238   1, /* scalar_store_cost  */
2239   1, /* cond_taken_branch_cost  */
2240   1, /* cond_not_taken_branch_cost  */
2241   &neoversev1_advsimd_vector_cost, /* advsimd  */
2242   &neoverse512tvb_sve_vector_cost, /* sve  */
2243   &neoverse512tvb_vec_issue_info /* issue_info  */
2244 };
2245 
2246 static const struct tune_params neoverse512tvb_tunings =
2247 {
2248   &cortexa76_extra_costs,
2249   &neoversev1_addrcost_table,
2250   &neoversev1_regmove_cost,
2251   &neoverse512tvb_vector_cost,
2252   &generic_branch_cost,
2253   &generic_approx_modes,
2254   SVE_128 | SVE_256, /* sve_width  */
2255   { 4, /* load_int.  */
2256     2, /* store_int.  */
2257     6, /* load_fp.  */
2258     2, /* store_fp.  */
2259     6, /* load_pred.  */
2260     1 /* store_pred.  */
2261   }, /* memmov_cost.  */
2262   3, /* issue_rate  */
2263   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2264   "32:16",	/* function_align.  */
2265   "4",		/* jump_align.  */
2266   "32:16",	/* loop_align.  */
2267   2,	/* int_reassoc_width.  */
2268   4,	/* fp_reassoc_width.  */
2269   2,	/* vec_reassoc_width.  */
2270   2,	/* min_div_recip_mul_sf.  */
2271   2,	/* min_div_recip_mul_df.  */
2272   0,	/* max_case_values.  */
2273   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
2274   (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2275    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2276    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
2277   &generic_prefetch_tune
2278 };
2279 
2280 static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
2281 {
2282   2, /* int_stmt_cost  */
2283   2, /* fp_stmt_cost  */
2284   2, /* ld2_st2_permute_cost */
2285   2, /* ld3_st3_permute_cost  */
2286   3, /* ld4_st4_permute_cost  */
2287   3, /* permute_cost  */
2288   4, /* reduc_i8_cost  */
2289   4, /* reduc_i16_cost  */
2290   2, /* reduc_i32_cost  */
2291   2, /* reduc_i64_cost  */
2292   6, /* reduc_f16_cost  */
2293   4, /* reduc_f32_cost  */
2294   2, /* reduc_f64_cost  */
2295   2, /* store_elt_extra_cost  */
2296   /* This value is just inherited from the Cortex-A57 table.  */
2297   8, /* vec_to_scalar_cost  */
2298   /* This depends very much on what the scalar value is and
2299      where it comes from.  E.g. some constants take two dependent
2300      instructions or a load, while others might be moved from a GPR.
2301      4 seems to be a reasonable compromise in practice.  */
2302   4, /* scalar_to_vec_cost  */
2303   4, /* align_load_cost  */
2304   4, /* unalign_load_cost  */
2305   /* Although stores have a latency of 2 and compete for the
2306      vector pipes, in practice it's better not to model that.  */
2307   1, /* unalign_store_cost  */
2308   1  /* store_cost  */
2309 };
2310 
2311 static const sve_vec_cost neoversen2_sve_vector_cost =
2312 {
2313   {
2314     2, /* int_stmt_cost  */
2315     2, /* fp_stmt_cost  */
2316     3, /* ld2_st2_permute_cost  */
2317     4, /* ld3_st3_permute_cost  */
2318     4, /* ld4_st4_permute_cost  */
2319     3, /* permute_cost  */
2320     /* Theoretically, a reduction involving 15 scalar ADDs could
2321        complete in ~5 cycles and would have a cost of 15.  [SU]ADDV
2322        completes in 11 cycles, so give it a cost of 15 + 6.  */
2323     21, /* reduc_i8_cost  */
2324     /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6.  */
2325     13, /* reduc_i16_cost  */
2326     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6.  */
2327     9, /* reduc_i32_cost  */
2328     /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
2329     2, /* reduc_i64_cost  */
2330     /* Theoretically, a reduction involving 7 scalar FADDs could
2331        complete in ~8 cycles and would have a cost of 14.  FADDV
2332        completes in 6 cycles, so give it a cost of 14 - 2.  */
2333     12, /* reduc_f16_cost  */
2334     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0.  */
2335     6, /* reduc_f32_cost  */
2336     /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0.  */
2337     2, /* reduc_f64_cost  */
2338     2, /* store_elt_extra_cost  */
2339     /* This value is just inherited from the Cortex-A57 table.  */
2340     8, /* vec_to_scalar_cost  */
2341     /* See the comment above the Advanced SIMD versions.  */
2342     4, /* scalar_to_vec_cost  */
2343     4, /* align_load_cost  */
2344     4, /* unalign_load_cost  */
2345     /* Although stores have a latency of 2 and compete for the
2346        vector pipes, in practice it's better not to model that.  */
2347     1, /* unalign_store_cost  */
2348     1  /* store_cost  */
2349   },
2350   3, /* clast_cost  */
2351   10, /* fadda_f16_cost  */
2352   6, /* fadda_f32_cost  */
2353   4, /* fadda_f64_cost  */
2354   /* A strided Advanced SIMD x64 load would take two parallel FP loads
2355      (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
2356      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
2357      (cost 8) and a vec_construct (cost 2).  Add a full vector operation
2358      (cost 2) to that, to avoid the difference being lost in rounding.
2359 
2360      There is no easy comparison between a strided Advanced SIMD x32 load
2361      and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2362      operation more than a 64-bit gather.  */
2363   14, /* gather_load_x32_cost  */
2364   12, /* gather_load_x64_cost  */
2365   3 /* scatter_store_elt_cost  */
2366 };
2367 
2368 static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
2369 {
2370   3, /* loads_stores_per_cycle  */
2371   2, /* stores_per_cycle  */
2372   4, /* general_ops_per_cycle  */
2373   0, /* fp_simd_load_general_ops  */
2374   1 /* fp_simd_store_general_ops  */
2375 };
2376 
2377 static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
2378 {
2379   {
2380     3, /* loads_stores_per_cycle  */
2381     2, /* stores_per_cycle  */
2382     2, /* general_ops_per_cycle  */
2383     0, /* fp_simd_load_general_ops  */
2384     1 /* fp_simd_store_general_ops  */
2385   },
2386   2, /* ld2_st2_general_ops  */
2387   2, /* ld3_st3_general_ops  */
2388   3 /* ld4_st4_general_ops  */
2389 };
2390 
2391 static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
2392 {
2393   {
2394     {
2395       3, /* loads_per_cycle  */
2396       2, /* stores_per_cycle  */
2397       2, /* general_ops_per_cycle  */
2398       0, /* fp_simd_load_general_ops  */
2399       1 /* fp_simd_store_general_ops  */
2400     },
2401     2, /* ld2_st2_general_ops  */
2402     3, /* ld3_st3_general_ops  */
2403     3 /* ld4_st4_general_ops  */
2404   },
2405   2, /* pred_ops_per_cycle  */
2406   2, /* while_pred_ops  */
2407   2, /* int_cmp_pred_ops  */
2408   1, /* fp_cmp_pred_ops  */
2409   1, /* gather_scatter_pair_general_ops  */
2410   1 /* gather_scatter_pair_pred_ops  */
2411 };
2412 
2413 static const aarch64_vec_issue_info neoversen2_vec_issue_info =
2414 {
2415   &neoversen2_scalar_issue_info,
2416   &neoversen2_advsimd_issue_info,
2417   &neoversen2_sve_issue_info
2418 };
2419 
2420 /* Neoverse N2 costs for vector insn classes.  */
2421 static const struct cpu_vector_cost neoversen2_vector_cost =
2422 {
2423   1, /* scalar_int_stmt_cost  */
2424   2, /* scalar_fp_stmt_cost  */
2425   4, /* scalar_load_cost  */
2426   1, /* scalar_store_cost  */
2427   1, /* cond_taken_branch_cost  */
2428   1, /* cond_not_taken_branch_cost  */
2429   &neoversen2_advsimd_vector_cost, /* advsimd  */
2430   &neoversen2_sve_vector_cost, /* sve  */
2431   &neoversen2_vec_issue_info /* issue_info  */
2432 };
2433 
2434 static const struct tune_params neoversen2_tunings =
2435 {
2436   &cortexa76_extra_costs,
2437   &neoversen2_addrcost_table,
2438   &neoversen2_regmove_cost,
2439   &neoversen2_vector_cost,
2440   &generic_branch_cost,
2441   &generic_approx_modes,
2442   SVE_128, /* sve_width  */
2443   { 4, /* load_int.  */
2444     1, /* store_int.  */
2445     6, /* load_fp.  */
2446     2, /* store_fp.  */
2447     6, /* load_pred.  */
2448     1 /* store_pred.  */
2449   }, /* memmov_cost.  */
2450   3, /* issue_rate  */
2451   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2452   "32:16",	/* function_align.  */
2453   "4",		/* jump_align.  */
2454   "32:16",	/* loop_align.  */
2455   2,	/* int_reassoc_width.  */
2456   4,	/* fp_reassoc_width.  */
2457   2,	/* vec_reassoc_width.  */
2458   2,	/* min_div_recip_mul_sf.  */
2459   2,	/* min_div_recip_mul_df.  */
2460   0,	/* max_case_values.  */
2461   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
2462   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2463    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2464    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2465    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
2466   &generic_prefetch_tune
2467 };
2468 
2469 static const advsimd_vec_cost neoversev2_advsimd_vector_cost =
2470 {
2471   2, /* int_stmt_cost  */
2472   2, /* fp_stmt_cost  */
2473   2, /* ld2_st2_permute_cost */
2474   2, /* ld3_st3_permute_cost  */
2475   3, /* ld4_st4_permute_cost  */
2476   3, /* permute_cost  */
2477   4, /* reduc_i8_cost  */
2478   4, /* reduc_i16_cost  */
2479   2, /* reduc_i32_cost  */
2480   2, /* reduc_i64_cost  */
2481   6, /* reduc_f16_cost  */
2482   3, /* reduc_f32_cost  */
2483   2, /* reduc_f64_cost  */
2484   2, /* store_elt_extra_cost  */
2485   /* This value is just inherited from the Cortex-A57 table.  */
2486   8, /* vec_to_scalar_cost  */
2487   /* This depends very much on what the scalar value is and
2488      where it comes from.  E.g. some constants take two dependent
2489      instructions or a load, while others might be moved from a GPR.
2490      4 seems to be a reasonable compromise in practice.  */
2491   4, /* scalar_to_vec_cost  */
2492   4, /* align_load_cost  */
2493   4, /* unalign_load_cost  */
2494   /* Although stores have a latency of 2 and compete for the
2495      vector pipes, in practice it's better not to model that.  */
2496   1, /* unalign_store_cost  */
2497   1  /* store_cost  */
2498 };
2499 
2500 static const sve_vec_cost neoversev2_sve_vector_cost =
2501 {
2502   {
2503     2, /* int_stmt_cost  */
2504     2, /* fp_stmt_cost  */
2505     3, /* ld2_st2_permute_cost  */
2506     3, /* ld3_st3_permute_cost  */
2507     4, /* ld4_st4_permute_cost  */
2508     3, /* permute_cost  */
2509     /* Theoretically, a reduction involving 15 scalar ADDs could
2510        complete in ~3 cycles and would have a cost of 15.  [SU]ADDV
2511        completes in 11 cycles, so give it a cost of 15 + 8.  */
2512     21, /* reduc_i8_cost  */
2513     /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7.  */
2514     14, /* reduc_i16_cost  */
2515     /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4.  */
2516     7, /* reduc_i32_cost  */
2517     /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1.  */
2518     2, /* reduc_i64_cost  */
2519     /* Theoretically, a reduction involving 7 scalar FADDs could
2520        complete in ~6 cycles and would have a cost of 14.  FADDV
2521        completes in 8 cycles, so give it a cost of 14 + 2.  */
2522     16, /* reduc_f16_cost  */
2523     /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2.  */
2524     8, /* reduc_f32_cost  */
2525     /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2.  */
2526     4, /* reduc_f64_cost  */
2527     2, /* store_elt_extra_cost  */
2528     /* This value is just inherited from the Cortex-A57 table.  */
2529     8, /* vec_to_scalar_cost  */
2530     /* See the comment above the Advanced SIMD versions.  */
2531     4, /* scalar_to_vec_cost  */
2532     4, /* align_load_cost  */
2533     4, /* unalign_load_cost  */
2534     /* Although stores have a latency of 2 and compete for the
2535        vector pipes, in practice it's better not to model that.  */
2536     1, /* unalign_store_cost  */
2537     1  /* store_cost  */
2538   },
2539   3, /* clast_cost  */
2540   10, /* fadda_f16_cost  */
2541   6, /* fadda_f32_cost  */
2542   4, /* fadda_f64_cost  */
2543   /* A strided Advanced SIMD x64 load would take two parallel FP loads
2544      (8 cycles) plus an insertion (2 cycles).  Assume a 64-bit SVE gather
2545      is 1 cycle more.  The Advanced SIMD version is costed as 2 scalar loads
2546      (cost 8) and a vec_construct (cost 2).  Add a full vector operation
2547      (cost 2) to that, to avoid the difference being lost in rounding.
2548 
2549      There is no easy comparison between a strided Advanced SIMD x32 load
2550      and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2551      operation more than a 64-bit gather.  */
2552   14, /* gather_load_x32_cost  */
2553   12, /* gather_load_x64_cost  */
2554   3 /* scatter_store_elt_cost  */
2555 };
2556 
2557 static const aarch64_scalar_vec_issue_info neoversev2_scalar_issue_info =
2558 {
2559   3, /* loads_stores_per_cycle  */
2560   2, /* stores_per_cycle  */
2561   6, /* general_ops_per_cycle  */
2562   0, /* fp_simd_load_general_ops  */
2563   1 /* fp_simd_store_general_ops  */
2564 };
2565 
2566 static const aarch64_advsimd_vec_issue_info neoversev2_advsimd_issue_info =
2567 {
2568   {
2569     3, /* loads_stores_per_cycle  */
2570     2, /* stores_per_cycle  */
2571     4, /* general_ops_per_cycle  */
2572     0, /* fp_simd_load_general_ops  */
2573     1 /* fp_simd_store_general_ops  */
2574   },
2575   2, /* ld2_st2_general_ops  */
2576   2, /* ld3_st3_general_ops  */
2577   3 /* ld4_st4_general_ops  */
2578 };
2579 
2580 static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info =
2581 {
2582   {
2583     {
2584       3, /* loads_per_cycle  */
2585       2, /* stores_per_cycle  */
2586       4, /* general_ops_per_cycle  */
2587       0, /* fp_simd_load_general_ops  */
2588       1 /* fp_simd_store_general_ops  */
2589     },
2590     2, /* ld2_st2_general_ops  */
2591     3, /* ld3_st3_general_ops  */
2592     3 /* ld4_st4_general_ops  */
2593   },
2594   2, /* pred_ops_per_cycle  */
2595   2, /* while_pred_ops  */
2596   2, /* int_cmp_pred_ops  */
2597   1, /* fp_cmp_pred_ops  */
2598   1, /* gather_scatter_pair_general_ops  */
2599   1 /* gather_scatter_pair_pred_ops  */
2600 };
2601 
2602 static const aarch64_vec_issue_info neoversev2_vec_issue_info =
2603 {
2604   &neoversev2_scalar_issue_info,
2605   &neoversev2_advsimd_issue_info,
2606   &neoversev2_sve_issue_info
2607 };
2608 
2609 /* Demeter costs for vector insn classes.  */
2610 static const struct cpu_vector_cost neoversev2_vector_cost =
2611 {
2612   1, /* scalar_int_stmt_cost  */
2613   2, /* scalar_fp_stmt_cost  */
2614   4, /* scalar_load_cost  */
2615   1, /* scalar_store_cost  */
2616   1, /* cond_taken_branch_cost  */
2617   1, /* cond_not_taken_branch_cost  */
2618   &neoversev2_advsimd_vector_cost, /* advsimd  */
2619   &neoversev2_sve_vector_cost, /* sve  */
2620   &neoversev2_vec_issue_info /* issue_info  */
2621 };
2622 
2623 static const struct tune_params neoversev2_tunings =
2624 {
2625   &cortexa76_extra_costs,
2626   &neoversev2_addrcost_table,
2627   &neoversev2_regmove_cost,
2628   &neoversev2_vector_cost,
2629   &generic_branch_cost,
2630   &generic_approx_modes,
2631   SVE_128, /* sve_width  */
2632   { 4, /* load_int.  */
2633     2, /* store_int.  */
2634     6, /* load_fp.  */
2635     1, /* store_fp.  */
2636     6, /* load_pred.  */
2637     2 /* store_pred.  */
2638   }, /* memmov_cost.  */
2639   5, /* issue_rate  */
2640   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2641   "32:16",	/* function_align.  */
2642   "4",		/* jump_align.  */
2643   "32:16",	/* loop_align.  */
2644   3,	/* int_reassoc_width.  */
2645   6,	/* fp_reassoc_width.  */
2646   3,	/* vec_reassoc_width.  */
2647   2,	/* min_div_recip_mul_sf.  */
2648   2,	/* min_div_recip_mul_df.  */
2649   0,	/* max_case_values.  */
2650   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
2651   (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2652    | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2653    | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2654    | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT),	/* tune_flags.  */
2655   &generic_prefetch_tune
2656 };
2657 
2658 static const struct tune_params a64fx_tunings =
2659 {
2660   &a64fx_extra_costs,
2661   &a64fx_addrcost_table,
2662   &a64fx_regmove_cost,
2663   &a64fx_vector_cost,
2664   &generic_branch_cost,
2665   &generic_approx_modes,
2666   SVE_512, /* sve_width  */
2667   { 4, /* load_int.  */
2668     4, /* store_int.  */
2669     4, /* load_fp.  */
2670     4, /* store_fp.  */
2671     4, /* load_pred.  */
2672     4 /* store_pred.  */
2673   }, /* memmov_cost.  */
2674   7, /* issue_rate  */
2675   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops  */
2676   "32",	/* function_align.  */
2677   "16",	/* jump_align.  */
2678   "32",	/* loop_align.  */
2679   4,	/* int_reassoc_width.  */
2680   2,	/* fp_reassoc_width.  */
2681   2,	/* vec_reassoc_width.  */
2682   2,	/* min_div_recip_mul_sf.  */
2683   2,	/* min_div_recip_mul_df.  */
2684   0,	/* max_case_values.  */
2685   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
2686   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
2687   &a64fx_prefetch_tune
2688 };
2689 
2690 /* Support for fine-grained override of the tuning structures.  */
2691 struct aarch64_tuning_override_function
2692 {
2693   const char* name;
2694   void (*parse_override)(const char*, struct tune_params*);
2695 };
2696 
2697 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
2698 static void aarch64_parse_tune_string (const char*, struct tune_params*);
2699 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
2700 
2701 static const struct aarch64_tuning_override_function
2702 aarch64_tuning_override_functions[] =
2703 {
2704   { "fuse", aarch64_parse_fuse_string },
2705   { "tune", aarch64_parse_tune_string },
2706   { "sve_width", aarch64_parse_sve_width_string },
2707   { NULL, NULL }
2708 };
2709 
2710 /* A processor implementing AArch64.  */
2711 struct processor
2712 {
2713   const char *const name;
2714   enum aarch64_processor ident;
2715   enum aarch64_processor sched_core;
2716   enum aarch64_arch arch;
2717   unsigned architecture_version;
2718   const uint64_t flags;
2719   const struct tune_params *const tune;
2720 };
2721 
2722 /* Architectures implementing AArch64.  */
2723 static const struct processor all_architectures[] =
2724 {
2725 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
2726   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
2727 #include "aarch64-arches.def"
2728   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
2729 };
2730 
2731 /* Processor cores implementing AArch64.  */
2732 static const struct processor all_cores[] =
2733 {
2734 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
2735   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,				\
2736   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,	\
2737   FLAGS, &COSTS##_tunings},
2738 #include "aarch64-cores.def"
2739   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
2740     AARCH64_FL_FOR_ARCH8, &generic_tunings},
2741   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
2742 };
2743 
2744 
2745 /* Target specification.  These are populated by the -march, -mtune, -mcpu
2746    handling code or by target attributes.  */
2747 static const struct processor *selected_arch;
2748 static const struct processor *selected_cpu;
2749 static const struct processor *selected_tune;
2750 
2751 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
2752 
2753 /* The current tuning set.  */
2754 struct tune_params aarch64_tune_params = generic_tunings;
2755 
2756 /* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
2757 
2758 static tree
handle_aarch64_vector_pcs_attribute(tree * node,tree name,tree,int,bool * no_add_attrs)2759 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
2760 				     int, bool *no_add_attrs)
2761 {
2762   /* Since we set fn_type_req to true, the caller should have checked
2763      this for us.  */
2764   gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
2765   switch ((arm_pcs) fntype_abi (*node).id ())
2766     {
2767     case ARM_PCS_AAPCS64:
2768     case ARM_PCS_SIMD:
2769       return NULL_TREE;
2770 
2771     case ARM_PCS_SVE:
2772       error ("the %qE attribute cannot be applied to an SVE function type",
2773 	     name);
2774       *no_add_attrs = true;
2775       return NULL_TREE;
2776 
2777     case ARM_PCS_TLSDESC:
2778     case ARM_PCS_UNKNOWN:
2779       break;
2780     }
2781   gcc_unreachable ();
2782 }
2783 
2784 /* Table of machine attributes.  */
2785 static const struct attribute_spec aarch64_attribute_table[] =
2786 {
2787   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
2788        affects_type_identity, handler, exclude } */
2789   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
2790 			  handle_aarch64_vector_pcs_attribute, NULL },
2791   { "arm_sve_vector_bits", 1, 1, false, true,  false, true,
2792 			  aarch64_sve::handle_arm_sve_vector_bits_attribute,
2793 			  NULL },
2794   { "Advanced SIMD type", 1, 1, false, true,  false, true,  NULL, NULL },
2795   { "SVE type",		  3, 3, false, true,  false, true,  NULL, NULL },
2796   { "SVE sizeless type",  0, 0, false, true,  false, true,  NULL, NULL },
2797   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
2798 };
2799 
2800 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
2801 
2802 /* An ISA extension in the co-processor and main instruction set space.  */
2803 struct aarch64_option_extension
2804 {
2805   const char *const name;
2806   const unsigned long flags_on;
2807   const unsigned long flags_off;
2808 };
2809 
2810 typedef enum aarch64_cond_code
2811 {
2812   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
2813   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
2814   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
2815 }
2816 aarch64_cc;
2817 
2818 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
2819 
2820 struct aarch64_branch_protect_type
2821 {
2822   /* The type's name that the user passes to the branch-protection option
2823     string.  */
2824   const char* name;
2825   /* Function to handle the protection type and set global variables.
2826     First argument is the string token corresponding with this type and the
2827     second argument is the next token in the option string.
2828     Return values:
2829     * AARCH64_PARSE_OK: Handling was sucessful.
2830     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
2831       should print an error.
2832     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
2833       own error.  */
2834   enum aarch64_parse_opt_result (*handler)(char*, char*);
2835   /* A list of types that can follow this type in the option string.  */
2836   const aarch64_branch_protect_type* subtypes;
2837   unsigned int num_subtypes;
2838 };
2839 
2840 static enum aarch64_parse_opt_result
aarch64_handle_no_branch_protection(char * str,char * rest)2841 aarch64_handle_no_branch_protection (char* str, char* rest)
2842 {
2843   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
2844   aarch64_enable_bti = 0;
2845   if (rest)
2846     {
2847       error ("unexpected %<%s%> after %<%s%>", rest, str);
2848       return AARCH64_PARSE_INVALID_FEATURE;
2849     }
2850   return AARCH64_PARSE_OK;
2851 }
2852 
2853 static enum aarch64_parse_opt_result
aarch64_handle_standard_branch_protection(char * str,char * rest)2854 aarch64_handle_standard_branch_protection (char* str, char* rest)
2855 {
2856   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
2857   aarch64_ra_sign_key = AARCH64_KEY_A;
2858   aarch64_enable_bti = 1;
2859   if (rest)
2860     {
2861       error ("unexpected %<%s%> after %<%s%>", rest, str);
2862       return AARCH64_PARSE_INVALID_FEATURE;
2863     }
2864   return AARCH64_PARSE_OK;
2865 }
2866 
2867 static enum aarch64_parse_opt_result
aarch64_handle_pac_ret_protection(char * str ATTRIBUTE_UNUSED,char * rest ATTRIBUTE_UNUSED)2868 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
2869 				    char* rest ATTRIBUTE_UNUSED)
2870 {
2871   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
2872   aarch64_ra_sign_key = AARCH64_KEY_A;
2873   return AARCH64_PARSE_OK;
2874 }
2875 
2876 static enum aarch64_parse_opt_result
aarch64_handle_pac_ret_leaf(char * str ATTRIBUTE_UNUSED,char * rest ATTRIBUTE_UNUSED)2877 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
2878 			      char* rest ATTRIBUTE_UNUSED)
2879 {
2880   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
2881   return AARCH64_PARSE_OK;
2882 }
2883 
2884 static enum aarch64_parse_opt_result
aarch64_handle_pac_ret_b_key(char * str ATTRIBUTE_UNUSED,char * rest ATTRIBUTE_UNUSED)2885 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
2886 			      char* rest ATTRIBUTE_UNUSED)
2887 {
2888   aarch64_ra_sign_key = AARCH64_KEY_B;
2889   return AARCH64_PARSE_OK;
2890 }
2891 
2892 static enum aarch64_parse_opt_result
aarch64_handle_bti_protection(char * str ATTRIBUTE_UNUSED,char * rest ATTRIBUTE_UNUSED)2893 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
2894 				    char* rest ATTRIBUTE_UNUSED)
2895 {
2896   aarch64_enable_bti = 1;
2897   return AARCH64_PARSE_OK;
2898 }
2899 
2900 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
2901   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
2902   { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
2903   { NULL, NULL, NULL, 0 }
2904 };
2905 
2906 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
2907   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
2908   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
2909   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
2910     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
2911   { "bti", aarch64_handle_bti_protection, NULL, 0 },
2912   { NULL, NULL, NULL, 0 }
2913 };
2914 
2915 /* The condition codes of the processor, and the inverse function.  */
2916 static const char * const aarch64_condition_codes[] =
2917 {
2918   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
2919   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
2920 };
2921 
2922 /* The preferred condition codes for SVE conditions.  */
2923 static const char *const aarch64_sve_condition_codes[] =
2924 {
2925   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
2926   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
2927 };
2928 
2929 /* Return the assembly token for svpattern value VALUE.  */
2930 
2931 static const char *
svpattern_token(enum aarch64_svpattern pattern)2932 svpattern_token (enum aarch64_svpattern pattern)
2933 {
2934   switch (pattern)
2935     {
2936 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
2937     AARCH64_FOR_SVPATTERN (CASE)
2938 #undef CASE
2939     case AARCH64_NUM_SVPATTERNS:
2940       break;
2941     }
2942   gcc_unreachable ();
2943 }
2944 
2945 /* Return the location of a piece that is known to be passed or returned
2946    in registers.  FIRST_ZR is the first unused vector argument register
2947    and FIRST_PR is the first unused predicate argument register.  */
2948 
2949 rtx
get_rtx(unsigned int first_zr,unsigned int first_pr) const2950 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
2951 					 unsigned int first_pr) const
2952 {
2953   gcc_assert (VECTOR_MODE_P (mode)
2954 	      && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
2955 	      && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
2956 
2957   if (num_zr > 0 && num_pr == 0)
2958     return gen_rtx_REG (mode, first_zr);
2959 
2960   if (num_zr == 0 && num_pr == 1)
2961     return gen_rtx_REG (mode, first_pr);
2962 
2963   gcc_unreachable ();
2964 }
2965 
2966 /* Return the total number of vector registers required by the PST.  */
2967 
2968 unsigned int
num_zr() const2969 pure_scalable_type_info::num_zr () const
2970 {
2971   unsigned int res = 0;
2972   for (unsigned int i = 0; i < pieces.length (); ++i)
2973     res += pieces[i].num_zr;
2974   return res;
2975 }
2976 
2977 /* Return the total number of predicate registers required by the PST.  */
2978 
2979 unsigned int
num_pr() const2980 pure_scalable_type_info::num_pr () const
2981 {
2982   unsigned int res = 0;
2983   for (unsigned int i = 0; i < pieces.length (); ++i)
2984     res += pieces[i].num_pr;
2985   return res;
2986 }
2987 
2988 /* Return the location of a PST that is known to be passed or returned
2989    in registers.  FIRST_ZR is the first unused vector argument register
2990    and FIRST_PR is the first unused predicate argument register.  */
2991 
2992 rtx
get_rtx(machine_mode mode,unsigned int first_zr,unsigned int first_pr) const2993 pure_scalable_type_info::get_rtx (machine_mode mode,
2994 				  unsigned int first_zr,
2995 				  unsigned int first_pr) const
2996 {
2997   /* Try to return a single REG if possible.  This leads to better
2998      code generation; it isn't required for correctness.  */
2999   if (mode == pieces[0].mode)
3000     {
3001       gcc_assert (pieces.length () == 1);
3002       return pieces[0].get_rtx (first_zr, first_pr);
3003     }
3004 
3005   /* Build up a PARALLEL that contains the individual pieces.  */
3006   rtvec rtxes = rtvec_alloc (pieces.length ());
3007   for (unsigned int i = 0; i < pieces.length (); ++i)
3008     {
3009       rtx reg = pieces[i].get_rtx (first_zr, first_pr);
3010       rtx offset = gen_int_mode (pieces[i].offset, Pmode);
3011       RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
3012       first_zr += pieces[i].num_zr;
3013       first_pr += pieces[i].num_pr;
3014     }
3015   return gen_rtx_PARALLEL (mode, rtxes);
3016 }
3017 
3018 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
3019    in the AAPCS64.  */
3020 
3021 pure_scalable_type_info::analysis_result
analyze(const_tree type)3022 pure_scalable_type_info::analyze (const_tree type)
3023 {
3024   /* Prevent accidental reuse.  */
3025   gcc_assert (pieces.is_empty ());
3026 
3027   /* No code will be generated for erroneous types, so we won't establish
3028      an ABI mapping.  */
3029   if (type == error_mark_node)
3030     return NO_ABI_IDENTITY;
3031 
3032   /* Zero-sized types disappear in the language->ABI mapping.  */
3033   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
3034     return NO_ABI_IDENTITY;
3035 
3036   /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs.  */
3037   piece p = {};
3038   if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
3039     {
3040       machine_mode mode = TYPE_MODE_RAW (type);
3041       gcc_assert (VECTOR_MODE_P (mode)
3042 		  && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
3043 
3044       p.mode = p.orig_mode = mode;
3045       add_piece (p);
3046       return IS_PST;
3047     }
3048 
3049   /* Check for user-defined PSTs.  */
3050   if (TREE_CODE (type) == ARRAY_TYPE)
3051     return analyze_array (type);
3052   if (TREE_CODE (type) == RECORD_TYPE)
3053     return analyze_record (type);
3054 
3055   return ISNT_PST;
3056 }
3057 
3058 /* Analyze a type that is known not to be passed or returned in memory.
3059    Return true if it has an ABI identity and is a Pure Scalable Type.  */
3060 
3061 bool
analyze_registers(const_tree type)3062 pure_scalable_type_info::analyze_registers (const_tree type)
3063 {
3064   analysis_result result = analyze (type);
3065   gcc_assert (result != DOESNT_MATTER);
3066   return result == IS_PST;
3067 }
3068 
3069 /* Subroutine of analyze for handling ARRAY_TYPEs.  */
3070 
3071 pure_scalable_type_info::analysis_result
analyze_array(const_tree type)3072 pure_scalable_type_info::analyze_array (const_tree type)
3073 {
3074   /* Analyze the element type.  */
3075   pure_scalable_type_info element_info;
3076   analysis_result result = element_info.analyze (TREE_TYPE (type));
3077   if (result != IS_PST)
3078     return result;
3079 
3080   /* An array of unknown, flexible or variable length will be passed and
3081      returned by reference whatever we do.  */
3082   tree nelts_minus_one = array_type_nelts (type);
3083   if (!tree_fits_uhwi_p (nelts_minus_one))
3084     return DOESNT_MATTER;
3085 
3086   /* Likewise if the array is constant-sized but too big to be interesting.
3087      The double checks against MAX_PIECES are to protect against overflow.  */
3088   unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
3089   if (count > MAX_PIECES)
3090     return DOESNT_MATTER;
3091   count += 1;
3092   if (count * element_info.pieces.length () > MAX_PIECES)
3093     return DOESNT_MATTER;
3094 
3095   /* The above checks should have weeded out elements of unknown size.  */
3096   poly_uint64 element_bytes;
3097   if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
3098     gcc_unreachable ();
3099 
3100   /* Build up the list of individual vectors and predicates.  */
3101   gcc_assert (!element_info.pieces.is_empty ());
3102   for (unsigned int i = 0; i < count; ++i)
3103     for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
3104       {
3105 	piece p = element_info.pieces[j];
3106 	p.offset += i * element_bytes;
3107 	add_piece (p);
3108       }
3109   return IS_PST;
3110 }
3111 
3112 /* Subroutine of analyze for handling RECORD_TYPEs.  */
3113 
3114 pure_scalable_type_info::analysis_result
analyze_record(const_tree type)3115 pure_scalable_type_info::analyze_record (const_tree type)
3116 {
3117   for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3118     {
3119       if (TREE_CODE (field) != FIELD_DECL)
3120 	continue;
3121 
3122       /* Zero-sized fields disappear in the language->ABI mapping.  */
3123       if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
3124 	continue;
3125 
3126       /* All fields with an ABI identity must be PSTs for the record as
3127 	 a whole to be a PST.  If any individual field is too big to be
3128 	 interesting then the record is too.  */
3129       pure_scalable_type_info field_info;
3130       analysis_result subresult = field_info.analyze (TREE_TYPE (field));
3131       if (subresult == NO_ABI_IDENTITY)
3132 	continue;
3133       if (subresult != IS_PST)
3134 	return subresult;
3135 
3136       /* Since all previous fields are PSTs, we ought to be able to track
3137 	 the field offset using poly_ints.  */
3138       tree bitpos = bit_position (field);
3139       gcc_assert (poly_int_tree_p (bitpos));
3140 
3141       /* For the same reason, it shouldn't be possible to create a PST field
3142 	 whose offset isn't byte-aligned.  */
3143       poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
3144 						BITS_PER_UNIT);
3145 
3146       /* Punt if the record is too big to be interesting.  */
3147       poly_uint64 bytepos;
3148       if (!wide_bytepos.to_uhwi (&bytepos)
3149 	  || pieces.length () + field_info.pieces.length () > MAX_PIECES)
3150 	return DOESNT_MATTER;
3151 
3152       /* Add the individual vectors and predicates in the field to the
3153 	 record's list.  */
3154       gcc_assert (!field_info.pieces.is_empty ());
3155       for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
3156 	{
3157 	  piece p = field_info.pieces[i];
3158 	  p.offset += bytepos;
3159 	  add_piece (p);
3160 	}
3161     }
3162   /* Empty structures disappear in the language->ABI mapping.  */
3163   return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
3164 }
3165 
3166 /* Add P to the list of pieces in the type.  */
3167 
3168 void
add_piece(const piece & p)3169 pure_scalable_type_info::add_piece (const piece &p)
3170 {
3171   /* Try to fold the new piece into the previous one to form a
3172      single-mode PST.  For example, if we see three consecutive vectors
3173      of the same mode, we can represent them using the corresponding
3174      3-tuple mode.
3175 
3176      This is purely an optimization.  */
3177   if (!pieces.is_empty ())
3178     {
3179       piece &prev = pieces.last ();
3180       gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
3181       unsigned int nelems1, nelems2;
3182       if (prev.orig_mode == p.orig_mode
3183 	  && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
3184 	  && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
3185 				  GET_MODE_NUNITS (p.orig_mode), &nelems1)
3186 	  && constant_multiple_p (GET_MODE_NUNITS (p.mode),
3187 				  GET_MODE_NUNITS (p.orig_mode), &nelems2)
3188 	  && targetm.array_mode (p.orig_mode,
3189 				 nelems1 + nelems2).exists (&prev.mode))
3190 	{
3191 	  prev.num_zr += p.num_zr;
3192 	  prev.num_pr += p.num_pr;
3193 	  return;
3194 	}
3195     }
3196   pieces.quick_push (p);
3197 }
3198 
3199 /* Return true if at least one possible value of type TYPE includes at
3200    least one object of Pure Scalable Type, in the sense of the AAPCS64.
3201 
3202    This is a relatively expensive test for some types, so it should
3203    generally be made as late as possible.  */
3204 
3205 static bool
aarch64_some_values_include_pst_objects_p(const_tree type)3206 aarch64_some_values_include_pst_objects_p (const_tree type)
3207 {
3208   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
3209     return false;
3210 
3211   if (aarch64_sve::builtin_type_p (type))
3212     return true;
3213 
3214   if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
3215     return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
3216 
3217   if (RECORD_OR_UNION_TYPE_P (type))
3218     for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3219       if (TREE_CODE (field) == FIELD_DECL
3220 	  && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
3221 	return true;
3222 
3223   return false;
3224 }
3225 
3226 /* Return the descriptor of the SIMD ABI.  */
3227 
3228 static const predefined_function_abi &
aarch64_simd_abi(void)3229 aarch64_simd_abi (void)
3230 {
3231   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
3232   if (!simd_abi.initialized_p ())
3233     {
3234       HARD_REG_SET full_reg_clobbers
3235 	= default_function_abi.full_reg_clobbers ();
3236       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
3237 	if (FP_SIMD_SAVED_REGNUM_P (regno))
3238 	  CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3239       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
3240     }
3241   return simd_abi;
3242 }
3243 
3244 /* Return the descriptor of the SVE PCS.  */
3245 
3246 static const predefined_function_abi &
aarch64_sve_abi(void)3247 aarch64_sve_abi (void)
3248 {
3249   predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
3250   if (!sve_abi.initialized_p ())
3251     {
3252       HARD_REG_SET full_reg_clobbers
3253 	= default_function_abi.full_reg_clobbers ();
3254       for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
3255 	CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3256       for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
3257 	CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3258       sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
3259     }
3260   return sve_abi;
3261 }
3262 
3263 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
3264    wraps, otherwise return X itself.  */
3265 
3266 static rtx
strip_salt(rtx x)3267 strip_salt (rtx x)
3268 {
3269   rtx search = x;
3270   if (GET_CODE (search) == CONST)
3271     search = XEXP (search, 0);
3272   if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
3273     x = XVECEXP (search, 0, 0);
3274   return x;
3275 }
3276 
3277 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
3278    expression.  */
3279 
3280 static rtx
strip_offset_and_salt(rtx addr,poly_int64 * offset)3281 strip_offset_and_salt (rtx addr, poly_int64 *offset)
3282 {
3283   return strip_salt (strip_offset (addr, offset));
3284 }
3285 
3286 /* Generate code to enable conditional branches in functions over 1 MiB.  */
3287 const char *
aarch64_gen_far_branch(rtx * operands,int pos_label,const char * dest,const char * branch_format)3288 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
3289 			const char * branch_format)
3290 {
3291     rtx_code_label * tmp_label = gen_label_rtx ();
3292     char label_buf[256];
3293     char buffer[128];
3294     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
3295 				 CODE_LABEL_NUMBER (tmp_label));
3296     const char *label_ptr = targetm.strip_name_encoding (label_buf);
3297     rtx dest_label = operands[pos_label];
3298     operands[pos_label] = tmp_label;
3299 
3300     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
3301     output_asm_insn (buffer, operands);
3302 
3303     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
3304     operands[pos_label] = dest_label;
3305     output_asm_insn (buffer, operands);
3306     return "";
3307 }
3308 
3309 void
aarch64_err_no_fpadvsimd(machine_mode mode)3310 aarch64_err_no_fpadvsimd (machine_mode mode)
3311 {
3312   if (TARGET_GENERAL_REGS_ONLY)
3313     if (FLOAT_MODE_P (mode))
3314       error ("%qs is incompatible with the use of floating-point types",
3315 	     "-mgeneral-regs-only");
3316     else
3317       error ("%qs is incompatible with the use of vector types",
3318 	     "-mgeneral-regs-only");
3319   else
3320     if (FLOAT_MODE_P (mode))
3321       error ("%qs feature modifier is incompatible with the use of"
3322 	     " floating-point types", "+nofp");
3323     else
3324       error ("%qs feature modifier is incompatible with the use of"
3325 	     " vector types", "+nofp");
3326 }
3327 
3328 /* Report when we try to do something that requires SVE when SVE is disabled.
3329    This is an error of last resort and isn't very high-quality.  It usually
3330    involves attempts to measure the vector length in some way.  */
3331 static void
aarch64_report_sve_required(void)3332 aarch64_report_sve_required (void)
3333 {
3334   static bool reported_p = false;
3335 
3336   /* Avoid reporting a slew of messages for a single oversight.  */
3337   if (reported_p)
3338     return;
3339 
3340   error ("this operation requires the SVE ISA extension");
3341   inform (input_location, "you can enable SVE using the command-line"
3342 	  " option %<-march%>, or by using the %<target%>"
3343 	  " attribute or pragma");
3344   reported_p = true;
3345 }
3346 
3347 /* Return true if REGNO is P0-P15 or one of the special FFR-related
3348    registers.  */
3349 inline bool
pr_or_ffr_regnum_p(unsigned int regno)3350 pr_or_ffr_regnum_p (unsigned int regno)
3351 {
3352   return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
3353 }
3354 
3355 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
3356    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
3357    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
3358    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
3359    and GENERAL_REGS is lower than the memory cost (in this case the best class
3360    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
3361    cost results in bad allocations with many redundant int<->FP moves which
3362    are expensive on various cores.
3363    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
3364    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
3365    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
3366    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
3367    The result of this is that it is no longer inefficient to have a higher
3368    memory move cost than the register move cost.
3369 */
3370 
3371 static reg_class_t
aarch64_ira_change_pseudo_allocno_class(int regno,reg_class_t allocno_class,reg_class_t best_class)3372 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
3373 					 reg_class_t best_class)
3374 {
3375   machine_mode mode;
3376 
3377   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
3378       || !reg_class_subset_p (FP_REGS, allocno_class))
3379     return allocno_class;
3380 
3381   if (!reg_class_subset_p (GENERAL_REGS, best_class)
3382       || !reg_class_subset_p (FP_REGS, best_class))
3383     return best_class;
3384 
3385   mode = PSEUDO_REGNO_MODE (regno);
3386   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
3387 }
3388 
3389 static unsigned int
aarch64_min_divisions_for_recip_mul(machine_mode mode)3390 aarch64_min_divisions_for_recip_mul (machine_mode mode)
3391 {
3392   if (GET_MODE_UNIT_SIZE (mode) == 4)
3393     return aarch64_tune_params.min_div_recip_mul_sf;
3394   return aarch64_tune_params.min_div_recip_mul_df;
3395 }
3396 
3397 /* Return the reassociation width of treeop OPC with mode MODE.  */
3398 static int
aarch64_reassociation_width(unsigned opc,machine_mode mode)3399 aarch64_reassociation_width (unsigned opc, machine_mode mode)
3400 {
3401   if (VECTOR_MODE_P (mode))
3402     return aarch64_tune_params.vec_reassoc_width;
3403   if (INTEGRAL_MODE_P (mode))
3404     return aarch64_tune_params.int_reassoc_width;
3405   /* Avoid reassociating floating point addition so we emit more FMAs.  */
3406   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
3407     return aarch64_tune_params.fp_reassoc_width;
3408   return 1;
3409 }
3410 
3411 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
3412 unsigned
aarch64_dbx_register_number(unsigned regno)3413 aarch64_dbx_register_number (unsigned regno)
3414 {
3415    if (GP_REGNUM_P (regno))
3416      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
3417    else if (regno == SP_REGNUM)
3418      return AARCH64_DWARF_SP;
3419    else if (FP_REGNUM_P (regno))
3420      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
3421    else if (PR_REGNUM_P (regno))
3422      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
3423    else if (regno == VG_REGNUM)
3424      return AARCH64_DWARF_VG;
3425 
3426    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
3427       equivalent DWARF register.  */
3428    return DWARF_FRAME_REGISTERS;
3429 }
3430 
3431 /* If X is a CONST_DOUBLE, return its bit representation as a constant
3432    integer, otherwise return X unmodified.  */
3433 static rtx
aarch64_bit_representation(rtx x)3434 aarch64_bit_representation (rtx x)
3435 {
3436   if (CONST_DOUBLE_P (x))
3437     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
3438   return x;
3439 }
3440 
3441 /* Return an estimate for the number of quadwords in an SVE vector.  This is
3442    equivalent to the number of Advanced SIMD vectors in an SVE vector.  */
3443 static unsigned int
aarch64_estimated_sve_vq()3444 aarch64_estimated_sve_vq ()
3445 {
3446   return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
3447 }
3448 
3449 /* Return true if MODE is an SVE predicate mode.  */
3450 static bool
aarch64_sve_pred_mode_p(machine_mode mode)3451 aarch64_sve_pred_mode_p (machine_mode mode)
3452 {
3453   return (TARGET_SVE
3454 	  && (mode == VNx16BImode
3455 	      || mode == VNx8BImode
3456 	      || mode == VNx4BImode
3457 	      || mode == VNx2BImode));
3458 }
3459 
3460 /* Three mutually-exclusive flags describing a vector or predicate type.  */
3461 const unsigned int VEC_ADVSIMD  = 1;
3462 const unsigned int VEC_SVE_DATA = 2;
3463 const unsigned int VEC_SVE_PRED = 4;
3464 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
3465    a structure of 2, 3 or 4 vectors.  */
3466 const unsigned int VEC_STRUCT   = 8;
3467 /* Can be used in combination with VEC_SVE_DATA to indicate that the
3468    vector has fewer significant bytes than a full SVE vector.  */
3469 const unsigned int VEC_PARTIAL  = 16;
3470 /* Useful combinations of the above.  */
3471 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
3472 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
3473 
3474 /* Return a set of flags describing the vector properties of mode MODE.
3475    Ignore modes that are not supported by the current target.  */
3476 static unsigned int
aarch64_classify_vector_mode(machine_mode mode)3477 aarch64_classify_vector_mode (machine_mode mode)
3478 {
3479   if (aarch64_sve_pred_mode_p (mode))
3480     return VEC_SVE_PRED;
3481 
3482   /* Make the decision based on the mode's enum value rather than its
3483      properties, so that we keep the correct classification regardless
3484      of -msve-vector-bits.  */
3485   switch (mode)
3486     {
3487     /* Partial SVE QI vectors.  */
3488     case E_VNx2QImode:
3489     case E_VNx4QImode:
3490     case E_VNx8QImode:
3491     /* Partial SVE HI vectors.  */
3492     case E_VNx2HImode:
3493     case E_VNx4HImode:
3494     /* Partial SVE SI vector.  */
3495     case E_VNx2SImode:
3496     /* Partial SVE HF vectors.  */
3497     case E_VNx2HFmode:
3498     case E_VNx4HFmode:
3499     /* Partial SVE BF vectors.  */
3500     case E_VNx2BFmode:
3501     case E_VNx4BFmode:
3502     /* Partial SVE SF vector.  */
3503     case E_VNx2SFmode:
3504       return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
3505 
3506     case E_VNx16QImode:
3507     case E_VNx8HImode:
3508     case E_VNx4SImode:
3509     case E_VNx2DImode:
3510     case E_VNx8BFmode:
3511     case E_VNx8HFmode:
3512     case E_VNx4SFmode:
3513     case E_VNx2DFmode:
3514       return TARGET_SVE ? VEC_SVE_DATA : 0;
3515 
3516     /* x2 SVE vectors.  */
3517     case E_VNx32QImode:
3518     case E_VNx16HImode:
3519     case E_VNx8SImode:
3520     case E_VNx4DImode:
3521     case E_VNx16BFmode:
3522     case E_VNx16HFmode:
3523     case E_VNx8SFmode:
3524     case E_VNx4DFmode:
3525     /* x3 SVE vectors.  */
3526     case E_VNx48QImode:
3527     case E_VNx24HImode:
3528     case E_VNx12SImode:
3529     case E_VNx6DImode:
3530     case E_VNx24BFmode:
3531     case E_VNx24HFmode:
3532     case E_VNx12SFmode:
3533     case E_VNx6DFmode:
3534     /* x4 SVE vectors.  */
3535     case E_VNx64QImode:
3536     case E_VNx32HImode:
3537     case E_VNx16SImode:
3538     case E_VNx8DImode:
3539     case E_VNx32BFmode:
3540     case E_VNx32HFmode:
3541     case E_VNx16SFmode:
3542     case E_VNx8DFmode:
3543       return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
3544 
3545     case E_OImode:
3546     case E_CImode:
3547     case E_XImode:
3548       return TARGET_SIMD ? VEC_ADVSIMD | VEC_STRUCT : 0;
3549 
3550     /* Structures of 64-bit Advanced SIMD vectors.  */
3551     case E_V2x8QImode:
3552     case E_V2x4HImode:
3553     case E_V2x2SImode:
3554     case E_V2x1DImode:
3555     case E_V2x4BFmode:
3556     case E_V2x4HFmode:
3557     case E_V2x2SFmode:
3558     case E_V2x1DFmode:
3559     case E_V3x8QImode:
3560     case E_V3x4HImode:
3561     case E_V3x2SImode:
3562     case E_V3x1DImode:
3563     case E_V3x4BFmode:
3564     case E_V3x4HFmode:
3565     case E_V3x2SFmode:
3566     case E_V3x1DFmode:
3567     case E_V4x8QImode:
3568     case E_V4x4HImode:
3569     case E_V4x2SImode:
3570     case E_V4x1DImode:
3571     case E_V4x4BFmode:
3572     case E_V4x4HFmode:
3573     case E_V4x2SFmode:
3574     case E_V4x1DFmode:
3575       return TARGET_SIMD ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
3576 
3577     /* Structures of 128-bit Advanced SIMD vectors.  */
3578     case E_V2x16QImode:
3579     case E_V2x8HImode:
3580     case E_V2x4SImode:
3581     case E_V2x2DImode:
3582     case E_V2x8BFmode:
3583     case E_V2x8HFmode:
3584     case E_V2x4SFmode:
3585     case E_V2x2DFmode:
3586     case E_V3x16QImode:
3587     case E_V3x8HImode:
3588     case E_V3x4SImode:
3589     case E_V3x2DImode:
3590     case E_V3x8BFmode:
3591     case E_V3x8HFmode:
3592     case E_V3x4SFmode:
3593     case E_V3x2DFmode:
3594     case E_V4x16QImode:
3595     case E_V4x8HImode:
3596     case E_V4x4SImode:
3597     case E_V4x2DImode:
3598     case E_V4x8BFmode:
3599     case E_V4x8HFmode:
3600     case E_V4x4SFmode:
3601     case E_V4x2DFmode:
3602       return TARGET_SIMD ? VEC_ADVSIMD | VEC_STRUCT : 0;
3603 
3604     /* 64-bit Advanced SIMD vectors.  */
3605     case E_V8QImode:
3606     case E_V4HImode:
3607     case E_V2SImode:
3608     /* ...E_V1DImode doesn't exist.  */
3609     case E_V4HFmode:
3610     case E_V4BFmode:
3611     case E_V2SFmode:
3612     case E_V1DFmode:
3613     /* 128-bit Advanced SIMD vectors.  */
3614     case E_V16QImode:
3615     case E_V8HImode:
3616     case E_V4SImode:
3617     case E_V2DImode:
3618     case E_V8HFmode:
3619     case E_V8BFmode:
3620     case E_V4SFmode:
3621     case E_V2DFmode:
3622       return TARGET_SIMD ? VEC_ADVSIMD : 0;
3623 
3624     default:
3625       return 0;
3626     }
3627 }
3628 
3629 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
3630 bool
aarch64_advsimd_struct_mode_p(machine_mode mode)3631 aarch64_advsimd_struct_mode_p (machine_mode mode)
3632 {
3633   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3634   return (vec_flags & VEC_ADVSIMD) && (vec_flags & VEC_STRUCT);
3635 }
3636 
3637 /* Return true if MODE is an Advanced SIMD D-register structure mode.  */
3638 static bool
aarch64_advsimd_partial_struct_mode_p(machine_mode mode)3639 aarch64_advsimd_partial_struct_mode_p (machine_mode mode)
3640 {
3641   return (aarch64_classify_vector_mode (mode)
3642 	  == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL));
3643 }
3644 
3645 /* Return true if MODE is an Advanced SIMD Q-register structure mode.  */
3646 static bool
aarch64_advsimd_full_struct_mode_p(machine_mode mode)3647 aarch64_advsimd_full_struct_mode_p (machine_mode mode)
3648 {
3649   return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
3650 }
3651 
3652 /* Return true if MODE is any of the data vector modes, including
3653    structure modes.  */
3654 static bool
aarch64_vector_data_mode_p(machine_mode mode)3655 aarch64_vector_data_mode_p (machine_mode mode)
3656 {
3657   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
3658 }
3659 
3660 /* Return true if MODE is any form of SVE mode, including predicates,
3661    vectors and structures.  */
3662 bool
aarch64_sve_mode_p(machine_mode mode)3663 aarch64_sve_mode_p (machine_mode mode)
3664 {
3665   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
3666 }
3667 
3668 /* Return true if MODE is an SVE data vector mode; either a single vector
3669    or a structure of vectors.  */
3670 static bool
aarch64_sve_data_mode_p(machine_mode mode)3671 aarch64_sve_data_mode_p (machine_mode mode)
3672 {
3673   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
3674 }
3675 
3676 /* Return the number of defined bytes in one constituent vector of
3677    SVE mode MODE, which has vector flags VEC_FLAGS.  */
3678 static poly_int64
aarch64_vl_bytes(machine_mode mode,unsigned int vec_flags)3679 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
3680 {
3681   if (vec_flags & VEC_PARTIAL)
3682     /* A single partial vector.  */
3683     return GET_MODE_SIZE (mode);
3684 
3685   if (vec_flags & VEC_SVE_DATA)
3686     /* A single vector or a tuple.  */
3687     return BYTES_PER_SVE_VECTOR;
3688 
3689   /* A single predicate.  */
3690   gcc_assert (vec_flags & VEC_SVE_PRED);
3691   return BYTES_PER_SVE_PRED;
3692 }
3693 
3694 /* If MODE holds an array of vectors, return the number of vectors
3695    in the array, otherwise return 1.  */
3696 
3697 static unsigned int
aarch64_ldn_stn_vectors(machine_mode mode)3698 aarch64_ldn_stn_vectors (machine_mode mode)
3699 {
3700   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3701   if (vec_flags == (VEC_ADVSIMD | VEC_PARTIAL | VEC_STRUCT))
3702     return exact_div (GET_MODE_SIZE (mode), 8).to_constant ();
3703   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
3704     return exact_div (GET_MODE_SIZE (mode), 16).to_constant ();
3705   if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
3706     return exact_div (GET_MODE_SIZE (mode),
3707 		      BYTES_PER_SVE_VECTOR).to_constant ();
3708   return 1;
3709 }
3710 
3711 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
3712    corresponding vector structure mode.  */
3713 static opt_machine_mode
aarch64_advsimd_vector_array_mode(machine_mode mode,unsigned HOST_WIDE_INT nelems)3714 aarch64_advsimd_vector_array_mode (machine_mode mode,
3715 				   unsigned HOST_WIDE_INT nelems)
3716 {
3717   unsigned int flags = VEC_ADVSIMD | VEC_STRUCT;
3718   if (known_eq (GET_MODE_SIZE (mode), 8))
3719     flags |= VEC_PARTIAL;
3720 
3721   machine_mode struct_mode;
3722   FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
3723     if (aarch64_classify_vector_mode (struct_mode) == flags
3724 	&& GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
3725 	&& known_eq (GET_MODE_NUNITS (struct_mode),
3726 	     GET_MODE_NUNITS (mode) * nelems))
3727       return struct_mode;
3728   return opt_machine_mode ();
3729 }
3730 
3731 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
3732 
3733 opt_machine_mode
aarch64_sve_data_mode(scalar_mode inner_mode,poly_uint64 nunits)3734 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
3735 {
3736   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
3737 			    ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
3738   machine_mode mode;
3739   FOR_EACH_MODE_IN_CLASS (mode, mclass)
3740     if (inner_mode == GET_MODE_INNER (mode)
3741 	&& known_eq (nunits, GET_MODE_NUNITS (mode))
3742 	&& aarch64_sve_data_mode_p (mode))
3743       return mode;
3744   return opt_machine_mode ();
3745 }
3746 
3747 /* Implement target hook TARGET_ARRAY_MODE.  */
3748 static opt_machine_mode
aarch64_array_mode(machine_mode mode,unsigned HOST_WIDE_INT nelems)3749 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
3750 {
3751   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
3752       && IN_RANGE (nelems, 2, 4))
3753     return aarch64_sve_data_mode (GET_MODE_INNER (mode),
3754 				  GET_MODE_NUNITS (mode) * nelems);
3755   if (aarch64_classify_vector_mode (mode) == VEC_ADVSIMD
3756       && IN_RANGE (nelems, 2, 4))
3757     return aarch64_advsimd_vector_array_mode (mode, nelems);
3758 
3759   return opt_machine_mode ();
3760 }
3761 
3762 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
3763 static bool
aarch64_array_mode_supported_p(machine_mode mode,unsigned HOST_WIDE_INT nelems)3764 aarch64_array_mode_supported_p (machine_mode mode,
3765 				unsigned HOST_WIDE_INT nelems)
3766 {
3767   if (TARGET_SIMD
3768       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
3769 	  || AARCH64_VALID_SIMD_DREG_MODE (mode))
3770       && (nelems >= 2 && nelems <= 4))
3771     return true;
3772 
3773   return false;
3774 }
3775 
3776 /* MODE is some form of SVE vector mode.  For data modes, return the number
3777    of vector register bits that each element of MODE occupies, such as 64
3778    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
3779    in a 64-bit container).  For predicate modes, return the number of
3780    data bits controlled by each significant predicate bit.  */
3781 
3782 static unsigned int
aarch64_sve_container_bits(machine_mode mode)3783 aarch64_sve_container_bits (machine_mode mode)
3784 {
3785   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3786   poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
3787 			     ? BITS_PER_SVE_VECTOR
3788 			     : GET_MODE_BITSIZE (mode));
3789   return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
3790 }
3791 
3792 /* Return the SVE predicate mode to use for elements that have
3793    ELEM_NBYTES bytes, if such a mode exists.  */
3794 
3795 opt_machine_mode
aarch64_sve_pred_mode(unsigned int elem_nbytes)3796 aarch64_sve_pred_mode (unsigned int elem_nbytes)
3797 {
3798   if (TARGET_SVE)
3799     {
3800       if (elem_nbytes == 1)
3801 	return VNx16BImode;
3802       if (elem_nbytes == 2)
3803 	return VNx8BImode;
3804       if (elem_nbytes == 4)
3805 	return VNx4BImode;
3806       if (elem_nbytes == 8)
3807 	return VNx2BImode;
3808     }
3809   return opt_machine_mode ();
3810 }
3811 
3812 /* Return the SVE predicate mode that should be used to control
3813    SVE mode MODE.  */
3814 
3815 machine_mode
aarch64_sve_pred_mode(machine_mode mode)3816 aarch64_sve_pred_mode (machine_mode mode)
3817 {
3818   unsigned int bits = aarch64_sve_container_bits (mode);
3819   return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
3820 }
3821 
3822 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
3823 
3824 static opt_machine_mode
aarch64_get_mask_mode(machine_mode mode)3825 aarch64_get_mask_mode (machine_mode mode)
3826 {
3827   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3828   if (vec_flags & VEC_SVE_DATA)
3829     return aarch64_sve_pred_mode (mode);
3830 
3831   return default_get_mask_mode (mode);
3832 }
3833 
3834 /* Return the integer element mode associated with SVE mode MODE.  */
3835 
3836 static scalar_int_mode
aarch64_sve_element_int_mode(machine_mode mode)3837 aarch64_sve_element_int_mode (machine_mode mode)
3838 {
3839   poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3840 			     ? BITS_PER_SVE_VECTOR
3841 			     : GET_MODE_BITSIZE (mode));
3842   unsigned int elt_bits = vector_element_size (vector_bits,
3843 					       GET_MODE_NUNITS (mode));
3844   return int_mode_for_size (elt_bits, 0).require ();
3845 }
3846 
3847 /* Return an integer element mode that contains exactly
3848    aarch64_sve_container_bits (MODE) bits.  This is wider than
3849    aarch64_sve_element_int_mode if MODE is a partial vector,
3850    otherwise it's the same.  */
3851 
3852 static scalar_int_mode
aarch64_sve_container_int_mode(machine_mode mode)3853 aarch64_sve_container_int_mode (machine_mode mode)
3854 {
3855   return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
3856 }
3857 
3858 /* Return the integer vector mode associated with SVE mode MODE.
3859    Unlike related_int_vector_mode, this can handle the case in which
3860    MODE is a predicate (and thus has a different total size).  */
3861 
3862 machine_mode
aarch64_sve_int_mode(machine_mode mode)3863 aarch64_sve_int_mode (machine_mode mode)
3864 {
3865   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
3866   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
3867 }
3868 
3869 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
3870 
3871 static opt_machine_mode
aarch64_vectorize_related_mode(machine_mode vector_mode,scalar_mode element_mode,poly_uint64 nunits)3872 aarch64_vectorize_related_mode (machine_mode vector_mode,
3873 				scalar_mode element_mode,
3874 				poly_uint64 nunits)
3875 {
3876   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
3877 
3878   /* If we're operating on SVE vectors, try to return an SVE mode.  */
3879   poly_uint64 sve_nunits;
3880   if ((vec_flags & VEC_SVE_DATA)
3881       && multiple_p (BYTES_PER_SVE_VECTOR,
3882 		     GET_MODE_SIZE (element_mode), &sve_nunits))
3883     {
3884       machine_mode sve_mode;
3885       if (maybe_ne (nunits, 0U))
3886 	{
3887 	  /* Try to find a full or partial SVE mode with exactly
3888 	     NUNITS units.  */
3889 	  if (multiple_p (sve_nunits, nunits)
3890 	      && aarch64_sve_data_mode (element_mode,
3891 					nunits).exists (&sve_mode))
3892 	    return sve_mode;
3893 	}
3894       else
3895 	{
3896 	  /* Take the preferred number of units from the number of bytes
3897 	     that fit in VECTOR_MODE.  We always start by "autodetecting"
3898 	     a full vector mode with preferred_simd_mode, so vectors
3899 	     chosen here will also be full vector modes.  Then
3900 	     autovectorize_vector_modes tries smaller starting modes
3901 	     and thus smaller preferred numbers of units.  */
3902 	  sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
3903 	  if (aarch64_sve_data_mode (element_mode,
3904 				     sve_nunits).exists (&sve_mode))
3905 	    return sve_mode;
3906 	}
3907     }
3908 
3909   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
3910   if ((vec_flags & VEC_ADVSIMD)
3911       && known_eq (nunits, 0U)
3912       && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
3913       && maybe_ge (GET_MODE_BITSIZE (element_mode)
3914 		   * GET_MODE_NUNITS (vector_mode), 128U))
3915     {
3916       machine_mode res = aarch64_simd_container_mode (element_mode, 128);
3917       if (VECTOR_MODE_P (res))
3918 	return res;
3919     }
3920 
3921   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
3922 }
3923 
3924 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
3925    prefer to use the first arithmetic operand as the else value if
3926    the else value doesn't matter, since that exactly matches the SVE
3927    destructive merging form.  For ternary operations we could either
3928    pick the first operand and use FMAD-like instructions or the last
3929    operand and use FMLA-like instructions; the latter seems more
3930    natural.  */
3931 
3932 static tree
aarch64_preferred_else_value(unsigned,tree,unsigned int nops,tree * ops)3933 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
3934 {
3935   return nops == 3 ? ops[2] : ops[0];
3936 }
3937 
3938 /* Implement TARGET_HARD_REGNO_NREGS.  */
3939 
3940 static unsigned int
aarch64_hard_regno_nregs(unsigned regno,machine_mode mode)3941 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
3942 {
3943   /* ??? Logically we should only need to provide a value when
3944      HARD_REGNO_MODE_OK says that the combination is valid,
3945      but at the moment we need to handle all modes.  Just ignore
3946      any runtime parts for registers that can't store them.  */
3947   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
3948   switch (aarch64_regno_regclass (regno))
3949     {
3950     case FP_REGS:
3951     case FP_LO_REGS:
3952     case FP_LO8_REGS:
3953       {
3954 	unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3955 	if (vec_flags & VEC_SVE_DATA)
3956 	  return exact_div (GET_MODE_SIZE (mode),
3957 			    aarch64_vl_bytes (mode, vec_flags)).to_constant ();
3958 	if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
3959 	  return GET_MODE_SIZE (mode).to_constant () / 8;
3960 	return CEIL (lowest_size, UNITS_PER_VREG);
3961       }
3962     case PR_REGS:
3963     case PR_LO_REGS:
3964     case PR_HI_REGS:
3965     case FFR_REGS:
3966     case PR_AND_FFR_REGS:
3967       return 1;
3968     default:
3969       return CEIL (lowest_size, UNITS_PER_WORD);
3970     }
3971   gcc_unreachable ();
3972 }
3973 
3974 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
3975 
3976 static bool
aarch64_hard_regno_mode_ok(unsigned regno,machine_mode mode)3977 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
3978 {
3979   if (mode == V8DImode)
3980     return IN_RANGE (regno, R0_REGNUM, R23_REGNUM)
3981            && multiple_p (regno - R0_REGNUM, 2);
3982 
3983   if (GET_MODE_CLASS (mode) == MODE_CC)
3984     return regno == CC_REGNUM;
3985 
3986   if (regno == VG_REGNUM)
3987     /* This must have the same size as _Unwind_Word.  */
3988     return mode == DImode;
3989 
3990   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3991   if (vec_flags & VEC_SVE_PRED)
3992     return pr_or_ffr_regnum_p (regno);
3993 
3994   if (pr_or_ffr_regnum_p (regno))
3995     return false;
3996 
3997   if (regno == SP_REGNUM)
3998     /* The purpose of comparing with ptr_mode is to support the
3999        global register variable associated with the stack pointer
4000        register via the syntax of asm ("wsp") in ILP32.  */
4001     return mode == Pmode || mode == ptr_mode;
4002 
4003   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
4004     return mode == Pmode;
4005 
4006   if (GP_REGNUM_P (regno))
4007     {
4008       if (vec_flags & VEC_ANY_SVE)
4009 	return false;
4010       if (known_le (GET_MODE_SIZE (mode), 8))
4011 	return true;
4012       if (known_le (GET_MODE_SIZE (mode), 16))
4013 	return (regno & 1) == 0;
4014     }
4015   else if (FP_REGNUM_P (regno))
4016     {
4017       if (vec_flags & VEC_STRUCT)
4018 	return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
4019       else
4020 	return !VECTOR_MODE_P (mode) || vec_flags != 0;
4021     }
4022 
4023   return false;
4024 }
4025 
4026 /* Return true if a function with type FNTYPE returns its value in
4027    SVE vector or predicate registers.  */
4028 
4029 static bool
aarch64_returns_value_in_sve_regs_p(const_tree fntype)4030 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
4031 {
4032   tree return_type = TREE_TYPE (fntype);
4033 
4034   pure_scalable_type_info pst_info;
4035   switch (pst_info.analyze (return_type))
4036     {
4037     case pure_scalable_type_info::IS_PST:
4038       return (pst_info.num_zr () <= NUM_FP_ARG_REGS
4039 	      && pst_info.num_pr () <= NUM_PR_ARG_REGS);
4040 
4041     case pure_scalable_type_info::DOESNT_MATTER:
4042       gcc_assert (aarch64_return_in_memory_1 (return_type));
4043       return false;
4044 
4045     case pure_scalable_type_info::NO_ABI_IDENTITY:
4046     case pure_scalable_type_info::ISNT_PST:
4047       return false;
4048     }
4049   gcc_unreachable ();
4050 }
4051 
4052 /* Return true if a function with type FNTYPE takes arguments in
4053    SVE vector or predicate registers.  */
4054 
4055 static bool
aarch64_takes_arguments_in_sve_regs_p(const_tree fntype)4056 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
4057 {
4058   CUMULATIVE_ARGS args_so_far_v;
4059   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
4060 				NULL_TREE, 0, true);
4061   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
4062 
4063   for (tree chain = TYPE_ARG_TYPES (fntype);
4064        chain && chain != void_list_node;
4065        chain = TREE_CHAIN (chain))
4066     {
4067       tree arg_type = TREE_VALUE (chain);
4068       if (arg_type == error_mark_node)
4069 	return false;
4070 
4071       function_arg_info arg (arg_type, /*named=*/true);
4072       apply_pass_by_reference_rules (&args_so_far_v, arg);
4073       pure_scalable_type_info pst_info;
4074       if (pst_info.analyze_registers (arg.type))
4075 	{
4076 	  unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
4077 	  unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
4078 	  gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
4079 	  return true;
4080 	}
4081 
4082       targetm.calls.function_arg_advance (args_so_far, arg);
4083     }
4084   return false;
4085 }
4086 
4087 /* Implement TARGET_FNTYPE_ABI.  */
4088 
4089 static const predefined_function_abi &
aarch64_fntype_abi(const_tree fntype)4090 aarch64_fntype_abi (const_tree fntype)
4091 {
4092   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
4093     return aarch64_simd_abi ();
4094 
4095   if (aarch64_returns_value_in_sve_regs_p (fntype)
4096       || aarch64_takes_arguments_in_sve_regs_p (fntype))
4097     return aarch64_sve_abi ();
4098 
4099   return default_function_abi;
4100 }
4101 
4102 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
4103 
4104 static bool
aarch64_compatible_vector_types_p(const_tree type1,const_tree type2)4105 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
4106 {
4107   return (aarch64_sve::builtin_type_p (type1)
4108 	  == aarch64_sve::builtin_type_p (type2));
4109 }
4110 
4111 /* Return true if we should emit CFI for register REGNO.  */
4112 
4113 static bool
aarch64_emit_cfi_for_reg_p(unsigned int regno)4114 aarch64_emit_cfi_for_reg_p (unsigned int regno)
4115 {
4116   return (GP_REGNUM_P (regno)
4117 	  || !default_function_abi.clobbers_full_reg_p (regno));
4118 }
4119 
4120 /* Return the mode we should use to save and restore register REGNO.  */
4121 
4122 static machine_mode
aarch64_reg_save_mode(unsigned int regno)4123 aarch64_reg_save_mode (unsigned int regno)
4124 {
4125   if (GP_REGNUM_P (regno))
4126     return DImode;
4127 
4128   if (FP_REGNUM_P (regno))
4129     switch (crtl->abi->id ())
4130       {
4131       case ARM_PCS_AAPCS64:
4132 	/* Only the low 64 bits are saved by the base PCS.  */
4133 	return DFmode;
4134 
4135       case ARM_PCS_SIMD:
4136 	/* The vector PCS saves the low 128 bits (which is the full
4137 	   register on non-SVE targets).  */
4138 	return V16QImode;
4139 
4140       case ARM_PCS_SVE:
4141 	/* Use vectors of DImode for registers that need frame
4142 	   information, so that the first 64 bytes of the save slot
4143 	   are always the equivalent of what storing D<n> would give.  */
4144 	if (aarch64_emit_cfi_for_reg_p (regno))
4145 	  return VNx2DImode;
4146 
4147 	/* Use vectors of bytes otherwise, so that the layout is
4148 	   endian-agnostic, and so that we can use LDR and STR for
4149 	   big-endian targets.  */
4150 	return VNx16QImode;
4151 
4152       case ARM_PCS_TLSDESC:
4153       case ARM_PCS_UNKNOWN:
4154 	break;
4155       }
4156 
4157   if (PR_REGNUM_P (regno))
4158     /* Save the full predicate register.  */
4159     return VNx16BImode;
4160 
4161   gcc_unreachable ();
4162 }
4163 
4164 /* Implement TARGET_INSN_CALLEE_ABI.  */
4165 
4166 const predefined_function_abi &
aarch64_insn_callee_abi(const rtx_insn * insn)4167 aarch64_insn_callee_abi (const rtx_insn *insn)
4168 {
4169   rtx pat = PATTERN (insn);
4170   gcc_assert (GET_CODE (pat) == PARALLEL);
4171   rtx unspec = XVECEXP (pat, 0, 1);
4172   gcc_assert (GET_CODE (unspec) == UNSPEC
4173 	      && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
4174   return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
4175 }
4176 
4177 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
4178    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
4179    clobbers the top 64 bits when restoring the bottom 64 bits.  */
4180 
4181 static bool
aarch64_hard_regno_call_part_clobbered(unsigned int abi_id,unsigned int regno,machine_mode mode)4182 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
4183 					unsigned int regno,
4184 					machine_mode mode)
4185 {
4186   if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
4187     {
4188       poly_int64 per_register_size = GET_MODE_SIZE (mode);
4189       unsigned int nregs = hard_regno_nregs (regno, mode);
4190       if (nregs > 1)
4191 	per_register_size = exact_div (per_register_size, nregs);
4192       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
4193 	return maybe_gt (per_register_size, 16);
4194       return maybe_gt (per_register_size, 8);
4195     }
4196   return false;
4197 }
4198 
4199 /* Implement REGMODE_NATURAL_SIZE.  */
4200 poly_uint64
aarch64_regmode_natural_size(machine_mode mode)4201 aarch64_regmode_natural_size (machine_mode mode)
4202 {
4203   /* The natural size for SVE data modes is one SVE data vector,
4204      and similarly for predicates.  We can't independently modify
4205      anything smaller than that.  */
4206   /* ??? For now, only do this for variable-width SVE registers.
4207      Doing it for constant-sized registers breaks lower-subreg.cc.  */
4208   /* ??? And once that's fixed, we should probably have similar
4209      code for Advanced SIMD.  */
4210   if (!aarch64_sve_vg.is_constant ())
4211     {
4212       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
4213       if (vec_flags & VEC_SVE_PRED)
4214 	return BYTES_PER_SVE_PRED;
4215       if (vec_flags & VEC_SVE_DATA)
4216 	return BYTES_PER_SVE_VECTOR;
4217     }
4218   return UNITS_PER_WORD;
4219 }
4220 
4221 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
4222 machine_mode
aarch64_hard_regno_caller_save_mode(unsigned regno,unsigned,machine_mode mode)4223 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
4224 				     machine_mode mode)
4225 {
4226   /* The predicate mode determines which bits are significant and
4227      which are "don't care".  Decreasing the number of lanes would
4228      lose data while increasing the number of lanes would make bits
4229      unnecessarily significant.  */
4230   if (PR_REGNUM_P (regno))
4231     return mode;
4232   if (known_ge (GET_MODE_SIZE (mode), 4))
4233     return mode;
4234   else
4235     return SImode;
4236 }
4237 
4238 /* Return true if I's bits are consecutive ones from the MSB.  */
4239 bool
aarch64_high_bits_all_ones_p(HOST_WIDE_INT i)4240 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
4241 {
4242   return exact_log2 (-i) != HOST_WIDE_INT_M1;
4243 }
4244 
4245 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
4246    that strcpy from constants will be faster.  */
4247 
4248 static HOST_WIDE_INT
aarch64_constant_alignment(const_tree exp,HOST_WIDE_INT align)4249 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
4250 {
4251   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
4252     return MAX (align, BITS_PER_WORD);
4253   return align;
4254 }
4255 
4256 /* Return true if calls to DECL should be treated as
4257    long-calls (ie called via a register).  */
4258 static bool
aarch64_decl_is_long_call_p(const_tree decl ATTRIBUTE_UNUSED)4259 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
4260 {
4261   return false;
4262 }
4263 
4264 /* Return true if calls to symbol-ref SYM should be treated as
4265    long-calls (ie called via a register).  */
4266 bool
aarch64_is_long_call_p(rtx sym)4267 aarch64_is_long_call_p (rtx sym)
4268 {
4269   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
4270 }
4271 
4272 /* Return true if calls to symbol-ref SYM should not go through
4273    plt stubs.  */
4274 
4275 bool
aarch64_is_noplt_call_p(rtx sym)4276 aarch64_is_noplt_call_p (rtx sym)
4277 {
4278   const_tree decl = SYMBOL_REF_DECL (sym);
4279 
4280   if (flag_pic
4281       && decl
4282       && (!flag_plt
4283 	  || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
4284       && !targetm.binds_local_p (decl))
4285     return true;
4286 
4287   return false;
4288 }
4289 
4290 /* Emit an insn that's a simple single-set.  Both the operands must be
4291    known to be valid.  */
4292 inline static rtx_insn *
emit_set_insn(rtx x,rtx y)4293 emit_set_insn (rtx x, rtx y)
4294 {
4295   return emit_insn (gen_rtx_SET (x, y));
4296 }
4297 
4298 /* X and Y are two things to compare using CODE.  Emit the compare insn and
4299    return the rtx for register 0 in the proper mode.  */
4300 rtx
aarch64_gen_compare_reg(RTX_CODE code,rtx x,rtx y)4301 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
4302 {
4303   machine_mode cmp_mode = GET_MODE (x);
4304   machine_mode cc_mode;
4305   rtx cc_reg;
4306 
4307   if (cmp_mode == TImode)
4308     {
4309       gcc_assert (code == NE);
4310 
4311       cc_mode = CCmode;
4312       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4313 
4314       rtx x_lo = operand_subword (x, 0, 0, TImode);
4315       rtx y_lo = operand_subword (y, 0, 0, TImode);
4316       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
4317 
4318       rtx x_hi = operand_subword (x, 1, 0, TImode);
4319       rtx y_hi = operand_subword (y, 1, 0, TImode);
4320       emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
4321 			       gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
4322 			       GEN_INT (AARCH64_EQ)));
4323     }
4324   else
4325     {
4326       cc_mode = SELECT_CC_MODE (code, x, y);
4327       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4328       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
4329     }
4330   return cc_reg;
4331 }
4332 
4333 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
4334 
4335 static rtx
aarch64_gen_compare_reg_maybe_ze(RTX_CODE code,rtx x,rtx y,machine_mode y_mode)4336 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
4337                                   machine_mode y_mode)
4338 {
4339   if (y_mode == E_QImode || y_mode == E_HImode)
4340     {
4341       if (CONST_INT_P (y))
4342 	{
4343 	  y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
4344 	  y_mode = SImode;
4345 	}
4346       else
4347 	{
4348 	  rtx t, cc_reg;
4349 	  machine_mode cc_mode;
4350 
4351 	  t = gen_rtx_ZERO_EXTEND (SImode, y);
4352 	  t = gen_rtx_COMPARE (CC_SWPmode, t, x);
4353 	  cc_mode = CC_SWPmode;
4354 	  cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4355 	  emit_set_insn (cc_reg, t);
4356 	  return cc_reg;
4357 	}
4358     }
4359 
4360   if (!aarch64_plus_operand (y, y_mode))
4361     y = force_reg (y_mode, y);
4362 
4363   return aarch64_gen_compare_reg (code, x, y);
4364 }
4365 
4366 /* Consider the operation:
4367 
4368      OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
4369 
4370    where:
4371 
4372    - CODE is [SU]MAX or [SU]MIN
4373    - OPERANDS[2] and OPERANDS[3] are constant integers
4374    - OPERANDS[3] is a positive or negative shifted 12-bit immediate
4375    - all operands have mode MODE
4376 
4377    Decide whether it is possible to implement the operation using:
4378 
4379      SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
4380      or
4381      ADDS <tmp>, OPERANDS[1], OPERANDS[3]
4382 
4383    followed by:
4384 
4385      <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
4386 
4387    where <insn> is one of CSEL, CSINV or CSINC.  Return true if so.
4388    If GENERATE_P is true, also update OPERANDS as follows:
4389 
4390      OPERANDS[4] = -OPERANDS[3]
4391      OPERANDS[5] = the rtl condition representing <cond>
4392      OPERANDS[6] = <tmp>
4393      OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC.  */
4394 bool
aarch64_maxmin_plus_const(rtx_code code,rtx * operands,bool generate_p)4395 aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p)
4396 {
4397   signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED);
4398   rtx dst = operands[0];
4399   rtx maxmin_op = operands[2];
4400   rtx add_op = operands[3];
4401   machine_mode mode = GET_MODE (dst);
4402 
4403   /* max (x, y) - z == (x >= y + 1 ? x : y) - z
4404 		    == (x >= y ? x : y) - z
4405 		    == (x > y ? x : y) - z
4406 		    == (x > y - 1 ? x : y) - z
4407 
4408      min (x, y) - z == (x <= y - 1 ? x : y) - z
4409 		    == (x <= y ? x : y) - z
4410 		    == (x < y ? x : y) - z
4411 		    == (x < y + 1 ? x : y) - z
4412 
4413      Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
4414      which x is compared with z.  Set DIFF to y - z.  Thus the supported
4415      combinations are as follows, with DIFF being the value after the ":":
4416 
4417      max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1   [z == y + 1]
4418 		    == x >= y ? x - y : 0              [z == y]
4419 		    == x > y ? x - y : 0               [z == y]
4420 		    == x > y - 1 ? x - (y - 1) : 1     [z == y - 1]
4421 
4422      min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1    [z == y - 1]
4423 		    == x <= y ? x - y : 0              [z == y]
4424 		    == x < y ? x - y : 0               [z == y]
4425 		    == x < y + 1 ? x - (y + 1) : -1    [z == y + 1].  */
4426   auto maxmin_val = rtx_mode_t (maxmin_op, mode);
4427   auto add_val = rtx_mode_t (add_op, mode);
4428   auto sub_val = wi::neg (add_val);
4429   auto diff = wi::sub (maxmin_val, sub_val);
4430   if (!(diff == 0
4431 	|| (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn))
4432 	|| (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn))))
4433     return false;
4434 
4435   if (!generate_p)
4436     return true;
4437 
4438   rtx_code cmp;
4439   switch (code)
4440     {
4441     case SMAX:
4442       cmp = diff == 1 ? GT : GE;
4443       break;
4444     case UMAX:
4445       cmp = diff == 1 ? GTU : GEU;
4446       break;
4447     case SMIN:
4448       cmp = diff == -1 ? LT : LE;
4449       break;
4450     case UMIN:
4451       cmp = diff == -1 ? LTU : LEU;
4452       break;
4453     default:
4454       gcc_unreachable ();
4455     }
4456   rtx cc = gen_rtx_REG (CCmode, CC_REGNUM);
4457 
4458   operands[4] = immed_wide_int_const (sub_val, mode);
4459   operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx);
4460   if (can_create_pseudo_p ())
4461     operands[6] = gen_reg_rtx (mode);
4462   else
4463     operands[6] = dst;
4464   operands[7] = immed_wide_int_const (diff, mode);
4465 
4466   return true;
4467 }
4468 
4469 
4470 /* Build the SYMBOL_REF for __tls_get_addr.  */
4471 
4472 static GTY(()) rtx tls_get_addr_libfunc;
4473 
4474 rtx
aarch64_tls_get_addr(void)4475 aarch64_tls_get_addr (void)
4476 {
4477   if (!tls_get_addr_libfunc)
4478     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
4479   return tls_get_addr_libfunc;
4480 }
4481 
4482 /* Return the TLS model to use for ADDR.  */
4483 
4484 static enum tls_model
tls_symbolic_operand_type(rtx addr)4485 tls_symbolic_operand_type (rtx addr)
4486 {
4487   enum tls_model tls_kind = TLS_MODEL_NONE;
4488   poly_int64 offset;
4489   addr = strip_offset_and_salt (addr, &offset);
4490   if (SYMBOL_REF_P (addr))
4491     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
4492 
4493   return tls_kind;
4494 }
4495 
4496 /* We'll allow lo_sum's in addresses in our legitimate addresses
4497    so that combine would take care of combining addresses where
4498    necessary, but for generation purposes, we'll generate the address
4499    as :
4500    RTL                               Absolute
4501    tmp = hi (symbol_ref);            adrp  x1, foo
4502    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
4503                                      nop
4504 
4505    PIC                               TLS
4506    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
4507    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
4508                                      bl   __tls_get_addr
4509                                      nop
4510 
4511    Load TLS symbol, depending on TLS mechanism and TLS access model.
4512 
4513    Global Dynamic - Traditional TLS:
4514    adrp tmp, :tlsgd:imm
4515    add  dest, tmp, #:tlsgd_lo12:imm
4516    bl   __tls_get_addr
4517 
4518    Global Dynamic - TLS Descriptors:
4519    adrp dest, :tlsdesc:imm
4520    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
4521    add  dest, dest, #:tlsdesc_lo12:imm
4522    blr  tmp
4523    mrs  tp, tpidr_el0
4524    add  dest, dest, tp
4525 
4526    Initial Exec:
4527    mrs  tp, tpidr_el0
4528    adrp tmp, :gottprel:imm
4529    ldr  dest, [tmp, #:gottprel_lo12:imm]
4530    add  dest, dest, tp
4531 
4532    Local Exec:
4533    mrs  tp, tpidr_el0
4534    add  t0, tp, #:tprel_hi12:imm, lsl #12
4535    add  t0, t0, #:tprel_lo12_nc:imm
4536 */
4537 
4538 static void
aarch64_load_symref_appropriately(rtx dest,rtx imm,enum aarch64_symbol_type type)4539 aarch64_load_symref_appropriately (rtx dest, rtx imm,
4540 				   enum aarch64_symbol_type type)
4541 {
4542   switch (type)
4543     {
4544     case SYMBOL_SMALL_ABSOLUTE:
4545       {
4546 	/* In ILP32, the mode of dest can be either SImode or DImode.  */
4547 	rtx tmp_reg = dest;
4548 	machine_mode mode = GET_MODE (dest);
4549 
4550 	gcc_assert (mode == Pmode || mode == ptr_mode);
4551 
4552 	if (can_create_pseudo_p ())
4553 	  tmp_reg = gen_reg_rtx (mode);
4554 
4555 	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
4556 	emit_insn (gen_add_losym (dest, tmp_reg, imm));
4557 	return;
4558       }
4559 
4560     case SYMBOL_TINY_ABSOLUTE:
4561       emit_insn (gen_rtx_SET (dest, imm));
4562       return;
4563 
4564     case SYMBOL_SMALL_GOT_28K:
4565       {
4566 	machine_mode mode = GET_MODE (dest);
4567 	rtx gp_rtx = pic_offset_table_rtx;
4568 	rtx insn;
4569 	rtx mem;
4570 
4571 	/* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
4572 	   here before rtl expand.  Tree IVOPT will generate rtl pattern to
4573 	   decide rtx costs, in which case pic_offset_table_rtx is not
4574 	   initialized.  For that case no need to generate the first adrp
4575 	   instruction as the final cost for global variable access is
4576 	   one instruction.  */
4577 	if (gp_rtx != NULL)
4578 	  {
4579 	    /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
4580 	       using the page base as GOT base, the first page may be wasted,
4581 	       in the worst scenario, there is only 28K space for GOT).
4582 
4583 	       The generate instruction sequence for accessing global variable
4584 	       is:
4585 
4586 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
4587 
4588 	       Only one instruction needed. But we must initialize
4589 	       pic_offset_table_rtx properly.  We generate initialize insn for
4590 	       every global access, and allow CSE to remove all redundant.
4591 
4592 	       The final instruction sequences will look like the following
4593 	       for multiply global variables access.
4594 
4595 		 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
4596 
4597 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
4598 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
4599 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
4600 		 ...  */
4601 
4602 	    rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
4603 	    crtl->uses_pic_offset_table = 1;
4604 	    emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
4605 
4606 	    if (mode != GET_MODE (gp_rtx))
4607              gp_rtx = gen_lowpart (mode, gp_rtx);
4608 
4609 	  }
4610 
4611 	if (mode == ptr_mode)
4612 	  {
4613 	    if (mode == DImode)
4614 	      insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
4615 	    else
4616 	      insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
4617 
4618 	    mem = XVECEXP (SET_SRC (insn), 0, 0);
4619 	  }
4620 	else
4621 	  {
4622 	    gcc_assert (mode == Pmode);
4623 
4624 	    insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
4625 	    mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
4626 	  }
4627 
4628 	/* The operand is expected to be MEM.  Whenever the related insn
4629 	   pattern changed, above code which calculate mem should be
4630 	   updated.  */
4631 	gcc_assert (MEM_P (mem));
4632 	MEM_READONLY_P (mem) = 1;
4633 	MEM_NOTRAP_P (mem) = 1;
4634 	emit_insn (insn);
4635 	return;
4636       }
4637 
4638     case SYMBOL_SMALL_GOT_4G:
4639       emit_insn (gen_rtx_SET (dest, imm));
4640       return;
4641 
4642     case SYMBOL_SMALL_TLSGD:
4643       {
4644 	rtx_insn *insns;
4645 	/* The return type of __tls_get_addr is the C pointer type
4646 	   so use ptr_mode.  */
4647 	rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
4648 	rtx tmp_reg = dest;
4649 
4650 	if (GET_MODE (dest) != ptr_mode)
4651 	  tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
4652 
4653 	start_sequence ();
4654 	if (ptr_mode == SImode)
4655 	  aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
4656 	else
4657 	  aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
4658 	insns = get_insns ();
4659 	end_sequence ();
4660 
4661 	RTL_CONST_CALL_P (insns) = 1;
4662 	emit_libcall_block (insns, tmp_reg, result, imm);
4663 	/* Convert back to the mode of the dest adding a zero_extend
4664 	   from SImode (ptr_mode) to DImode (Pmode). */
4665 	if (dest != tmp_reg)
4666 	  convert_move (dest, tmp_reg, true);
4667 	return;
4668       }
4669 
4670     case SYMBOL_SMALL_TLSDESC:
4671       {
4672 	machine_mode mode = GET_MODE (dest);
4673 	rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
4674 	rtx tp;
4675 
4676 	gcc_assert (mode == Pmode || mode == ptr_mode);
4677 
4678 	/* In ILP32, the got entry is always of SImode size.  Unlike
4679 	   small GOT, the dest is fixed at reg 0.  */
4680 	if (TARGET_ILP32)
4681 	  emit_insn (gen_tlsdesc_small_si (imm));
4682 	else
4683 	  emit_insn (gen_tlsdesc_small_di (imm));
4684 	tp = aarch64_load_tp (NULL);
4685 
4686 	if (mode != Pmode)
4687 	  tp = gen_lowpart (mode, tp);
4688 
4689 	emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
4690 	if (REG_P (dest))
4691 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4692 	return;
4693       }
4694 
4695     case SYMBOL_SMALL_TLSIE:
4696       {
4697 	/* In ILP32, the mode of dest can be either SImode or DImode,
4698 	   while the got entry is always of SImode size.  The mode of
4699 	   dest depends on how dest is used: if dest is assigned to a
4700 	   pointer (e.g. in the memory), it has SImode; it may have
4701 	   DImode if dest is dereferenced to access the memeory.
4702 	   This is why we have to handle three different tlsie_small
4703 	   patterns here (two patterns for ILP32).  */
4704 	machine_mode mode = GET_MODE (dest);
4705 	rtx tmp_reg = gen_reg_rtx (mode);
4706 	rtx tp = aarch64_load_tp (NULL);
4707 
4708 	if (mode == ptr_mode)
4709 	  {
4710 	    if (mode == DImode)
4711 	      emit_insn (gen_tlsie_small_di (tmp_reg, imm));
4712 	    else
4713 	      {
4714 		emit_insn (gen_tlsie_small_si (tmp_reg, imm));
4715 		tp = gen_lowpart (mode, tp);
4716 	      }
4717 	  }
4718 	else
4719 	  {
4720 	    gcc_assert (mode == Pmode);
4721 	    emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
4722 	  }
4723 
4724 	emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
4725 	if (REG_P (dest))
4726 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4727 	return;
4728       }
4729 
4730     case SYMBOL_TLSLE12:
4731     case SYMBOL_TLSLE24:
4732     case SYMBOL_TLSLE32:
4733     case SYMBOL_TLSLE48:
4734       {
4735 	machine_mode mode = GET_MODE (dest);
4736 	rtx tp = aarch64_load_tp (NULL);
4737 
4738 	if (mode != Pmode)
4739 	  tp = gen_lowpart (mode, tp);
4740 
4741 	switch (type)
4742 	  {
4743 	  case SYMBOL_TLSLE12:
4744 	    emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
4745 			(dest, tp, imm));
4746 	    break;
4747 	  case SYMBOL_TLSLE24:
4748 	    emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
4749 			(dest, tp, imm));
4750 	  break;
4751 	  case SYMBOL_TLSLE32:
4752 	    emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
4753 			(dest, imm));
4754 	    emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4755 			(dest, dest, tp));
4756 	  break;
4757 	  case SYMBOL_TLSLE48:
4758 	    emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
4759 			(dest, imm));
4760 	    emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4761 			(dest, dest, tp));
4762 	    break;
4763 	  default:
4764 	    gcc_unreachable ();
4765 	  }
4766 
4767 	if (REG_P (dest))
4768 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4769 	return;
4770       }
4771 
4772     case SYMBOL_TINY_GOT:
4773       {
4774 	rtx insn;
4775 	machine_mode mode = GET_MODE (dest);
4776 
4777 	if (mode == ptr_mode)
4778 	  insn = gen_ldr_got_tiny (mode, dest, imm);
4779 	else
4780 	  {
4781 	    gcc_assert (mode == Pmode);
4782 	    insn = gen_ldr_got_tiny_sidi (dest, imm);
4783 	  }
4784 
4785 	emit_insn (insn);
4786 	return;
4787       }
4788 
4789     case SYMBOL_TINY_TLSIE:
4790       {
4791 	machine_mode mode = GET_MODE (dest);
4792 	rtx tp = aarch64_load_tp (NULL);
4793 
4794 	if (mode == ptr_mode)
4795 	  {
4796 	    if (mode == DImode)
4797 	      emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
4798 	    else
4799 	      {
4800 		tp = gen_lowpart (mode, tp);
4801 		emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
4802 	      }
4803 	  }
4804 	else
4805 	  {
4806 	    gcc_assert (mode == Pmode);
4807 	    emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
4808 	  }
4809 
4810 	if (REG_P (dest))
4811 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4812 	return;
4813       }
4814 
4815     default:
4816       gcc_unreachable ();
4817     }
4818 }
4819 
4820 /* Emit a move from SRC to DEST.  Assume that the move expanders can
4821    handle all moves if !can_create_pseudo_p ().  The distinction is
4822    important because, unlike emit_move_insn, the move expanders know
4823    how to force Pmode objects into the constant pool even when the
4824    constant pool address is not itself legitimate.  */
4825 static rtx
aarch64_emit_move(rtx dest,rtx src)4826 aarch64_emit_move (rtx dest, rtx src)
4827 {
4828   return (can_create_pseudo_p ()
4829 	  ? emit_move_insn (dest, src)
4830 	  : emit_move_insn_1 (dest, src));
4831 }
4832 
4833 /* Apply UNOPTAB to OP and store the result in DEST.  */
4834 
4835 static void
aarch64_emit_unop(rtx dest,optab unoptab,rtx op)4836 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
4837 {
4838   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
4839   if (dest != tmp)
4840     emit_move_insn (dest, tmp);
4841 }
4842 
4843 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
4844 
4845 static void
aarch64_emit_binop(rtx dest,optab binoptab,rtx op0,rtx op1)4846 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
4847 {
4848   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
4849 			  OPTAB_DIRECT);
4850   if (dest != tmp)
4851     emit_move_insn (dest, tmp);
4852 }
4853 
4854 /* Split a 128-bit move operation into two 64-bit move operations,
4855    taking care to handle partial overlap of register to register
4856    copies.  Special cases are needed when moving between GP regs and
4857    FP regs.  SRC can be a register, constant or memory; DST a register
4858    or memory.  If either operand is memory it must not have any side
4859    effects.  */
4860 void
aarch64_split_128bit_move(rtx dst,rtx src)4861 aarch64_split_128bit_move (rtx dst, rtx src)
4862 {
4863   rtx dst_lo, dst_hi;
4864   rtx src_lo, src_hi;
4865 
4866   machine_mode mode = GET_MODE (dst);
4867 
4868   gcc_assert (mode == TImode || mode == TFmode);
4869   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
4870   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
4871 
4872   if (REG_P (dst) && REG_P (src))
4873     {
4874       int src_regno = REGNO (src);
4875       int dst_regno = REGNO (dst);
4876 
4877       /* Handle FP <-> GP regs.  */
4878       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
4879 	{
4880 	  src_lo = gen_lowpart (word_mode, src);
4881 	  src_hi = gen_highpart (word_mode, src);
4882 
4883 	  emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
4884 	  emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
4885 	  return;
4886 	}
4887       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
4888 	{
4889 	  dst_lo = gen_lowpart (word_mode, dst);
4890 	  dst_hi = gen_highpart (word_mode, dst);
4891 
4892 	  emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
4893 	  emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
4894 	  return;
4895 	}
4896     }
4897 
4898   dst_lo = gen_lowpart (word_mode, dst);
4899   dst_hi = gen_highpart (word_mode, dst);
4900   src_lo = gen_lowpart (word_mode, src);
4901   src_hi = gen_highpart_mode (word_mode, mode, src);
4902 
4903   /* At most one pairing may overlap.  */
4904   if (reg_overlap_mentioned_p (dst_lo, src_hi))
4905     {
4906       aarch64_emit_move (dst_hi, src_hi);
4907       aarch64_emit_move (dst_lo, src_lo);
4908     }
4909   else
4910     {
4911       aarch64_emit_move (dst_lo, src_lo);
4912       aarch64_emit_move (dst_hi, src_hi);
4913     }
4914 }
4915 
4916 /* Return true if we should split a move from 128-bit value SRC
4917    to 128-bit register DEST.  */
4918 
4919 bool
aarch64_split_128bit_move_p(rtx dst,rtx src)4920 aarch64_split_128bit_move_p (rtx dst, rtx src)
4921 {
4922   if (FP_REGNUM_P (REGNO (dst)))
4923     return REG_P (src) && !FP_REGNUM_P (REGNO (src));
4924   /* All moves to GPRs need to be split.  */
4925   return true;
4926 }
4927 
4928 /* Split a complex SIMD move.  */
4929 
4930 void
aarch64_split_simd_move(rtx dst,rtx src)4931 aarch64_split_simd_move (rtx dst, rtx src)
4932 {
4933   machine_mode src_mode = GET_MODE (src);
4934   machine_mode dst_mode = GET_MODE (dst);
4935 
4936   gcc_assert (VECTOR_MODE_P (dst_mode));
4937 
4938   if (REG_P (dst) && REG_P (src))
4939     {
4940       gcc_assert (VECTOR_MODE_P (src_mode));
4941       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
4942     }
4943 }
4944 
4945 bool
aarch64_zero_extend_const_eq(machine_mode xmode,rtx x,machine_mode ymode,rtx y)4946 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
4947 			      machine_mode ymode, rtx y)
4948 {
4949   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
4950   gcc_assert (r != NULL);
4951   return rtx_equal_p (x, r);
4952 }
4953 
4954 /* Return TARGET if it is nonnull and a register of mode MODE.
4955    Otherwise, return a fresh register of mode MODE if we can,
4956    or TARGET reinterpreted as MODE if we can't.  */
4957 
4958 static rtx
aarch64_target_reg(rtx target,machine_mode mode)4959 aarch64_target_reg (rtx target, machine_mode mode)
4960 {
4961   if (target && REG_P (target) && GET_MODE (target) == mode)
4962     return target;
4963   if (!can_create_pseudo_p ())
4964     {
4965       gcc_assert (target);
4966       return gen_lowpart (mode, target);
4967     }
4968   return gen_reg_rtx (mode);
4969 }
4970 
4971 /* Return a register that contains the constant in BUILDER, given that
4972    the constant is a legitimate move operand.  Use TARGET as the register
4973    if it is nonnull and convenient.  */
4974 
4975 static rtx
aarch64_emit_set_immediate(rtx target,rtx_vector_builder & builder)4976 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
4977 {
4978   rtx src = builder.build ();
4979   target = aarch64_target_reg (target, GET_MODE (src));
4980   emit_insn (gen_rtx_SET (target, src));
4981   return target;
4982 }
4983 
4984 static rtx
aarch64_force_temporary(machine_mode mode,rtx x,rtx value)4985 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
4986 {
4987   if (can_create_pseudo_p ())
4988     return force_reg (mode, value);
4989   else
4990     {
4991       gcc_assert (x);
4992       aarch64_emit_move (x, value);
4993       return x;
4994     }
4995 }
4996 
4997 /* Return true if predicate value X is a constant in which every element
4998    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
4999    value, i.e. as a predicate in which all bits are significant.  */
5000 
5001 static bool
aarch64_get_sve_pred_bits(rtx_vector_builder & builder,rtx x)5002 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
5003 {
5004   if (!CONST_VECTOR_P (x))
5005     return false;
5006 
5007   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
5008 					     GET_MODE_NUNITS (GET_MODE (x)));
5009   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
5010   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
5011   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
5012 
5013   unsigned int nelts = const_vector_encoded_nelts (x);
5014   for (unsigned int i = 0; i < nelts; ++i)
5015     {
5016       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
5017       if (!CONST_INT_P (elt))
5018 	return false;
5019 
5020       builder.quick_push (elt);
5021       for (unsigned int j = 1; j < factor; ++j)
5022 	builder.quick_push (const0_rtx);
5023     }
5024   builder.finalize ();
5025   return true;
5026 }
5027 
5028 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
5029    widest predicate element size it can have (that is, the largest size
5030    for which each element would still be 0 or 1).  */
5031 
5032 unsigned int
aarch64_widest_sve_pred_elt_size(rtx_vector_builder & builder)5033 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
5034 {
5035   /* Start with the most optimistic assumption: that we only need
5036      one bit per pattern.  This is what we will use if only the first
5037      bit in each pattern is ever set.  */
5038   unsigned int mask = GET_MODE_SIZE (DImode);
5039   mask |= builder.npatterns ();
5040 
5041   /* Look for set bits.  */
5042   unsigned int nelts = builder.encoded_nelts ();
5043   for (unsigned int i = 1; i < nelts; ++i)
5044     if (INTVAL (builder.elt (i)) != 0)
5045       {
5046 	if (i & 1)
5047 	  return 1;
5048 	mask |= i;
5049       }
5050   return mask & -mask;
5051 }
5052 
5053 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
5054    return that predicate mode, otherwise return opt_machine_mode ().  */
5055 
5056 opt_machine_mode
aarch64_ptrue_all_mode(rtx x)5057 aarch64_ptrue_all_mode (rtx x)
5058 {
5059   gcc_assert (GET_MODE (x) == VNx16BImode);
5060   if (!CONST_VECTOR_P (x)
5061       || !CONST_VECTOR_DUPLICATE_P (x)
5062       || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
5063       || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
5064     return opt_machine_mode ();
5065 
5066   unsigned int nelts = const_vector_encoded_nelts (x);
5067   for (unsigned int i = 1; i < nelts; ++i)
5068     if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
5069       return opt_machine_mode ();
5070 
5071   return aarch64_sve_pred_mode (nelts);
5072 }
5073 
5074 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
5075    that the constant would have with predicate element size ELT_SIZE
5076    (ignoring the upper bits in each element) and return:
5077 
5078    * -1 if all bits are set
5079    * N if the predicate has N leading set bits followed by all clear bits
5080    * 0 if the predicate does not have any of these forms.  */
5081 
5082 int
aarch64_partial_ptrue_length(rtx_vector_builder & builder,unsigned int elt_size)5083 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
5084 			      unsigned int elt_size)
5085 {
5086   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
5087      followed by set bits.  */
5088   if (builder.nelts_per_pattern () == 3)
5089     return 0;
5090 
5091   /* Skip over leading set bits.  */
5092   unsigned int nelts = builder.encoded_nelts ();
5093   unsigned int i = 0;
5094   for (; i < nelts; i += elt_size)
5095     if (INTVAL (builder.elt (i)) == 0)
5096       break;
5097   unsigned int vl = i / elt_size;
5098 
5099   /* Check for the all-true case.  */
5100   if (i == nelts)
5101     return -1;
5102 
5103   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
5104      repeating pattern of set bits followed by clear bits.  */
5105   if (builder.nelts_per_pattern () != 2)
5106     return 0;
5107 
5108   /* We have a "foreground" value and a duplicated "background" value.
5109      If the background might repeat and the last set bit belongs to it,
5110      we might have set bits followed by clear bits followed by set bits.  */
5111   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
5112     return 0;
5113 
5114   /* Make sure that the rest are all clear.  */
5115   for (; i < nelts; i += elt_size)
5116     if (INTVAL (builder.elt (i)) != 0)
5117       return 0;
5118 
5119   return vl;
5120 }
5121 
5122 /* See if there is an svpattern that encodes an SVE predicate of mode
5123    PRED_MODE in which the first VL bits are set and the rest are clear.
5124    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
5125    A VL of -1 indicates an all-true vector.  */
5126 
5127 aarch64_svpattern
aarch64_svpattern_for_vl(machine_mode pred_mode,int vl)5128 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
5129 {
5130   if (vl < 0)
5131     return AARCH64_SV_ALL;
5132 
5133   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
5134     return AARCH64_NUM_SVPATTERNS;
5135 
5136   if (vl >= 1 && vl <= 8)
5137     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
5138 
5139   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
5140     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
5141 
5142   int max_vl;
5143   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
5144     {
5145       if (vl == (max_vl / 3) * 3)
5146 	return AARCH64_SV_MUL3;
5147       /* These would only trigger for non-power-of-2 lengths.  */
5148       if (vl == (max_vl & -4))
5149 	return AARCH64_SV_MUL4;
5150       if (vl == (1 << floor_log2 (max_vl)))
5151 	return AARCH64_SV_POW2;
5152       if (vl == max_vl)
5153 	return AARCH64_SV_ALL;
5154     }
5155   return AARCH64_NUM_SVPATTERNS;
5156 }
5157 
5158 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
5159    bits has the lowest bit set and the upper bits clear.  This is the
5160    VNx16BImode equivalent of a PTRUE for controlling elements of
5161    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
5162    all bits are significant, even the upper zeros.  */
5163 
5164 rtx
aarch64_ptrue_all(unsigned int elt_size)5165 aarch64_ptrue_all (unsigned int elt_size)
5166 {
5167   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
5168   builder.quick_push (const1_rtx);
5169   for (unsigned int i = 1; i < elt_size; ++i)
5170     builder.quick_push (const0_rtx);
5171   return builder.build ();
5172 }
5173 
5174 /* Return an all-true predicate register of mode MODE.  */
5175 
5176 rtx
aarch64_ptrue_reg(machine_mode mode)5177 aarch64_ptrue_reg (machine_mode mode)
5178 {
5179   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
5180   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
5181   return gen_lowpart (mode, reg);
5182 }
5183 
5184 /* Return an all-false predicate register of mode MODE.  */
5185 
5186 rtx
aarch64_pfalse_reg(machine_mode mode)5187 aarch64_pfalse_reg (machine_mode mode)
5188 {
5189   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
5190   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
5191   return gen_lowpart (mode, reg);
5192 }
5193 
5194 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
5195    for it.  PRED2[0] is the predicate for the instruction whose result
5196    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
5197    for it.  Return true if we can prove that the two predicates are
5198    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
5199    with PRED1[0] without changing behavior.  */
5200 
5201 bool
aarch64_sve_same_pred_for_ptest_p(rtx * pred1,rtx * pred2)5202 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
5203 {
5204   machine_mode mode = GET_MODE (pred1[0]);
5205   gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
5206 	      && mode == GET_MODE (pred2[0])
5207 	      && aarch64_sve_ptrue_flag (pred1[1], SImode)
5208 	      && aarch64_sve_ptrue_flag (pred2[1], SImode));
5209 
5210   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
5211 		   || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
5212   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
5213 		   || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
5214   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
5215 }
5216 
5217 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
5218    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
5219    Use TARGET as the target register if nonnull and convenient.  */
5220 
5221 static rtx
aarch64_sve_emit_int_cmp(rtx target,machine_mode pred_mode,rtx_code cmp,machine_mode data_mode,rtx op1,rtx op2)5222 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
5223 			  machine_mode data_mode, rtx op1, rtx op2)
5224 {
5225   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
5226   expand_operand ops[5];
5227   create_output_operand (&ops[0], target, pred_mode);
5228   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
5229   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
5230   create_input_operand (&ops[3], op1, data_mode);
5231   create_input_operand (&ops[4], op2, data_mode);
5232   expand_insn (icode, 5, ops);
5233   return ops[0].value;
5234 }
5235 
5236 /* Use a comparison to convert integer vector SRC into MODE, which is
5237    the corresponding SVE predicate mode.  Use TARGET for the result
5238    if it's nonnull and convenient.  */
5239 
5240 rtx
aarch64_convert_sve_data_to_pred(rtx target,machine_mode mode,rtx src)5241 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
5242 {
5243   machine_mode src_mode = GET_MODE (src);
5244   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
5245 				   src, CONST0_RTX (src_mode));
5246 }
5247 
5248 /* Return the assembly token for svprfop value PRFOP.  */
5249 
5250 static const char *
svprfop_token(enum aarch64_svprfop prfop)5251 svprfop_token (enum aarch64_svprfop prfop)
5252 {
5253   switch (prfop)
5254     {
5255 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
5256     AARCH64_FOR_SVPRFOP (CASE)
5257 #undef CASE
5258     case AARCH64_NUM_SVPRFOPS:
5259       break;
5260     }
5261   gcc_unreachable ();
5262 }
5263 
5264 /* Return the assembly string for an SVE prefetch operation with
5265    mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
5266    and that SUFFIX is the format for the remaining operands.  */
5267 
5268 char *
aarch64_output_sve_prefetch(const char * mnemonic,rtx prfop_rtx,const char * suffix)5269 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
5270 			     const char *suffix)
5271 {
5272   static char buffer[128];
5273   aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
5274   unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
5275 				   mnemonic, svprfop_token (prfop), suffix);
5276   gcc_assert (written < sizeof (buffer));
5277   return buffer;
5278 }
5279 
5280 /* Check whether we can calculate the number of elements in PATTERN
5281    at compile time, given that there are NELTS_PER_VQ elements per
5282    128-bit block.  Return the value if so, otherwise return -1.  */
5283 
5284 HOST_WIDE_INT
aarch64_fold_sve_cnt_pat(aarch64_svpattern pattern,unsigned int nelts_per_vq)5285 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
5286 {
5287   unsigned int vl, const_vg;
5288   if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
5289     vl = 1 + (pattern - AARCH64_SV_VL1);
5290   else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
5291     vl = 16 << (pattern - AARCH64_SV_VL16);
5292   else if (aarch64_sve_vg.is_constant (&const_vg))
5293     {
5294       /* There are two vector granules per quadword.  */
5295       unsigned int nelts = (const_vg / 2) * nelts_per_vq;
5296       switch (pattern)
5297 	{
5298 	case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
5299 	case AARCH64_SV_MUL4: return nelts & -4;
5300 	case AARCH64_SV_MUL3: return (nelts / 3) * 3;
5301 	case AARCH64_SV_ALL: return nelts;
5302 	default: gcc_unreachable ();
5303 	}
5304     }
5305   else
5306     return -1;
5307 
5308   /* There are two vector granules per quadword.  */
5309   poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
5310   if (known_le (vl, nelts_all))
5311     return vl;
5312 
5313   /* Requesting more elements than are available results in a PFALSE.  */
5314   if (known_gt (vl, nelts_all))
5315     return 0;
5316 
5317   return -1;
5318 }
5319 
5320 /* Return true if we can move VALUE into a register using a single
5321    CNT[BHWD] instruction.  */
5322 
5323 static bool
aarch64_sve_cnt_immediate_p(poly_int64 value)5324 aarch64_sve_cnt_immediate_p (poly_int64 value)
5325 {
5326   HOST_WIDE_INT factor = value.coeffs[0];
5327   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
5328   return (value.coeffs[1] == factor
5329 	  && IN_RANGE (factor, 2, 16 * 16)
5330 	  && (factor & 1) == 0
5331 	  && factor <= 16 * (factor & -factor));
5332 }
5333 
5334 /* Likewise for rtx X.  */
5335 
5336 bool
aarch64_sve_cnt_immediate_p(rtx x)5337 aarch64_sve_cnt_immediate_p (rtx x)
5338 {
5339   poly_int64 value;
5340   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
5341 }
5342 
5343 /* Return the asm string for an instruction with a CNT-like vector size
5344    operand (a vector pattern followed by a multiplier in the range [1, 16]).
5345    PREFIX is the mnemonic without the size suffix and OPERANDS is the
5346    first part of the operands template (the part that comes before the
5347    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
5348    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
5349    in each quadword.  If it is zero, we can use any element size.  */
5350 
5351 static char *
aarch64_output_sve_cnt_immediate(const char * prefix,const char * operands,aarch64_svpattern pattern,unsigned int factor,unsigned int nelts_per_vq)5352 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
5353 				  aarch64_svpattern pattern,
5354 				  unsigned int factor,
5355 				  unsigned int nelts_per_vq)
5356 {
5357   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
5358 
5359   if (nelts_per_vq == 0)
5360     /* There is some overlap in the ranges of the four CNT instructions.
5361        Here we always use the smallest possible element size, so that the
5362        multiplier is 1 whereever possible.  */
5363     nelts_per_vq = factor & -factor;
5364   int shift = std::min (exact_log2 (nelts_per_vq), 4);
5365   gcc_assert (IN_RANGE (shift, 1, 4));
5366   char suffix = "dwhb"[shift - 1];
5367 
5368   factor >>= shift;
5369   unsigned int written;
5370   if (pattern == AARCH64_SV_ALL && factor == 1)
5371     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
5372 			prefix, suffix, operands);
5373   else if (factor == 1)
5374     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
5375 			prefix, suffix, operands, svpattern_token (pattern));
5376   else
5377     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
5378 			prefix, suffix, operands, svpattern_token (pattern),
5379 			factor);
5380   gcc_assert (written < sizeof (buffer));
5381   return buffer;
5382 }
5383 
5384 /* Return the asm string for an instruction with a CNT-like vector size
5385    operand (a vector pattern followed by a multiplier in the range [1, 16]).
5386    PREFIX is the mnemonic without the size suffix and OPERANDS is the
5387    first part of the operands template (the part that comes before the
5388    vector size itself).  X is the value of the vector size operand,
5389    as a polynomial integer rtx; we need to convert this into an "all"
5390    pattern with a multiplier.  */
5391 
5392 char *
aarch64_output_sve_cnt_immediate(const char * prefix,const char * operands,rtx x)5393 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
5394 				  rtx x)
5395 {
5396   poly_int64 value = rtx_to_poly_int64 (x);
5397   gcc_assert (aarch64_sve_cnt_immediate_p (value));
5398   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
5399 					   value.coeffs[1], 0);
5400 }
5401 
5402 /* Return the asm string for an instruction with a CNT-like vector size
5403    operand (a vector pattern followed by a multiplier in the range [1, 16]).
5404    PREFIX is the mnemonic without the size suffix and OPERANDS is the
5405    first part of the operands template (the part that comes before the
5406    vector size itself).  CNT_PAT[0..2] are the operands of the
5407    UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
5408 
5409 char *
aarch64_output_sve_cnt_pat_immediate(const char * prefix,const char * operands,rtx * cnt_pat)5410 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
5411 				      const char *operands, rtx *cnt_pat)
5412 {
5413   aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
5414   unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
5415   unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
5416   return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
5417 					   factor, nelts_per_vq);
5418 }
5419 
5420 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
5421 
5422 bool
aarch64_sve_scalar_inc_dec_immediate_p(rtx x)5423 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
5424 {
5425   poly_int64 value;
5426   return (poly_int_rtx_p (x, &value)
5427 	  && (aarch64_sve_cnt_immediate_p (value)
5428 	      || aarch64_sve_cnt_immediate_p (-value)));
5429 }
5430 
5431 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
5432    operand 0.  */
5433 
5434 char *
aarch64_output_sve_scalar_inc_dec(rtx offset)5435 aarch64_output_sve_scalar_inc_dec (rtx offset)
5436 {
5437   poly_int64 offset_value = rtx_to_poly_int64 (offset);
5438   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
5439   if (offset_value.coeffs[1] > 0)
5440     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
5441 					     offset_value.coeffs[1], 0);
5442   else
5443     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
5444 					     -offset_value.coeffs[1], 0);
5445 }
5446 
5447 /* Return true if we can add VALUE to a register using a single ADDVL
5448    or ADDPL instruction.  */
5449 
5450 static bool
aarch64_sve_addvl_addpl_immediate_p(poly_int64 value)5451 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
5452 {
5453   HOST_WIDE_INT factor = value.coeffs[0];
5454   if (factor == 0 || value.coeffs[1] != factor)
5455     return false;
5456   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
5457      and a value of 16 is one vector width.  */
5458   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
5459 	  || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
5460 }
5461 
5462 /* Likewise for rtx X.  */
5463 
5464 bool
aarch64_sve_addvl_addpl_immediate_p(rtx x)5465 aarch64_sve_addvl_addpl_immediate_p (rtx x)
5466 {
5467   poly_int64 value;
5468   return (poly_int_rtx_p (x, &value)
5469 	  && aarch64_sve_addvl_addpl_immediate_p (value));
5470 }
5471 
5472 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
5473    to operand 1 and storing the result in operand 0.  */
5474 
5475 char *
aarch64_output_sve_addvl_addpl(rtx offset)5476 aarch64_output_sve_addvl_addpl (rtx offset)
5477 {
5478   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
5479   poly_int64 offset_value = rtx_to_poly_int64 (offset);
5480   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
5481 
5482   int factor = offset_value.coeffs[1];
5483   if ((factor & 15) == 0)
5484     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
5485   else
5486     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
5487   return buffer;
5488 }
5489 
5490 /* Return true if X is a valid immediate for an SVE vector INC or DEC
5491    instruction.  If it is, store the number of elements in each vector
5492    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
5493    factor in *FACTOR_OUT (if nonnull).  */
5494 
5495 bool
aarch64_sve_vector_inc_dec_immediate_p(rtx x,int * factor_out,unsigned int * nelts_per_vq_out)5496 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
5497 					unsigned int *nelts_per_vq_out)
5498 {
5499   rtx elt;
5500   poly_int64 value;
5501 
5502   if (!const_vec_duplicate_p (x, &elt)
5503       || !poly_int_rtx_p (elt, &value))
5504     return false;
5505 
5506   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
5507   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
5508     /* There's no vector INCB.  */
5509     return false;
5510 
5511   HOST_WIDE_INT factor = value.coeffs[0];
5512   if (value.coeffs[1] != factor)
5513     return false;
5514 
5515   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
5516   if ((factor % nelts_per_vq) != 0
5517       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
5518     return false;
5519 
5520   if (factor_out)
5521     *factor_out = factor;
5522   if (nelts_per_vq_out)
5523     *nelts_per_vq_out = nelts_per_vq;
5524   return true;
5525 }
5526 
5527 /* Return true if X is a valid immediate for an SVE vector INC or DEC
5528    instruction.  */
5529 
5530 bool
aarch64_sve_vector_inc_dec_immediate_p(rtx x)5531 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
5532 {
5533   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
5534 }
5535 
5536 /* Return the asm template for an SVE vector INC or DEC instruction.
5537    OPERANDS gives the operands before the vector count and X is the
5538    value of the vector count operand itself.  */
5539 
5540 char *
aarch64_output_sve_vector_inc_dec(const char * operands,rtx x)5541 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
5542 {
5543   int factor;
5544   unsigned int nelts_per_vq;
5545   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
5546     gcc_unreachable ();
5547   if (factor < 0)
5548     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
5549 					     -factor, nelts_per_vq);
5550   else
5551     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
5552 					     factor, nelts_per_vq);
5553 }
5554 
5555 static int
aarch64_internal_mov_immediate(rtx dest,rtx imm,bool generate,scalar_int_mode mode)5556 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
5557 				scalar_int_mode mode)
5558 {
5559   int i;
5560   unsigned HOST_WIDE_INT val, val2, mask;
5561   int one_match, zero_match;
5562   int num_insns;
5563 
5564   val = INTVAL (imm);
5565 
5566   if (aarch64_move_imm (val, mode))
5567     {
5568       if (generate)
5569 	emit_insn (gen_rtx_SET (dest, imm));
5570       return 1;
5571     }
5572 
5573   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
5574      (with XXXX non-zero). In that case check to see if the move can be done in
5575      a smaller mode.  */
5576   val2 = val & 0xffffffff;
5577   if (mode == DImode
5578       && aarch64_move_imm (val2, SImode)
5579       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
5580     {
5581       if (generate)
5582 	emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
5583 
5584       /* Check if we have to emit a second instruction by checking to see
5585          if any of the upper 32 bits of the original DI mode value is set.  */
5586       if (val == val2)
5587 	return 1;
5588 
5589       i = (val >> 48) ? 48 : 32;
5590 
5591       if (generate)
5592 	 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5593 				    GEN_INT ((val >> i) & 0xffff)));
5594 
5595       return 2;
5596     }
5597 
5598   if ((val >> 32) == 0 || mode == SImode)
5599     {
5600       if (generate)
5601 	{
5602 	  emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
5603 	  if (mode == SImode)
5604 	    emit_insn (gen_insv_immsi (dest, GEN_INT (16),
5605 				       GEN_INT ((val >> 16) & 0xffff)));
5606 	  else
5607 	    emit_insn (gen_insv_immdi (dest, GEN_INT (16),
5608 				       GEN_INT ((val >> 16) & 0xffff)));
5609 	}
5610       return 2;
5611     }
5612 
5613   /* Remaining cases are all for DImode.  */
5614 
5615   mask = 0xffff;
5616   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
5617     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
5618   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
5619     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
5620 
5621   if (zero_match != 2 && one_match != 2)
5622     {
5623       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
5624 	 For a 64-bit bitmask try whether changing 16 bits to all ones or
5625 	 zeroes creates a valid bitmask.  To check any repeated bitmask,
5626 	 try using 16 bits from the other 32-bit half of val.  */
5627 
5628       for (i = 0; i < 64; i += 16, mask <<= 16)
5629 	{
5630 	  val2 = val & ~mask;
5631 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
5632 	    break;
5633 	  val2 = val | mask;
5634 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
5635 	    break;
5636 	  val2 = val2 & ~mask;
5637 	  val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
5638 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
5639 	    break;
5640 	}
5641       if (i != 64)
5642 	{
5643 	  if (generate)
5644 	    {
5645 	      emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
5646 	      emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5647 					 GEN_INT ((val >> i) & 0xffff)));
5648 	    }
5649 	  return 2;
5650 	}
5651     }
5652 
5653   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
5654      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
5655      otherwise skip zero bits.  */
5656 
5657   num_insns = 1;
5658   mask = 0xffff;
5659   val2 = one_match > zero_match ? ~val : val;
5660   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
5661 
5662   if (generate)
5663     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
5664 					   ? (val | ~(mask << i))
5665 					   : (val & (mask << i)))));
5666   for (i += 16; i < 64; i += 16)
5667     {
5668       if ((val2 & (mask << i)) == 0)
5669 	continue;
5670       if (generate)
5671 	emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5672 				   GEN_INT ((val >> i) & 0xffff)));
5673       num_insns ++;
5674     }
5675 
5676   return num_insns;
5677 }
5678 
5679 /* Return whether imm is a 128-bit immediate which is simple enough to
5680    expand inline.  */
5681 bool
aarch64_mov128_immediate(rtx imm)5682 aarch64_mov128_immediate (rtx imm)
5683 {
5684   if (CONST_INT_P (imm))
5685     return true;
5686 
5687   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
5688 
5689   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
5690   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
5691 
5692   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
5693 	 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
5694 }
5695 
5696 
5697 /* Return the number of temporary registers that aarch64_add_offset_1
5698    would need to add OFFSET to a register.  */
5699 
5700 static unsigned int
aarch64_add_offset_1_temporaries(HOST_WIDE_INT offset)5701 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
5702 {
5703   return absu_hwi (offset) < 0x1000000 ? 0 : 1;
5704 }
5705 
5706 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
5707    a non-polynomial OFFSET.  MODE is the mode of the addition.
5708    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
5709    be set and CFA adjustments added to the generated instructions.
5710 
5711    TEMP1, if nonnull, is a register of mode MODE that can be used as a
5712    temporary if register allocation is already complete.  This temporary
5713    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
5714    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
5715    the immediate again.
5716 
5717    Since this function may be used to adjust the stack pointer, we must
5718    ensure that it cannot cause transient stack deallocation (for example
5719    by first incrementing SP and then decrementing when adjusting by a
5720    large immediate).  */
5721 
5722 static void
aarch64_add_offset_1(scalar_int_mode mode,rtx dest,rtx src,HOST_WIDE_INT offset,rtx temp1,bool frame_related_p,bool emit_move_imm)5723 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
5724 		      rtx src, HOST_WIDE_INT offset, rtx temp1,
5725 		      bool frame_related_p, bool emit_move_imm)
5726 {
5727   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
5728   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
5729 
5730   unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
5731   rtx_insn *insn;
5732 
5733   if (!moffset)
5734     {
5735       if (!rtx_equal_p (dest, src))
5736 	{
5737 	  insn = emit_insn (gen_rtx_SET (dest, src));
5738 	  RTX_FRAME_RELATED_P (insn) = frame_related_p;
5739 	}
5740       return;
5741     }
5742 
5743   /* Single instruction adjustment.  */
5744   if (aarch64_uimm12_shift (moffset))
5745     {
5746       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
5747       RTX_FRAME_RELATED_P (insn) = frame_related_p;
5748       return;
5749     }
5750 
5751   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
5752      and either:
5753 
5754      a) the offset cannot be loaded by a 16-bit move or
5755      b) there is no spare register into which we can move it.  */
5756   if (moffset < 0x1000000
5757       && ((!temp1 && !can_create_pseudo_p ())
5758 	  || !aarch64_move_imm (moffset, mode)))
5759     {
5760       HOST_WIDE_INT low_off = moffset & 0xfff;
5761 
5762       low_off = offset < 0 ? -low_off : low_off;
5763       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
5764       RTX_FRAME_RELATED_P (insn) = frame_related_p;
5765       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
5766       RTX_FRAME_RELATED_P (insn) = frame_related_p;
5767       return;
5768     }
5769 
5770   /* Emit a move immediate if required and an addition/subtraction.  */
5771   if (emit_move_imm)
5772     {
5773       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
5774       temp1 = aarch64_force_temporary (mode, temp1,
5775 				       gen_int_mode (moffset, mode));
5776     }
5777   insn = emit_insn (offset < 0
5778 		    ? gen_sub3_insn (dest, src, temp1)
5779 		    : gen_add3_insn (dest, src, temp1));
5780   if (frame_related_p)
5781     {
5782       RTX_FRAME_RELATED_P (insn) = frame_related_p;
5783       rtx adj = plus_constant (mode, src, offset);
5784       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
5785     }
5786 }
5787 
5788 /* Return the number of temporary registers that aarch64_add_offset
5789    would need to move OFFSET into a register or add OFFSET to a register;
5790    ADD_P is true if we want the latter rather than the former.  */
5791 
5792 static unsigned int
aarch64_offset_temporaries(bool add_p,poly_int64 offset)5793 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
5794 {
5795   /* This follows the same structure as aarch64_add_offset.  */
5796   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
5797     return 0;
5798 
5799   unsigned int count = 0;
5800   HOST_WIDE_INT factor = offset.coeffs[1];
5801   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
5802   poly_int64 poly_offset (factor, factor);
5803   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
5804     /* Need one register for the ADDVL/ADDPL result.  */
5805     count += 1;
5806   else if (factor != 0)
5807     {
5808       factor = abs (factor);
5809       if (factor > 16 * (factor & -factor))
5810 	/* Need one register for the CNT result and one for the multiplication
5811 	   factor.  If necessary, the second temporary can be reused for the
5812 	   constant part of the offset.  */
5813 	return 2;
5814       /* Need one register for the CNT result (which might then
5815 	 be shifted).  */
5816       count += 1;
5817     }
5818   return count + aarch64_add_offset_1_temporaries (constant);
5819 }
5820 
5821 /* If X can be represented as a poly_int64, return the number
5822    of temporaries that are required to add it to a register.
5823    Return -1 otherwise.  */
5824 
5825 int
aarch64_add_offset_temporaries(rtx x)5826 aarch64_add_offset_temporaries (rtx x)
5827 {
5828   poly_int64 offset;
5829   if (!poly_int_rtx_p (x, &offset))
5830     return -1;
5831   return aarch64_offset_temporaries (true, offset);
5832 }
5833 
5834 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
5835    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
5836    be set and CFA adjustments added to the generated instructions.
5837 
5838    TEMP1, if nonnull, is a register of mode MODE that can be used as a
5839    temporary if register allocation is already complete.  This temporary
5840    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
5841    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
5842    false to avoid emitting the immediate again.
5843 
5844    TEMP2, if nonnull, is a second temporary register that doesn't
5845    overlap either DEST or REG.
5846 
5847    Since this function may be used to adjust the stack pointer, we must
5848    ensure that it cannot cause transient stack deallocation (for example
5849    by first incrementing SP and then decrementing when adjusting by a
5850    large immediate).  */
5851 
5852 static void
aarch64_add_offset(scalar_int_mode mode,rtx dest,rtx src,poly_int64 offset,rtx temp1,rtx temp2,bool frame_related_p,bool emit_move_imm=true)5853 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
5854 		    poly_int64 offset, rtx temp1, rtx temp2,
5855 		    bool frame_related_p, bool emit_move_imm = true)
5856 {
5857   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
5858   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
5859   gcc_assert (temp1 == NULL_RTX
5860 	      || !frame_related_p
5861 	      || !reg_overlap_mentioned_p (temp1, dest));
5862   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
5863 
5864   /* Try using ADDVL or ADDPL to add the whole value.  */
5865   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
5866     {
5867       rtx offset_rtx = gen_int_mode (offset, mode);
5868       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
5869       RTX_FRAME_RELATED_P (insn) = frame_related_p;
5870       return;
5871     }
5872 
5873   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
5874      SVE vector register, over and above the minimum size of 128 bits.
5875      This is equivalent to half the value returned by CNTD with a
5876      vector shape of ALL.  */
5877   HOST_WIDE_INT factor = offset.coeffs[1];
5878   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
5879 
5880   /* Try using ADDVL or ADDPL to add the VG-based part.  */
5881   poly_int64 poly_offset (factor, factor);
5882   if (src != const0_rtx
5883       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
5884     {
5885       rtx offset_rtx = gen_int_mode (poly_offset, mode);
5886       if (frame_related_p)
5887 	{
5888 	  rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
5889 	  RTX_FRAME_RELATED_P (insn) = true;
5890 	  src = dest;
5891 	}
5892       else
5893 	{
5894 	  rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
5895 	  src = aarch64_force_temporary (mode, temp1, addr);
5896 	  temp1 = temp2;
5897 	  temp2 = NULL_RTX;
5898 	}
5899     }
5900   /* Otherwise use a CNT-based sequence.  */
5901   else if (factor != 0)
5902     {
5903       /* Use a subtraction if we have a negative factor.  */
5904       rtx_code code = PLUS;
5905       if (factor < 0)
5906 	{
5907 	  factor = -factor;
5908 	  code = MINUS;
5909 	}
5910 
5911       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
5912 	 into the multiplication.  */
5913       rtx val;
5914       int shift = 0;
5915       if (factor & 1)
5916 	/* Use a right shift by 1.  */
5917 	shift = -1;
5918       else
5919 	factor /= 2;
5920       HOST_WIDE_INT low_bit = factor & -factor;
5921       if (factor <= 16 * low_bit)
5922 	{
5923 	  if (factor > 16 * 8)
5924 	    {
5925 	      /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
5926 		 the value with the minimum multiplier and shift it into
5927 		 position.  */
5928 	      int extra_shift = exact_log2 (low_bit);
5929 	      shift += extra_shift;
5930 	      factor >>= extra_shift;
5931 	    }
5932 	  val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
5933 	}
5934       else
5935 	{
5936 	  /* Base the factor on LOW_BIT if we can calculate LOW_BIT
5937 	     directly, since that should increase the chances of being
5938 	     able to use a shift and add sequence.  If LOW_BIT itself
5939 	     is out of range, just use CNTD.  */
5940 	  if (low_bit <= 16 * 8)
5941 	    factor /= low_bit;
5942 	  else
5943 	    low_bit = 1;
5944 
5945 	  val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
5946 	  val = aarch64_force_temporary (mode, temp1, val);
5947 
5948 	  if (can_create_pseudo_p ())
5949 	    {
5950 	      rtx coeff1 = gen_int_mode (factor, mode);
5951 	      val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
5952 	    }
5953 	  else
5954 	    {
5955 	      /* Go back to using a negative multiplication factor if we have
5956 		 no register from which to subtract.  */
5957 	      if (code == MINUS && src == const0_rtx)
5958 		{
5959 		  factor = -factor;
5960 		  code = PLUS;
5961 		}
5962 	      rtx coeff1 = gen_int_mode (factor, mode);
5963 	      coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
5964 	      val = gen_rtx_MULT (mode, val, coeff1);
5965 	    }
5966 	}
5967 
5968       if (shift > 0)
5969 	{
5970 	  /* Multiply by 1 << SHIFT.  */
5971 	  val = aarch64_force_temporary (mode, temp1, val);
5972 	  val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
5973 	}
5974       else if (shift == -1)
5975 	{
5976 	  /* Divide by 2.  */
5977 	  val = aarch64_force_temporary (mode, temp1, val);
5978 	  val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
5979 	}
5980 
5981       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
5982       if (src != const0_rtx)
5983 	{
5984 	  val = aarch64_force_temporary (mode, temp1, val);
5985 	  val = gen_rtx_fmt_ee (code, mode, src, val);
5986 	}
5987       else if (code == MINUS)
5988 	{
5989 	  val = aarch64_force_temporary (mode, temp1, val);
5990 	  val = gen_rtx_NEG (mode, val);
5991 	}
5992 
5993       if (constant == 0 || frame_related_p)
5994 	{
5995 	  rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
5996 	  if (frame_related_p)
5997 	    {
5998 	      RTX_FRAME_RELATED_P (insn) = true;
5999 	      add_reg_note (insn, REG_CFA_ADJUST_CFA,
6000 			    gen_rtx_SET (dest, plus_constant (Pmode, src,
6001 							      poly_offset)));
6002 	    }
6003 	  src = dest;
6004 	  if (constant == 0)
6005 	    return;
6006 	}
6007       else
6008 	{
6009 	  src = aarch64_force_temporary (mode, temp1, val);
6010 	  temp1 = temp2;
6011 	  temp2 = NULL_RTX;
6012 	}
6013 
6014       emit_move_imm = true;
6015     }
6016 
6017   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
6018 			frame_related_p, emit_move_imm);
6019 }
6020 
6021 /* Like aarch64_add_offset, but the offset is given as an rtx rather
6022    than a poly_int64.  */
6023 
6024 void
aarch64_split_add_offset(scalar_int_mode mode,rtx dest,rtx src,rtx offset_rtx,rtx temp1,rtx temp2)6025 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
6026 			  rtx offset_rtx, rtx temp1, rtx temp2)
6027 {
6028   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
6029 		      temp1, temp2, false);
6030 }
6031 
6032 /* Add DELTA to the stack pointer, marking the instructions frame-related.
6033    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
6034    if TEMP1 already contains abs (DELTA).  */
6035 
6036 static inline void
aarch64_add_sp(rtx temp1,rtx temp2,poly_int64 delta,bool emit_move_imm)6037 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
6038 {
6039   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
6040 		      temp1, temp2, true, emit_move_imm);
6041 }
6042 
6043 /* Subtract DELTA from the stack pointer, marking the instructions
6044    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
6045    if nonnull.  */
6046 
6047 static inline void
aarch64_sub_sp(rtx temp1,rtx temp2,poly_int64 delta,bool frame_related_p,bool emit_move_imm=true)6048 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
6049 		bool emit_move_imm = true)
6050 {
6051   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
6052 		      temp1, temp2, frame_related_p, emit_move_imm);
6053 }
6054 
6055 /* Set DEST to (vec_series BASE STEP).  */
6056 
6057 static void
aarch64_expand_vec_series(rtx dest,rtx base,rtx step)6058 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
6059 {
6060   machine_mode mode = GET_MODE (dest);
6061   scalar_mode inner = GET_MODE_INNER (mode);
6062 
6063   /* Each operand can be a register or an immediate in the range [-16, 15].  */
6064   if (!aarch64_sve_index_immediate_p (base))
6065     base = force_reg (inner, base);
6066   if (!aarch64_sve_index_immediate_p (step))
6067     step = force_reg (inner, step);
6068 
6069   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
6070 }
6071 
6072 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
6073    register of mode MODE.  Use TARGET for the result if it's nonnull
6074    and convenient.
6075 
6076    The two vector modes must have the same element mode.  The behavior
6077    is to duplicate architectural lane N of SRC into architectural lanes
6078    N + I * STEP of the result.  On big-endian targets, architectural
6079    lane 0 of an Advanced SIMD vector is the last element of the vector
6080    in memory layout, so for big-endian targets this operation has the
6081    effect of reversing SRC before duplicating it.  Callers need to
6082    account for this.  */
6083 
6084 rtx
aarch64_expand_sve_dupq(rtx target,machine_mode mode,rtx src)6085 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
6086 {
6087   machine_mode src_mode = GET_MODE (src);
6088   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
6089   insn_code icode = (BYTES_BIG_ENDIAN
6090 		     ? code_for_aarch64_vec_duplicate_vq_be (mode)
6091 		     : code_for_aarch64_vec_duplicate_vq_le (mode));
6092 
6093   unsigned int i = 0;
6094   expand_operand ops[3];
6095   create_output_operand (&ops[i++], target, mode);
6096   create_output_operand (&ops[i++], src, src_mode);
6097   if (BYTES_BIG_ENDIAN)
6098     {
6099       /* Create a PARALLEL describing the reversal of SRC.  */
6100       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
6101       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
6102 						  nelts_per_vq - 1, -1);
6103       create_fixed_operand (&ops[i++], sel);
6104     }
6105   expand_insn (icode, i, ops);
6106   return ops[0].value;
6107 }
6108 
6109 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
6110    the memory image into DEST.  Return true on success.  */
6111 
6112 static bool
aarch64_expand_sve_ld1rq(rtx dest,rtx src)6113 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
6114 {
6115   src = force_const_mem (GET_MODE (src), src);
6116   if (!src)
6117     return false;
6118 
6119   /* Make sure that the address is legitimate.  */
6120   if (!aarch64_sve_ld1rq_operand_p (src))
6121     {
6122       rtx addr = force_reg (Pmode, XEXP (src, 0));
6123       src = replace_equiv_address (src, addr);
6124     }
6125 
6126   machine_mode mode = GET_MODE (dest);
6127   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
6128   rtx ptrue = aarch64_ptrue_reg (pred_mode);
6129   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
6130   return true;
6131 }
6132 
6133 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
6134    by N "background" values.  Try to move it into TARGET using:
6135 
6136       PTRUE PRED.<T>, VL<N>
6137       MOV TRUE.<T>, #<foreground>
6138       MOV FALSE.<T>, #<background>
6139       SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
6140 
6141    The PTRUE is always a single instruction but the MOVs might need a
6142    longer sequence.  If the background value is zero (as it often is),
6143    the sequence can sometimes collapse to a PTRUE followed by a
6144    zero-predicated move.
6145 
6146    Return the target on success, otherwise return null.  */
6147 
6148 static rtx
aarch64_expand_sve_const_vector_sel(rtx target,rtx src)6149 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
6150 {
6151   gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
6152 
6153   /* Make sure that the PTRUE is valid.  */
6154   machine_mode mode = GET_MODE (src);
6155   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
6156   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
6157   if (aarch64_svpattern_for_vl (pred_mode, npatterns)
6158       == AARCH64_NUM_SVPATTERNS)
6159     return NULL_RTX;
6160 
6161   rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
6162   rtx_vector_builder true_builder (mode, npatterns, 1);
6163   rtx_vector_builder false_builder (mode, npatterns, 1);
6164   for (unsigned int i = 0; i < npatterns; ++i)
6165     {
6166       true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
6167       pred_builder.quick_push (CONST1_RTX (BImode));
6168     }
6169   for (unsigned int i = 0; i < npatterns; ++i)
6170     {
6171       false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
6172       pred_builder.quick_push (CONST0_RTX (BImode));
6173     }
6174   expand_operand ops[4];
6175   create_output_operand (&ops[0], target, mode);
6176   create_input_operand (&ops[1], true_builder.build (), mode);
6177   create_input_operand (&ops[2], false_builder.build (), mode);
6178   create_input_operand (&ops[3], pred_builder.build (), pred_mode);
6179   expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
6180   return target;
6181 }
6182 
6183 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
6184    SVE data mode and isn't a legitimate constant.  Use TARGET for the
6185    result if convenient.
6186 
6187    The returned register can have whatever mode seems most natural
6188    given the contents of SRC.  */
6189 
6190 static rtx
aarch64_expand_sve_const_vector(rtx target,rtx src)6191 aarch64_expand_sve_const_vector (rtx target, rtx src)
6192 {
6193   machine_mode mode = GET_MODE (src);
6194   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
6195   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
6196   scalar_mode elt_mode = GET_MODE_INNER (mode);
6197   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
6198   unsigned int container_bits = aarch64_sve_container_bits (mode);
6199   unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
6200 
6201   if (nelts_per_pattern == 1
6202       && encoded_bits <= 128
6203       && container_bits != elt_bits)
6204     {
6205       /* We have a partial vector mode and a constant whose full-vector
6206 	 equivalent would occupy a repeating 128-bit sequence.  Build that
6207 	 full-vector equivalent instead, so that we have the option of
6208 	 using LD1RQ and Advanced SIMD operations.  */
6209       unsigned int repeat = container_bits / elt_bits;
6210       machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
6211       rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
6212       for (unsigned int i = 0; i < npatterns; ++i)
6213 	for (unsigned int j = 0; j < repeat; ++j)
6214 	  builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
6215       target = aarch64_target_reg (target, full_mode);
6216       return aarch64_expand_sve_const_vector (target, builder.build ());
6217     }
6218 
6219   if (nelts_per_pattern == 1 && encoded_bits == 128)
6220     {
6221       /* The constant is a duplicated quadword but can't be narrowed
6222 	 beyond a quadword.  Get the memory image of the first quadword
6223 	 as a 128-bit vector and try using LD1RQ to load it from memory.
6224 
6225 	 The effect for both endiannesses is to load memory lane N into
6226 	 architectural lanes N + I * STEP of the result.  On big-endian
6227 	 targets, the layout of the 128-bit vector in an Advanced SIMD
6228 	 register would be different from its layout in an SVE register,
6229 	 but this 128-bit vector is a memory value only.  */
6230       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
6231       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
6232       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
6233 	return target;
6234     }
6235 
6236   if (nelts_per_pattern == 1 && encoded_bits < 128)
6237     {
6238       /* The vector is a repeating sequence of 64 bits or fewer.
6239 	 See if we can load them using an Advanced SIMD move and then
6240 	 duplicate it to fill a vector.  This is better than using a GPR
6241 	 move because it keeps everything in the same register file.  */
6242       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
6243       rtx_vector_builder builder (vq_mode, npatterns, 1);
6244       for (unsigned int i = 0; i < npatterns; ++i)
6245 	{
6246 	  /* We want memory lane N to go into architectural lane N,
6247 	     so reverse for big-endian targets.  The DUP .Q pattern
6248 	     has a compensating reverse built-in.  */
6249 	  unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
6250 	  builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
6251 	}
6252       rtx vq_src = builder.build ();
6253       if (aarch64_simd_valid_immediate (vq_src, NULL))
6254 	{
6255 	  vq_src = force_reg (vq_mode, vq_src);
6256 	  return aarch64_expand_sve_dupq (target, mode, vq_src);
6257 	}
6258 
6259       /* Get an integer representation of the repeating part of Advanced
6260 	 SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
6261 	 which for big-endian targets is lane-swapped wrt a normal
6262 	 Advanced SIMD vector.  This means that for both endiannesses,
6263 	 memory lane N of SVE vector SRC corresponds to architectural
6264 	 lane N of a register holding VQ_SRC.  This in turn means that
6265 	 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
6266 	 as a single 128-bit value) and thus that memory lane 0 of SRC is
6267 	 in the lsb of the integer.  Duplicating the integer therefore
6268 	 ensures that memory lane N of SRC goes into architectural lane
6269 	 N + I * INDEX of the SVE register.  */
6270       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
6271       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
6272       if (elt_value)
6273 	{
6274 	  /* Pretend that we had a vector of INT_MODE to start with.  */
6275 	  elt_mode = int_mode;
6276 	  mode = aarch64_full_sve_mode (int_mode).require ();
6277 
6278 	  /* If the integer can be moved into a general register by a
6279 	     single instruction, do that and duplicate the result.  */
6280 	  if (CONST_INT_P (elt_value)
6281 	      && aarch64_move_imm (INTVAL (elt_value), elt_mode))
6282 	    {
6283 	      elt_value = force_reg (elt_mode, elt_value);
6284 	      return expand_vector_broadcast (mode, elt_value);
6285 	    }
6286 	}
6287       else if (npatterns == 1)
6288 	/* We're duplicating a single value, but can't do better than
6289 	   force it to memory and load from there.  This handles things
6290 	   like symbolic constants.  */
6291 	elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
6292 
6293       if (elt_value)
6294 	{
6295 	  /* Load the element from memory if we can, otherwise move it into
6296 	     a register and use a DUP.  */
6297 	  rtx op = force_const_mem (elt_mode, elt_value);
6298 	  if (!op)
6299 	    op = force_reg (elt_mode, elt_value);
6300 	  return expand_vector_broadcast (mode, op);
6301 	}
6302     }
6303 
6304   /* Try using INDEX.  */
6305   rtx base, step;
6306   if (const_vec_series_p (src, &base, &step))
6307     {
6308       aarch64_expand_vec_series (target, base, step);
6309       return target;
6310     }
6311 
6312   /* From here on, it's better to force the whole constant to memory
6313      if we can.  */
6314   if (GET_MODE_NUNITS (mode).is_constant ())
6315     return NULL_RTX;
6316 
6317   if (nelts_per_pattern == 2)
6318     if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
6319       return res;
6320 
6321   /* Expand each pattern individually.  */
6322   gcc_assert (npatterns > 1);
6323   rtx_vector_builder builder;
6324   auto_vec<rtx, 16> vectors (npatterns);
6325   for (unsigned int i = 0; i < npatterns; ++i)
6326     {
6327       builder.new_vector (mode, 1, nelts_per_pattern);
6328       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
6329 	builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
6330       vectors.quick_push (force_reg (mode, builder.build ()));
6331     }
6332 
6333   /* Use permutes to interleave the separate vectors.  */
6334   while (npatterns > 1)
6335     {
6336       npatterns /= 2;
6337       for (unsigned int i = 0; i < npatterns; ++i)
6338 	{
6339 	  rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
6340 	  rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
6341 	  emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
6342 	  vectors[i] = tmp;
6343 	}
6344     }
6345   gcc_assert (vectors[0] == target);
6346   return target;
6347 }
6348 
6349 /* Use WHILE to set a predicate register of mode MODE in which the first
6350    VL bits are set and the rest are clear.  Use TARGET for the register
6351    if it's nonnull and convenient.  */
6352 
6353 static rtx
aarch64_sve_move_pred_via_while(rtx target,machine_mode mode,unsigned int vl)6354 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
6355 				 unsigned int vl)
6356 {
6357   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
6358   target = aarch64_target_reg (target, mode);
6359   emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
6360 			target, const0_rtx, limit));
6361   return target;
6362 }
6363 
6364 static rtx
6365 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
6366 
6367 /* BUILDER is a constant predicate in which the index of every set bit
6368    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
6369    by inverting every element at a multiple of ELT_SIZE and EORing the
6370    result with an ELT_SIZE PTRUE.
6371 
6372    Return a register that contains the constant on success, otherwise
6373    return null.  Use TARGET as the register if it is nonnull and
6374    convenient.  */
6375 
6376 static rtx
aarch64_expand_sve_const_pred_eor(rtx target,rtx_vector_builder & builder,unsigned int elt_size)6377 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
6378 				   unsigned int elt_size)
6379 {
6380   /* Invert every element at a multiple of ELT_SIZE, keeping the
6381      other bits zero.  */
6382   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
6383 				  builder.nelts_per_pattern ());
6384   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6385     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
6386       inv_builder.quick_push (const1_rtx);
6387     else
6388       inv_builder.quick_push (const0_rtx);
6389   inv_builder.finalize ();
6390 
6391   /* See if we can load the constant cheaply.  */
6392   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
6393   if (!inv)
6394     return NULL_RTX;
6395 
6396   /* EOR the result with an ELT_SIZE PTRUE.  */
6397   rtx mask = aarch64_ptrue_all (elt_size);
6398   mask = force_reg (VNx16BImode, mask);
6399   inv = gen_lowpart (VNx16BImode, inv);
6400   target = aarch64_target_reg (target, VNx16BImode);
6401   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
6402   return target;
6403 }
6404 
6405 /* BUILDER is a constant predicate in which the index of every set bit
6406    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
6407    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
6408    register on success, otherwise return null.  Use TARGET as the register
6409    if nonnull and convenient.  */
6410 
6411 static rtx
aarch64_expand_sve_const_pred_trn(rtx target,rtx_vector_builder & builder,unsigned int elt_size,unsigned int permute_size)6412 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
6413 				   unsigned int elt_size,
6414 				   unsigned int permute_size)
6415 {
6416   /* We're going to split the constant into two new constants A and B,
6417      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
6418      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
6419 
6420      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
6421      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
6422 
6423      where _ indicates elements that will be discarded by the permute.
6424 
6425      First calculate the ELT_SIZEs for A and B.  */
6426   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
6427   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
6428   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
6429     if (INTVAL (builder.elt (i)) != 0)
6430       {
6431 	if (i & permute_size)
6432 	  b_elt_size |= i - permute_size;
6433 	else
6434 	  a_elt_size |= i;
6435       }
6436   a_elt_size &= -a_elt_size;
6437   b_elt_size &= -b_elt_size;
6438 
6439   /* Now construct the vectors themselves.  */
6440   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
6441 				builder.nelts_per_pattern ());
6442   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
6443 				builder.nelts_per_pattern ());
6444   unsigned int nelts = builder.encoded_nelts ();
6445   for (unsigned int i = 0; i < nelts; ++i)
6446     if (i & (elt_size - 1))
6447       {
6448 	a_builder.quick_push (const0_rtx);
6449 	b_builder.quick_push (const0_rtx);
6450       }
6451     else if ((i & permute_size) == 0)
6452       {
6453 	/* The A and B elements are significant.  */
6454 	a_builder.quick_push (builder.elt (i));
6455 	b_builder.quick_push (builder.elt (i + permute_size));
6456       }
6457     else
6458       {
6459 	/* The A and B elements are going to be discarded, so pick whatever
6460 	   is likely to give a nice constant.  We are targeting element
6461 	   sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
6462 	   with the aim of each being a sequence of ones followed by
6463 	   a sequence of zeros.  So:
6464 
6465 	   * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
6466 	     duplicate the last X_ELT_SIZE element, to extend the
6467 	     current sequence of ones or zeros.
6468 
6469 	   * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
6470 	     zero, so that the constant really does have X_ELT_SIZE and
6471 	     not a smaller size.  */
6472 	if (a_elt_size > permute_size)
6473 	  a_builder.quick_push (const0_rtx);
6474 	else
6475 	  a_builder.quick_push (a_builder.elt (i - a_elt_size));
6476 	if (b_elt_size > permute_size)
6477 	  b_builder.quick_push (const0_rtx);
6478 	else
6479 	  b_builder.quick_push (b_builder.elt (i - b_elt_size));
6480       }
6481   a_builder.finalize ();
6482   b_builder.finalize ();
6483 
6484   /* Try loading A into a register.  */
6485   rtx_insn *last = get_last_insn ();
6486   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
6487   if (!a)
6488     return NULL_RTX;
6489 
6490   /* Try loading B into a register.  */
6491   rtx b = a;
6492   if (a_builder != b_builder)
6493     {
6494       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
6495       if (!b)
6496 	{
6497 	  delete_insns_since (last);
6498 	  return NULL_RTX;
6499 	}
6500     }
6501 
6502   /* Emit the TRN1 itself.  We emit a TRN that operates on VNx16BI
6503      operands but permutes them as though they had mode MODE.  */
6504   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
6505   target = aarch64_target_reg (target, GET_MODE (a));
6506   rtx type_reg = CONST0_RTX (mode);
6507   emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
6508   return target;
6509 }
6510 
6511 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
6512    constant in BUILDER into an SVE predicate register.  Return the register
6513    on success, otherwise return null.  Use TARGET for the register if
6514    nonnull and convenient.
6515 
6516    ALLOW_RECURSE_P is true if we can use methods that would call this
6517    function recursively.  */
6518 
6519 static rtx
aarch64_expand_sve_const_pred_1(rtx target,rtx_vector_builder & builder,bool allow_recurse_p)6520 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
6521 				 bool allow_recurse_p)
6522 {
6523   if (builder.encoded_nelts () == 1)
6524     /* A PFALSE or a PTRUE .B ALL.  */
6525     return aarch64_emit_set_immediate (target, builder);
6526 
6527   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
6528   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
6529     {
6530       /* If we can load the constant using PTRUE, use it as-is.  */
6531       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
6532       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
6533 	return aarch64_emit_set_immediate (target, builder);
6534 
6535       /* Otherwise use WHILE to set the first VL bits.  */
6536       return aarch64_sve_move_pred_via_while (target, mode, vl);
6537     }
6538 
6539   if (!allow_recurse_p)
6540     return NULL_RTX;
6541 
6542   /* Try inverting the vector in element size ELT_SIZE and then EORing
6543      the result with an ELT_SIZE PTRUE.  */
6544   if (INTVAL (builder.elt (0)) == 0)
6545     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
6546 						     elt_size))
6547       return res;
6548 
6549   /* Try using TRN1 to permute two simpler constants.  */
6550   for (unsigned int i = elt_size; i <= 8; i *= 2)
6551     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
6552 						     elt_size, i))
6553       return res;
6554 
6555   return NULL_RTX;
6556 }
6557 
6558 /* Return an SVE predicate register that contains the VNx16BImode
6559    constant in BUILDER, without going through the move expanders.
6560 
6561    The returned register can have whatever mode seems most natural
6562    given the contents of BUILDER.  Use TARGET for the result if
6563    convenient.  */
6564 
6565 static rtx
aarch64_expand_sve_const_pred(rtx target,rtx_vector_builder & builder)6566 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
6567 {
6568   /* Try loading the constant using pure predicate operations.  */
6569   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
6570     return res;
6571 
6572   /* Try forcing the constant to memory.  */
6573   if (builder.full_nelts ().is_constant ())
6574     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
6575       {
6576 	target = aarch64_target_reg (target, VNx16BImode);
6577 	emit_move_insn (target, mem);
6578 	return target;
6579       }
6580 
6581   /* The last resort is to load the constant as an integer and then
6582      compare it against zero.  Use -1 for set bits in order to increase
6583      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
6584   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
6585 				  builder.nelts_per_pattern ());
6586   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6587     int_builder.quick_push (INTVAL (builder.elt (i))
6588 			    ? constm1_rtx : const0_rtx);
6589   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
6590 					   int_builder.build ());
6591 }
6592 
6593 /* Set DEST to immediate IMM.  */
6594 
6595 void
aarch64_expand_mov_immediate(rtx dest,rtx imm)6596 aarch64_expand_mov_immediate (rtx dest, rtx imm)
6597 {
6598   machine_mode mode = GET_MODE (dest);
6599 
6600   /* Check on what type of symbol it is.  */
6601   scalar_int_mode int_mode;
6602   if ((SYMBOL_REF_P (imm)
6603        || LABEL_REF_P (imm)
6604        || GET_CODE (imm) == CONST
6605        || GET_CODE (imm) == CONST_POLY_INT)
6606       && is_a <scalar_int_mode> (mode, &int_mode))
6607     {
6608       rtx mem;
6609       poly_int64 offset;
6610       HOST_WIDE_INT const_offset;
6611       enum aarch64_symbol_type sty;
6612 
6613       /* If we have (const (plus symbol offset)), separate out the offset
6614 	 before we start classifying the symbol.  */
6615       rtx base = strip_offset (imm, &offset);
6616 
6617       /* We must always add an offset involving VL separately, rather than
6618 	 folding it into the relocation.  */
6619       if (!offset.is_constant (&const_offset))
6620 	{
6621 	  if (!TARGET_SVE)
6622 	    {
6623 	      aarch64_report_sve_required ();
6624 	      return;
6625 	    }
6626 	  if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
6627 	    emit_insn (gen_rtx_SET (dest, imm));
6628 	  else
6629 	    {
6630 	      /* Do arithmetic on 32-bit values if the result is smaller
6631 		 than that.  */
6632 	      if (partial_subreg_p (int_mode, SImode))
6633 		{
6634 		  /* It is invalid to do symbol calculations in modes
6635 		     narrower than SImode.  */
6636 		  gcc_assert (base == const0_rtx);
6637 		  dest = gen_lowpart (SImode, dest);
6638 		  int_mode = SImode;
6639 		}
6640 	      if (base != const0_rtx)
6641 		{
6642 		  base = aarch64_force_temporary (int_mode, dest, base);
6643 		  aarch64_add_offset (int_mode, dest, base, offset,
6644 				      NULL_RTX, NULL_RTX, false);
6645 		}
6646 	      else
6647 		aarch64_add_offset (int_mode, dest, base, offset,
6648 				    dest, NULL_RTX, false);
6649 	    }
6650 	  return;
6651 	}
6652 
6653       sty = aarch64_classify_symbol (base, const_offset);
6654       switch (sty)
6655 	{
6656 	case SYMBOL_FORCE_TO_MEM:
6657 	  if (int_mode != ptr_mode)
6658 	    imm = convert_memory_address (ptr_mode, imm);
6659 
6660 	  if (const_offset != 0
6661 	      && targetm.cannot_force_const_mem (ptr_mode, imm))
6662 	    {
6663 	      gcc_assert (can_create_pseudo_p ());
6664 	      base = aarch64_force_temporary (int_mode, dest, base);
6665 	      aarch64_add_offset (int_mode, dest, base, const_offset,
6666 				  NULL_RTX, NULL_RTX, false);
6667 	      return;
6668 	    }
6669 
6670 	  mem = force_const_mem (ptr_mode, imm);
6671 	  gcc_assert (mem);
6672 
6673 	  /* If we aren't generating PC relative literals, then
6674 	     we need to expand the literal pool access carefully.
6675 	     This is something that needs to be done in a number
6676 	     of places, so could well live as a separate function.  */
6677 	  if (!aarch64_pcrelative_literal_loads)
6678 	    {
6679 	      gcc_assert (can_create_pseudo_p ());
6680 	      base = gen_reg_rtx (ptr_mode);
6681 	      aarch64_expand_mov_immediate (base, XEXP (mem, 0));
6682 	      if (ptr_mode != Pmode)
6683 		base = convert_memory_address (Pmode, base);
6684 	      mem = gen_rtx_MEM (ptr_mode, base);
6685 	    }
6686 
6687 	  if (int_mode != ptr_mode)
6688 	    mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
6689 
6690 	  emit_insn (gen_rtx_SET (dest, mem));
6691 
6692 	  return;
6693 
6694         case SYMBOL_SMALL_TLSGD:
6695         case SYMBOL_SMALL_TLSDESC:
6696 	case SYMBOL_SMALL_TLSIE:
6697 	case SYMBOL_SMALL_GOT_28K:
6698 	case SYMBOL_SMALL_GOT_4G:
6699 	case SYMBOL_TINY_GOT:
6700 	case SYMBOL_TINY_TLSIE:
6701 	  if (const_offset != 0)
6702 	    {
6703 	      gcc_assert(can_create_pseudo_p ());
6704 	      base = aarch64_force_temporary (int_mode, dest, base);
6705 	      aarch64_add_offset (int_mode, dest, base, const_offset,
6706 				  NULL_RTX, NULL_RTX, false);
6707 	      return;
6708 	    }
6709 	  /* FALLTHRU */
6710 
6711 	case SYMBOL_SMALL_ABSOLUTE:
6712 	case SYMBOL_TINY_ABSOLUTE:
6713 	case SYMBOL_TLSLE12:
6714 	case SYMBOL_TLSLE24:
6715 	case SYMBOL_TLSLE32:
6716 	case SYMBOL_TLSLE48:
6717 	  aarch64_load_symref_appropriately (dest, imm, sty);
6718 	  return;
6719 
6720 	default:
6721 	  gcc_unreachable ();
6722 	}
6723     }
6724 
6725   if (!CONST_INT_P (imm))
6726     {
6727       if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
6728 	{
6729 	  /* Only the low bit of each .H, .S and .D element is defined,
6730 	     so we can set the upper bits to whatever we like.  If the
6731 	     predicate is all-true in MODE, prefer to set all the undefined
6732 	     bits as well, so that we can share a single .B predicate for
6733 	     all modes.  */
6734 	  if (imm == CONSTM1_RTX (mode))
6735 	    imm = CONSTM1_RTX (VNx16BImode);
6736 
6737 	  /* All methods for constructing predicate modes wider than VNx16BI
6738 	     will set the upper bits of each element to zero.  Expose this
6739 	     by moving such constants as a VNx16BI, so that all bits are
6740 	     significant and so that constants for different modes can be
6741 	     shared.  The wider constant will still be available as a
6742 	     REG_EQUAL note.  */
6743 	  rtx_vector_builder builder;
6744 	  if (aarch64_get_sve_pred_bits (builder, imm))
6745 	    {
6746 	      rtx res = aarch64_expand_sve_const_pred (dest, builder);
6747 	      if (dest != res)
6748 		emit_move_insn (dest, gen_lowpart (mode, res));
6749 	      return;
6750 	    }
6751 	}
6752 
6753       if (GET_CODE (imm) == HIGH
6754 	  || aarch64_simd_valid_immediate (imm, NULL))
6755 	{
6756 	  emit_insn (gen_rtx_SET (dest, imm));
6757 	  return;
6758 	}
6759 
6760       if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode))
6761 	if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
6762 	  {
6763 	    if (dest != res)
6764 	      emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
6765 	    return;
6766 	  }
6767 
6768       rtx mem = force_const_mem (mode, imm);
6769       gcc_assert (mem);
6770       emit_move_insn (dest, mem);
6771       return;
6772     }
6773 
6774   aarch64_internal_mov_immediate (dest, imm, true,
6775 				  as_a <scalar_int_mode> (mode));
6776 }
6777 
6778 /* Return the MEM rtx that provides the canary value that should be used
6779    for stack-smashing protection.  MODE is the mode of the memory.
6780    For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
6781    (__stack_chk_guard), otherwise it has no useful value.  SALT_TYPE
6782    indicates whether the caller is performing a SET or a TEST operation.  */
6783 
6784 rtx
aarch64_stack_protect_canary_mem(machine_mode mode,rtx decl_rtl,aarch64_salt_type salt_type)6785 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
6786 				  aarch64_salt_type salt_type)
6787 {
6788   rtx addr;
6789   if (aarch64_stack_protector_guard == SSP_GLOBAL)
6790     {
6791       gcc_assert (MEM_P (decl_rtl));
6792       addr = XEXP (decl_rtl, 0);
6793       poly_int64 offset;
6794       rtx base = strip_offset_and_salt (addr, &offset);
6795       if (!SYMBOL_REF_P (base))
6796 	return decl_rtl;
6797 
6798       rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
6799       addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
6800       addr = gen_rtx_CONST (Pmode, addr);
6801       addr = plus_constant (Pmode, addr, offset);
6802     }
6803   else
6804     {
6805       /* Calculate the address from the system register.  */
6806       rtx salt = GEN_INT (salt_type);
6807       addr = gen_reg_rtx (mode);
6808       if (mode == DImode)
6809 	emit_insn (gen_reg_stack_protect_address_di (addr, salt));
6810       else
6811 	{
6812 	  emit_insn (gen_reg_stack_protect_address_si (addr, salt));
6813 	  addr = convert_memory_address (Pmode, addr);
6814 	}
6815       addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
6816     }
6817   return gen_rtx_MEM (mode, force_reg (Pmode, addr));
6818 }
6819 
6820 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
6821    that is known to contain PTRUE.  */
6822 
6823 void
aarch64_emit_sve_pred_move(rtx dest,rtx pred,rtx src)6824 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
6825 {
6826   expand_operand ops[3];
6827   machine_mode mode = GET_MODE (dest);
6828   create_output_operand (&ops[0], dest, mode);
6829   create_input_operand (&ops[1], pred, GET_MODE(pred));
6830   create_input_operand (&ops[2], src, mode);
6831   temporary_volatile_ok v (true);
6832   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
6833 }
6834 
6835 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
6836    operand is in memory.  In this case we need to use the predicated LD1
6837    and ST1 instead of LDR and STR, both for correctness on big-endian
6838    targets and because LD1 and ST1 support a wider range of addressing modes.
6839    PRED_MODE is the mode of the predicate.
6840 
6841    See the comment at the head of aarch64-sve.md for details about the
6842    big-endian handling.  */
6843 
6844 void
aarch64_expand_sve_mem_move(rtx dest,rtx src,machine_mode pred_mode)6845 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
6846 {
6847   machine_mode mode = GET_MODE (dest);
6848   rtx ptrue = aarch64_ptrue_reg (pred_mode);
6849   if (!register_operand (src, mode)
6850       && !register_operand (dest, mode))
6851     {
6852       rtx tmp = gen_reg_rtx (mode);
6853       if (MEM_P (src))
6854 	aarch64_emit_sve_pred_move (tmp, ptrue, src);
6855       else
6856 	emit_move_insn (tmp, src);
6857       src = tmp;
6858     }
6859   aarch64_emit_sve_pred_move (dest, ptrue, src);
6860 }
6861 
6862 /* Called only on big-endian targets.  See whether an SVE vector move
6863    from SRC to DEST is effectively a REV[BHW] instruction, because at
6864    least one operand is a subreg of an SVE vector that has wider or
6865    narrower elements.  Return true and emit the instruction if so.
6866 
6867    For example:
6868 
6869      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
6870 
6871    represents a VIEW_CONVERT between the following vectors, viewed
6872    in memory order:
6873 
6874      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
6875      R1: { [0],      [1],      [2],      [3],     ... }
6876 
6877    The high part of lane X in R2 should therefore correspond to lane X*2
6878    of R1, but the register representations are:
6879 
6880          msb                                      lsb
6881      R2: ...... [1].high  [1].low   [0].high  [0].low
6882      R1: ...... [3]       [2]       [1]       [0]
6883 
6884    where the low part of lane X in R2 corresponds to lane X*2 in R1.
6885    We therefore need a reverse operation to swap the high and low values
6886    around.
6887 
6888    This is purely an optimization.  Without it we would spill the
6889    subreg operand to the stack in one mode and reload it in the
6890    other mode, which has the same effect as the REV.  */
6891 
6892 bool
aarch64_maybe_expand_sve_subreg_move(rtx dest,rtx src)6893 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
6894 {
6895   gcc_assert (BYTES_BIG_ENDIAN);
6896 
6897   /* Do not try to optimize subregs that LRA has created for matched
6898      reloads.  These subregs only exist as a temporary measure to make
6899      the RTL well-formed, but they are exempt from the usual
6900      TARGET_CAN_CHANGE_MODE_CLASS rules.
6901 
6902      For example, if we have:
6903 
6904        (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
6905 
6906      and the constraints require R1 and R2 to be in the same register,
6907      LRA may need to create RTL such as:
6908 
6909        (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
6910        (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
6911        (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
6912 
6913      which forces both the input and output of the original instruction
6914      to use the same hard register.  But for this to work, the normal
6915      rules have to be suppressed on the subreg input, otherwise LRA
6916      would need to reload that input too, meaning that the process
6917      would never terminate.  To compensate for this, the normal rules
6918      are also suppressed for the subreg output of the first move.
6919      Ignoring the special case and handling the first move normally
6920      would therefore generate wrong code: we would reverse the elements
6921      for the first subreg but not reverse them back for the second subreg.  */
6922   if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
6923     dest = SUBREG_REG (dest);
6924   if (SUBREG_P (src) && !LRA_SUBREG_P (src))
6925     src = SUBREG_REG (src);
6926 
6927   /* The optimization handles two single SVE REGs with different element
6928      sizes.  */
6929   if (!REG_P (dest)
6930       || !REG_P (src)
6931       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
6932       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
6933       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
6934 	  == GET_MODE_UNIT_SIZE (GET_MODE (src))))
6935     return false;
6936 
6937   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
6938   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
6939   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
6940 			       UNSPEC_REV_SUBREG);
6941   emit_insn (gen_rtx_SET (dest, unspec));
6942   return true;
6943 }
6944 
6945 /* Return a copy of X with mode MODE, without changing its other
6946    attributes.  Unlike gen_lowpart, this doesn't care whether the
6947    mode change is valid.  */
6948 
6949 rtx
aarch64_replace_reg_mode(rtx x,machine_mode mode)6950 aarch64_replace_reg_mode (rtx x, machine_mode mode)
6951 {
6952   if (GET_MODE (x) == mode)
6953     return x;
6954 
6955   x = shallow_copy_rtx (x);
6956   set_mode_and_regno (x, mode, REGNO (x));
6957   return x;
6958 }
6959 
6960 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
6961    stored in wider integer containers.  */
6962 
6963 static unsigned int
aarch64_sve_rev_unspec(machine_mode mode)6964 aarch64_sve_rev_unspec (machine_mode mode)
6965 {
6966   switch (GET_MODE_UNIT_SIZE (mode))
6967     {
6968     case 1: return UNSPEC_REVB;
6969     case 2: return UNSPEC_REVH;
6970     case 4: return UNSPEC_REVW;
6971     }
6972   gcc_unreachable ();
6973 }
6974 
6975 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
6976    operands.  */
6977 
6978 void
aarch64_split_sve_subreg_move(rtx dest,rtx ptrue,rtx src)6979 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
6980 {
6981   /* Decide which REV operation we need.  The mode with wider elements
6982      determines the mode of the operands and the mode with the narrower
6983      elements determines the reverse width.  */
6984   machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
6985   machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
6986   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
6987       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
6988     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
6989 
6990   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
6991   machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
6992 
6993   /* Get the operands in the appropriate modes and emit the instruction.  */
6994   ptrue = gen_lowpart (pred_mode, ptrue);
6995   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
6996   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
6997   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
6998 			       dest, ptrue, src));
6999 }
7000 
7001 static bool
aarch64_function_ok_for_sibcall(tree,tree exp)7002 aarch64_function_ok_for_sibcall (tree, tree exp)
7003 {
7004   if (crtl->abi->id () != expr_callee_abi (exp).id ())
7005     return false;
7006 
7007   return true;
7008 }
7009 
7010 /* Subroutine of aarch64_pass_by_reference for arguments that are not
7011    passed in SVE registers.  */
7012 
7013 static bool
aarch64_pass_by_reference_1(CUMULATIVE_ARGS * pcum,const function_arg_info & arg)7014 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
7015 			     const function_arg_info &arg)
7016 {
7017   HOST_WIDE_INT size;
7018   machine_mode dummymode;
7019   int nregs;
7020 
7021   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
7022   if (arg.mode == BLKmode && arg.type)
7023     size = int_size_in_bytes (arg.type);
7024   else
7025     /* No frontends can create types with variable-sized modes, so we
7026        shouldn't be asked to pass or return them.  */
7027     size = GET_MODE_SIZE (arg.mode).to_constant ();
7028 
7029   /* Aggregates are passed by reference based on their size.  */
7030   if (arg.aggregate_type_p ())
7031     size = int_size_in_bytes (arg.type);
7032 
7033   /* Variable sized arguments are always returned by reference.  */
7034   if (size < 0)
7035     return true;
7036 
7037   /* Can this be a candidate to be passed in fp/simd register(s)?  */
7038   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
7039 					       &dummymode, &nregs, NULL,
7040 					       !pcum || pcum->silent_p))
7041     return false;
7042 
7043   /* Arguments which are variable sized or larger than 2 registers are
7044      passed by reference unless they are a homogenous floating point
7045      aggregate.  */
7046   return size > 2 * UNITS_PER_WORD;
7047 }
7048 
7049 /* Implement TARGET_PASS_BY_REFERENCE.  */
7050 
7051 static bool
aarch64_pass_by_reference(cumulative_args_t pcum_v,const function_arg_info & arg)7052 aarch64_pass_by_reference (cumulative_args_t pcum_v,
7053 			   const function_arg_info &arg)
7054 {
7055   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7056 
7057   if (!arg.type)
7058     return aarch64_pass_by_reference_1 (pcum, arg);
7059 
7060   pure_scalable_type_info pst_info;
7061   switch (pst_info.analyze (arg.type))
7062     {
7063     case pure_scalable_type_info::IS_PST:
7064       if (pcum && !pcum->silent_p && !TARGET_SVE)
7065 	/* We can't gracefully recover at this point, so make this a
7066 	   fatal error.  */
7067 	fatal_error (input_location, "arguments of type %qT require"
7068 		     " the SVE ISA extension", arg.type);
7069 
7070       /* Variadic SVE types are passed by reference.  Normal non-variadic
7071 	 arguments are too if we've run out of registers.  */
7072       return (!arg.named
7073 	      || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
7074 	      || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
7075 
7076     case pure_scalable_type_info::DOESNT_MATTER:
7077       gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
7078       return true;
7079 
7080     case pure_scalable_type_info::NO_ABI_IDENTITY:
7081     case pure_scalable_type_info::ISNT_PST:
7082       return aarch64_pass_by_reference_1 (pcum, arg);
7083     }
7084   gcc_unreachable ();
7085 }
7086 
7087 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
7088 static bool
aarch64_return_in_msb(const_tree valtype)7089 aarch64_return_in_msb (const_tree valtype)
7090 {
7091   machine_mode dummy_mode;
7092   int dummy_int;
7093 
7094   /* Never happens in little-endian mode.  */
7095   if (!BYTES_BIG_ENDIAN)
7096     return false;
7097 
7098   /* Only composite types smaller than or equal to 16 bytes can
7099      be potentially returned in registers.  */
7100   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
7101       || int_size_in_bytes (valtype) <= 0
7102       || int_size_in_bytes (valtype) > 16)
7103     return false;
7104 
7105   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
7106      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
7107      is always passed/returned in the least significant bits of fp/simd
7108      register(s).  */
7109   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
7110 					       &dummy_mode, &dummy_int, NULL,
7111 					       false))
7112     return false;
7113 
7114   /* Likewise pure scalable types for SVE vector and predicate registers.  */
7115   pure_scalable_type_info pst_info;
7116   if (pst_info.analyze_registers (valtype))
7117     return false;
7118 
7119   return true;
7120 }
7121 
7122 /* Implement TARGET_FUNCTION_VALUE.
7123    Define how to find the value returned by a function.  */
7124 
7125 static rtx
aarch64_function_value(const_tree type,const_tree func,bool outgoing ATTRIBUTE_UNUSED)7126 aarch64_function_value (const_tree type, const_tree func,
7127 			bool outgoing ATTRIBUTE_UNUSED)
7128 {
7129   machine_mode mode;
7130   int unsignedp;
7131 
7132   mode = TYPE_MODE (type);
7133   if (INTEGRAL_TYPE_P (type))
7134     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
7135 
7136   pure_scalable_type_info pst_info;
7137   if (type && pst_info.analyze_registers (type))
7138     return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
7139 
7140   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7141      are returned in memory, not by value.  */
7142   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7143   bool sve_p = (vec_flags & VEC_ANY_SVE);
7144 
7145   if (aarch64_return_in_msb (type))
7146     {
7147       HOST_WIDE_INT size = int_size_in_bytes (type);
7148 
7149       if (size % UNITS_PER_WORD != 0)
7150 	{
7151 	  size += UNITS_PER_WORD - size % UNITS_PER_WORD;
7152 	  mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
7153 	}
7154     }
7155 
7156   int count;
7157   machine_mode ag_mode;
7158   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
7159 					       NULL, false))
7160     {
7161       gcc_assert (!sve_p);
7162       if (!aarch64_composite_type_p (type, mode))
7163 	{
7164 	  gcc_assert (count == 1 && mode == ag_mode);
7165 	  return gen_rtx_REG (mode, V0_REGNUM);
7166 	}
7167       else if (aarch64_advsimd_full_struct_mode_p (mode)
7168 	       && known_eq (GET_MODE_SIZE (ag_mode), 16))
7169 	return gen_rtx_REG (mode, V0_REGNUM);
7170       else if (aarch64_advsimd_partial_struct_mode_p (mode)
7171 	       && known_eq (GET_MODE_SIZE (ag_mode), 8))
7172 	return gen_rtx_REG (mode, V0_REGNUM);
7173       else
7174 	{
7175 	  int i;
7176 	  rtx par;
7177 
7178 	  par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
7179 	  for (i = 0; i < count; i++)
7180 	    {
7181 	      rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
7182 	      rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
7183 	      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7184 	      XVECEXP (par, 0, i) = tmp;
7185 	    }
7186 	  return par;
7187 	}
7188     }
7189   else
7190     {
7191       if (sve_p)
7192 	{
7193 	  /* Vector types can acquire a partial SVE mode using things like
7194 	     __attribute__((vector_size(N))), and this is potentially useful.
7195 	     However, the choice of mode doesn't affect the type's ABI
7196 	     identity, so we should treat the types as though they had
7197 	     the associated integer mode, just like they did before SVE
7198 	     was introduced.
7199 
7200 	     We know that the vector must be 128 bits or smaller,
7201 	     otherwise we'd have returned it in memory instead.  */
7202 	  gcc_assert (type
7203 		      && (aarch64_some_values_include_pst_objects_p (type)
7204 			  || (vec_flags & VEC_PARTIAL)));
7205 
7206 	  scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
7207 	  rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
7208 	  rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
7209 	  return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
7210 	}
7211       return gen_rtx_REG (mode, R0_REGNUM);
7212     }
7213 }
7214 
7215 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
7216    Return true if REGNO is the number of a hard register in which the values
7217    of called function may come back.  */
7218 
7219 static bool
aarch64_function_value_regno_p(const unsigned int regno)7220 aarch64_function_value_regno_p (const unsigned int regno)
7221 {
7222   /* Maximum of 16 bytes can be returned in the general registers.  Examples
7223      of 16-byte return values are: 128-bit integers and 16-byte small
7224      structures (excluding homogeneous floating-point aggregates).  */
7225   if (regno == R0_REGNUM || regno == R1_REGNUM)
7226     return true;
7227 
7228   /* Up to four fp/simd registers can return a function value, e.g. a
7229      homogeneous floating-point aggregate having four members.  */
7230   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
7231     return TARGET_FLOAT;
7232 
7233   return false;
7234 }
7235 
7236 /* Subroutine for aarch64_return_in_memory for types that are not returned
7237    in SVE registers.  */
7238 
7239 static bool
aarch64_return_in_memory_1(const_tree type)7240 aarch64_return_in_memory_1 (const_tree type)
7241 {
7242   HOST_WIDE_INT size;
7243   machine_mode ag_mode;
7244   int count;
7245 
7246   if (!AGGREGATE_TYPE_P (type)
7247       && TREE_CODE (type) != COMPLEX_TYPE
7248       && TREE_CODE (type) != VECTOR_TYPE)
7249     /* Simple scalar types always returned in registers.  */
7250     return false;
7251 
7252   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7253 					       &ag_mode, &count, NULL, false))
7254     return false;
7255 
7256   /* Types larger than 2 registers returned in memory.  */
7257   size = int_size_in_bytes (type);
7258   return (size < 0 || size > 2 * UNITS_PER_WORD);
7259 }
7260 
7261 /* Implement TARGET_RETURN_IN_MEMORY.
7262 
7263    If the type T of the result of a function is such that
7264      void func (T arg)
7265    would require that arg be passed as a value in a register (or set of
7266    registers) according to the parameter passing rules, then the result
7267    is returned in the same registers as would be used for such an
7268    argument.  */
7269 
7270 static bool
aarch64_return_in_memory(const_tree type,const_tree fndecl ATTRIBUTE_UNUSED)7271 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
7272 {
7273   pure_scalable_type_info pst_info;
7274   switch (pst_info.analyze (type))
7275     {
7276     case pure_scalable_type_info::IS_PST:
7277       return (pst_info.num_zr () > NUM_FP_ARG_REGS
7278 	      || pst_info.num_pr () > NUM_PR_ARG_REGS);
7279 
7280     case pure_scalable_type_info::DOESNT_MATTER:
7281       gcc_assert (aarch64_return_in_memory_1 (type));
7282       return true;
7283 
7284     case pure_scalable_type_info::NO_ABI_IDENTITY:
7285     case pure_scalable_type_info::ISNT_PST:
7286       return aarch64_return_in_memory_1 (type);
7287     }
7288   gcc_unreachable ();
7289 }
7290 
7291 static bool
aarch64_vfp_is_call_candidate(cumulative_args_t pcum_v,machine_mode mode,const_tree type,int * nregs)7292 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
7293 			       const_tree type, int *nregs)
7294 {
7295   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7296   return aarch64_vfp_is_call_or_return_candidate (mode, type,
7297 						  &pcum->aapcs_vfp_rmode,
7298 						  nregs, NULL, pcum->silent_p);
7299 }
7300 
7301 /* Given MODE and TYPE of a function argument, return the alignment in
7302    bits.  The idea is to suppress any stronger alignment requested by
7303    the user and opt for the natural alignment (specified in AAPCS64 \S
7304    4.1).  ABI_BREAK is set to the old alignment if the alignment was
7305    incorrectly calculated in versions of GCC prior to GCC-9.  This is
7306    a helper function for local use only.  */
7307 
7308 static unsigned int
aarch64_function_arg_alignment(machine_mode mode,const_tree type,unsigned int * abi_break)7309 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
7310 				unsigned int *abi_break)
7311 {
7312   *abi_break = 0;
7313   if (!type)
7314     return GET_MODE_ALIGNMENT (mode);
7315 
7316   if (integer_zerop (TYPE_SIZE (type)))
7317     return 0;
7318 
7319   gcc_assert (TYPE_MODE (type) == mode);
7320 
7321   if (!AGGREGATE_TYPE_P (type))
7322     {
7323       /* The ABI alignment is the natural alignment of the type, without
7324 	 any attributes applied.  Normally this is the alignment of the
7325 	 TYPE_MAIN_VARIANT, but not always; see PR108910 for a counterexample.
7326 	 For now we just handle the known exceptions explicitly.  */
7327       type = TYPE_MAIN_VARIANT (type);
7328       if (POINTER_TYPE_P (type))
7329 	{
7330 	  gcc_assert (known_eq (POINTER_SIZE, GET_MODE_BITSIZE (mode)));
7331 	  return POINTER_SIZE;
7332 	}
7333       return TYPE_ALIGN (type);
7334     }
7335 
7336   if (TREE_CODE (type) == ARRAY_TYPE)
7337     return TYPE_ALIGN (TREE_TYPE (type));
7338 
7339   unsigned int alignment = 0;
7340   unsigned int bitfield_alignment = 0;
7341   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7342     if (TREE_CODE (field) == FIELD_DECL)
7343       {
7344 	/* Note that we explicitly consider zero-sized fields here,
7345 	   even though they don't map to AAPCS64 machine types.
7346 	   For example, in:
7347 
7348 	       struct __attribute__((aligned(8))) empty {};
7349 
7350 	       struct s {
7351 		 [[no_unique_address]] empty e;
7352 		 int x;
7353 	       };
7354 
7355 	   "s" contains only one Fundamental Data Type (the int field)
7356 	   but gains 8-byte alignment and size thanks to "e".  */
7357 	alignment = std::max (alignment, DECL_ALIGN (field));
7358 	if (DECL_BIT_FIELD_TYPE (field))
7359 	  bitfield_alignment
7360 	    = std::max (bitfield_alignment,
7361 			TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
7362       }
7363 
7364   if (bitfield_alignment > alignment)
7365     {
7366       *abi_break = alignment;
7367       return bitfield_alignment;
7368     }
7369 
7370   return alignment;
7371 }
7372 
7373 /* Layout a function argument according to the AAPCS64 rules.  The rule
7374    numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
7375    mode that was originally given to us by the target hook, whereas the
7376    mode in ARG might be the result of replacing partial SVE modes with
7377    the equivalent integer mode.  */
7378 
7379 static void
aarch64_layout_arg(cumulative_args_t pcum_v,const function_arg_info & arg)7380 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7381 {
7382   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7383   tree type = arg.type;
7384   machine_mode mode = arg.mode;
7385   int ncrn, nvrn, nregs;
7386   bool allocate_ncrn, allocate_nvrn;
7387   HOST_WIDE_INT size;
7388   unsigned int abi_break;
7389 
7390   /* We need to do this once per argument.  */
7391   if (pcum->aapcs_arg_processed)
7392     return;
7393 
7394   bool warn_pcs_change
7395     = (warn_psabi
7396        && !pcum->silent_p
7397        && (currently_expanding_function_start
7398 	   || currently_expanding_gimple_stmt));
7399 
7400   unsigned int alignment
7401     = aarch64_function_arg_alignment (mode, type, &abi_break);
7402   gcc_assert (!alignment || abi_break < alignment);
7403 
7404   pcum->aapcs_arg_processed = true;
7405 
7406   pure_scalable_type_info pst_info;
7407   if (type && pst_info.analyze_registers (type))
7408     {
7409       /* aarch64_function_arg_alignment has never had an effect on
7410 	 this case.  */
7411 
7412       /* The PCS says that it is invalid to pass an SVE value to an
7413 	 unprototyped function.  There is no ABI-defined location we
7414 	 can return in this case, so we have no real choice but to raise
7415 	 an error immediately, even though this is only a query function.  */
7416       if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
7417 	{
7418 	  gcc_assert (!pcum->silent_p);
7419 	  error ("SVE type %qT cannot be passed to an unprototyped function",
7420 		 arg.type);
7421 	  /* Avoid repeating the message, and avoid tripping the assert
7422 	     below.  */
7423 	  pcum->pcs_variant = ARM_PCS_SVE;
7424 	}
7425 
7426       /* We would have converted the argument into pass-by-reference
7427 	 form if it didn't fit in registers.  */
7428       pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
7429       pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
7430       gcc_assert (arg.named
7431 		  && pcum->pcs_variant == ARM_PCS_SVE
7432 		  && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
7433 		  && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
7434       pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
7435 					  P0_REGNUM + pcum->aapcs_nprn);
7436       return;
7437     }
7438 
7439   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7440      are passed by reference, not by value.  */
7441   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7442   bool sve_p = (vec_flags & VEC_ANY_SVE);
7443   if (sve_p)
7444     /* Vector types can acquire a partial SVE mode using things like
7445        __attribute__((vector_size(N))), and this is potentially useful.
7446        However, the choice of mode doesn't affect the type's ABI
7447        identity, so we should treat the types as though they had
7448        the associated integer mode, just like they did before SVE
7449        was introduced.
7450 
7451        We know that the vector must be 128 bits or smaller,
7452        otherwise we'd have passed it in memory instead.  */
7453     gcc_assert (type
7454 		&& (aarch64_some_values_include_pst_objects_p (type)
7455 		    || (vec_flags & VEC_PARTIAL)));
7456 
7457   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
7458   if (type)
7459     size = int_size_in_bytes (type);
7460   else
7461     /* No frontends can create types with variable-sized modes, so we
7462        shouldn't be asked to pass or return them.  */
7463     size = GET_MODE_SIZE (mode).to_constant ();
7464   size = ROUND_UP (size, UNITS_PER_WORD);
7465 
7466   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
7467   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
7468 						 mode,
7469 						 type,
7470 						 &nregs);
7471   gcc_assert (!sve_p || !allocate_nvrn);
7472 
7473   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
7474      The following code thus handles passing by SIMD/FP registers first.  */
7475 
7476   nvrn = pcum->aapcs_nvrn;
7477 
7478   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
7479      and homogenous short-vector aggregates (HVA).  */
7480   if (allocate_nvrn)
7481     {
7482       /* aarch64_function_arg_alignment has never had an effect on
7483 	 this case.  */
7484       if (!pcum->silent_p && !TARGET_FLOAT)
7485 	aarch64_err_no_fpadvsimd (mode);
7486 
7487       if (nvrn + nregs <= NUM_FP_ARG_REGS)
7488 	{
7489 	  pcum->aapcs_nextnvrn = nvrn + nregs;
7490 	  if (!aarch64_composite_type_p (type, mode))
7491 	    {
7492 	      gcc_assert (nregs == 1);
7493 	      pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7494 	    }
7495 	  else if (aarch64_advsimd_full_struct_mode_p (mode)
7496 		   && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 16))
7497 	    pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7498 	  else if (aarch64_advsimd_partial_struct_mode_p (mode)
7499 		   && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 8))
7500 	    pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7501 	  else
7502 	    {
7503 	      rtx par;
7504 	      int i;
7505 	      par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7506 	      for (i = 0; i < nregs; i++)
7507 		{
7508 		  rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
7509 					 V0_REGNUM + nvrn + i);
7510 		  rtx offset = gen_int_mode
7511 		    (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
7512 		  tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7513 		  XVECEXP (par, 0, i) = tmp;
7514 		}
7515 	      pcum->aapcs_reg = par;
7516 	    }
7517 	  return;
7518 	}
7519       else
7520 	{
7521 	  /* C.3 NSRN is set to 8.  */
7522 	  pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
7523 	  goto on_stack;
7524 	}
7525     }
7526 
7527   ncrn = pcum->aapcs_ncrn;
7528   nregs = size / UNITS_PER_WORD;
7529 
7530   /* C6 - C9.  though the sign and zero extension semantics are
7531      handled elsewhere.  This is the case where the argument fits
7532      entirely general registers.  */
7533   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
7534     {
7535       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
7536 
7537       /* C.8 if the argument has an alignment of 16 then the NGRN is
7538 	 rounded up to the next even number.  */
7539       if (nregs == 2
7540 	  && ncrn % 2
7541 	  /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
7542 	     comparison is there because for > 16 * BITS_PER_UNIT
7543 	     alignment nregs should be > 2 and therefore it should be
7544 	     passed by reference rather than value.  */
7545 	  && (aarch64_function_arg_alignment (mode, type, &abi_break)
7546 	      == 16 * BITS_PER_UNIT))
7547 	{
7548 	  if (warn_pcs_change && abi_break)
7549 	    inform (input_location, "parameter passing for argument of type "
7550 		    "%qT changed in GCC 9.1", type);
7551 	  ++ncrn;
7552 	  gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
7553 	}
7554 
7555       /* If an argument with an SVE mode needs to be shifted up to the
7556 	 high part of the register, treat it as though it had an integer mode.
7557 	 Using the normal (parallel [...]) would suppress the shifting.  */
7558       if (sve_p
7559 	  && BYTES_BIG_ENDIAN
7560 	  && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
7561 	  && aarch64_pad_reg_upward (mode, type, false))
7562 	{
7563 	  mode = int_mode_for_mode (mode).require ();
7564 	  sve_p = false;
7565 	}
7566 
7567       /* NREGS can be 0 when e.g. an empty structure is to be passed.
7568 	 A reg is still generated for it, but the caller should be smart
7569 	 enough not to use it.  */
7570       if (nregs == 0
7571 	  || (nregs == 1 && !sve_p)
7572 	  || GET_MODE_CLASS (mode) == MODE_INT)
7573 	pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
7574       else
7575 	{
7576 	  rtx par;
7577 	  int i;
7578 
7579 	  par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7580 	  for (i = 0; i < nregs; i++)
7581 	    {
7582 	      scalar_int_mode reg_mode = word_mode;
7583 	      if (nregs == 1)
7584 		reg_mode = int_mode_for_mode (mode).require ();
7585 	      rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
7586 	      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
7587 				       GEN_INT (i * UNITS_PER_WORD));
7588 	      XVECEXP (par, 0, i) = tmp;
7589 	    }
7590 	  pcum->aapcs_reg = par;
7591 	}
7592 
7593       pcum->aapcs_nextncrn = ncrn + nregs;
7594       return;
7595     }
7596 
7597   /* C.11  */
7598   pcum->aapcs_nextncrn = NUM_ARG_REGS;
7599 
7600   /* The argument is passed on stack; record the needed number of words for
7601      this argument and align the total size if necessary.  */
7602 on_stack:
7603   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
7604 
7605   if (aarch64_function_arg_alignment (mode, type, &abi_break)
7606       == 16 * BITS_PER_UNIT)
7607     {
7608       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
7609       if (pcum->aapcs_stack_size != new_size)
7610 	{
7611 	  if (warn_pcs_change && abi_break)
7612 	    inform (input_location, "parameter passing for argument of type "
7613 		    "%qT changed in GCC 9.1", type);
7614 	  pcum->aapcs_stack_size = new_size;
7615 	}
7616     }
7617   return;
7618 }
7619 
7620 /* Implement TARGET_FUNCTION_ARG.  */
7621 
7622 static rtx
aarch64_function_arg(cumulative_args_t pcum_v,const function_arg_info & arg)7623 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7624 {
7625   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7626   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
7627 	      || pcum->pcs_variant == ARM_PCS_SIMD
7628 	      || pcum->pcs_variant == ARM_PCS_SVE);
7629 
7630   if (arg.end_marker_p ())
7631     return gen_int_mode (pcum->pcs_variant, DImode);
7632 
7633   aarch64_layout_arg (pcum_v, arg);
7634   return pcum->aapcs_reg;
7635 }
7636 
7637 void
aarch64_init_cumulative_args(CUMULATIVE_ARGS * pcum,const_tree fntype,rtx libname ATTRIBUTE_UNUSED,const_tree fndecl ATTRIBUTE_UNUSED,unsigned n_named ATTRIBUTE_UNUSED,bool silent_p)7638 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
7639 			      const_tree fntype,
7640 			      rtx libname ATTRIBUTE_UNUSED,
7641 			      const_tree fndecl ATTRIBUTE_UNUSED,
7642 			      unsigned n_named ATTRIBUTE_UNUSED,
7643 			      bool silent_p)
7644 {
7645   pcum->aapcs_ncrn = 0;
7646   pcum->aapcs_nvrn = 0;
7647   pcum->aapcs_nprn = 0;
7648   pcum->aapcs_nextncrn = 0;
7649   pcum->aapcs_nextnvrn = 0;
7650   pcum->aapcs_nextnprn = 0;
7651   if (fntype)
7652     pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
7653   else
7654     pcum->pcs_variant = ARM_PCS_AAPCS64;
7655   pcum->aapcs_reg = NULL_RTX;
7656   pcum->aapcs_arg_processed = false;
7657   pcum->aapcs_stack_words = 0;
7658   pcum->aapcs_stack_size = 0;
7659   pcum->silent_p = silent_p;
7660 
7661   if (!silent_p
7662       && !TARGET_FLOAT
7663       && fntype && fntype != error_mark_node)
7664     {
7665       const_tree type = TREE_TYPE (fntype);
7666       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
7667       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
7668       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7669 						   &mode, &nregs, NULL, false))
7670 	aarch64_err_no_fpadvsimd (TYPE_MODE (type));
7671     }
7672 
7673   if (!silent_p
7674       && !TARGET_SVE
7675       && pcum->pcs_variant == ARM_PCS_SVE)
7676     {
7677       /* We can't gracefully recover at this point, so make this a
7678 	 fatal error.  */
7679       if (fndecl)
7680 	fatal_error (input_location, "%qE requires the SVE ISA extension",
7681 		     fndecl);
7682       else
7683 	fatal_error (input_location, "calls to functions of type %qT require"
7684 		     " the SVE ISA extension", fntype);
7685     }
7686 }
7687 
7688 static void
aarch64_function_arg_advance(cumulative_args_t pcum_v,const function_arg_info & arg)7689 aarch64_function_arg_advance (cumulative_args_t pcum_v,
7690 			      const function_arg_info &arg)
7691 {
7692   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7693   if (pcum->pcs_variant == ARM_PCS_AAPCS64
7694       || pcum->pcs_variant == ARM_PCS_SIMD
7695       || pcum->pcs_variant == ARM_PCS_SVE)
7696     {
7697       aarch64_layout_arg (pcum_v, arg);
7698       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
7699 		  != (pcum->aapcs_stack_words != 0));
7700       pcum->aapcs_arg_processed = false;
7701       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
7702       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
7703       pcum->aapcs_nprn = pcum->aapcs_nextnprn;
7704       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
7705       pcum->aapcs_stack_words = 0;
7706       pcum->aapcs_reg = NULL_RTX;
7707     }
7708 }
7709 
7710 bool
aarch64_function_arg_regno_p(unsigned regno)7711 aarch64_function_arg_regno_p (unsigned regno)
7712 {
7713   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
7714 	  || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
7715 }
7716 
7717 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
7718    PARM_BOUNDARY bits of alignment, but will be given anything up
7719    to STACK_BOUNDARY bits if the type requires it.  This makes sure
7720    that both before and after the layout of each argument, the Next
7721    Stacked Argument Address (NSAA) will have a minimum alignment of
7722    8 bytes.  */
7723 
7724 static unsigned int
aarch64_function_arg_boundary(machine_mode mode,const_tree type)7725 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
7726 {
7727   unsigned int abi_break;
7728   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
7729 							   &abi_break);
7730   alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
7731   if (abi_break && warn_psabi)
7732     {
7733       abi_break = MIN (MAX (abi_break, PARM_BOUNDARY), STACK_BOUNDARY);
7734       if (alignment != abi_break)
7735 	inform (input_location, "parameter passing for argument of type "
7736 		"%qT changed in GCC 9.1", type);
7737     }
7738   return alignment;
7739 }
7740 
7741 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
7742 
7743 static fixed_size_mode
aarch64_get_reg_raw_mode(int regno)7744 aarch64_get_reg_raw_mode (int regno)
7745 {
7746   if (TARGET_SVE && FP_REGNUM_P (regno))
7747     /* Don't use the SVE part of the register for __builtin_apply and
7748        __builtin_return.  The SVE registers aren't used by the normal PCS,
7749        so using them there would be a waste of time.  The PCS extensions
7750        for SVE types are fundamentally incompatible with the
7751        __builtin_return/__builtin_apply interface.  */
7752     return as_a <fixed_size_mode> (V16QImode);
7753   return default_get_reg_raw_mode (regno);
7754 }
7755 
7756 /* Implement TARGET_FUNCTION_ARG_PADDING.
7757 
7758    Small aggregate types are placed in the lowest memory address.
7759 
7760    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
7761 
7762 static pad_direction
aarch64_function_arg_padding(machine_mode mode,const_tree type)7763 aarch64_function_arg_padding (machine_mode mode, const_tree type)
7764 {
7765   /* On little-endian targets, the least significant byte of every stack
7766      argument is passed at the lowest byte address of the stack slot.  */
7767   if (!BYTES_BIG_ENDIAN)
7768     return PAD_UPWARD;
7769 
7770   /* Otherwise, integral, floating-point and pointer types are padded downward:
7771      the least significant byte of a stack argument is passed at the highest
7772      byte address of the stack slot.  */
7773   if (type
7774       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
7775 	 || POINTER_TYPE_P (type))
7776       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
7777     return PAD_DOWNWARD;
7778 
7779   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
7780   return PAD_UPWARD;
7781 }
7782 
7783 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
7784 
7785    It specifies padding for the last (may also be the only)
7786    element of a block move between registers and memory.  If
7787    assuming the block is in the memory, padding upward means that
7788    the last element is padded after its highest significant byte,
7789    while in downward padding, the last element is padded at the
7790    its least significant byte side.
7791 
7792    Small aggregates and small complex types are always padded
7793    upwards.
7794 
7795    We don't need to worry about homogeneous floating-point or
7796    short-vector aggregates; their move is not affected by the
7797    padding direction determined here.  Regardless of endianness,
7798    each element of such an aggregate is put in the least
7799    significant bits of a fp/simd register.
7800 
7801    Return !BYTES_BIG_ENDIAN if the least significant byte of the
7802    register has useful data, and return the opposite if the most
7803    significant byte does.  */
7804 
7805 bool
aarch64_pad_reg_upward(machine_mode mode,const_tree type,bool first ATTRIBUTE_UNUSED)7806 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
7807 		     bool first ATTRIBUTE_UNUSED)
7808 {
7809 
7810   /* Aside from pure scalable types, small composite types are always
7811      padded upward.  */
7812   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
7813     {
7814       HOST_WIDE_INT size;
7815       if (type)
7816 	size = int_size_in_bytes (type);
7817       else
7818 	/* No frontends can create types with variable-sized modes, so we
7819 	   shouldn't be asked to pass or return them.  */
7820 	size = GET_MODE_SIZE (mode).to_constant ();
7821       if (size < 2 * UNITS_PER_WORD)
7822 	{
7823 	  pure_scalable_type_info pst_info;
7824 	  if (pst_info.analyze_registers (type))
7825 	    return false;
7826 	  return true;
7827 	}
7828     }
7829 
7830   /* Otherwise, use the default padding.  */
7831   return !BYTES_BIG_ENDIAN;
7832 }
7833 
7834 static scalar_int_mode
aarch64_libgcc_cmp_return_mode(void)7835 aarch64_libgcc_cmp_return_mode (void)
7836 {
7837   return SImode;
7838 }
7839 
7840 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
7841 
7842 /* We use the 12-bit shifted immediate arithmetic instructions so values
7843    must be multiple of (1 << 12), i.e. 4096.  */
7844 #define ARITH_FACTOR 4096
7845 
7846 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
7847 #error Cannot use simple address calculation for stack probing
7848 #endif
7849 
7850 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
7851    inclusive.  These are offsets from the current stack pointer.  */
7852 
7853 static void
aarch64_emit_probe_stack_range(HOST_WIDE_INT first,poly_int64 poly_size)7854 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
7855 {
7856   HOST_WIDE_INT size;
7857   if (!poly_size.is_constant (&size))
7858     {
7859       sorry ("stack probes for SVE frames");
7860       return;
7861     }
7862 
7863   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
7864 
7865   /* See the same assertion on PROBE_INTERVAL above.  */
7866   gcc_assert ((first % ARITH_FACTOR) == 0);
7867 
7868   /* See if we have a constant small number of probes to generate.  If so,
7869      that's the easy case.  */
7870   if (size <= PROBE_INTERVAL)
7871     {
7872       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
7873 
7874       emit_set_insn (reg1,
7875 		     plus_constant (Pmode,
7876 				    stack_pointer_rtx, -(first + base)));
7877       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
7878     }
7879 
7880   /* The run-time loop is made up of 8 insns in the generic case while the
7881      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
7882   else if (size <= 4 * PROBE_INTERVAL)
7883     {
7884       HOST_WIDE_INT i, rem;
7885 
7886       emit_set_insn (reg1,
7887 		     plus_constant (Pmode,
7888 				    stack_pointer_rtx,
7889 				    -(first + PROBE_INTERVAL)));
7890       emit_stack_probe (reg1);
7891 
7892       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
7893 	 it exceeds SIZE.  If only two probes are needed, this will not
7894 	 generate any code.  Then probe at FIRST + SIZE.  */
7895       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
7896 	{
7897 	  emit_set_insn (reg1,
7898 			 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
7899 	  emit_stack_probe (reg1);
7900 	}
7901 
7902       rem = size - (i - PROBE_INTERVAL);
7903       if (rem > 256)
7904 	{
7905 	  const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7906 
7907 	  emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
7908 	  emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
7909 	}
7910       else
7911 	emit_stack_probe (plus_constant (Pmode, reg1, -rem));
7912     }
7913 
7914   /* Otherwise, do the same as above, but in a loop.  Note that we must be
7915      extra careful with variables wrapping around because we might be at
7916      the very top (or the very bottom) of the address space and we have
7917      to be able to handle this case properly; in particular, we use an
7918      equality test for the loop condition.  */
7919   else
7920     {
7921       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
7922 
7923       /* Step 1: round SIZE to the previous multiple of the interval.  */
7924 
7925       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
7926 
7927 
7928       /* Step 2: compute initial and final value of the loop counter.  */
7929 
7930       /* TEST_ADDR = SP + FIRST.  */
7931       emit_set_insn (reg1,
7932 		     plus_constant (Pmode, stack_pointer_rtx, -first));
7933 
7934       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
7935       HOST_WIDE_INT adjustment = - (first + rounded_size);
7936       if (! aarch64_uimm12_shift (adjustment))
7937 	{
7938 	  aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
7939 					  true, Pmode);
7940 	  emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
7941 	}
7942       else
7943 	emit_set_insn (reg2,
7944 		       plus_constant (Pmode, stack_pointer_rtx, adjustment));
7945 
7946       /* Step 3: the loop
7947 
7948 	 do
7949 	   {
7950 	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
7951 	     probe at TEST_ADDR
7952 	   }
7953 	 while (TEST_ADDR != LAST_ADDR)
7954 
7955 	 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
7956 	 until it is equal to ROUNDED_SIZE.  */
7957 
7958       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
7959 
7960 
7961       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
7962 	 that SIZE is equal to ROUNDED_SIZE.  */
7963 
7964       if (size != rounded_size)
7965 	{
7966 	  HOST_WIDE_INT rem = size - rounded_size;
7967 
7968 	  if (rem > 256)
7969 	    {
7970 	      const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7971 
7972 	      emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
7973 	      emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
7974 	    }
7975 	  else
7976 	    emit_stack_probe (plus_constant (Pmode, reg2, -rem));
7977 	}
7978     }
7979 
7980   /* Make sure nothing is scheduled before we are done.  */
7981   emit_insn (gen_blockage ());
7982 }
7983 
7984 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
7985    absolute addresses.  */
7986 
7987 const char *
aarch64_output_probe_stack_range(rtx reg1,rtx reg2)7988 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
7989 {
7990   static int labelno = 0;
7991   char loop_lab[32];
7992   rtx xops[2];
7993 
7994   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
7995 
7996   /* Loop.  */
7997   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
7998 
7999   HOST_WIDE_INT stack_clash_probe_interval
8000     = 1 << param_stack_clash_protection_guard_size;
8001 
8002   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
8003   xops[0] = reg1;
8004   HOST_WIDE_INT interval;
8005   if (flag_stack_clash_protection)
8006     interval = stack_clash_probe_interval;
8007   else
8008     interval = PROBE_INTERVAL;
8009 
8010   gcc_assert (aarch64_uimm12_shift (interval));
8011   xops[1] = GEN_INT (interval);
8012 
8013   output_asm_insn ("sub\t%0, %0, %1", xops);
8014 
8015   /* If doing stack clash protection then we probe up by the ABI specified
8016      amount.  We do this because we're dropping full pages at a time in the
8017      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
8018   if (flag_stack_clash_protection)
8019     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
8020   else
8021     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
8022 
8023   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
8024      by this amount for each iteration.  */
8025   output_asm_insn ("str\txzr, [%0, %1]", xops);
8026 
8027   /* Test if TEST_ADDR == LAST_ADDR.  */
8028   xops[1] = reg2;
8029   output_asm_insn ("cmp\t%0, %1", xops);
8030 
8031   /* Branch.  */
8032   fputs ("\tb.ne\t", asm_out_file);
8033   assemble_name_raw (asm_out_file, loop_lab);
8034   fputc ('\n', asm_out_file);
8035 
8036   return "";
8037 }
8038 
8039 /* Emit the probe loop for doing stack clash probes and stack adjustments for
8040    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
8041    of GUARD_SIZE.  When a probe is emitted it is done at most
8042    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
8043    at most MIN_PROBE_THRESHOLD.  By the end of this function
8044    BASE = BASE - ADJUSTMENT.  */
8045 
8046 const char *
aarch64_output_probe_sve_stack_clash(rtx base,rtx adjustment,rtx min_probe_threshold,rtx guard_size)8047 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
8048 				      rtx min_probe_threshold, rtx guard_size)
8049 {
8050   /* This function is not allowed to use any instruction generation function
8051      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
8052      so instead emit the code you want using output_asm_insn.  */
8053   gcc_assert (flag_stack_clash_protection);
8054   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
8055   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
8056 
8057   /* The minimum required allocation before the residual requires probing.  */
8058   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
8059 
8060   /* Clamp the value down to the nearest value that can be used with a cmp.  */
8061   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
8062   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
8063 
8064   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
8065   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
8066 
8067   static int labelno = 0;
8068   char loop_start_lab[32];
8069   char loop_end_lab[32];
8070   rtx xops[2];
8071 
8072   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
8073   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
8074 
8075   /* Emit loop start label.  */
8076   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
8077 
8078   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
8079   xops[0] = adjustment;
8080   xops[1] = probe_offset_value_rtx;
8081   output_asm_insn ("cmp\t%0, %1", xops);
8082 
8083   /* Branch to end if not enough adjustment to probe.  */
8084   fputs ("\tb.lt\t", asm_out_file);
8085   assemble_name_raw (asm_out_file, loop_end_lab);
8086   fputc ('\n', asm_out_file);
8087 
8088   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
8089   xops[0] = base;
8090   xops[1] = probe_offset_value_rtx;
8091   output_asm_insn ("sub\t%0, %0, %1", xops);
8092 
8093   /* Probe at BASE.  */
8094   xops[1] = const0_rtx;
8095   output_asm_insn ("str\txzr, [%0, %1]", xops);
8096 
8097   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
8098   xops[0] = adjustment;
8099   xops[1] = probe_offset_value_rtx;
8100   output_asm_insn ("sub\t%0, %0, %1", xops);
8101 
8102   /* Branch to start if still more bytes to allocate.  */
8103   fputs ("\tb\t", asm_out_file);
8104   assemble_name_raw (asm_out_file, loop_start_lab);
8105   fputc ('\n', asm_out_file);
8106 
8107   /* No probe leave.  */
8108   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
8109 
8110   /* BASE = BASE - ADJUSTMENT.  */
8111   xops[0] = base;
8112   xops[1] = adjustment;
8113   output_asm_insn ("sub\t%0, %0, %1", xops);
8114   return "";
8115 }
8116 
8117 /* Determine whether a frame chain needs to be generated.  */
8118 static bool
aarch64_needs_frame_chain(void)8119 aarch64_needs_frame_chain (void)
8120 {
8121   /* Force a frame chain for EH returns so the return address is at FP+8.  */
8122   if (frame_pointer_needed || crtl->calls_eh_return)
8123     return true;
8124 
8125   /* A leaf function cannot have calls or write LR.  */
8126   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
8127 
8128   /* Don't use a frame chain in leaf functions if leaf frame pointers
8129      are disabled.  */
8130   if (flag_omit_leaf_frame_pointer && is_leaf)
8131     return false;
8132 
8133   return aarch64_use_frame_pointer;
8134 }
8135 
8136 /* Return true if the current function should save registers above
8137    the locals area, rather than below it.  */
8138 
8139 static bool
aarch64_save_regs_above_locals_p()8140 aarch64_save_regs_above_locals_p ()
8141 {
8142   /* When using stack smash protection, make sure that the canary slot
8143      comes between the locals and the saved registers.  Otherwise,
8144      it would be possible for a carefully sized smash attack to change
8145      the saved registers (particularly LR and FP) without reaching the
8146      canary.  */
8147   return crtl->stack_protect_guard;
8148 }
8149 
8150 /* Mark the registers that need to be saved by the callee and calculate
8151    the size of the callee-saved registers area and frame record (both FP
8152    and LR may be omitted).  */
8153 static void
aarch64_layout_frame(void)8154 aarch64_layout_frame (void)
8155 {
8156   int regno, last_fp_reg = INVALID_REGNUM;
8157   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
8158   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
8159   bool frame_related_fp_reg_p = false;
8160   aarch64_frame &frame = cfun->machine->frame;
8161   poly_int64 top_of_locals = -1;
8162 
8163   frame.emit_frame_chain = aarch64_needs_frame_chain ();
8164 
8165   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
8166      the mid-end is doing.  */
8167   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
8168 
8169 #define SLOT_NOT_REQUIRED (-2)
8170 #define SLOT_REQUIRED     (-1)
8171 
8172   frame.wb_push_candidate1 = INVALID_REGNUM;
8173   frame.wb_push_candidate2 = INVALID_REGNUM;
8174   frame.spare_pred_reg = INVALID_REGNUM;
8175 
8176   /* First mark all the registers that really need to be saved...  */
8177   for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8178     frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
8179 
8180   /* ... that includes the eh data registers (if needed)...  */
8181   if (crtl->calls_eh_return)
8182     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
8183       frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
8184 
8185   /* ... and any callee saved register that dataflow says is live.  */
8186   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8187     if (df_regs_ever_live_p (regno)
8188 	&& !fixed_regs[regno]
8189 	&& (regno == R30_REGNUM
8190 	    || !crtl->abi->clobbers_full_reg_p (regno)))
8191       frame.reg_offset[regno] = SLOT_REQUIRED;
8192 
8193   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8194     if (df_regs_ever_live_p (regno)
8195 	&& !fixed_regs[regno]
8196 	&& !crtl->abi->clobbers_full_reg_p (regno))
8197       {
8198 	frame.reg_offset[regno] = SLOT_REQUIRED;
8199 	last_fp_reg = regno;
8200 	if (aarch64_emit_cfi_for_reg_p (regno))
8201 	  frame_related_fp_reg_p = true;
8202       }
8203 
8204   /* Big-endian SVE frames need a spare predicate register in order
8205      to save Z8-Z15.  Decide which register they should use.  Prefer
8206      an unused argument register if possible, so that we don't force P4
8207      to be saved unnecessarily.  */
8208   if (frame_related_fp_reg_p
8209       && crtl->abi->id () == ARM_PCS_SVE
8210       && BYTES_BIG_ENDIAN)
8211     {
8212       bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8213       bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
8214       for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
8215 	if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
8216 	  break;
8217       gcc_assert (regno <= P7_REGNUM);
8218       frame.spare_pred_reg = regno;
8219       df_set_regs_ever_live (regno, true);
8220     }
8221 
8222   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8223     if (df_regs_ever_live_p (regno)
8224 	&& !fixed_regs[regno]
8225 	&& !crtl->abi->clobbers_full_reg_p (regno))
8226       frame.reg_offset[regno] = SLOT_REQUIRED;
8227 
8228   bool regs_at_top_p = aarch64_save_regs_above_locals_p ();
8229 
8230   poly_int64 offset = crtl->outgoing_args_size;
8231   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
8232   if (regs_at_top_p)
8233     {
8234       offset += get_frame_size ();
8235       offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8236       top_of_locals = offset;
8237     }
8238   frame.bytes_below_saved_regs = offset;
8239   frame.sve_save_and_probe = INVALID_REGNUM;
8240 
8241   /* Now assign stack slots for the registers.  Start with the predicate
8242      registers, since predicate LDR and STR have a relatively small
8243      offset range.  These saves happen below the hard frame pointer.  */
8244   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8245     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8246       {
8247 	if (frame.sve_save_and_probe == INVALID_REGNUM)
8248 	  frame.sve_save_and_probe = regno;
8249 	frame.reg_offset[regno] = offset;
8250 	offset += BYTES_PER_SVE_PRED;
8251       }
8252 
8253   poly_int64 saved_prs_size = offset - frame.bytes_below_saved_regs;
8254   if (maybe_ne (saved_prs_size, 0))
8255     {
8256       /* If we have any vector registers to save above the predicate registers,
8257 	 the offset of the vector register save slots need to be a multiple
8258 	 of the vector size.  This lets us use the immediate forms of LDR/STR
8259 	 (or LD1/ST1 for big-endian).
8260 
8261 	 A vector register is 8 times the size of a predicate register,
8262 	 and we need to save a maximum of 12 predicate registers, so the
8263 	 first vector register will be at either #1, MUL VL or #2, MUL VL.
8264 
8265 	 If we don't have any vector registers to save, and we know how
8266 	 big the predicate save area is, we can just round it up to the
8267 	 next 16-byte boundary.  */
8268       if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
8269 	offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8270       else
8271 	{
8272 	  if (known_le (saved_prs_size, vector_save_size))
8273 	    offset = frame.bytes_below_saved_regs + vector_save_size;
8274 	  else if (known_le (saved_prs_size, vector_save_size * 2))
8275 	    offset = frame.bytes_below_saved_regs + vector_save_size * 2;
8276 	  else
8277 	    gcc_unreachable ();
8278 	}
8279     }
8280 
8281   /* If we need to save any SVE vector registers, add them next.  */
8282   if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
8283     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8284       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8285 	{
8286 	  if (frame.sve_save_and_probe == INVALID_REGNUM)
8287 	    frame.sve_save_and_probe = regno;
8288 	  frame.reg_offset[regno] = offset;
8289 	  offset += vector_save_size;
8290 	}
8291 
8292   /* OFFSET is now the offset of the hard frame pointer from the bottom
8293      of the callee save area.  */
8294   auto below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
8295   bool saves_below_hard_fp_p = maybe_ne (below_hard_fp_saved_regs_size, 0);
8296   gcc_assert (!saves_below_hard_fp_p
8297 	      || (frame.sve_save_and_probe != INVALID_REGNUM
8298 		  && known_eq (frame.reg_offset[frame.sve_save_and_probe],
8299 			       frame.bytes_below_saved_regs)));
8300 
8301   frame.bytes_below_hard_fp = offset;
8302   frame.hard_fp_save_and_probe = INVALID_REGNUM;
8303 
8304   auto allocate_gpr_slot = [&](unsigned int regno)
8305     {
8306       if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
8307 	frame.hard_fp_save_and_probe = regno;
8308       frame.reg_offset[regno] = offset;
8309       if (frame.wb_push_candidate1 == INVALID_REGNUM)
8310 	frame.wb_push_candidate1 = regno;
8311       else if (frame.wb_push_candidate2 == INVALID_REGNUM)
8312 	frame.wb_push_candidate2 = regno;
8313       offset += UNITS_PER_WORD;
8314     };
8315 
8316   if (frame.emit_frame_chain)
8317     {
8318       /* FP and LR are placed in the linkage record.  */
8319       allocate_gpr_slot (R29_REGNUM);
8320       allocate_gpr_slot (R30_REGNUM);
8321     }
8322   else if (flag_stack_clash_protection
8323 	   && known_eq (frame.reg_offset[R30_REGNUM], SLOT_REQUIRED))
8324     /* Put the LR save slot first, since it makes a good choice of probe
8325        for stack clash purposes.  The idea is that the link register usually
8326        has to be saved before a call anyway, and so we lose little by
8327        stopping it from being individually shrink-wrapped.  */
8328     allocate_gpr_slot (R30_REGNUM);
8329 
8330   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8331     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8332       allocate_gpr_slot (regno);
8333 
8334   poly_int64 max_int_offset = offset;
8335   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8336   bool has_align_gap = maybe_ne (offset, max_int_offset);
8337 
8338   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8339     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8340       {
8341 	if (frame.hard_fp_save_and_probe == INVALID_REGNUM)
8342 	  frame.hard_fp_save_and_probe = regno;
8343 	/* If there is an alignment gap between integer and fp callee-saves,
8344 	   allocate the last fp register to it if possible.  */
8345 	if (regno == last_fp_reg
8346 	    && has_align_gap
8347 	    && known_eq (vector_save_size, 8)
8348 	    && multiple_p (offset, 16))
8349 	  {
8350 	    frame.reg_offset[regno] = max_int_offset;
8351 	    break;
8352 	  }
8353 
8354 	frame.reg_offset[regno] = offset;
8355 	if (frame.wb_push_candidate1 == INVALID_REGNUM)
8356 	  frame.wb_push_candidate1 = regno;
8357 	else if (frame.wb_push_candidate2 == INVALID_REGNUM
8358 		 && frame.wb_push_candidate1 >= V0_REGNUM)
8359 	  frame.wb_push_candidate2 = regno;
8360 	offset += vector_save_size;
8361       }
8362 
8363   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8364 
8365   auto saved_regs_size = offset - frame.bytes_below_saved_regs;
8366   gcc_assert (known_eq (saved_regs_size, below_hard_fp_saved_regs_size)
8367 	      || (frame.hard_fp_save_and_probe != INVALID_REGNUM
8368 		  && known_eq (frame.reg_offset[frame.hard_fp_save_and_probe],
8369 			       frame.bytes_below_hard_fp)));
8370 
8371   /* With stack-clash, a register must be saved in non-leaf functions.
8372      The saving of the bottommost register counts as an implicit probe,
8373      which allows us to maintain the invariant described in the comment
8374      at expand_prologue.  */
8375   gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
8376 
8377   if (!regs_at_top_p)
8378     {
8379       offset += get_frame_size ();
8380       offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8381       top_of_locals = offset;
8382     }
8383   offset += frame.saved_varargs_size;
8384   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
8385   frame.frame_size = offset;
8386 
8387   frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp;
8388   gcc_assert (known_ge (top_of_locals, 0));
8389   frame.bytes_above_locals = frame.frame_size - top_of_locals;
8390 
8391   frame.initial_adjust = 0;
8392   frame.final_adjust = 0;
8393   frame.callee_adjust = 0;
8394   frame.sve_callee_adjust = 0;
8395 
8396   frame.wb_pop_candidate1 = frame.wb_push_candidate1;
8397   frame.wb_pop_candidate2 = frame.wb_push_candidate2;
8398 
8399   /* Shadow call stack only deals with functions where the LR is pushed
8400      onto the stack and without specifying the "no_sanitize" attribute
8401      with the argument "shadow-call-stack".  */
8402   frame.is_scs_enabled
8403     = (!crtl->calls_eh_return
8404        && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
8405        && known_ge (frame.reg_offset[LR_REGNUM], 0));
8406 
8407   /* When shadow call stack is enabled, the scs_pop in the epilogue will
8408      restore x30, and we don't need to pop x30 again in the traditional
8409      way.  Pop candidates record the registers that need to be popped
8410      eventually.  */
8411   if (frame.is_scs_enabled)
8412     {
8413       if (frame.wb_pop_candidate2 == R30_REGNUM)
8414 	frame.wb_pop_candidate2 = INVALID_REGNUM;
8415       else if (frame.wb_pop_candidate1 == R30_REGNUM)
8416 	frame.wb_pop_candidate1 = INVALID_REGNUM;
8417     }
8418 
8419   /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8420      256 to ensure that the offset meets the requirements of emit_move_insn.
8421      Similarly, if candidate1 is INVALID_REGNUM, we need to set
8422      max_push_offset to 0, because no registers are popped at this time,
8423      so callee_adjust cannot be adjusted.  */
8424   HOST_WIDE_INT max_push_offset = 0;
8425   if (frame.wb_pop_candidate1 != INVALID_REGNUM)
8426     {
8427       if (frame.wb_pop_candidate2 != INVALID_REGNUM)
8428 	max_push_offset = 512;
8429       else
8430 	max_push_offset = 256;
8431     }
8432 
8433   HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
8434   HOST_WIDE_INT const_saved_regs_size;
8435   if (known_eq (saved_regs_size, 0))
8436     frame.initial_adjust = frame.frame_size;
8437   else if (frame.frame_size.is_constant (&const_size)
8438 	   && const_size < max_push_offset
8439 	   && known_eq (frame.bytes_above_hard_fp, const_size))
8440     {
8441       /* Simple, small frame with no data below the saved registers.
8442 
8443 	 stp reg1, reg2, [sp, -frame_size]!
8444 	 stp reg3, reg4, [sp, 16]  */
8445       frame.callee_adjust = const_size;
8446     }
8447   else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs)
8448 	   && saved_regs_size.is_constant (&const_saved_regs_size)
8449 	   && const_below_saved_regs + const_saved_regs_size < 512
8450 	   /* We could handle this case even with data below the saved
8451 	      registers, provided that that data left us with valid offsets
8452 	      for all predicate and vector save slots.  It's such a rare
8453 	      case that it hardly seems worth the effort though.  */
8454 	   && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
8455 	   && !(cfun->calls_alloca
8456 		&& frame.bytes_above_hard_fp.is_constant (&const_above_fp)
8457 		&& const_above_fp < max_push_offset))
8458     {
8459       /* Frame with small area below the saved registers:
8460 
8461 	 sub sp, sp, frame_size
8462 	 stp reg1, reg2, [sp, bytes_below_saved_regs]
8463 	 stp reg3, reg4, [sp, bytes_below_saved_regs + 16]  */
8464       frame.initial_adjust = frame.frame_size;
8465     }
8466   else if (saves_below_hard_fp_p
8467 	   && known_eq (saved_regs_size, below_hard_fp_saved_regs_size))
8468     {
8469       /* Frame in which all saves are SVE saves:
8470 
8471 	 sub sp, sp, frame_size - bytes_below_saved_regs
8472 	 save SVE registers relative to SP
8473 	 sub sp, sp, bytes_below_saved_regs  */
8474       frame.initial_adjust = frame.frame_size - frame.bytes_below_saved_regs;
8475       frame.final_adjust = frame.bytes_below_saved_regs;
8476     }
8477   else if (frame.bytes_above_hard_fp.is_constant (&const_above_fp)
8478 	   && const_above_fp < max_push_offset)
8479     {
8480       /* Frame with large area below the saved registers, or with SVE saves,
8481 	 but with a small area above:
8482 
8483 	 stp reg1, reg2, [sp, -hard_fp_offset]!
8484 	 stp reg3, reg4, [sp, 16]
8485 	 [sub sp, sp, below_hard_fp_saved_regs_size]
8486 	 [save SVE registers relative to SP]
8487 	 sub sp, sp, bytes_below_saved_regs  */
8488       frame.callee_adjust = const_above_fp;
8489       frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
8490       frame.final_adjust = frame.bytes_below_saved_regs;
8491     }
8492   else
8493     {
8494       /* General case:
8495 
8496 	 sub sp, sp, hard_fp_offset
8497 	 stp x29, x30, [sp, 0]
8498 	 add x29, sp, 0
8499 	 stp reg3, reg4, [sp, 16]
8500 	 [sub sp, sp, below_hard_fp_saved_regs_size]
8501 	 [save SVE registers relative to SP]
8502 	 sub sp, sp, bytes_below_saved_regs  */
8503       frame.initial_adjust = frame.bytes_above_hard_fp;
8504       frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
8505       frame.final_adjust = frame.bytes_below_saved_regs;
8506     }
8507 
8508   /* The frame is allocated in pieces, with each non-final piece
8509      including a register save at offset 0 that acts as a probe for
8510      the following piece.  In addition, the save of the bottommost register
8511      acts as a probe for callees and allocas.  Roll back any probes that
8512      aren't needed.
8513 
8514      A probe isn't needed if it is associated with the final allocation
8515      (including callees and allocas) that happens before the epilogue is
8516      executed.  */
8517   if (crtl->is_leaf
8518       && !cfun->calls_alloca
8519       && known_eq (frame.final_adjust, 0))
8520     {
8521       if (maybe_ne (frame.sve_callee_adjust, 0))
8522 	frame.sve_save_and_probe = INVALID_REGNUM;
8523       else
8524 	frame.hard_fp_save_and_probe = INVALID_REGNUM;
8525     }
8526 
8527   /* Make sure the individual adjustments add up to the full frame size.  */
8528   gcc_assert (known_eq (frame.initial_adjust
8529 			+ frame.callee_adjust
8530 			+ frame.sve_callee_adjust
8531 			+ frame.final_adjust, frame.frame_size));
8532 
8533   if (!frame.emit_frame_chain && frame.callee_adjust == 0)
8534     {
8535       /* We've decided not to associate any register saves with the initial
8536 	 stack allocation.  */
8537       frame.wb_pop_candidate1 = frame.wb_push_candidate1 = INVALID_REGNUM;
8538       frame.wb_pop_candidate2 = frame.wb_push_candidate2 = INVALID_REGNUM;
8539     }
8540 
8541   frame.laid_out = true;
8542 }
8543 
8544 /* Return true if the register REGNO is saved on entry to
8545    the current function.  */
8546 
8547 static bool
aarch64_register_saved_on_entry(int regno)8548 aarch64_register_saved_on_entry (int regno)
8549 {
8550   return known_ge (cfun->machine->frame.reg_offset[regno], 0);
8551 }
8552 
8553 /* Return the next register up from REGNO up to LIMIT for the callee
8554    to save.  */
8555 
8556 static unsigned
aarch64_next_callee_save(unsigned regno,unsigned limit)8557 aarch64_next_callee_save (unsigned regno, unsigned limit)
8558 {
8559   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
8560     regno ++;
8561   return regno;
8562 }
8563 
8564 /* Push the register number REGNO of mode MODE to the stack with write-back
8565    adjusting the stack by ADJUSTMENT.  */
8566 
8567 static void
aarch64_pushwb_single_reg(machine_mode mode,unsigned regno,HOST_WIDE_INT adjustment)8568 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
8569 			   HOST_WIDE_INT adjustment)
8570  {
8571   rtx base_rtx = stack_pointer_rtx;
8572   rtx insn, reg, mem;
8573 
8574   reg = gen_rtx_REG (mode, regno);
8575   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
8576 			    plus_constant (Pmode, base_rtx, -adjustment));
8577   mem = gen_frame_mem (mode, mem);
8578 
8579   insn = emit_move_insn (mem, reg);
8580   RTX_FRAME_RELATED_P (insn) = 1;
8581 }
8582 
8583 /* Generate and return an instruction to store the pair of registers
8584    REG and REG2 of mode MODE to location BASE with write-back adjusting
8585    the stack location BASE by ADJUSTMENT.  */
8586 
8587 static rtx
aarch64_gen_storewb_pair(machine_mode mode,rtx base,rtx reg,rtx reg2,HOST_WIDE_INT adjustment)8588 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8589 			  HOST_WIDE_INT adjustment)
8590 {
8591   switch (mode)
8592     {
8593     case E_DImode:
8594       return gen_storewb_pairdi_di (base, base, reg, reg2,
8595 				    GEN_INT (-adjustment),
8596 				    GEN_INT (UNITS_PER_WORD - adjustment));
8597     case E_DFmode:
8598       return gen_storewb_pairdf_di (base, base, reg, reg2,
8599 				    GEN_INT (-adjustment),
8600 				    GEN_INT (UNITS_PER_WORD - adjustment));
8601     case E_TFmode:
8602       return gen_storewb_pairtf_di (base, base, reg, reg2,
8603 				    GEN_INT (-adjustment),
8604 				    GEN_INT (UNITS_PER_VREG - adjustment));
8605     case E_V16QImode:
8606       return gen_storewb_pairv16qi_di (base, base, reg, reg2,
8607 				       GEN_INT (-adjustment),
8608 				       GEN_INT (UNITS_PER_VREG - adjustment));
8609     default:
8610       gcc_unreachable ();
8611     }
8612 }
8613 
8614 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8615    stack pointer by ADJUSTMENT.  */
8616 
8617 static void
aarch64_push_regs(unsigned regno1,unsigned regno2,HOST_WIDE_INT adjustment)8618 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
8619 {
8620   rtx_insn *insn;
8621   machine_mode mode = aarch64_reg_save_mode (regno1);
8622 
8623   if (regno2 == INVALID_REGNUM)
8624     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
8625 
8626   rtx reg1 = gen_rtx_REG (mode, regno1);
8627   rtx reg2 = gen_rtx_REG (mode, regno2);
8628 
8629   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
8630 					      reg2, adjustment));
8631   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
8632   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8633   RTX_FRAME_RELATED_P (insn) = 1;
8634 }
8635 
8636 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8637    adjusting it by ADJUSTMENT afterwards.  */
8638 
8639 static rtx
aarch64_gen_loadwb_pair(machine_mode mode,rtx base,rtx reg,rtx reg2,HOST_WIDE_INT adjustment)8640 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8641 			 HOST_WIDE_INT adjustment)
8642 {
8643   switch (mode)
8644     {
8645     case E_DImode:
8646       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
8647 				   GEN_INT (UNITS_PER_WORD));
8648     case E_DFmode:
8649       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
8650 				   GEN_INT (UNITS_PER_WORD));
8651     case E_TFmode:
8652       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
8653 				   GEN_INT (UNITS_PER_VREG));
8654     case E_V16QImode:
8655       return gen_loadwb_pairv16qi_di (base, base, reg, reg2,
8656 				      GEN_INT (adjustment),
8657 				      GEN_INT (UNITS_PER_VREG));
8658     default:
8659       gcc_unreachable ();
8660     }
8661 }
8662 
8663 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8664    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8665    into CFI_OPS.  */
8666 
8667 static void
aarch64_pop_regs(unsigned regno1,unsigned regno2,HOST_WIDE_INT adjustment,rtx * cfi_ops)8668 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
8669 		  rtx *cfi_ops)
8670 {
8671   machine_mode mode = aarch64_reg_save_mode (regno1);
8672   rtx reg1 = gen_rtx_REG (mode, regno1);
8673 
8674   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
8675 
8676   if (regno2 == INVALID_REGNUM)
8677     {
8678       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
8679       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
8680       emit_move_insn (reg1, gen_frame_mem (mode, mem));
8681     }
8682   else
8683     {
8684       rtx reg2 = gen_rtx_REG (mode, regno2);
8685       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8686       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
8687 					  reg2, adjustment));
8688     }
8689 }
8690 
8691 /* Generate and return a store pair instruction of mode MODE to store
8692    register REG1 to MEM1 and register REG2 to MEM2.  */
8693 
8694 static rtx
aarch64_gen_store_pair(machine_mode mode,rtx mem1,rtx reg1,rtx mem2,rtx reg2)8695 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
8696 			rtx reg2)
8697 {
8698   switch (mode)
8699     {
8700     case E_DImode:
8701       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
8702 
8703     case E_DFmode:
8704       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
8705 
8706     case E_TFmode:
8707       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
8708 
8709     case E_V4SImode:
8710       return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
8711 
8712     case E_V16QImode:
8713       return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
8714 
8715     default:
8716       gcc_unreachable ();
8717     }
8718 }
8719 
8720 /* Generate and regurn a load pair isntruction of mode MODE to load register
8721    REG1 from MEM1 and register REG2 from MEM2.  */
8722 
8723 static rtx
aarch64_gen_load_pair(machine_mode mode,rtx reg1,rtx mem1,rtx reg2,rtx mem2)8724 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
8725 		       rtx mem2)
8726 {
8727   switch (mode)
8728     {
8729     case E_DImode:
8730       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
8731 
8732     case E_DFmode:
8733       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
8734 
8735     case E_TFmode:
8736       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
8737 
8738     case E_V4SImode:
8739       return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
8740 
8741     case E_V16QImode:
8742       return gen_load_pairv16qiv16qi (reg1, mem1, reg2, mem2);
8743 
8744     default:
8745       gcc_unreachable ();
8746     }
8747 }
8748 
8749 /* Return TRUE if return address signing should be enabled for the current
8750    function, otherwise return FALSE.  */
8751 
8752 bool
aarch64_return_address_signing_enabled(void)8753 aarch64_return_address_signing_enabled (void)
8754 {
8755   /* This function should only be called after frame laid out.   */
8756   gcc_assert (cfun->machine->frame.laid_out);
8757 
8758   /* Turn return address signing off in any function that uses
8759      __builtin_eh_return.  The address passed to __builtin_eh_return
8760      is not signed so either it has to be signed (with original sp)
8761      or the code path that uses it has to avoid authenticating it.
8762      Currently eh return introduces a return to anywhere gadget, no
8763      matter what we do here since it uses ret with user provided
8764      address. An ideal fix for that is to use indirect branch which
8765      can be protected with BTI j (to some extent).  */
8766   if (crtl->calls_eh_return)
8767     return false;
8768 
8769   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
8770      if its LR is pushed onto stack.  */
8771   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
8772 	  || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
8773 	      && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
8774 }
8775 
8776 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
8777 bool
aarch64_bti_enabled(void)8778 aarch64_bti_enabled (void)
8779 {
8780   return (aarch64_enable_bti == 1);
8781 }
8782 
8783 /* The caller is going to use ST1D or LD1D to save or restore an SVE
8784    register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
8785    the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
8786 
8787      (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
8788 	 or LD1D address
8789 
8790      (2) setting PRED to a valid predicate register for the ST1D or LD1D,
8791 	 if the variable isn't already nonnull
8792 
8793    (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
8794    Handle this case using a temporary base register that is suitable for
8795    all offsets in that range.  Use ANCHOR_REG as this base register if it
8796    is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
8797 
8798 static inline void
aarch64_adjust_sve_callee_save_base(machine_mode mode,rtx & base_rtx,rtx & anchor_reg,poly_int64 & offset,rtx & ptrue)8799 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
8800 				     rtx &anchor_reg, poly_int64 &offset,
8801 				     rtx &ptrue)
8802 {
8803   if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
8804     {
8805       /* This is the maximum valid offset of the anchor from the base.
8806 	 Lower values would be valid too.  */
8807       poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
8808       if (!anchor_reg)
8809 	{
8810 	  anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8811 	  emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8812 				    gen_int_mode (anchor_offset, Pmode)));
8813 	}
8814       base_rtx = anchor_reg;
8815       offset -= anchor_offset;
8816     }
8817   if (!ptrue)
8818     {
8819       int pred_reg = cfun->machine->frame.spare_pred_reg;
8820       emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
8821 		      CONSTM1_RTX (VNx16BImode));
8822       ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
8823     }
8824 }
8825 
8826 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
8827    is saved at BASE + OFFSET.  */
8828 
8829 static void
aarch64_add_cfa_expression(rtx_insn * insn,rtx reg,rtx base,poly_int64 offset)8830 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
8831 			    rtx base, poly_int64 offset)
8832 {
8833   rtx mem = gen_frame_mem (GET_MODE (reg),
8834 			   plus_constant (Pmode, base, offset));
8835   add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
8836 }
8837 
8838 /* Emit code to save the callee-saved registers from register number START
8839    to LIMIT to the stack.  The stack pointer is currently BYTES_BELOW_SP
8840    bytes above the bottom of the static frame.  Skip any write-back
8841    candidates if SKIP_WB is true.  HARD_FP_VALID_P is true if the hard
8842    frame pointer has been set up.  */
8843 
8844 static void
aarch64_save_callee_saves(poly_int64 bytes_below_sp,unsigned start,unsigned limit,bool skip_wb,bool hard_fp_valid_p)8845 aarch64_save_callee_saves (poly_int64 bytes_below_sp,
8846 			   unsigned start, unsigned limit, bool skip_wb,
8847 			   bool hard_fp_valid_p)
8848 {
8849   aarch64_frame &frame = cfun->machine->frame;
8850   rtx_insn *insn;
8851   unsigned regno;
8852   unsigned regno2;
8853   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8854 
8855   for (regno = aarch64_next_callee_save (start, limit);
8856        regno <= limit;
8857        regno = aarch64_next_callee_save (regno + 1, limit))
8858     {
8859       rtx reg, mem;
8860       poly_int64 offset;
8861       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8862 
8863       if (skip_wb
8864 	  && (regno == frame.wb_push_candidate1
8865 	      || regno == frame.wb_push_candidate2))
8866 	continue;
8867 
8868       if (cfun->machine->reg_is_wrapped_separately[regno])
8869 	continue;
8870 
8871       machine_mode mode = aarch64_reg_save_mode (regno);
8872       reg = gen_rtx_REG (mode, regno);
8873       offset = frame.reg_offset[regno] - bytes_below_sp;
8874       rtx base_rtx = stack_pointer_rtx;
8875       poly_int64 sp_offset = offset;
8876 
8877       HOST_WIDE_INT const_offset;
8878       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8879 	aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8880 					     offset, ptrue);
8881       else if (GP_REGNUM_P (regno)
8882 	       && (!offset.is_constant (&const_offset) || const_offset >= 512))
8883 	{
8884 	  poly_int64 fp_offset = frame.bytes_below_hard_fp - bytes_below_sp;
8885 	  if (hard_fp_valid_p)
8886 	    base_rtx = hard_frame_pointer_rtx;
8887 	  else
8888 	    {
8889 	      if (!anchor_reg)
8890 		{
8891 		  anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8892 		  emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8893 					    gen_int_mode (fp_offset, Pmode)));
8894 		}
8895 	      base_rtx = anchor_reg;
8896 	    }
8897 	  offset -= fp_offset;
8898 	}
8899       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8900       bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
8901 
8902       if (!aarch64_sve_mode_p (mode)
8903 	  && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
8904 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
8905 	  && known_eq (GET_MODE_SIZE (mode),
8906 		       frame.reg_offset[regno2] - frame.reg_offset[regno]))
8907 	{
8908 	  rtx reg2 = gen_rtx_REG (mode, regno2);
8909 	  rtx mem2;
8910 
8911 	  offset += GET_MODE_SIZE (mode);
8912 	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8913 	  insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
8914 						    reg2));
8915 
8916 	  /* The first part of a frame-related parallel insn is
8917 	     always assumed to be relevant to the frame
8918 	     calculations; subsequent parts, are only
8919 	     frame-related if explicitly marked.  */
8920 	  if (aarch64_emit_cfi_for_reg_p (regno2))
8921 	    {
8922 	      if (need_cfa_note_p)
8923 		aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
8924 					    sp_offset + GET_MODE_SIZE (mode));
8925 	      else
8926 		RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8927 	    }
8928 
8929 	  regno = regno2;
8930 	}
8931       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8932 	{
8933 	  insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
8934 	  need_cfa_note_p = true;
8935 	}
8936       else if (aarch64_sve_mode_p (mode))
8937 	insn = emit_insn (gen_rtx_SET (mem, reg));
8938       else
8939 	insn = emit_move_insn (mem, reg);
8940 
8941       RTX_FRAME_RELATED_P (insn) = frame_related_p;
8942       if (frame_related_p && need_cfa_note_p)
8943 	aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
8944     }
8945 }
8946 
8947 /* Emit code to restore the callee registers from register number START
8948    up to and including LIMIT.  The stack pointer is currently BYTES_BELOW_SP
8949    bytes above the bottom of the static frame.  Skip any write-back
8950    candidates if SKIP_WB is true.  Write the appropriate REG_CFA_RESTORE
8951    notes into CFI_OPS.  */
8952 
8953 static void
aarch64_restore_callee_saves(poly_int64 bytes_below_sp,unsigned start,unsigned limit,bool skip_wb,rtx * cfi_ops)8954 aarch64_restore_callee_saves (poly_int64 bytes_below_sp, unsigned start,
8955 			      unsigned limit, bool skip_wb, rtx *cfi_ops)
8956 {
8957   aarch64_frame &frame = cfun->machine->frame;
8958   unsigned regno;
8959   unsigned regno2;
8960   poly_int64 offset;
8961   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8962 
8963   for (regno = aarch64_next_callee_save (start, limit);
8964        regno <= limit;
8965        regno = aarch64_next_callee_save (regno + 1, limit))
8966     {
8967       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8968       if (cfun->machine->reg_is_wrapped_separately[regno])
8969 	continue;
8970 
8971       rtx reg, mem;
8972 
8973       if (skip_wb
8974 	  && (regno == frame.wb_pop_candidate1
8975 	      || regno == frame.wb_pop_candidate2))
8976 	continue;
8977 
8978       machine_mode mode = aarch64_reg_save_mode (regno);
8979       reg = gen_rtx_REG (mode, regno);
8980       offset = frame.reg_offset[regno] - bytes_below_sp;
8981       rtx base_rtx = stack_pointer_rtx;
8982       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8983 	aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8984 					     offset, ptrue);
8985       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8986 
8987       if (!aarch64_sve_mode_p (mode)
8988 	  && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
8989 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
8990 	  && known_eq (GET_MODE_SIZE (mode),
8991 		       frame.reg_offset[regno2] - frame.reg_offset[regno]))
8992 	{
8993 	  rtx reg2 = gen_rtx_REG (mode, regno2);
8994 	  rtx mem2;
8995 
8996 	  offset += GET_MODE_SIZE (mode);
8997 	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8998 	  emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
8999 
9000 	  *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
9001 	  regno = regno2;
9002 	}
9003       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9004 	emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
9005       else if (aarch64_sve_mode_p (mode))
9006 	emit_insn (gen_rtx_SET (reg, mem));
9007       else
9008 	emit_move_insn (reg, mem);
9009       if (frame_related_p)
9010 	*cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
9011     }
9012 }
9013 
9014 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
9015    of MODE.  */
9016 
9017 static inline bool
offset_4bit_signed_scaled_p(machine_mode mode,poly_int64 offset)9018 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9019 {
9020   HOST_WIDE_INT multiple;
9021   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9022 	  && IN_RANGE (multiple, -8, 7));
9023 }
9024 
9025 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
9026    of MODE.  */
9027 
9028 static inline bool
offset_6bit_signed_scaled_p(machine_mode mode,poly_int64 offset)9029 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9030 {
9031   HOST_WIDE_INT multiple;
9032   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9033 	  && IN_RANGE (multiple, -32, 31));
9034 }
9035 
9036 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
9037    of MODE.  */
9038 
9039 static inline bool
offset_6bit_unsigned_scaled_p(machine_mode mode,poly_int64 offset)9040 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
9041 {
9042   HOST_WIDE_INT multiple;
9043   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9044 	  && IN_RANGE (multiple, 0, 63));
9045 }
9046 
9047 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
9048    of MODE.  */
9049 
9050 bool
aarch64_offset_7bit_signed_scaled_p(machine_mode mode,poly_int64 offset)9051 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9052 {
9053   HOST_WIDE_INT multiple;
9054   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9055 	  && IN_RANGE (multiple, -64, 63));
9056 }
9057 
9058 /* Return true if OFFSET is a signed 9-bit value.  */
9059 
9060 bool
aarch64_offset_9bit_signed_unscaled_p(machine_mode mode ATTRIBUTE_UNUSED,poly_int64 offset)9061 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
9062 				       poly_int64 offset)
9063 {
9064   HOST_WIDE_INT const_offset;
9065   return (offset.is_constant (&const_offset)
9066 	  && IN_RANGE (const_offset, -256, 255));
9067 }
9068 
9069 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
9070    of MODE.  */
9071 
9072 static inline bool
offset_9bit_signed_scaled_p(machine_mode mode,poly_int64 offset)9073 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9074 {
9075   HOST_WIDE_INT multiple;
9076   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9077 	  && IN_RANGE (multiple, -256, 255));
9078 }
9079 
9080 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
9081    of MODE.  */
9082 
9083 static inline bool
offset_12bit_unsigned_scaled_p(machine_mode mode,poly_int64 offset)9084 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
9085 {
9086   HOST_WIDE_INT multiple;
9087   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9088 	  && IN_RANGE (multiple, 0, 4095));
9089 }
9090 
9091 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
9092 
9093 static sbitmap
aarch64_get_separate_components(void)9094 aarch64_get_separate_components (void)
9095 {
9096   aarch64_frame &frame = cfun->machine->frame;
9097   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9098   bitmap_clear (components);
9099 
9100   /* The registers we need saved to the frame.  */
9101   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9102     if (aarch64_register_saved_on_entry (regno))
9103       {
9104 	/* Punt on saves and restores that use ST1D and LD1D.  We could
9105 	   try to be smarter, but it would involve making sure that the
9106 	   spare predicate register itself is safe to use at the save
9107 	   and restore points.  Also, when a frame pointer is being used,
9108 	   the slots are often out of reach of ST1D and LD1D anyway.  */
9109 	machine_mode mode = aarch64_reg_save_mode (regno);
9110 	if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9111 	  continue;
9112 
9113 	poly_int64 offset = frame.reg_offset[regno];
9114 
9115 	/* Get the offset relative to the register we'll use.  */
9116 	if (frame_pointer_needed)
9117 	  offset -= frame.bytes_below_hard_fp;
9118 
9119 	/* Check that we can access the stack slot of the register with one
9120 	   direct load with no adjustments needed.  */
9121 	if (aarch64_sve_mode_p (mode)
9122 	    ? offset_9bit_signed_scaled_p (mode, offset)
9123 	    : offset_12bit_unsigned_scaled_p (mode, offset))
9124 	  bitmap_set_bit (components, regno);
9125       }
9126 
9127   /* Don't mess with the hard frame pointer.  */
9128   if (frame_pointer_needed)
9129     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
9130 
9131   /* If the spare predicate register used by big-endian SVE code
9132      is call-preserved, it must be saved in the main prologue
9133      before any saves that use it.  */
9134   if (frame.spare_pred_reg != INVALID_REGNUM)
9135     bitmap_clear_bit (components, frame.spare_pred_reg);
9136 
9137   unsigned reg1 = frame.wb_push_candidate1;
9138   unsigned reg2 = frame.wb_push_candidate2;
9139   /* If registers have been chosen to be stored/restored with
9140      writeback don't interfere with them to avoid having to output explicit
9141      stack adjustment instructions.  */
9142   if (reg2 != INVALID_REGNUM)
9143     bitmap_clear_bit (components, reg2);
9144   if (reg1 != INVALID_REGNUM)
9145     bitmap_clear_bit (components, reg1);
9146 
9147   bitmap_clear_bit (components, LR_REGNUM);
9148   bitmap_clear_bit (components, SP_REGNUM);
9149   if (flag_stack_clash_protection)
9150     {
9151       if (frame.sve_save_and_probe != INVALID_REGNUM)
9152 	bitmap_clear_bit (components, frame.sve_save_and_probe);
9153       if (frame.hard_fp_save_and_probe != INVALID_REGNUM)
9154 	bitmap_clear_bit (components, frame.hard_fp_save_and_probe);
9155     }
9156 
9157   return components;
9158 }
9159 
9160 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
9161 
9162 static sbitmap
aarch64_components_for_bb(basic_block bb)9163 aarch64_components_for_bb (basic_block bb)
9164 {
9165   bitmap in = DF_LIVE_IN (bb);
9166   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
9167   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
9168 
9169   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9170   bitmap_clear (components);
9171 
9172   /* Clobbered registers don't generate values in any meaningful sense,
9173      since nothing after the clobber can rely on their value.  And we can't
9174      say that partially-clobbered registers are unconditionally killed,
9175      because whether they're killed or not depends on the mode of the
9176      value they're holding.  Thus partially call-clobbered registers
9177      appear in neither the kill set nor the gen set.
9178 
9179      Check manually for any calls that clobber more of a register than the
9180      current function can.  */
9181   function_abi_aggregator callee_abis;
9182   rtx_insn *insn;
9183   FOR_BB_INSNS (bb, insn)
9184     if (CALL_P (insn))
9185       callee_abis.note_callee_abi (insn_callee_abi (insn));
9186   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
9187 
9188   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
9189   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9190     if (!fixed_regs[regno]
9191 	&& !crtl->abi->clobbers_full_reg_p (regno)
9192 	&& (TEST_HARD_REG_BIT (extra_caller_saves, regno)
9193 	    || bitmap_bit_p (in, regno)
9194 	    || bitmap_bit_p (gen, regno)
9195 	    || bitmap_bit_p (kill, regno)))
9196       {
9197 	bitmap_set_bit (components, regno);
9198 
9199 	/* If there is a callee-save at an adjacent offset, add it too
9200 	   to increase the use of LDP/STP.  */
9201 	poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9202 	unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
9203 
9204 	if (regno2 <= LAST_SAVED_REGNUM)
9205 	  {
9206 	    poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9207 	    if (regno < regno2
9208 		? known_eq (offset + 8, offset2)
9209 		: multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
9210 	      bitmap_set_bit (components, regno2);
9211 	  }
9212       }
9213 
9214   return components;
9215 }
9216 
9217 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
9218    Nothing to do for aarch64.  */
9219 
9220 static void
aarch64_disqualify_components(sbitmap,edge,sbitmap,bool)9221 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
9222 {
9223 }
9224 
9225 /* Return the next set bit in BMP from START onwards.  Return the total number
9226    of bits in BMP if no set bit is found at or after START.  */
9227 
9228 static unsigned int
aarch64_get_next_set_bit(sbitmap bmp,unsigned int start)9229 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
9230 {
9231   unsigned int nbits = SBITMAP_SIZE (bmp);
9232   if (start == nbits)
9233     return start;
9234 
9235   gcc_assert (start < nbits);
9236   for (unsigned int i = start; i < nbits; i++)
9237     if (bitmap_bit_p (bmp, i))
9238       return i;
9239 
9240   return nbits;
9241 }
9242 
9243 /* Do the work for aarch64_emit_prologue_components and
9244    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
9245    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
9246    for these components or the epilogue sequence.  That is, it determines
9247    whether we should emit stores or loads and what kind of CFA notes to attach
9248    to the insns.  Otherwise the logic for the two sequences is very
9249    similar.  */
9250 
9251 static void
aarch64_process_components(sbitmap components,bool prologue_p)9252 aarch64_process_components (sbitmap components, bool prologue_p)
9253 {
9254   aarch64_frame &frame = cfun->machine->frame;
9255   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
9256 			     ? HARD_FRAME_POINTER_REGNUM
9257 			     : STACK_POINTER_REGNUM);
9258 
9259   unsigned last_regno = SBITMAP_SIZE (components);
9260   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
9261   rtx_insn *insn = NULL;
9262 
9263   while (regno != last_regno)
9264     {
9265       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9266       machine_mode mode = aarch64_reg_save_mode (regno);
9267 
9268       rtx reg = gen_rtx_REG (mode, regno);
9269       poly_int64 offset = frame.reg_offset[regno];
9270       if (frame_pointer_needed)
9271 	offset -= frame.bytes_below_hard_fp;
9272 
9273       rtx addr = plus_constant (Pmode, ptr_reg, offset);
9274       rtx mem = gen_frame_mem (mode, addr);
9275 
9276       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
9277       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
9278       /* No more registers to handle after REGNO.
9279 	 Emit a single save/restore and exit.  */
9280       if (regno2 == last_regno)
9281 	{
9282 	  insn = emit_insn (set);
9283 	  if (frame_related_p)
9284 	    {
9285 	      RTX_FRAME_RELATED_P (insn) = 1;
9286 	      if (prologue_p)
9287 		add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9288 	      else
9289 		add_reg_note (insn, REG_CFA_RESTORE, reg);
9290 	    }
9291 	  break;
9292 	}
9293 
9294       poly_int64 offset2 = frame.reg_offset[regno2];
9295       /* The next register is not of the same class or its offset is not
9296 	 mergeable with the current one into a pair.  */
9297       if (aarch64_sve_mode_p (mode)
9298 	  || !satisfies_constraint_Ump (mem)
9299 	  || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
9300 	  || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
9301 	  || maybe_ne ((offset2 - frame.reg_offset[regno]),
9302 		       GET_MODE_SIZE (mode)))
9303 	{
9304 	  insn = emit_insn (set);
9305 	  if (frame_related_p)
9306 	    {
9307 	      RTX_FRAME_RELATED_P (insn) = 1;
9308 	      if (prologue_p)
9309 		add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9310 	      else
9311 		add_reg_note (insn, REG_CFA_RESTORE, reg);
9312 	    }
9313 
9314 	  regno = regno2;
9315 	  continue;
9316 	}
9317 
9318       bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
9319 
9320       /* REGNO2 can be saved/restored in a pair with REGNO.  */
9321       rtx reg2 = gen_rtx_REG (mode, regno2);
9322       if (frame_pointer_needed)
9323 	offset2 -= frame.bytes_below_hard_fp;
9324       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
9325       rtx mem2 = gen_frame_mem (mode, addr2);
9326       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
9327 			     : gen_rtx_SET (reg2, mem2);
9328 
9329       if (prologue_p)
9330 	insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
9331       else
9332 	insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
9333 
9334       if (frame_related_p || frame_related2_p)
9335 	{
9336 	  RTX_FRAME_RELATED_P (insn) = 1;
9337 	  if (prologue_p)
9338 	    {
9339 	      if (frame_related_p)
9340 		add_reg_note (insn, REG_CFA_OFFSET, set);
9341 	      if (frame_related2_p)
9342 		add_reg_note (insn, REG_CFA_OFFSET, set2);
9343 	    }
9344 	  else
9345 	    {
9346 	      if (frame_related_p)
9347 		add_reg_note (insn, REG_CFA_RESTORE, reg);
9348 	      if (frame_related2_p)
9349 		add_reg_note (insn, REG_CFA_RESTORE, reg2);
9350 	    }
9351 	}
9352 
9353       regno = aarch64_get_next_set_bit (components, regno2 + 1);
9354     }
9355 }
9356 
9357 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
9358 
9359 static void
aarch64_emit_prologue_components(sbitmap components)9360 aarch64_emit_prologue_components (sbitmap components)
9361 {
9362   aarch64_process_components (components, true);
9363 }
9364 
9365 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
9366 
9367 static void
aarch64_emit_epilogue_components(sbitmap components)9368 aarch64_emit_epilogue_components (sbitmap components)
9369 {
9370   aarch64_process_components (components, false);
9371 }
9372 
9373 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
9374 
9375 static void
aarch64_set_handled_components(sbitmap components)9376 aarch64_set_handled_components (sbitmap components)
9377 {
9378   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9379     if (bitmap_bit_p (components, regno))
9380       cfun->machine->reg_is_wrapped_separately[regno] = true;
9381 }
9382 
9383 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
9384    determining the probe offset for alloca.  */
9385 
9386 static HOST_WIDE_INT
aarch64_stack_clash_protection_alloca_probe_range(void)9387 aarch64_stack_clash_protection_alloca_probe_range (void)
9388 {
9389   return STACK_CLASH_CALLER_GUARD;
9390 }
9391 
9392 
9393 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9394    registers.  If POLY_SIZE is not large enough to require a probe this function
9395    will only adjust the stack.  When allocating the stack space
9396    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9397    FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
9398    the saved registers.  If we are then we ensure that any allocation
9399    larger than the ABI defined buffer needs a probe so that the
9400    invariant of having a 1KB buffer is maintained.
9401 
9402    We emit barriers after each stack adjustment to prevent optimizations from
9403    breaking the invariant that we never drop the stack more than a page.  This
9404    invariant is needed to make it easier to correctly handle asynchronous
9405    events, e.g. if we were to allow the stack to be dropped by more than a page
9406    and then have multiple probes up and we take a signal somewhere in between
9407    then the signal handler doesn't know the state of the stack and can make no
9408    assumptions about which pages have been probed.  */
9409 
9410 static void
aarch64_allocate_and_probe_stack_space(rtx temp1,rtx temp2,poly_int64 poly_size,bool frame_related_p,bool final_adjustment_p)9411 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
9412 					poly_int64 poly_size,
9413 					bool frame_related_p,
9414 					bool final_adjustment_p)
9415 {
9416   aarch64_frame &frame = cfun->machine->frame;
9417   HOST_WIDE_INT guard_size
9418     = 1 << param_stack_clash_protection_guard_size;
9419   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9420   HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT;
9421   gcc_assert (multiple_p (poly_size, byte_sp_alignment));
9422   HOST_WIDE_INT min_probe_threshold
9423     = (final_adjustment_p
9424        ? guard_used_by_caller + byte_sp_alignment
9425        : guard_size - guard_used_by_caller);
9426   poly_int64 frame_size = frame.frame_size;
9427 
9428   /* We should always have a positive probe threshold.  */
9429   gcc_assert (min_probe_threshold > 0);
9430 
9431   if (flag_stack_clash_protection && !final_adjustment_p)
9432     {
9433       poly_int64 initial_adjust = frame.initial_adjust;
9434       poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9435       poly_int64 final_adjust = frame.final_adjust;
9436 
9437       if (known_eq (frame_size, 0))
9438 	{
9439 	  dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
9440 	}
9441       else if (known_lt (initial_adjust + sve_callee_adjust,
9442 			 guard_size - guard_used_by_caller)
9443 	       && known_lt (final_adjust, guard_used_by_caller))
9444 	{
9445 	  dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
9446 	}
9447     }
9448 
9449   /* If SIZE is not large enough to require probing, just adjust the stack and
9450      exit.  */
9451   if (known_lt (poly_size, min_probe_threshold)
9452       || !flag_stack_clash_protection)
9453     {
9454       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
9455       return;
9456     }
9457 
9458   HOST_WIDE_INT size;
9459   /* Handle the SVE non-constant case first.  */
9460   if (!poly_size.is_constant (&size))
9461     {
9462      if (dump_file)
9463       {
9464 	fprintf (dump_file, "Stack clash SVE prologue: ");
9465 	print_dec (poly_size, dump_file);
9466 	fprintf (dump_file, " bytes, dynamic probing will be required.\n");
9467       }
9468 
9469       /* First calculate the amount of bytes we're actually spilling.  */
9470       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
9471 			  poly_size, temp1, temp2, false, true);
9472 
9473       rtx_insn *insn = get_last_insn ();
9474 
9475       if (frame_related_p)
9476 	{
9477 	  /* This is done to provide unwinding information for the stack
9478 	     adjustments we're about to do, however to prevent the optimizers
9479 	     from removing the R11 move and leaving the CFA note (which would be
9480 	     very wrong) we tie the old and new stack pointer together.
9481 	     The tie will expand to nothing but the optimizers will not touch
9482 	     the instruction.  */
9483 	  rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9484 	  emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
9485 	  emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
9486 
9487 	  /* We want the CFA independent of the stack pointer for the
9488 	     duration of the loop.  */
9489 	  add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
9490 	  RTX_FRAME_RELATED_P (insn) = 1;
9491 	}
9492 
9493       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
9494       rtx guard_const = gen_int_mode (guard_size, Pmode);
9495 
9496       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
9497 						   stack_pointer_rtx, temp1,
9498 						   probe_const, guard_const));
9499 
9500       /* Now reset the CFA register if needed.  */
9501       if (frame_related_p)
9502 	{
9503 	  add_reg_note (insn, REG_CFA_DEF_CFA,
9504 			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
9505 				      gen_int_mode (poly_size, Pmode)));
9506 	  RTX_FRAME_RELATED_P (insn) = 1;
9507 	}
9508 
9509       return;
9510     }
9511 
9512   if (dump_file)
9513     fprintf (dump_file,
9514 	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9515 	     " bytes, probing will be required.\n", size);
9516 
9517   /* Round size to the nearest multiple of guard_size, and calculate the
9518      residual as the difference between the original size and the rounded
9519      size.  */
9520   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
9521   HOST_WIDE_INT residual = size - rounded_size;
9522 
9523   /* We can handle a small number of allocations/probes inline.  Otherwise
9524      punt to a loop.  */
9525   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
9526     {
9527       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
9528 	{
9529 	  aarch64_sub_sp (NULL, temp2, guard_size, true);
9530 	  emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9531 					   guard_used_by_caller));
9532 	  emit_insn (gen_blockage ());
9533 	}
9534       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
9535     }
9536   else
9537     {
9538       /* Compute the ending address.  */
9539       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
9540 			  temp1, NULL, false, true);
9541       rtx_insn *insn = get_last_insn ();
9542 
9543       /* For the initial allocation, we don't have a frame pointer
9544 	 set up, so we always need CFI notes.  If we're doing the
9545 	 final allocation, then we may have a frame pointer, in which
9546 	 case it is the CFA, otherwise we need CFI notes.
9547 
9548 	 We can determine which allocation we are doing by looking at
9549 	 the value of FRAME_RELATED_P since the final allocations are not
9550 	 frame related.  */
9551       if (frame_related_p)
9552 	{
9553 	  /* We want the CFA independent of the stack pointer for the
9554 	     duration of the loop.  */
9555 	  add_reg_note (insn, REG_CFA_DEF_CFA,
9556 			plus_constant (Pmode, temp1, rounded_size));
9557 	  RTX_FRAME_RELATED_P (insn) = 1;
9558 	}
9559 
9560       /* This allocates and probes the stack.  Note that this re-uses some of
9561 	 the existing Ada stack protection code.  However we are guaranteed not
9562 	 to enter the non loop or residual branches of that code.
9563 
9564 	 The non-loop part won't be entered because if our allocation amount
9565 	 doesn't require a loop, the case above would handle it.
9566 
9567 	 The residual amount won't be entered because TEMP1 is a mutliple of
9568 	 the allocation size.  The residual will always be 0.  As such, the only
9569 	 part we are actually using from that code is the loop setup.  The
9570 	 actual probing is done in aarch64_output_probe_stack_range.  */
9571       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
9572 					       stack_pointer_rtx, temp1));
9573 
9574       /* Now reset the CFA register if needed.  */
9575       if (frame_related_p)
9576 	{
9577 	  add_reg_note (insn, REG_CFA_DEF_CFA,
9578 			plus_constant (Pmode, stack_pointer_rtx, rounded_size));
9579 	  RTX_FRAME_RELATED_P (insn) = 1;
9580 	}
9581 
9582       emit_insn (gen_blockage ());
9583       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
9584     }
9585 
9586   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
9587      be probed.  This maintains the requirement that each page is probed at
9588      least once.  For initial probing we probe only if the allocation is
9589      more than GUARD_SIZE - buffer, and below the saved registers we probe
9590      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
9591      GUARD_SIZE.  This works that for any allocation that is large enough to
9592      trigger a probe here, we'll have at least one, and if they're not large
9593      enough for this code to emit anything for them, The page would have been
9594      probed by the saving of FP/LR either by this function or any callees.  If
9595      we don't have any callees then we won't have more stack adjustments and so
9596      are still safe.  */
9597   if (residual)
9598     {
9599       gcc_assert (guard_used_by_caller + byte_sp_alignment <= size);
9600 
9601       /* If we're doing final adjustments, and we've done any full page
9602 	 allocations then any residual needs to be probed.  */
9603       if (final_adjustment_p && rounded_size != 0)
9604 	min_probe_threshold = 0;
9605 
9606       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
9607       if (residual >= min_probe_threshold)
9608 	{
9609 	  if (dump_file)
9610 	    fprintf (dump_file,
9611 		     "Stack clash AArch64 prologue residuals: "
9612 		     HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
9613 		     "\n", residual);
9614 
9615 	  emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9616 					   guard_used_by_caller));
9617 	  emit_insn (gen_blockage ());
9618 	}
9619     }
9620 }
9621 
9622 /* Return 1 if the register is used by the epilogue.  We need to say the
9623    return register is used, but only after epilogue generation is complete.
9624    Note that in the case of sibcalls, the values "used by the epilogue" are
9625    considered live at the start of the called function.
9626 
9627    For SIMD functions we need to return 1 for FP registers that are saved and
9628    restored by a function but are not zero in call_used_regs.  If we do not do
9629    this optimizations may remove the restore of the register.  */
9630 
9631 int
aarch64_epilogue_uses(int regno)9632 aarch64_epilogue_uses (int regno)
9633 {
9634   if (epilogue_completed)
9635     {
9636       if (regno == LR_REGNUM)
9637 	return 1;
9638     }
9639   return 0;
9640 }
9641 
9642 /* AArch64 stack frames generated by this compiler look like:
9643 
9644 	+-------------------------------+
9645 	|                               |
9646 	|  incoming stack arguments     |
9647 	|                               |
9648 	+-------------------------------+
9649 	|                               | <-- incoming stack pointer (aligned)
9650 	|  callee-allocated save area   |
9651 	|  for register varargs         |
9652 	|                               |
9653 	+-------------------------------+
9654 	|  local variables (1)          | <-- frame_pointer_rtx
9655 	|                               |
9656 	+-------------------------------+
9657 	|  padding (1)                  |
9658 	+-------------------------------+
9659 	|  callee-saved registers       |
9660 	+-------------------------------+
9661 	|  LR'                          |
9662 	+-------------------------------+
9663 	|  FP'                          |
9664 	+-------------------------------+ <-- hard_frame_pointer_rtx (aligned)
9665 	|  SVE vector registers         |
9666 	+-------------------------------+
9667 	|  SVE predicate registers      |
9668 	+-------------------------------+
9669 	|  local variables (2)          |
9670 	+-------------------------------+
9671 	|  padding (2)                  |
9672 	+-------------------------------+
9673 	|  dynamic allocation           |
9674 	+-------------------------------+
9675 	|  padding                      |
9676 	+-------------------------------+
9677 	|  outgoing stack arguments     | <-- arg_pointer
9678         |                               |
9679 	+-------------------------------+
9680 	|                               | <-- stack_pointer_rtx (aligned)
9681 
9682    The regions marked (1) and (2) are mutually exclusive.  (2) is used
9683    when aarch64_save_regs_above_locals_p is true.
9684 
9685    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9686    but leave frame_pointer_rtx and hard_frame_pointer_rtx
9687    unchanged.
9688 
9689    By default for stack-clash we assume the guard is at least 64KB, but this
9690    value is configurable to either 4KB or 64KB.  We also force the guard size to
9691    be the same as the probing interval and both values are kept in sync.
9692 
9693    With those assumptions the callee can allocate up to 63KB (or 3KB depending
9694    on the guard size) of stack space without probing.
9695 
9696    When probing is needed, we emit a probe at the start of the prologue
9697    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9698 
9699    We can also use register saves as probes.  These are stored in
9700    sve_save_and_probe and hard_fp_save_and_probe.
9701 
9702    For outgoing arguments we probe if the size is larger than 1KB, such that
9703    the ABI specified buffer is maintained for the next callee.
9704 
9705    The following registers are reserved during frame layout and should not be
9706    used for any other purpose:
9707 
9708    - r11: Used by stack clash protection when SVE is enabled, and also
9709 	  as an anchor register when saving and restoring registers
9710    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9711    - r14 and r15: Used for speculation tracking.
9712    - r16(IP0), r17(IP1): Used by indirect tailcalls.
9713    - r30(LR), r29(FP): Used by standard frame layout.
9714 
9715    These registers must be avoided in frame layout related code unless the
9716    explicit intention is to interact with one of the features listed above.  */
9717 
9718 /* Generate the prologue instructions for entry into a function.
9719    Establish the stack frame by decreasing the stack pointer with a
9720    properly calculated size and, if necessary, create a frame record
9721    filled with the values of LR and previous frame pointer.  The
9722    current FP is also set up if it is in use.  */
9723 
9724 void
aarch64_expand_prologue(void)9725 aarch64_expand_prologue (void)
9726 {
9727   aarch64_frame &frame = cfun->machine->frame;
9728   poly_int64 frame_size = frame.frame_size;
9729   poly_int64 initial_adjust = frame.initial_adjust;
9730   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
9731   poly_int64 final_adjust = frame.final_adjust;
9732   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9733   unsigned reg1 = frame.wb_push_candidate1;
9734   unsigned reg2 = frame.wb_push_candidate2;
9735   bool emit_frame_chain = frame.emit_frame_chain;
9736   rtx_insn *insn;
9737 
9738   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
9739     {
9740       /* Fold the SVE allocation into the initial allocation.
9741 	 We don't do this in aarch64_layout_arg to avoid pessimizing
9742 	 the epilogue code.  */
9743       initial_adjust += sve_callee_adjust;
9744       sve_callee_adjust = 0;
9745     }
9746 
9747   /* Sign return address for functions.  */
9748   if (aarch64_return_address_signing_enabled ())
9749     {
9750       switch (aarch64_ra_sign_key)
9751 	{
9752 	  case AARCH64_KEY_A:
9753 	    insn = emit_insn (gen_paciasp ());
9754 	    break;
9755 	  case AARCH64_KEY_B:
9756 	    insn = emit_insn (gen_pacibsp ());
9757 	    break;
9758 	  default:
9759 	    gcc_unreachable ();
9760 	}
9761       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
9762       RTX_FRAME_RELATED_P (insn) = 1;
9763     }
9764 
9765   /* Push return address to shadow call stack.  */
9766   if (frame.is_scs_enabled)
9767     emit_insn (gen_scs_push ());
9768 
9769   if (flag_stack_usage_info)
9770     current_function_static_stack_size = constant_lower_bound (frame_size);
9771 
9772   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
9773     {
9774       if (crtl->is_leaf && !cfun->calls_alloca)
9775 	{
9776 	  if (maybe_gt (frame_size, PROBE_INTERVAL)
9777 	      && maybe_gt (frame_size, get_stack_check_protect ()))
9778 	    aarch64_emit_probe_stack_range (get_stack_check_protect (),
9779 					    (frame_size
9780 					     - get_stack_check_protect ()));
9781 	}
9782       else if (maybe_gt (frame_size, 0))
9783 	aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
9784     }
9785 
9786   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9787   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9788 
9789   /* In theory we should never have both an initial adjustment
9790      and a callee save adjustment.  Verify that is the case since the
9791      code below does not handle it for -fstack-clash-protection.  */
9792   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
9793 
9794   /* Will only probe if the initial adjustment is larger than the guard
9795      less the amount of the guard reserved for use by the caller's
9796      outgoing args.  */
9797   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
9798 					  true, false);
9799 
9800   if (callee_adjust != 0)
9801     aarch64_push_regs (reg1, reg2, callee_adjust);
9802 
9803   /* The offset of the current SP from the bottom of the static frame.  */
9804   poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust;
9805 
9806   if (emit_frame_chain)
9807     {
9808       /* The offset of the frame chain record (if any) from the current SP.  */
9809       poly_int64 chain_offset = (initial_adjust + callee_adjust
9810 				 - frame.bytes_above_hard_fp);
9811       gcc_assert (known_ge (chain_offset, 0));
9812 
9813       if (callee_adjust == 0)
9814 	{
9815 	  reg1 = R29_REGNUM;
9816 	  reg2 = R30_REGNUM;
9817 	  aarch64_save_callee_saves (bytes_below_sp, reg1, reg2,
9818 				     false, false);
9819 	}
9820       else
9821 	gcc_assert (known_eq (chain_offset, 0));
9822       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
9823 			  stack_pointer_rtx, chain_offset,
9824 			  tmp1_rtx, tmp0_rtx, frame_pointer_needed);
9825       if (frame_pointer_needed && !frame_size.is_constant ())
9826 	{
9827 	  /* Variable-sized frames need to describe the save slot
9828 	     address using DW_CFA_expression rather than DW_CFA_offset.
9829 	     This means that, without taking further action, the
9830 	     locations of the registers that we've already saved would
9831 	     remain based on the stack pointer even after we redefine
9832 	     the CFA based on the frame pointer.  We therefore need new
9833 	     DW_CFA_expressions to re-express the save slots with addresses
9834 	     based on the frame pointer.  */
9835 	  rtx_insn *insn = get_last_insn ();
9836 	  gcc_assert (RTX_FRAME_RELATED_P (insn));
9837 
9838 	  /* Add an explicit CFA definition if this was previously
9839 	     implicit.  */
9840 	  if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
9841 	    {
9842 	      rtx src = plus_constant (Pmode, stack_pointer_rtx, chain_offset);
9843 	      add_reg_note (insn, REG_CFA_ADJUST_CFA,
9844 			    gen_rtx_SET (hard_frame_pointer_rtx, src));
9845 	    }
9846 
9847 	  /* Change the save slot expressions for the registers that
9848 	     we've already saved.  */
9849 	  aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
9850 				      hard_frame_pointer_rtx, UNITS_PER_WORD);
9851 	  aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
9852 				      hard_frame_pointer_rtx, 0);
9853 	}
9854       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
9855     }
9856 
9857   aarch64_save_callee_saves (bytes_below_sp, R0_REGNUM, R30_REGNUM,
9858 			     callee_adjust != 0 || emit_frame_chain,
9859 			     emit_frame_chain);
9860   if (maybe_ne (sve_callee_adjust, 0))
9861     {
9862       gcc_assert (!flag_stack_clash_protection
9863 		  || known_eq (initial_adjust, 0));
9864       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
9865 					      sve_callee_adjust,
9866 					      !frame_pointer_needed, false);
9867       bytes_below_sp -= sve_callee_adjust;
9868     }
9869   aarch64_save_callee_saves (bytes_below_sp, P0_REGNUM, P15_REGNUM,
9870 			     false, emit_frame_chain);
9871   aarch64_save_callee_saves (bytes_below_sp, V0_REGNUM, V31_REGNUM,
9872 			     callee_adjust != 0 || emit_frame_chain,
9873 			     emit_frame_chain);
9874 
9875   /* We may need to probe the final adjustment if it is larger than the guard
9876      that is assumed by the called.  */
9877   gcc_assert (known_eq (bytes_below_sp, final_adjust));
9878   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
9879 					  !frame_pointer_needed, true);
9880   if (emit_frame_chain && maybe_ne (final_adjust, 0))
9881     emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
9882 }
9883 
9884 /* Return TRUE if we can use a simple_return insn.
9885 
9886    This function checks whether the callee saved stack is empty, which
9887    means no restore actions are need. The pro_and_epilogue will use
9888    this to check whether shrink-wrapping opt is feasible.  */
9889 
9890 bool
aarch64_use_return_insn_p(void)9891 aarch64_use_return_insn_p (void)
9892 {
9893   if (!reload_completed)
9894     return false;
9895 
9896   if (crtl->profile)
9897     return false;
9898 
9899   return known_eq (cfun->machine->frame.frame_size, 0);
9900 }
9901 
9902 /* Generate the epilogue instructions for returning from a function.
9903    This is almost exactly the reverse of the prolog sequence, except
9904    that we need to insert barriers to avoid scheduling loads that read
9905    from a deallocated stack, and we optimize the unwind records by
9906    emitting them all together if possible.  */
9907 void
aarch64_expand_epilogue(bool for_sibcall)9908 aarch64_expand_epilogue (bool for_sibcall)
9909 {
9910   aarch64_frame &frame = cfun->machine->frame;
9911   poly_int64 initial_adjust = frame.initial_adjust;
9912   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
9913   poly_int64 final_adjust = frame.final_adjust;
9914   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9915   poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
9916   unsigned reg1 = frame.wb_pop_candidate1;
9917   unsigned reg2 = frame.wb_pop_candidate2;
9918   unsigned int last_gpr = (frame.is_scs_enabled
9919 			   ? R29_REGNUM : R30_REGNUM);
9920   rtx cfi_ops = NULL;
9921   rtx_insn *insn;
9922   /* A stack clash protection prologue may not have left EP0_REGNUM or
9923      EP1_REGNUM in a usable state.  The same is true for allocations
9924      with an SVE component, since we then need both temporary registers
9925      for each allocation.  For stack clash we are in a usable state if
9926      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
9927   HOST_WIDE_INT guard_size
9928     = 1 << param_stack_clash_protection_guard_size;
9929   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9930 
9931   /* We can re-use the registers when:
9932 
9933      (a) the deallocation amount is the same as the corresponding
9934 	 allocation amount (which is false if we combine the initial
9935 	 and SVE callee save allocations in the prologue); and
9936 
9937      (b) the allocation amount doesn't need a probe (which is false
9938 	 if the amount is guard_size - guard_used_by_caller or greater).
9939 
9940      In such situations the register should remain live with the correct
9941      value.  */
9942   bool can_inherit_p = (initial_adjust.is_constant ()
9943 			&& final_adjust.is_constant ()
9944 			&& (!flag_stack_clash_protection
9945 			    || (known_lt (initial_adjust,
9946 					  guard_size - guard_used_by_caller)
9947 				&& known_eq (sve_callee_adjust, 0))));
9948 
9949   /* We need to add memory barrier to prevent read from deallocated stack.  */
9950   bool need_barrier_p
9951     = maybe_ne (get_frame_size ()
9952 		+ frame.saved_varargs_size, 0);
9953 
9954   /* Emit a barrier to prevent loads from a deallocated stack.  */
9955   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
9956       || cfun->calls_alloca
9957       || crtl->calls_eh_return)
9958     {
9959       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
9960       need_barrier_p = false;
9961     }
9962 
9963   /* Restore the stack pointer from the frame pointer if it may not
9964      be the same as the stack pointer.  */
9965   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9966   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9967   if (frame_pointer_needed
9968       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
9969     /* If writeback is used when restoring callee-saves, the CFA
9970        is restored on the instruction doing the writeback.  */
9971     aarch64_add_offset (Pmode, stack_pointer_rtx,
9972 			hard_frame_pointer_rtx,
9973 			-bytes_below_hard_fp + final_adjust,
9974 			tmp1_rtx, tmp0_rtx, callee_adjust == 0);
9975   else
9976      /* The case where we need to re-use the register here is very rare, so
9977 	avoid the complicated condition and just always emit a move if the
9978 	immediate doesn't fit.  */
9979      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
9980 
9981   /* Restore the vector registers before the predicate registers,
9982      so that we can use P4 as a temporary for big-endian SVE frames.  */
9983   aarch64_restore_callee_saves (final_adjust, V0_REGNUM, V31_REGNUM,
9984 				callee_adjust != 0, &cfi_ops);
9985   aarch64_restore_callee_saves (final_adjust, P0_REGNUM, P15_REGNUM,
9986 				false, &cfi_ops);
9987   if (maybe_ne (sve_callee_adjust, 0))
9988     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
9989 
9990   /* When shadow call stack is enabled, the scs_pop in the epilogue will
9991      restore x30, we don't need to restore x30 again in the traditional
9992      way.  */
9993   aarch64_restore_callee_saves (final_adjust + sve_callee_adjust,
9994 				R0_REGNUM, last_gpr,
9995 				callee_adjust != 0, &cfi_ops);
9996 
9997   if (need_barrier_p)
9998     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
9999 
10000   if (callee_adjust != 0)
10001     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
10002 
10003   /* If we have no register restore information, the CFA must have been
10004      defined in terms of the stack pointer since the end of the prologue.  */
10005   gcc_assert (cfi_ops || !frame_pointer_needed);
10006 
10007   if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
10008     {
10009       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
10010       insn = get_last_insn ();
10011       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
10012       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
10013       RTX_FRAME_RELATED_P (insn) = 1;
10014       cfi_ops = NULL;
10015     }
10016 
10017   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
10018      add restriction on emit_move optimization to leaf functions.  */
10019   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
10020 		  (!can_inherit_p || !crtl->is_leaf
10021 		   || df_regs_ever_live_p (EP0_REGNUM)));
10022 
10023   if (cfi_ops)
10024     {
10025       /* Emit delayed restores and reset the CFA to be SP.  */
10026       insn = get_last_insn ();
10027       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
10028       REG_NOTES (insn) = cfi_ops;
10029       RTX_FRAME_RELATED_P (insn) = 1;
10030     }
10031 
10032   /* Pop return address from shadow call stack.  */
10033   if (frame.is_scs_enabled)
10034     {
10035       machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
10036       rtx reg = gen_rtx_REG (mode, R30_REGNUM);
10037 
10038       insn = emit_insn (gen_scs_pop ());
10039       add_reg_note (insn, REG_CFA_RESTORE, reg);
10040       RTX_FRAME_RELATED_P (insn) = 1;
10041     }
10042 
10043   /* We prefer to emit the combined return/authenticate instruction RETAA,
10044      however there are three cases in which we must instead emit an explicit
10045      authentication instruction.
10046 
10047 	1) Sibcalls don't return in a normal way, so if we're about to call one
10048 	   we must authenticate.
10049 
10050 	2) The RETAA instruction is not available before ARMv8.3-A, so if we are
10051 	   generating code for !TARGET_ARMV8_3 we can't use it and must
10052 	   explicitly authenticate.
10053     */
10054   if (aarch64_return_address_signing_enabled ()
10055       && (for_sibcall || !TARGET_ARMV8_3))
10056     {
10057       switch (aarch64_ra_sign_key)
10058 	{
10059 	  case AARCH64_KEY_A:
10060 	    insn = emit_insn (gen_autiasp ());
10061 	    break;
10062 	  case AARCH64_KEY_B:
10063 	    insn = emit_insn (gen_autibsp ());
10064 	    break;
10065 	  default:
10066 	    gcc_unreachable ();
10067 	}
10068       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
10069       RTX_FRAME_RELATED_P (insn) = 1;
10070     }
10071 
10072   /* Stack adjustment for exception handler.  */
10073   if (crtl->calls_eh_return && !for_sibcall)
10074     {
10075       /* We need to unwind the stack by the offset computed by
10076 	 EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
10077 	 to be SP; letting the CFA move during this adjustment
10078 	 is just as correct as retaining the CFA from the body
10079 	 of the function.  Therefore, do nothing special.  */
10080       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
10081     }
10082 
10083   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
10084   if (!for_sibcall)
10085     emit_jump_insn (ret_rtx);
10086 }
10087 
10088 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
10089    normally or return to a previous frame after unwinding.
10090 
10091    An EH return uses a single shared return sequence.  The epilogue is
10092    exactly like a normal epilogue except that it has an extra input
10093    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
10094    that must be applied after the frame has been destroyed.  An extra label
10095    is inserted before the epilogue which initializes this register to zero,
10096    and this is the entry point for a normal return.
10097 
10098    An actual EH return updates the return address, initializes the stack
10099    adjustment and jumps directly into the epilogue (bypassing the zeroing
10100    of the adjustment).  Since the return address is typically saved on the
10101    stack when a function makes a call, the saved LR must be updated outside
10102    the epilogue.
10103 
10104    This poses problems as the store is generated well before the epilogue,
10105    so the offset of LR is not known yet.  Also optimizations will remove the
10106    store as it appears dead, even after the epilogue is generated (as the
10107    base or offset for loading LR is different in many cases).
10108 
10109    To avoid these problems this implementation forces the frame pointer
10110    in eh_return functions so that the location of LR is fixed and known early.
10111    It also marks the store volatile, so no optimization is permitted to
10112    remove the store.  */
10113 rtx
aarch64_eh_return_handler_rtx(void)10114 aarch64_eh_return_handler_rtx (void)
10115 {
10116   rtx tmp = gen_frame_mem (Pmode,
10117     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
10118 
10119   /* Mark the store volatile, so no optimization is permitted to remove it.  */
10120   MEM_VOLATILE_P (tmp) = true;
10121   return tmp;
10122 }
10123 
10124 /* Output code to add DELTA to the first argument, and then jump
10125    to FUNCTION.  Used for C++ multiple inheritance.  */
10126 static void
aarch64_output_mi_thunk(FILE * file,tree thunk ATTRIBUTE_UNUSED,HOST_WIDE_INT delta,HOST_WIDE_INT vcall_offset,tree function)10127 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
10128 			 HOST_WIDE_INT delta,
10129 			 HOST_WIDE_INT vcall_offset,
10130 			 tree function)
10131 {
10132   /* The this pointer is always in x0.  Note that this differs from
10133      Arm where the this pointer maybe bumped to r1 if r0 is required
10134      to return a pointer to an aggregate.  On AArch64 a result value
10135      pointer will be in x8.  */
10136   int this_regno = R0_REGNUM;
10137   rtx this_rtx, temp0, temp1, addr, funexp;
10138   rtx_insn *insn;
10139   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
10140 
10141   if (aarch64_bti_enabled ())
10142     emit_insn (gen_bti_c());
10143 
10144   reload_completed = 1;
10145   emit_note (NOTE_INSN_PROLOGUE_END);
10146 
10147   this_rtx = gen_rtx_REG (Pmode, this_regno);
10148   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
10149   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
10150 
10151   if (vcall_offset == 0)
10152     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
10153   else
10154     {
10155       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
10156 
10157       addr = this_rtx;
10158       if (delta != 0)
10159 	{
10160 	  if (delta >= -256 && delta < 256)
10161 	    addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
10162 				       plus_constant (Pmode, this_rtx, delta));
10163 	  else
10164 	    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
10165 				temp1, temp0, false);
10166 	}
10167 
10168       if (Pmode == ptr_mode)
10169 	aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
10170       else
10171 	aarch64_emit_move (temp0,
10172 			   gen_rtx_ZERO_EXTEND (Pmode,
10173 						gen_rtx_MEM (ptr_mode, addr)));
10174 
10175       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
10176 	  addr = plus_constant (Pmode, temp0, vcall_offset);
10177       else
10178 	{
10179 	  aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
10180 					  Pmode);
10181 	  addr = gen_rtx_PLUS (Pmode, temp0, temp1);
10182 	}
10183 
10184       if (Pmode == ptr_mode)
10185 	aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
10186       else
10187 	aarch64_emit_move (temp1,
10188 			   gen_rtx_SIGN_EXTEND (Pmode,
10189 						gen_rtx_MEM (ptr_mode, addr)));
10190 
10191       emit_insn (gen_add2_insn (this_rtx, temp1));
10192     }
10193 
10194   /* Generate a tail call to the target function.  */
10195   if (!TREE_USED (function))
10196     {
10197       assemble_external (function);
10198       TREE_USED (function) = 1;
10199     }
10200   funexp = XEXP (DECL_RTL (function), 0);
10201   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
10202   rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
10203   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
10204   SIBLING_CALL_P (insn) = 1;
10205 
10206   insn = get_insns ();
10207   shorten_branches (insn);
10208 
10209   assemble_start_function (thunk, fnname);
10210   final_start_function (insn, file, 1);
10211   final (insn, file, 1);
10212   final_end_function ();
10213   assemble_end_function (thunk, fnname);
10214 
10215   /* Stop pretending to be a post-reload pass.  */
10216   reload_completed = 0;
10217 }
10218 
10219 static bool
aarch64_tls_referenced_p(rtx x)10220 aarch64_tls_referenced_p (rtx x)
10221 {
10222   if (!TARGET_HAVE_TLS)
10223     return false;
10224   subrtx_iterator::array_type array;
10225   FOR_EACH_SUBRTX (iter, array, x, ALL)
10226     {
10227       const_rtx x = *iter;
10228       if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
10229 	return true;
10230       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10231 	 TLS offsets, not real symbol references.  */
10232       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10233 	iter.skip_subrtxes ();
10234     }
10235   return false;
10236 }
10237 
10238 
10239 /* Return true if val can be encoded as a 12-bit unsigned immediate with
10240    a left shift of 0 or 12 bits.  */
10241 bool
aarch64_uimm12_shift(HOST_WIDE_INT val)10242 aarch64_uimm12_shift (HOST_WIDE_INT val)
10243 {
10244   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
10245 	  || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
10246 	  );
10247 }
10248 
10249 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
10250    that can be created with a left shift of 0 or 12.  */
10251 static HOST_WIDE_INT
aarch64_clamp_to_uimm12_shift(HOST_WIDE_INT val)10252 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
10253 {
10254   /* Check to see if the value fits in 24 bits, as that is the maximum we can
10255      handle correctly.  */
10256   gcc_assert ((val & 0xffffff) == val);
10257 
10258   if (((val & 0xfff) << 0) == val)
10259     return val;
10260 
10261   return val & (0xfff << 12);
10262 }
10263 
10264 /* Return true if val is an immediate that can be loaded into a
10265    register by a MOVZ instruction.  */
10266 static bool
aarch64_movw_imm(HOST_WIDE_INT val,scalar_int_mode mode)10267 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
10268 {
10269   if (GET_MODE_SIZE (mode) > 4)
10270     {
10271       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
10272 	  || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
10273 	return 1;
10274     }
10275   else
10276     {
10277       /* Ignore sign extension.  */
10278       val &= (HOST_WIDE_INT) 0xffffffff;
10279     }
10280   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
10281 	  || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
10282 }
10283 
10284 /* Test whether:
10285 
10286      X = (X & AND_VAL) | IOR_VAL;
10287 
10288    can be implemented using:
10289 
10290      MOVK X, #(IOR_VAL >> shift), LSL #shift
10291 
10292    Return the shift if so, otherwise return -1.  */
10293 int
aarch64_movk_shift(const wide_int_ref & and_val,const wide_int_ref & ior_val)10294 aarch64_movk_shift (const wide_int_ref &and_val,
10295 		    const wide_int_ref &ior_val)
10296 {
10297   unsigned int precision = and_val.get_precision ();
10298   unsigned HOST_WIDE_INT mask = 0xffff;
10299   for (unsigned int shift = 0; shift < precision; shift += 16)
10300     {
10301       if (and_val == ~mask && (ior_val & mask) == ior_val)
10302 	return shift;
10303       mask <<= 16;
10304     }
10305   return -1;
10306 }
10307 
10308 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
10309    64-bit (DImode) integer.  */
10310 
10311 static unsigned HOST_WIDE_INT
aarch64_replicate_bitmask_imm(unsigned HOST_WIDE_INT val,machine_mode mode)10312 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
10313 {
10314   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
10315   while (size < 64)
10316     {
10317       val &= (HOST_WIDE_INT_1U << size) - 1;
10318       val |= val << size;
10319       size *= 2;
10320     }
10321   return val;
10322 }
10323 
10324 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
10325 
10326 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
10327   {
10328     0x0000000100000001ull,
10329     0x0001000100010001ull,
10330     0x0101010101010101ull,
10331     0x1111111111111111ull,
10332     0x5555555555555555ull,
10333   };
10334 
10335 
10336 /* Return true if val is a valid bitmask immediate.  */
10337 
10338 bool
aarch64_bitmask_imm(HOST_WIDE_INT val_in,machine_mode mode)10339 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
10340 {
10341   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
10342   int bits;
10343 
10344   /* Check for a single sequence of one bits and return quickly if so.
10345      The special cases of all ones and all zeroes returns false.  */
10346   val = aarch64_replicate_bitmask_imm (val_in, mode);
10347   tmp = val + (val & -val);
10348 
10349   if (tmp == (tmp & -tmp))
10350     return (val + 1) > 1;
10351 
10352   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
10353   if (mode == SImode)
10354     val = (val << 32) | (val & 0xffffffff);
10355 
10356   /* Invert if the immediate doesn't start with a zero bit - this means we
10357      only need to search for sequences of one bits.  */
10358   if (val & 1)
10359     val = ~val;
10360 
10361   /* Find the first set bit and set tmp to val with the first sequence of one
10362      bits removed.  Return success if there is a single sequence of ones.  */
10363   first_one = val & -val;
10364   tmp = val & (val + first_one);
10365 
10366   if (tmp == 0)
10367     return true;
10368 
10369   /* Find the next set bit and compute the difference in bit position.  */
10370   next_one = tmp & -tmp;
10371   bits = clz_hwi (first_one) - clz_hwi (next_one);
10372   mask = val ^ tmp;
10373 
10374   /* Check the bit position difference is a power of 2, and that the first
10375      sequence of one bits fits within 'bits' bits.  */
10376   if ((mask >> bits) != 0 || bits != (bits & -bits))
10377     return false;
10378 
10379   /* Check the sequence of one bits is repeated 64/bits times.  */
10380   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
10381 }
10382 
10383 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
10384    Assumed precondition: VAL_IN Is not zero.  */
10385 
10386 unsigned HOST_WIDE_INT
aarch64_and_split_imm1(HOST_WIDE_INT val_in)10387 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
10388 {
10389   int lowest_bit_set = ctz_hwi (val_in);
10390   int highest_bit_set = floor_log2 (val_in);
10391   gcc_assert (val_in != 0);
10392 
10393   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
10394 	  (HOST_WIDE_INT_1U << lowest_bit_set));
10395 }
10396 
10397 /* Create constant where bits outside of lowest bit set to highest bit set
10398    are set to 1.  */
10399 
10400 unsigned HOST_WIDE_INT
aarch64_and_split_imm2(HOST_WIDE_INT val_in)10401 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
10402 {
10403   return val_in | ~aarch64_and_split_imm1 (val_in);
10404 }
10405 
10406 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
10407 
10408 bool
aarch64_and_bitmask_imm(unsigned HOST_WIDE_INT val_in,machine_mode mode)10409 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
10410 {
10411   scalar_int_mode int_mode;
10412   if (!is_a <scalar_int_mode> (mode, &int_mode))
10413     return false;
10414 
10415   if (aarch64_bitmask_imm (val_in, int_mode))
10416     return false;
10417 
10418   if (aarch64_move_imm (val_in, int_mode))
10419     return false;
10420 
10421   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
10422 
10423   return aarch64_bitmask_imm (imm2, int_mode);
10424 }
10425 
10426 /* Return true if val is an immediate that can be loaded into a
10427    register in a single instruction.  */
10428 bool
aarch64_move_imm(HOST_WIDE_INT val,machine_mode mode)10429 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
10430 {
10431   scalar_int_mode int_mode;
10432   if (!is_a <scalar_int_mode> (mode, &int_mode))
10433     return false;
10434 
10435   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
10436     return 1;
10437   return aarch64_bitmask_imm (val, int_mode);
10438 }
10439 
10440 static bool
aarch64_cannot_force_const_mem(machine_mode mode ATTRIBUTE_UNUSED,rtx x)10441 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
10442 {
10443   if (GET_CODE (x) == HIGH)
10444     return true;
10445 
10446   /* There's no way to calculate VL-based values using relocations.  */
10447   subrtx_iterator::array_type array;
10448   FOR_EACH_SUBRTX (iter, array, x, ALL)
10449     if (GET_CODE (*iter) == CONST_POLY_INT)
10450       return true;
10451 
10452   poly_int64 offset;
10453   rtx base = strip_offset_and_salt (x, &offset);
10454   if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
10455     {
10456       /* We checked for POLY_INT_CST offsets above.  */
10457       if (aarch64_classify_symbol (base, offset.to_constant ())
10458 	  != SYMBOL_FORCE_TO_MEM)
10459 	return true;
10460       else
10461 	/* Avoid generating a 64-bit relocation in ILP32; leave
10462 	   to aarch64_expand_mov_immediate to handle it properly.  */
10463 	return mode != ptr_mode;
10464     }
10465 
10466   return aarch64_tls_referenced_p (x);
10467 }
10468 
10469 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10470    The expansion for a table switch is quite expensive due to the number
10471    of instructions, the table lookup and hard to predict indirect jump.
10472    When optimizing for speed, and -O3 enabled, use the per-core tuning if
10473    set, otherwise use tables for >= 11 cases as a tradeoff between size and
10474    performance.  When optimizing for size, use 8 for smallest codesize.  */
10475 
10476 static unsigned int
aarch64_case_values_threshold(void)10477 aarch64_case_values_threshold (void)
10478 {
10479   /* Use the specified limit for the number of cases before using jump
10480      tables at higher optimization levels.  */
10481   if (optimize > 2
10482       && selected_cpu->tune->max_case_values != 0)
10483     return selected_cpu->tune->max_case_values;
10484   else
10485     return optimize_size ? 8 : 11;
10486 }
10487 
10488 /* Return true if register REGNO is a valid index register.
10489    STRICT_P is true if REG_OK_STRICT is in effect.  */
10490 
10491 bool
aarch64_regno_ok_for_index_p(int regno,bool strict_p)10492 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
10493 {
10494   if (!HARD_REGISTER_NUM_P (regno))
10495     {
10496       if (!strict_p)
10497 	return true;
10498 
10499       if (!reg_renumber)
10500 	return false;
10501 
10502       regno = reg_renumber[regno];
10503     }
10504   return GP_REGNUM_P (regno);
10505 }
10506 
10507 /* Return true if register REGNO is a valid base register for mode MODE.
10508    STRICT_P is true if REG_OK_STRICT is in effect.  */
10509 
10510 bool
aarch64_regno_ok_for_base_p(int regno,bool strict_p)10511 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
10512 {
10513   if (!HARD_REGISTER_NUM_P (regno))
10514     {
10515       if (!strict_p)
10516 	return true;
10517 
10518       if (!reg_renumber)
10519 	return false;
10520 
10521       regno = reg_renumber[regno];
10522     }
10523 
10524   /* The fake registers will be eliminated to either the stack or
10525      hard frame pointer, both of which are usually valid base registers.
10526      Reload deals with the cases where the eliminated form isn't valid.  */
10527   return (GP_REGNUM_P (regno)
10528 	  || regno == SP_REGNUM
10529 	  || regno == FRAME_POINTER_REGNUM
10530 	  || regno == ARG_POINTER_REGNUM);
10531 }
10532 
10533 /* Return true if X is a valid base register for mode MODE.
10534    STRICT_P is true if REG_OK_STRICT is in effect.  */
10535 
10536 static bool
aarch64_base_register_rtx_p(rtx x,bool strict_p)10537 aarch64_base_register_rtx_p (rtx x, bool strict_p)
10538 {
10539   if (!strict_p
10540       && SUBREG_P (x)
10541       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
10542     x = SUBREG_REG (x);
10543 
10544   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
10545 }
10546 
10547 /* Return true if address offset is a valid index.  If it is, fill in INFO
10548    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
10549 
10550 static bool
aarch64_classify_index(struct aarch64_address_info * info,rtx x,machine_mode mode,bool strict_p)10551 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
10552 			machine_mode mode, bool strict_p)
10553 {
10554   enum aarch64_address_type type;
10555   rtx index;
10556   int shift;
10557 
10558   /* (reg:P) */
10559   if ((REG_P (x) || SUBREG_P (x))
10560       && GET_MODE (x) == Pmode)
10561     {
10562       type = ADDRESS_REG_REG;
10563       index = x;
10564       shift = 0;
10565     }
10566   /* (sign_extend:DI (reg:SI)) */
10567   else if ((GET_CODE (x) == SIGN_EXTEND
10568 	    || GET_CODE (x) == ZERO_EXTEND)
10569 	   && GET_MODE (x) == DImode
10570 	   && GET_MODE (XEXP (x, 0)) == SImode)
10571     {
10572       type = (GET_CODE (x) == SIGN_EXTEND)
10573 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10574       index = XEXP (x, 0);
10575       shift = 0;
10576     }
10577   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10578   else if (GET_CODE (x) == MULT
10579 	   && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10580 	       || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10581 	   && GET_MODE (XEXP (x, 0)) == DImode
10582 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10583 	   && CONST_INT_P (XEXP (x, 1)))
10584     {
10585       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10586 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10587       index = XEXP (XEXP (x, 0), 0);
10588       shift = exact_log2 (INTVAL (XEXP (x, 1)));
10589     }
10590   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10591   else if (GET_CODE (x) == ASHIFT
10592 	   && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10593 	       || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10594 	   && GET_MODE (XEXP (x, 0)) == DImode
10595 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10596 	   && CONST_INT_P (XEXP (x, 1)))
10597     {
10598       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10599 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10600       index = XEXP (XEXP (x, 0), 0);
10601       shift = INTVAL (XEXP (x, 1));
10602     }
10603   /* (and:DI (mult:DI (reg:DI) (const_int scale))
10604      (const_int 0xffffffff<<shift)) */
10605   else if (GET_CODE (x) == AND
10606 	   && GET_MODE (x) == DImode
10607 	   && GET_CODE (XEXP (x, 0)) == MULT
10608 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10609 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10610 	   && CONST_INT_P (XEXP (x, 1)))
10611     {
10612       type = ADDRESS_REG_UXTW;
10613       index = XEXP (XEXP (x, 0), 0);
10614       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
10615       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10616 	shift = -1;
10617     }
10618   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10619      (const_int 0xffffffff<<shift)) */
10620   else if (GET_CODE (x) == AND
10621 	   && GET_MODE (x) == DImode
10622 	   && GET_CODE (XEXP (x, 0)) == ASHIFT
10623 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10624 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10625 	   && CONST_INT_P (XEXP (x, 1)))
10626     {
10627       type = ADDRESS_REG_UXTW;
10628       index = XEXP (XEXP (x, 0), 0);
10629       shift = INTVAL (XEXP (XEXP (x, 0), 1));
10630       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10631 	shift = -1;
10632     }
10633   /* (mult:P (reg:P) (const_int scale)) */
10634   else if (GET_CODE (x) == MULT
10635 	   && GET_MODE (x) == Pmode
10636 	   && GET_MODE (XEXP (x, 0)) == Pmode
10637 	   && CONST_INT_P (XEXP (x, 1)))
10638     {
10639       type = ADDRESS_REG_REG;
10640       index = XEXP (x, 0);
10641       shift = exact_log2 (INTVAL (XEXP (x, 1)));
10642     }
10643   /* (ashift:P (reg:P) (const_int shift)) */
10644   else if (GET_CODE (x) == ASHIFT
10645 	   && GET_MODE (x) == Pmode
10646 	   && GET_MODE (XEXP (x, 0)) == Pmode
10647 	   && CONST_INT_P (XEXP (x, 1)))
10648     {
10649       type = ADDRESS_REG_REG;
10650       index = XEXP (x, 0);
10651       shift = INTVAL (XEXP (x, 1));
10652     }
10653   else
10654     return false;
10655 
10656   if (!strict_p
10657       && SUBREG_P (index)
10658       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
10659     index = SUBREG_REG (index);
10660 
10661   if (aarch64_sve_data_mode_p (mode))
10662     {
10663       if (type != ADDRESS_REG_REG
10664 	  || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
10665 	return false;
10666     }
10667   else
10668     {
10669       if (shift != 0
10670 	  && !(IN_RANGE (shift, 1, 3)
10671 	       && known_eq (1 << shift, GET_MODE_SIZE (mode))))
10672 	return false;
10673     }
10674 
10675   if (REG_P (index)
10676       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
10677     {
10678       info->type = type;
10679       info->offset = index;
10680       info->shift = shift;
10681       return true;
10682     }
10683 
10684   return false;
10685 }
10686 
10687 /* Return true if MODE is one of the modes for which we
10688    support LDP/STP operations.  */
10689 
10690 static bool
aarch64_mode_valid_for_sched_fusion_p(machine_mode mode)10691 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
10692 {
10693   return mode == SImode || mode == DImode
10694 	 || mode == SFmode || mode == DFmode
10695 	 || (aarch64_vector_mode_supported_p (mode)
10696 	     && (known_eq (GET_MODE_SIZE (mode), 8)
10697 		 || (known_eq (GET_MODE_SIZE (mode), 16)
10698 		    && (aarch64_tune_params.extra_tuning_flags
10699 			& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
10700 }
10701 
10702 /* Return true if REGNO is a virtual pointer register, or an eliminable
10703    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
10704    include stack_pointer or hard_frame_pointer.  */
10705 static bool
virt_or_elim_regno_p(unsigned regno)10706 virt_or_elim_regno_p (unsigned regno)
10707 {
10708   return ((regno >= FIRST_VIRTUAL_REGISTER
10709 	   && regno <= LAST_VIRTUAL_POINTER_REGISTER)
10710 	  || regno == FRAME_POINTER_REGNUM
10711 	  || regno == ARG_POINTER_REGNUM);
10712 }
10713 
10714 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10715    If it is, fill in INFO appropriately.  STRICT_P is true if
10716    REG_OK_STRICT is in effect.  */
10717 
10718 bool
aarch64_classify_address(struct aarch64_address_info * info,rtx x,machine_mode mode,bool strict_p,aarch64_addr_query_type type)10719 aarch64_classify_address (struct aarch64_address_info *info,
10720 			  rtx x, machine_mode mode, bool strict_p,
10721 			  aarch64_addr_query_type type)
10722 {
10723   enum rtx_code code = GET_CODE (x);
10724   rtx op0, op1;
10725   poly_int64 offset;
10726 
10727   HOST_WIDE_INT const_size;
10728 
10729   /* Whether a vector mode is partial doesn't affect address legitimacy.
10730      Partial vectors like VNx8QImode allow the same indexed addressing
10731      mode and MUL VL addressing mode as full vectors like VNx16QImode;
10732      in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
10733   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10734   vec_flags &= ~VEC_PARTIAL;
10735 
10736   /* On BE, we use load/store pair for all large int mode load/stores.
10737      TI/TFmode may also use a load/store pair.  */
10738   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
10739   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
10740 			    || type == ADDR_QUERY_LDP_STP_N
10741 			    || mode == TImode
10742 			    || mode == TFmode
10743 			    || (BYTES_BIG_ENDIAN && advsimd_struct_p));
10744   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10745      corresponds to the actual size of the memory being loaded/stored and the
10746      mode of the corresponding addressing mode is half of that.  */
10747   if (type == ADDR_QUERY_LDP_STP_N)
10748     {
10749       if (known_eq (GET_MODE_SIZE (mode), 16))
10750 	mode = DFmode;
10751       else if (known_eq (GET_MODE_SIZE (mode), 8))
10752 	mode = SFmode;
10753       else
10754 	return false;
10755     }
10756 
10757   bool allow_reg_index_p = (!load_store_pair_p
10758 			    && ((vec_flags == 0
10759 				 && known_lt (GET_MODE_SIZE (mode), 16))
10760 				|| vec_flags == VEC_ADVSIMD
10761 				|| vec_flags & VEC_SVE_DATA));
10762 
10763   /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10764      The latter is not valid for SVE predicates, and that's rejected through
10765      allow_reg_index_p above.  */
10766   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
10767       && (code != REG && code != PLUS))
10768     return false;
10769 
10770   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
10771      REG addressing.  */
10772   if (advsimd_struct_p
10773       && !BYTES_BIG_ENDIAN
10774       && (code != POST_INC && code != REG))
10775     return false;
10776 
10777   gcc_checking_assert (GET_MODE (x) == VOIDmode
10778 		       || SCALAR_INT_MODE_P (GET_MODE (x)));
10779 
10780   switch (code)
10781     {
10782     case REG:
10783     case SUBREG:
10784       info->type = ADDRESS_REG_IMM;
10785       info->base = x;
10786       info->offset = const0_rtx;
10787       info->const_offset = 0;
10788       return aarch64_base_register_rtx_p (x, strict_p);
10789 
10790     case PLUS:
10791       op0 = XEXP (x, 0);
10792       op1 = XEXP (x, 1);
10793 
10794       if (! strict_p
10795 	  && REG_P (op0)
10796 	  && virt_or_elim_regno_p (REGNO (op0))
10797 	  && poly_int_rtx_p (op1, &offset))
10798 	{
10799 	  info->type = ADDRESS_REG_IMM;
10800 	  info->base = op0;
10801 	  info->offset = op1;
10802 	  info->const_offset = offset;
10803 
10804 	  return true;
10805 	}
10806 
10807       if (maybe_ne (GET_MODE_SIZE (mode), 0)
10808 	  && aarch64_base_register_rtx_p (op0, strict_p)
10809 	  && poly_int_rtx_p (op1, &offset))
10810 	{
10811 	  info->type = ADDRESS_REG_IMM;
10812 	  info->base = op0;
10813 	  info->offset = op1;
10814 	  info->const_offset = offset;
10815 
10816 	  /* TImode and TFmode values are allowed in both pairs of X
10817 	     registers and individual Q registers.  The available
10818 	     address modes are:
10819 	     X,X: 7-bit signed scaled offset
10820 	     Q:   9-bit signed offset
10821 	     We conservatively require an offset representable in either mode.
10822 	     When performing the check for pairs of X registers i.e.  LDP/STP
10823 	     pass down DImode since that is the natural size of the LDP/STP
10824 	     instruction memory accesses.  */
10825 	  if (mode == TImode || mode == TFmode)
10826 	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10827 		    && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10828 			|| offset_12bit_unsigned_scaled_p (mode, offset)));
10829 
10830 	  if (mode == V8DImode)
10831 	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10832 	            && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
10833 
10834 	  /* A 7bit offset check because OImode will emit a ldp/stp
10835 	     instruction (only big endian will get here).
10836 	     For ldp/stp instructions, the offset is scaled for the size of a
10837 	     single element of the pair.  */
10838 	  if (aarch64_advsimd_partial_struct_mode_p (mode)
10839 	      && known_eq (GET_MODE_SIZE (mode), 16))
10840 	    return aarch64_offset_7bit_signed_scaled_p (DImode, offset);
10841 	  if (aarch64_advsimd_full_struct_mode_p (mode)
10842 	      && known_eq (GET_MODE_SIZE (mode), 32))
10843 	    return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
10844 
10845 	  /* Three 9/12 bit offsets checks because CImode will emit three
10846 	     ldr/str instructions (only big endian will get here).  */
10847 	  if (aarch64_advsimd_partial_struct_mode_p (mode)
10848 	      && known_eq (GET_MODE_SIZE (mode), 24))
10849 	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10850 		    && (aarch64_offset_9bit_signed_unscaled_p (DImode,
10851 							       offset + 16)
10852 			|| offset_12bit_unsigned_scaled_p (DImode,
10853 							   offset + 16)));
10854 	  if (aarch64_advsimd_full_struct_mode_p (mode)
10855 	      && known_eq (GET_MODE_SIZE (mode), 48))
10856 	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10857 		    && (aarch64_offset_9bit_signed_unscaled_p (TImode,
10858 							       offset + 32)
10859 			|| offset_12bit_unsigned_scaled_p (TImode,
10860 							   offset + 32)));
10861 
10862 	  /* Two 7bit offsets checks because XImode will emit two ldp/stp
10863 	     instructions (only big endian will get here).  */
10864 	  if (aarch64_advsimd_partial_struct_mode_p (mode)
10865 	      && known_eq (GET_MODE_SIZE (mode), 32))
10866 	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10867 		    && aarch64_offset_7bit_signed_scaled_p (DImode,
10868 							    offset + 16));
10869 	  if (aarch64_advsimd_full_struct_mode_p (mode)
10870 	      && known_eq (GET_MODE_SIZE (mode), 64))
10871 	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10872 		    && aarch64_offset_7bit_signed_scaled_p (TImode,
10873 							    offset + 32));
10874 
10875 	  /* Make "m" use the LD1 offset range for SVE data modes, so
10876 	     that pre-RTL optimizers like ivopts will work to that
10877 	     instead of the wider LDR/STR range.  */
10878 	  if (vec_flags == VEC_SVE_DATA)
10879 	    return (type == ADDR_QUERY_M
10880 		    ? offset_4bit_signed_scaled_p (mode, offset)
10881 		    : offset_9bit_signed_scaled_p (mode, offset));
10882 
10883 	  if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
10884 	    {
10885 	      poly_int64 end_offset = (offset
10886 				       + GET_MODE_SIZE (mode)
10887 				       - BYTES_PER_SVE_VECTOR);
10888 	      return (type == ADDR_QUERY_M
10889 		      ? offset_4bit_signed_scaled_p (mode, offset)
10890 		      : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
10891 			 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
10892 							 end_offset)));
10893 	    }
10894 
10895 	  if (vec_flags == VEC_SVE_PRED)
10896 	    return offset_9bit_signed_scaled_p (mode, offset);
10897 
10898 	  if (load_store_pair_p)
10899 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
10900 		     || known_eq (GET_MODE_SIZE (mode), 8)
10901 		     || known_eq (GET_MODE_SIZE (mode), 16))
10902 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10903 	  else
10904 	    return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10905 		    || offset_12bit_unsigned_scaled_p (mode, offset));
10906 	}
10907 
10908       if (allow_reg_index_p)
10909 	{
10910 	  /* Look for base + (scaled/extended) index register.  */
10911 	  if (aarch64_base_register_rtx_p (op0, strict_p)
10912 	      && aarch64_classify_index (info, op1, mode, strict_p))
10913 	    {
10914 	      info->base = op0;
10915 	      return true;
10916 	    }
10917 	  if (aarch64_base_register_rtx_p (op1, strict_p)
10918 	      && aarch64_classify_index (info, op0, mode, strict_p))
10919 	    {
10920 	      info->base = op1;
10921 	      return true;
10922 	    }
10923 	}
10924 
10925       return false;
10926 
10927     case POST_INC:
10928     case POST_DEC:
10929     case PRE_INC:
10930     case PRE_DEC:
10931       info->type = ADDRESS_REG_WB;
10932       info->base = XEXP (x, 0);
10933       info->offset = NULL_RTX;
10934       return aarch64_base_register_rtx_p (info->base, strict_p);
10935 
10936     case POST_MODIFY:
10937     case PRE_MODIFY:
10938       info->type = ADDRESS_REG_WB;
10939       info->base = XEXP (x, 0);
10940       if (GET_CODE (XEXP (x, 1)) == PLUS
10941 	  && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
10942 	  && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
10943 	  && aarch64_base_register_rtx_p (info->base, strict_p))
10944 	{
10945 	  info->offset = XEXP (XEXP (x, 1), 1);
10946 	  info->const_offset = offset;
10947 
10948 	  /* TImode and TFmode values are allowed in both pairs of X
10949 	     registers and individual Q registers.  The available
10950 	     address modes are:
10951 	     X,X: 7-bit signed scaled offset
10952 	     Q:   9-bit signed offset
10953 	     We conservatively require an offset representable in either mode.
10954 	   */
10955 	  if (mode == TImode || mode == TFmode)
10956 	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
10957 		    && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
10958 
10959 	  if (load_store_pair_p)
10960 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
10961 		     || known_eq (GET_MODE_SIZE (mode), 8)
10962 		     || known_eq (GET_MODE_SIZE (mode), 16))
10963 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10964 	  else
10965 	    return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
10966 	}
10967       return false;
10968 
10969     case CONST:
10970     case SYMBOL_REF:
10971     case LABEL_REF:
10972       /* load literal: pc-relative constant pool entry.  Only supported
10973          for SI mode or larger.  */
10974       info->type = ADDRESS_SYMBOLIC;
10975 
10976       if (!load_store_pair_p
10977 	  && GET_MODE_SIZE (mode).is_constant (&const_size)
10978 	  && const_size >= 4)
10979 	{
10980 	  poly_int64 offset;
10981 	  rtx sym = strip_offset_and_salt (x, &offset);
10982 	  return ((LABEL_REF_P (sym)
10983 		   || (SYMBOL_REF_P (sym)
10984 		       && CONSTANT_POOL_ADDRESS_P (sym)
10985 		       && aarch64_pcrelative_literal_loads)));
10986 	}
10987       return false;
10988 
10989     case LO_SUM:
10990       info->type = ADDRESS_LO_SUM;
10991       info->base = XEXP (x, 0);
10992       info->offset = XEXP (x, 1);
10993       if (allow_reg_index_p
10994 	  && aarch64_base_register_rtx_p (info->base, strict_p))
10995 	{
10996 	  poly_int64 offset;
10997 	  HOST_WIDE_INT const_offset;
10998 	  rtx sym = strip_offset_and_salt (info->offset, &offset);
10999 	  if (SYMBOL_REF_P (sym)
11000 	      && offset.is_constant (&const_offset)
11001 	      && (aarch64_classify_symbol (sym, const_offset)
11002 		  == SYMBOL_SMALL_ABSOLUTE))
11003 	    {
11004 	      /* The symbol and offset must be aligned to the access size.  */
11005 	      unsigned int align;
11006 
11007 	      if (CONSTANT_POOL_ADDRESS_P (sym))
11008 		align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
11009 	      else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
11010 		{
11011 		  tree exp = SYMBOL_REF_DECL (sym);
11012 		  align = TYPE_ALIGN (TREE_TYPE (exp));
11013 		  align = aarch64_constant_alignment (exp, align);
11014 		}
11015 	      else if (SYMBOL_REF_DECL (sym))
11016 		align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
11017 	      else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
11018 		       && SYMBOL_REF_BLOCK (sym) != NULL)
11019 		align = SYMBOL_REF_BLOCK (sym)->alignment;
11020 	      else
11021 		align = BITS_PER_UNIT;
11022 
11023 	      poly_int64 ref_size = GET_MODE_SIZE (mode);
11024 	      if (known_eq (ref_size, 0))
11025 		ref_size = GET_MODE_SIZE (DImode);
11026 
11027 	      return (multiple_p (const_offset, ref_size)
11028 		      && multiple_p (align / BITS_PER_UNIT, ref_size));
11029 	    }
11030 	}
11031       return false;
11032 
11033     default:
11034       return false;
11035     }
11036 }
11037 
11038 /* Return true if the address X is valid for a PRFM instruction.
11039    STRICT_P is true if we should do strict checking with
11040    aarch64_classify_address.  */
11041 
11042 bool
aarch64_address_valid_for_prefetch_p(rtx x,bool strict_p)11043 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
11044 {
11045   struct aarch64_address_info addr;
11046 
11047   /* PRFM accepts the same addresses as DImode...  */
11048   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
11049   if (!res)
11050     return false;
11051 
11052   /* ... except writeback forms.  */
11053   return addr.type != ADDRESS_REG_WB;
11054 }
11055 
11056 bool
aarch64_symbolic_address_p(rtx x)11057 aarch64_symbolic_address_p (rtx x)
11058 {
11059   poly_int64 offset;
11060   x = strip_offset_and_salt (x, &offset);
11061   return SYMBOL_REF_P (x) || LABEL_REF_P (x);
11062 }
11063 
11064 /* Classify the base of symbolic expression X.  */
11065 
11066 enum aarch64_symbol_type
aarch64_classify_symbolic_expression(rtx x)11067 aarch64_classify_symbolic_expression (rtx x)
11068 {
11069   rtx offset;
11070 
11071   split_const (x, &x, &offset);
11072   return aarch64_classify_symbol (x, INTVAL (offset));
11073 }
11074 
11075 
11076 /* Return TRUE if X is a legitimate address for accessing memory in
11077    mode MODE.  */
11078 static bool
aarch64_legitimate_address_hook_p(machine_mode mode,rtx x,bool strict_p)11079 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
11080 {
11081   struct aarch64_address_info addr;
11082 
11083   return aarch64_classify_address (&addr, x, mode, strict_p);
11084 }
11085 
11086 /* Return TRUE if X is a legitimate address of type TYPE for accessing
11087    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
11088 bool
aarch64_legitimate_address_p(machine_mode mode,rtx x,bool strict_p,aarch64_addr_query_type type)11089 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
11090 			      aarch64_addr_query_type type)
11091 {
11092   struct aarch64_address_info addr;
11093 
11094   return aarch64_classify_address (&addr, x, mode, strict_p, type);
11095 }
11096 
11097 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
11098 
11099 static bool
aarch64_legitimize_address_displacement(rtx * offset1,rtx * offset2,poly_int64 orig_offset,machine_mode mode)11100 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
11101 					 poly_int64 orig_offset,
11102 					 machine_mode mode)
11103 {
11104   HOST_WIDE_INT size;
11105   if (GET_MODE_SIZE (mode).is_constant (&size))
11106     {
11107       HOST_WIDE_INT const_offset, second_offset;
11108 
11109       /* A general SVE offset is A * VQ + B.  Remove the A component from
11110 	 coefficient 0 in order to get the constant B.  */
11111       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
11112 
11113       /* Split an out-of-range address displacement into a base and
11114 	 offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
11115 	 range otherwise to increase opportunities for sharing the base
11116 	 address of different sizes.  Unaligned accesses use the signed
11117 	 9-bit range, TImode/TFmode use the intersection of signed
11118 	 scaled 7-bit and signed 9-bit offset.  */
11119       if (mode == TImode || mode == TFmode)
11120 	second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
11121       else if ((const_offset & (size - 1)) != 0)
11122 	second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
11123       else
11124 	second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
11125 
11126       if (second_offset == 0 || known_eq (orig_offset, second_offset))
11127 	return false;
11128 
11129       /* Split the offset into second_offset and the rest.  */
11130       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11131       *offset2 = gen_int_mode (second_offset, Pmode);
11132       return true;
11133     }
11134   else
11135     {
11136       /* Get the mode we should use as the basis of the range.  For structure
11137 	 modes this is the mode of one vector.  */
11138       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11139       machine_mode step_mode
11140 	= (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
11141 
11142       /* Get the "mul vl" multiplier we'd like to use.  */
11143       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
11144       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
11145       if (vec_flags & VEC_SVE_DATA)
11146 	/* LDR supports a 9-bit range, but the move patterns for
11147 	   structure modes require all vectors to be in range of the
11148 	   same base.  The simplest way of accomodating that while still
11149 	   promoting reuse of anchor points between different modes is
11150 	   to use an 8-bit range unconditionally.  */
11151 	vnum = ((vnum + 128) & 255) - 128;
11152       else
11153 	/* Predicates are only handled singly, so we might as well use
11154 	   the full range.  */
11155 	vnum = ((vnum + 256) & 511) - 256;
11156       if (vnum == 0)
11157 	return false;
11158 
11159       /* Convert the "mul vl" multiplier into a byte offset.  */
11160       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
11161       if (known_eq (second_offset, orig_offset))
11162 	return false;
11163 
11164       /* Split the offset into second_offset and the rest.  */
11165       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11166       *offset2 = gen_int_mode (second_offset, Pmode);
11167       return true;
11168     }
11169 }
11170 
11171 /* Return the binary representation of floating point constant VALUE in INTVAL.
11172    If the value cannot be converted, return false without setting INTVAL.
11173    The conversion is done in the given MODE.  */
11174 bool
aarch64_reinterpret_float_as_int(rtx value,unsigned HOST_WIDE_INT * intval)11175 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
11176 {
11177 
11178   /* We make a general exception for 0.  */
11179   if (aarch64_float_const_zero_rtx_p (value))
11180     {
11181       *intval = 0;
11182       return true;
11183     }
11184 
11185   scalar_float_mode mode;
11186   if (!CONST_DOUBLE_P (value)
11187       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
11188       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
11189       /* Only support up to DF mode.  */
11190       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
11191     return false;
11192 
11193   unsigned HOST_WIDE_INT ival = 0;
11194 
11195   long res[2];
11196   real_to_target (res,
11197 		  CONST_DOUBLE_REAL_VALUE (value),
11198 		  REAL_MODE_FORMAT (mode));
11199 
11200   if (mode == DFmode)
11201     {
11202       int order = BYTES_BIG_ENDIAN ? 1 : 0;
11203       ival = zext_hwi (res[order], 32);
11204       ival |= (zext_hwi (res[1 - order], 32) << 32);
11205     }
11206   else
11207       ival = zext_hwi (res[0], 32);
11208 
11209   *intval = ival;
11210   return true;
11211 }
11212 
11213 /* Return TRUE if rtx X is an immediate constant that can be moved using a
11214    single MOV(+MOVK) followed by an FMOV.  */
11215 bool
aarch64_float_const_rtx_p(rtx x)11216 aarch64_float_const_rtx_p (rtx x)
11217 {
11218   machine_mode mode = GET_MODE (x);
11219   if (mode == VOIDmode)
11220     return false;
11221 
11222   /* Determine whether it's cheaper to write float constants as
11223      mov/movk pairs over ldr/adrp pairs.  */
11224   unsigned HOST_WIDE_INT ival;
11225 
11226   if (CONST_DOUBLE_P (x)
11227       && SCALAR_FLOAT_MODE_P (mode)
11228       && aarch64_reinterpret_float_as_int (x, &ival))
11229     {
11230       scalar_int_mode imode = (mode == HFmode
11231 			       ? SImode
11232 			       : int_mode_for_mode (mode).require ());
11233       int num_instr = aarch64_internal_mov_immediate
11234 			(NULL_RTX, gen_int_mode (ival, imode), false, imode);
11235       return num_instr < 3;
11236     }
11237 
11238   return false;
11239 }
11240 
11241 /* Return TRUE if rtx X is immediate constant 0.0 */
11242 bool
aarch64_float_const_zero_rtx_p(rtx x)11243 aarch64_float_const_zero_rtx_p (rtx x)
11244 {
11245   if (GET_MODE (x) == VOIDmode)
11246     return false;
11247 
11248   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
11249     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
11250   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
11251 }
11252 
11253 /* Return TRUE if rtx X is immediate constant that fits in a single
11254    MOVI immediate operation.  */
11255 bool
aarch64_can_const_movi_rtx_p(rtx x,machine_mode mode)11256 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
11257 {
11258   if (!TARGET_SIMD)
11259      return false;
11260 
11261   machine_mode vmode;
11262   scalar_int_mode imode;
11263   unsigned HOST_WIDE_INT ival;
11264 
11265   if (CONST_DOUBLE_P (x)
11266       && SCALAR_FLOAT_MODE_P (mode))
11267     {
11268       if (!aarch64_reinterpret_float_as_int (x, &ival))
11269 	return false;
11270 
11271       /* We make a general exception for 0.  */
11272       if (aarch64_float_const_zero_rtx_p (x))
11273 	return true;
11274 
11275       imode = int_mode_for_mode (mode).require ();
11276     }
11277   else if (CONST_INT_P (x)
11278 	   && is_a <scalar_int_mode> (mode, &imode))
11279     ival = INTVAL (x);
11280   else
11281     return false;
11282 
11283    /* use a 64 bit mode for everything except for DI/DF mode, where we use
11284      a 128 bit vector mode.  */
11285   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
11286 
11287   vmode = aarch64_simd_container_mode (imode, width);
11288   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
11289 
11290   return aarch64_simd_valid_immediate (v_op, NULL);
11291 }
11292 
11293 
11294 /* Return the fixed registers used for condition codes.  */
11295 
11296 static bool
aarch64_fixed_condition_code_regs(unsigned int * p1,unsigned int * p2)11297 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
11298 {
11299   *p1 = CC_REGNUM;
11300   *p2 = INVALID_REGNUM;
11301   return true;
11302 }
11303 
11304 /* This function is used by the call expanders of the machine description.
11305    RESULT is the register in which the result is returned.  It's NULL for
11306    "call" and "sibcall".
11307    MEM is the location of the function call.
11308    CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
11309    SIBCALL indicates whether this function call is normal call or sibling call.
11310    It will generate different pattern accordingly.  */
11311 
11312 void
aarch64_expand_call(rtx result,rtx mem,rtx callee_abi,bool sibcall)11313 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
11314 {
11315   rtx call, callee, tmp;
11316   rtvec vec;
11317   machine_mode mode;
11318 
11319   gcc_assert (MEM_P (mem));
11320   callee = XEXP (mem, 0);
11321   mode = GET_MODE (callee);
11322   gcc_assert (mode == Pmode);
11323 
11324   /* Decide if we should generate indirect calls by loading the
11325      address of the callee into a register before performing
11326      the branch-and-link.  */
11327   if (SYMBOL_REF_P (callee)
11328       ? (aarch64_is_long_call_p (callee)
11329 	 || aarch64_is_noplt_call_p (callee))
11330       : !REG_P (callee))
11331     XEXP (mem, 0) = force_reg (mode, callee);
11332 
11333   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
11334 
11335   if (result != NULL_RTX)
11336     call = gen_rtx_SET (result, call);
11337 
11338   if (sibcall)
11339     tmp = ret_rtx;
11340   else
11341     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
11342 
11343   gcc_assert (CONST_INT_P (callee_abi));
11344   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
11345 			       UNSPEC_CALLEE_ABI);
11346 
11347   vec = gen_rtvec (3, call, callee_abi, tmp);
11348   call = gen_rtx_PARALLEL (VOIDmode, vec);
11349 
11350   aarch64_emit_call_insn (call);
11351 }
11352 
11353 /* Emit call insn with PAT and do aarch64-specific handling.  */
11354 
11355 void
aarch64_emit_call_insn(rtx pat)11356 aarch64_emit_call_insn (rtx pat)
11357 {
11358   rtx insn = emit_call_insn (pat);
11359 
11360   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
11361   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
11362   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
11363 }
11364 
11365 machine_mode
aarch64_select_cc_mode(RTX_CODE code,rtx x,rtx y)11366 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
11367 {
11368   machine_mode mode_x = GET_MODE (x);
11369   rtx_code code_x = GET_CODE (x);
11370 
11371   /* All floating point compares return CCFP if it is an equality
11372      comparison, and CCFPE otherwise.  */
11373   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
11374     {
11375       switch (code)
11376 	{
11377 	case EQ:
11378 	case NE:
11379 	case UNORDERED:
11380 	case ORDERED:
11381 	case UNLT:
11382 	case UNLE:
11383 	case UNGT:
11384 	case UNGE:
11385 	case UNEQ:
11386 	  return CCFPmode;
11387 
11388 	case LT:
11389 	case LE:
11390 	case GT:
11391 	case GE:
11392 	case LTGT:
11393 	  return CCFPEmode;
11394 
11395 	default:
11396 	  gcc_unreachable ();
11397 	}
11398     }
11399 
11400   /* Equality comparisons of short modes against zero can be performed
11401      using the TST instruction with the appropriate bitmask.  */
11402   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
11403       && (code == EQ || code == NE)
11404       && (mode_x == HImode || mode_x == QImode))
11405     return CC_NZmode;
11406 
11407   /* Similarly, comparisons of zero_extends from shorter modes can
11408      be performed using an ANDS with an immediate mask.  */
11409   if (y == const0_rtx && code_x == ZERO_EXTEND
11410       && (mode_x == SImode || mode_x == DImode)
11411       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
11412       && (code == EQ || code == NE))
11413     return CC_NZmode;
11414 
11415   if ((mode_x == SImode || mode_x == DImode)
11416       && y == const0_rtx
11417       && (code == EQ || code == NE || code == LT || code == GE)
11418       && (code_x == PLUS || code_x == MINUS || code_x == AND
11419 	  || code_x == NEG
11420 	  || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
11421 	      && CONST_INT_P (XEXP (x, 2)))))
11422     return CC_NZmode;
11423 
11424   /* A compare with a shifted operand.  Because of canonicalization,
11425      the comparison will have to be swapped when we emit the assembly
11426      code.  */
11427   if ((mode_x == SImode || mode_x == DImode)
11428       && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
11429       && (code_x == ASHIFT || code_x == ASHIFTRT
11430 	  || code_x == LSHIFTRT
11431 	  || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
11432     return CC_SWPmode;
11433 
11434   /* Similarly for a negated operand, but we can only do this for
11435      equalities.  */
11436   if ((mode_x == SImode || mode_x == DImode)
11437       && (REG_P (y) || SUBREG_P (y))
11438       && (code == EQ || code == NE)
11439       && code_x == NEG)
11440     return CC_Zmode;
11441 
11442   /* A test for unsigned overflow from an addition.  */
11443   if ((mode_x == DImode || mode_x == TImode)
11444       && (code == LTU || code == GEU)
11445       && code_x == PLUS
11446       && rtx_equal_p (XEXP (x, 0), y))
11447     return CC_Cmode;
11448 
11449   /* A test for unsigned overflow from an add with carry.  */
11450   if ((mode_x == DImode || mode_x == TImode)
11451       && (code == LTU || code == GEU)
11452       && code_x == PLUS
11453       && CONST_SCALAR_INT_P (y)
11454       && (rtx_mode_t (y, mode_x)
11455 	  == (wi::shwi (1, mode_x)
11456 	      << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
11457     return CC_ADCmode;
11458 
11459   /* A test for signed overflow.  */
11460   if ((mode_x == DImode || mode_x == TImode)
11461       && code == NE
11462       && code_x == PLUS
11463       && GET_CODE (y) == SIGN_EXTEND)
11464     return CC_Vmode;
11465 
11466   /* For everything else, return CCmode.  */
11467   return CCmode;
11468 }
11469 
11470 static int
11471 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
11472 
11473 int
aarch64_get_condition_code(rtx x)11474 aarch64_get_condition_code (rtx x)
11475 {
11476   machine_mode mode = GET_MODE (XEXP (x, 0));
11477   enum rtx_code comp_code = GET_CODE (x);
11478 
11479   if (GET_MODE_CLASS (mode) != MODE_CC)
11480     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
11481   return aarch64_get_condition_code_1 (mode, comp_code);
11482 }
11483 
11484 static int
aarch64_get_condition_code_1(machine_mode mode,enum rtx_code comp_code)11485 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
11486 {
11487   switch (mode)
11488     {
11489     case E_CCFPmode:
11490     case E_CCFPEmode:
11491       switch (comp_code)
11492 	{
11493 	case GE: return AARCH64_GE;
11494 	case GT: return AARCH64_GT;
11495 	case LE: return AARCH64_LS;
11496 	case LT: return AARCH64_MI;
11497 	case NE: return AARCH64_NE;
11498 	case EQ: return AARCH64_EQ;
11499 	case ORDERED: return AARCH64_VC;
11500 	case UNORDERED: return AARCH64_VS;
11501 	case UNLT: return AARCH64_LT;
11502 	case UNLE: return AARCH64_LE;
11503 	case UNGT: return AARCH64_HI;
11504 	case UNGE: return AARCH64_PL;
11505 	default: return -1;
11506 	}
11507       break;
11508 
11509     case E_CCmode:
11510       switch (comp_code)
11511 	{
11512 	case NE: return AARCH64_NE;
11513 	case EQ: return AARCH64_EQ;
11514 	case GE: return AARCH64_GE;
11515 	case GT: return AARCH64_GT;
11516 	case LE: return AARCH64_LE;
11517 	case LT: return AARCH64_LT;
11518 	case GEU: return AARCH64_CS;
11519 	case GTU: return AARCH64_HI;
11520 	case LEU: return AARCH64_LS;
11521 	case LTU: return AARCH64_CC;
11522 	default: return -1;
11523 	}
11524       break;
11525 
11526     case E_CC_SWPmode:
11527       switch (comp_code)
11528 	{
11529 	case NE: return AARCH64_NE;
11530 	case EQ: return AARCH64_EQ;
11531 	case GE: return AARCH64_LE;
11532 	case GT: return AARCH64_LT;
11533 	case LE: return AARCH64_GE;
11534 	case LT: return AARCH64_GT;
11535 	case GEU: return AARCH64_LS;
11536 	case GTU: return AARCH64_CC;
11537 	case LEU: return AARCH64_CS;
11538 	case LTU: return AARCH64_HI;
11539 	default: return -1;
11540 	}
11541       break;
11542 
11543     case E_CC_NZCmode:
11544       switch (comp_code)
11545 	{
11546 	case NE: return AARCH64_NE; /* = any */
11547 	case EQ: return AARCH64_EQ; /* = none */
11548 	case GE: return AARCH64_PL; /* = nfrst */
11549 	case LT: return AARCH64_MI; /* = first */
11550 	case GEU: return AARCH64_CS; /* = nlast */
11551 	case GTU: return AARCH64_HI; /* = pmore */
11552 	case LEU: return AARCH64_LS; /* = plast */
11553 	case LTU: return AARCH64_CC; /* = last */
11554 	default: return -1;
11555 	}
11556       break;
11557 
11558     case E_CC_NZmode:
11559       switch (comp_code)
11560 	{
11561 	case NE: return AARCH64_NE;
11562 	case EQ: return AARCH64_EQ;
11563 	case GE: return AARCH64_PL;
11564 	case LT: return AARCH64_MI;
11565 	default: return -1;
11566 	}
11567       break;
11568 
11569     case E_CC_Zmode:
11570       switch (comp_code)
11571 	{
11572 	case NE: return AARCH64_NE;
11573 	case EQ: return AARCH64_EQ;
11574 	default: return -1;
11575 	}
11576       break;
11577 
11578     case E_CC_Cmode:
11579       switch (comp_code)
11580 	{
11581 	case LTU: return AARCH64_CS;
11582 	case GEU: return AARCH64_CC;
11583 	default: return -1;
11584 	}
11585       break;
11586 
11587     case E_CC_ADCmode:
11588       switch (comp_code)
11589 	{
11590 	case GEU: return AARCH64_CS;
11591 	case LTU: return AARCH64_CC;
11592 	default: return -1;
11593 	}
11594       break;
11595 
11596     case E_CC_Vmode:
11597       switch (comp_code)
11598 	{
11599 	case NE: return AARCH64_VS;
11600 	case EQ: return AARCH64_VC;
11601 	default: return -1;
11602 	}
11603       break;
11604 
11605     default:
11606       return -1;
11607     }
11608 
11609   return -1;
11610 }
11611 
11612 bool
aarch64_const_vec_all_same_in_range_p(rtx x,HOST_WIDE_INT minval,HOST_WIDE_INT maxval)11613 aarch64_const_vec_all_same_in_range_p (rtx x,
11614 				       HOST_WIDE_INT minval,
11615 				       HOST_WIDE_INT maxval)
11616 {
11617   rtx elt;
11618   return (const_vec_duplicate_p (x, &elt)
11619 	  && CONST_INT_P (elt)
11620 	  && IN_RANGE (INTVAL (elt), minval, maxval));
11621 }
11622 
11623 bool
aarch64_const_vec_all_same_int_p(rtx x,HOST_WIDE_INT val)11624 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
11625 {
11626   return aarch64_const_vec_all_same_in_range_p (x, val, val);
11627 }
11628 
11629 /* Return true if VEC is a constant in which every element is in the range
11630    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
11631 
11632 static bool
aarch64_const_vec_all_in_range_p(rtx vec,HOST_WIDE_INT minval,HOST_WIDE_INT maxval)11633 aarch64_const_vec_all_in_range_p (rtx vec,
11634 				  HOST_WIDE_INT minval,
11635 				  HOST_WIDE_INT maxval)
11636 {
11637   if (!CONST_VECTOR_P (vec)
11638       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
11639     return false;
11640 
11641   int nunits;
11642   if (!CONST_VECTOR_STEPPED_P (vec))
11643     nunits = const_vector_encoded_nelts (vec);
11644   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
11645     return false;
11646 
11647   for (int i = 0; i < nunits; i++)
11648     {
11649       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
11650       if (!CONST_INT_P (vec_elem)
11651 	  || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
11652 	return false;
11653     }
11654   return true;
11655 }
11656 
11657 /* N Z C V.  */
11658 #define AARCH64_CC_V 1
11659 #define AARCH64_CC_C (1 << 1)
11660 #define AARCH64_CC_Z (1 << 2)
11661 #define AARCH64_CC_N (1 << 3)
11662 
11663 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
11664 static const int aarch64_nzcv_codes[] =
11665 {
11666   0,		/* EQ, Z == 1.  */
11667   AARCH64_CC_Z,	/* NE, Z == 0.  */
11668   0,		/* CS, C == 1.  */
11669   AARCH64_CC_C,	/* CC, C == 0.  */
11670   0,		/* MI, N == 1.  */
11671   AARCH64_CC_N, /* PL, N == 0.  */
11672   0,		/* VS, V == 1.  */
11673   AARCH64_CC_V, /* VC, V == 0.  */
11674   0,		/* HI, C ==1 && Z == 0.  */
11675   AARCH64_CC_C,	/* LS, !(C == 1 && Z == 0).  */
11676   AARCH64_CC_V,	/* GE, N == V.  */
11677   0,		/* LT, N != V.  */
11678   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
11679   0,		/* LE, !(Z == 0 && N == V).  */
11680   0,		/* AL, Any.  */
11681   0		/* NV, Any.  */
11682 };
11683 
11684 /* Print floating-point vector immediate operand X to F, negating it
11685    first if NEGATE is true.  Return true on success, false if it isn't
11686    a constant we can handle.  */
11687 
11688 static bool
aarch64_print_vector_float_operand(FILE * f,rtx x,bool negate)11689 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
11690 {
11691   rtx elt;
11692 
11693   if (!const_vec_duplicate_p (x, &elt))
11694     return false;
11695 
11696   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
11697   if (negate)
11698     r = real_value_negate (&r);
11699 
11700   /* Handle the SVE single-bit immediates specially, since they have a
11701      fixed form in the assembly syntax.  */
11702   if (real_equal (&r, &dconst0))
11703     asm_fprintf (f, "0.0");
11704   else if (real_equal (&r, &dconst2))
11705     asm_fprintf (f, "2.0");
11706   else if (real_equal (&r, &dconst1))
11707     asm_fprintf (f, "1.0");
11708   else if (real_equal (&r, &dconsthalf))
11709     asm_fprintf (f, "0.5");
11710   else
11711     {
11712       const int buf_size = 20;
11713       char float_buf[buf_size] = {'\0'};
11714       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
11715 				1, GET_MODE (elt));
11716       asm_fprintf (f, "%s", float_buf);
11717     }
11718 
11719   return true;
11720 }
11721 
11722 /* Return the equivalent letter for size.  */
11723 static char
sizetochar(int size)11724 sizetochar (int size)
11725 {
11726   switch (size)
11727     {
11728     case 64: return 'd';
11729     case 32: return 's';
11730     case 16: return 'h';
11731     case 8 : return 'b';
11732     default: gcc_unreachable ();
11733     }
11734 }
11735 
11736 /* Print operand X to file F in a target specific manner according to CODE.
11737    The acceptable formatting commands given by CODE are:
11738      'c':		An integer or symbol address without a preceding #
11739 			sign.
11740      'C':		Take the duplicated element in a vector constant
11741 			and print it in hex.
11742      'D':		Take the duplicated element in a vector constant
11743 			and print it as an unsigned integer, in decimal.
11744      'e':		Print the sign/zero-extend size as a character 8->b,
11745 			16->h, 32->w.  Can also be used for masks:
11746 			0xff->b, 0xffff->h, 0xffffffff->w.
11747      'I':		If the operand is a duplicated vector constant,
11748 			replace it with the duplicated scalar.  If the
11749 			operand is then a floating-point constant, replace
11750 			it with the integer bit representation.  Print the
11751 			transformed constant as a signed decimal number.
11752      'p':		Prints N such that 2^N == X (X must be power of 2 and
11753 			const int).
11754      'P':		Print the number of non-zero bits in X (a const_int).
11755      'H':		Print the higher numbered register of a pair (TImode)
11756 			of regs.
11757      'm':		Print a condition (eq, ne, etc).
11758      'M':		Same as 'm', but invert condition.
11759      'N':		Take the duplicated element in a vector constant
11760 			and print the negative of it in decimal.
11761      'b/h/s/d/q':	Print a scalar FP/SIMD register name.
11762      'S/T/U/V':		Print a FP/SIMD register name for a register list.
11763 			The register printed is the FP/SIMD register name
11764 			of X + 0/1/2/3 for S/T/U/V.
11765      'R':		Print a scalar Integer/FP/SIMD register name + 1.
11766      'X':		Print bottom 16 bits of integer constant in hex.
11767      'w/x':		Print a general register name or the zero register
11768 			(32-bit or 64-bit).
11769      '0':		Print a normal operand, if it's a general register,
11770 			then we assume DImode.
11771      'k':		Print NZCV for conditional compare instructions.
11772      'A':		Output address constant representing the first
11773 			argument of X, specifying a relocation offset
11774 			if appropriate.
11775      'L':		Output constant address specified by X
11776 			with a relocation offset if appropriate.
11777      'G':		Prints address of X, specifying a PC relative
11778 			relocation mode if appropriate.
11779      'y':		Output address of LDP or STP - this is used for
11780 			some LDP/STPs which don't use a PARALLEL in their
11781 			pattern (so the mode needs to be adjusted).
11782      'z':		Output address of a typical LDP or STP.  */
11783 
11784 static void
aarch64_print_operand(FILE * f,rtx x,int code)11785 aarch64_print_operand (FILE *f, rtx x, int code)
11786 {
11787   rtx elt;
11788   switch (code)
11789     {
11790     case 'c':
11791       if (CONST_INT_P (x))
11792 	fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
11793       else
11794 	{
11795 	  poly_int64 offset;
11796 	  rtx base = strip_offset_and_salt (x, &offset);
11797 	  if (SYMBOL_REF_P (base))
11798 	    output_addr_const (f, x);
11799 	  else
11800 	    output_operand_lossage ("unsupported operand for code '%c'", code);
11801 	}
11802       break;
11803 
11804     case 'e':
11805       {
11806 	x = unwrap_const_vec_duplicate (x);
11807 	if (!CONST_INT_P (x))
11808 	  {
11809 	    output_operand_lossage ("invalid operand for '%%%c'", code);
11810 	    return;
11811 	  }
11812 
11813 	HOST_WIDE_INT val = INTVAL (x);
11814 	if ((val & ~7) == 8 || val == 0xff)
11815 	  fputc ('b', f);
11816 	else if ((val & ~7) == 16 || val == 0xffff)
11817 	  fputc ('h', f);
11818 	else if ((val & ~7) == 32 || val == 0xffffffff)
11819 	  fputc ('w', f);
11820 	else
11821 	  {
11822 	    output_operand_lossage ("invalid operand for '%%%c'", code);
11823 	    return;
11824 	  }
11825       }
11826       break;
11827 
11828     case 'p':
11829       {
11830 	int n;
11831 
11832 	if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
11833 	  {
11834 	    output_operand_lossage ("invalid operand for '%%%c'", code);
11835 	    return;
11836 	  }
11837 
11838 	asm_fprintf (f, "%d", n);
11839       }
11840       break;
11841 
11842     case 'P':
11843       if (!CONST_INT_P (x))
11844 	{
11845 	  output_operand_lossage ("invalid operand for '%%%c'", code);
11846 	  return;
11847 	}
11848 
11849       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
11850       break;
11851 
11852     case 'H':
11853       if (x == const0_rtx)
11854 	{
11855 	  asm_fprintf (f, "xzr");
11856 	  break;
11857 	}
11858 
11859       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
11860 	{
11861 	  output_operand_lossage ("invalid operand for '%%%c'", code);
11862 	  return;
11863 	}
11864 
11865       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
11866       break;
11867 
11868     case 'I':
11869       {
11870 	x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
11871 	if (CONST_INT_P (x))
11872 	  asm_fprintf (f, "%wd", INTVAL (x));
11873 	else
11874 	  {
11875 	    output_operand_lossage ("invalid operand for '%%%c'", code);
11876 	    return;
11877 	  }
11878 	break;
11879       }
11880 
11881     case 'M':
11882     case 'm':
11883       {
11884         int cond_code;
11885 	/* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
11886 	if (x == const_true_rtx)
11887 	  {
11888 	    if (code == 'M')
11889 	      fputs ("nv", f);
11890 	    return;
11891 	  }
11892 
11893         if (!COMPARISON_P (x))
11894 	  {
11895 	    output_operand_lossage ("invalid operand for '%%%c'", code);
11896 	    return;
11897 	  }
11898 
11899         cond_code = aarch64_get_condition_code (x);
11900         gcc_assert (cond_code >= 0);
11901 	if (code == 'M')
11902 	  cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
11903 	if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
11904 	  fputs (aarch64_sve_condition_codes[cond_code], f);
11905 	else
11906 	  fputs (aarch64_condition_codes[cond_code], f);
11907       }
11908       break;
11909 
11910     case 'N':
11911       if (!const_vec_duplicate_p (x, &elt))
11912 	{
11913 	  output_operand_lossage ("invalid vector constant");
11914 	  return;
11915 	}
11916 
11917       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
11918 	asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
11919       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11920 	       && aarch64_print_vector_float_operand (f, x, true))
11921 	;
11922       else
11923 	{
11924 	  output_operand_lossage ("invalid vector constant");
11925 	  return;
11926 	}
11927       break;
11928 
11929     case 'b':
11930     case 'h':
11931     case 's':
11932     case 'd':
11933     case 'q':
11934       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
11935 	{
11936 	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
11937 	  return;
11938 	}
11939       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
11940       break;
11941 
11942     case 'S':
11943     case 'T':
11944     case 'U':
11945     case 'V':
11946       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
11947 	{
11948 	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
11949 	  return;
11950 	}
11951       asm_fprintf (f, "%c%d",
11952 		   aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
11953 		   REGNO (x) - V0_REGNUM + (code - 'S'));
11954       break;
11955 
11956     case 'R':
11957       if (REG_P (x) && FP_REGNUM_P (REGNO (x))
11958 	  && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x))))
11959 	asm_fprintf (f, "d%d", REGNO (x) - V0_REGNUM + 1);
11960       else if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
11961 	asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
11962       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
11963 	asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
11964       else
11965 	output_operand_lossage ("incompatible register operand for '%%%c'",
11966 				code);
11967       break;
11968 
11969     case 'X':
11970       if (!CONST_INT_P (x))
11971 	{
11972 	  output_operand_lossage ("invalid operand for '%%%c'", code);
11973 	  return;
11974 	}
11975       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
11976       break;
11977 
11978     case 'C':
11979       {
11980 	/* Print a replicated constant in hex.  */
11981 	if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
11982 	  {
11983 	    output_operand_lossage ("invalid operand for '%%%c'", code);
11984 	    return;
11985 	  }
11986 	scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
11987 	asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
11988       }
11989       break;
11990 
11991     case 'D':
11992       {
11993 	/* Print a replicated constant in decimal, treating it as
11994 	   unsigned.  */
11995 	if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
11996 	  {
11997 	    output_operand_lossage ("invalid operand for '%%%c'", code);
11998 	    return;
11999 	  }
12000 	scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12001 	asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12002       }
12003       break;
12004 
12005     case 'w':
12006     case 'x':
12007       if (x == const0_rtx
12008 	  || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
12009 	{
12010 	  asm_fprintf (f, "%czr", code);
12011 	  break;
12012 	}
12013 
12014       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12015 	{
12016 	  asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
12017 	  break;
12018 	}
12019 
12020       if (REG_P (x) && REGNO (x) == SP_REGNUM)
12021 	{
12022 	  asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
12023 	  break;
12024 	}
12025 
12026       /* Fall through */
12027 
12028     case 0:
12029       if (x == NULL)
12030 	{
12031 	  output_operand_lossage ("missing operand");
12032 	  return;
12033 	}
12034 
12035       switch (GET_CODE (x))
12036 	{
12037 	case REG:
12038 	  if (aarch64_sve_data_mode_p (GET_MODE (x)))
12039 	    {
12040 	      if (REG_NREGS (x) == 1)
12041 		asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
12042 	      else
12043 		{
12044 		  char suffix
12045 		    = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
12046 		  asm_fprintf (f, "{z%d.%c - z%d.%c}",
12047 			       REGNO (x) - V0_REGNUM, suffix,
12048 			       END_REGNO (x) - V0_REGNUM - 1, suffix);
12049 		}
12050 	    }
12051 	  else
12052 	    asm_fprintf (f, "%s", reg_names [REGNO (x)]);
12053 	  break;
12054 
12055 	case MEM:
12056 	  output_address (GET_MODE (x), XEXP (x, 0));
12057 	  break;
12058 
12059 	case LABEL_REF:
12060 	case SYMBOL_REF:
12061 	  output_addr_const (asm_out_file, x);
12062 	  break;
12063 
12064 	case CONST_INT:
12065 	  asm_fprintf (f, "%wd", INTVAL (x));
12066 	  break;
12067 
12068 	case CONST:
12069 	  if (!VECTOR_MODE_P (GET_MODE (x)))
12070 	    {
12071 	      output_addr_const (asm_out_file, x);
12072 	      break;
12073 	    }
12074 	  /* fall through */
12075 
12076 	case CONST_VECTOR:
12077 	  if (!const_vec_duplicate_p (x, &elt))
12078 	    {
12079 	      output_operand_lossage ("invalid vector constant");
12080 	      return;
12081 	    }
12082 
12083 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12084 	    asm_fprintf (f, "%wd", INTVAL (elt));
12085 	  else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12086 		   && aarch64_print_vector_float_operand (f, x, false))
12087 	    ;
12088 	  else
12089 	    {
12090 	      output_operand_lossage ("invalid vector constant");
12091 	      return;
12092 	    }
12093 	  break;
12094 
12095 	case CONST_DOUBLE:
12096 	  /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12097 	     be getting CONST_DOUBLEs holding integers.  */
12098 	  gcc_assert (GET_MODE (x) != VOIDmode);
12099 	  if (aarch64_float_const_zero_rtx_p (x))
12100 	    {
12101 	      fputc ('0', f);
12102 	      break;
12103 	    }
12104 	  else if (aarch64_float_const_representable_p (x))
12105 	    {
12106 #define buf_size 20
12107 	      char float_buf[buf_size] = {'\0'};
12108 	      real_to_decimal_for_mode (float_buf,
12109 					CONST_DOUBLE_REAL_VALUE (x),
12110 					buf_size, buf_size,
12111 					1, GET_MODE (x));
12112 	      asm_fprintf (asm_out_file, "%s", float_buf);
12113 	      break;
12114 #undef buf_size
12115 	    }
12116 	  output_operand_lossage ("invalid constant");
12117 	  return;
12118 	default:
12119 	  output_operand_lossage ("invalid operand");
12120 	  return;
12121 	}
12122       break;
12123 
12124     case 'A':
12125       if (GET_CODE (x) == HIGH)
12126 	x = XEXP (x, 0);
12127 
12128       switch (aarch64_classify_symbolic_expression (x))
12129 	{
12130 	case SYMBOL_SMALL_GOT_4G:
12131 	  asm_fprintf (asm_out_file, ":got:");
12132 	  break;
12133 
12134 	case SYMBOL_SMALL_TLSGD:
12135 	  asm_fprintf (asm_out_file, ":tlsgd:");
12136 	  break;
12137 
12138 	case SYMBOL_SMALL_TLSDESC:
12139 	  asm_fprintf (asm_out_file, ":tlsdesc:");
12140 	  break;
12141 
12142 	case SYMBOL_SMALL_TLSIE:
12143 	  asm_fprintf (asm_out_file, ":gottprel:");
12144 	  break;
12145 
12146 	case SYMBOL_TLSLE24:
12147 	  asm_fprintf (asm_out_file, ":tprel:");
12148 	  break;
12149 
12150 	case SYMBOL_TINY_GOT:
12151 	  gcc_unreachable ();
12152 	  break;
12153 
12154 	default:
12155 	  break;
12156 	}
12157       output_addr_const (asm_out_file, x);
12158       break;
12159 
12160     case 'L':
12161       switch (aarch64_classify_symbolic_expression (x))
12162 	{
12163 	case SYMBOL_SMALL_GOT_4G:
12164 	  asm_fprintf (asm_out_file, ":got_lo12:");
12165 	  break;
12166 
12167 	case SYMBOL_SMALL_TLSGD:
12168 	  asm_fprintf (asm_out_file, ":tlsgd_lo12:");
12169 	  break;
12170 
12171 	case SYMBOL_SMALL_TLSDESC:
12172 	  asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
12173 	  break;
12174 
12175 	case SYMBOL_SMALL_TLSIE:
12176 	  asm_fprintf (asm_out_file, ":gottprel_lo12:");
12177 	  break;
12178 
12179 	case SYMBOL_TLSLE12:
12180 	  asm_fprintf (asm_out_file, ":tprel_lo12:");
12181 	  break;
12182 
12183 	case SYMBOL_TLSLE24:
12184 	  asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
12185 	  break;
12186 
12187 	case SYMBOL_TINY_GOT:
12188 	  asm_fprintf (asm_out_file, ":got:");
12189 	  break;
12190 
12191 	case SYMBOL_TINY_TLSIE:
12192 	  asm_fprintf (asm_out_file, ":gottprel:");
12193 	  break;
12194 
12195 	default:
12196 	  break;
12197 	}
12198       output_addr_const (asm_out_file, x);
12199       break;
12200 
12201     case 'G':
12202       switch (aarch64_classify_symbolic_expression (x))
12203 	{
12204 	case SYMBOL_TLSLE24:
12205 	  asm_fprintf (asm_out_file, ":tprel_hi12:");
12206 	  break;
12207 	default:
12208 	  break;
12209 	}
12210       output_addr_const (asm_out_file, x);
12211       break;
12212 
12213     case 'k':
12214       {
12215 	HOST_WIDE_INT cond_code;
12216 
12217 	if (!CONST_INT_P (x))
12218 	  {
12219 	    output_operand_lossage ("invalid operand for '%%%c'", code);
12220 	    return;
12221 	  }
12222 
12223 	cond_code = INTVAL (x);
12224 	gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
12225 	asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
12226       }
12227       break;
12228 
12229     case 'y':
12230     case 'z':
12231       {
12232 	machine_mode mode = GET_MODE (x);
12233 
12234 	if (!MEM_P (x)
12235 	    || (code == 'y'
12236 		&& maybe_ne (GET_MODE_SIZE (mode), 8)
12237 		&& maybe_ne (GET_MODE_SIZE (mode), 16)))
12238 	  {
12239 	    output_operand_lossage ("invalid operand for '%%%c'", code);
12240 	    return;
12241 	  }
12242 
12243 	if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
12244 					    code == 'y'
12245 					    ? ADDR_QUERY_LDP_STP_N
12246 					    : ADDR_QUERY_LDP_STP))
12247 	  output_operand_lossage ("invalid operand prefix '%%%c'", code);
12248       }
12249       break;
12250 
12251     default:
12252       output_operand_lossage ("invalid operand prefix '%%%c'", code);
12253       return;
12254     }
12255 }
12256 
12257 /* Print address 'x' of a memory access with mode 'mode'.
12258    'op' is the context required by aarch64_classify_address.  It can either be
12259    MEM for a normal memory access or PARALLEL for LDP/STP.  */
12260 static bool
aarch64_print_address_internal(FILE * f,machine_mode mode,rtx x,aarch64_addr_query_type type)12261 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
12262 				aarch64_addr_query_type type)
12263 {
12264   struct aarch64_address_info addr;
12265   unsigned int size, vec_flags;
12266 
12267   /* Check all addresses are Pmode - including ILP32.  */
12268   if (GET_MODE (x) != Pmode
12269       && (!CONST_INT_P (x)
12270 	  || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
12271     {
12272       output_operand_lossage ("invalid address mode");
12273       return false;
12274     }
12275 
12276   if (aarch64_classify_address (&addr, x, mode, true, type))
12277     switch (addr.type)
12278       {
12279       case ADDRESS_REG_IMM:
12280 	if (known_eq (addr.const_offset, 0))
12281 	  {
12282 	    asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
12283 	    return true;
12284 	  }
12285 
12286 	vec_flags = aarch64_classify_vector_mode (mode);
12287 	if (vec_flags & VEC_ANY_SVE)
12288 	  {
12289 	    HOST_WIDE_INT vnum
12290 	      = exact_div (addr.const_offset,
12291 			   aarch64_vl_bytes (mode, vec_flags)).to_constant ();
12292 	    asm_fprintf (f, "[%s, #%wd, mul vl]",
12293 			 reg_names[REGNO (addr.base)], vnum);
12294 	    return true;
12295 	  }
12296 
12297 	asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
12298 		     INTVAL (addr.offset));
12299 	return true;
12300 
12301       case ADDRESS_REG_REG:
12302 	if (addr.shift == 0)
12303 	  asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
12304 		       reg_names [REGNO (addr.offset)]);
12305 	else
12306 	  asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
12307 		       reg_names [REGNO (addr.offset)], addr.shift);
12308 	return true;
12309 
12310       case ADDRESS_REG_UXTW:
12311 	if (addr.shift == 0)
12312 	  asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
12313 		       REGNO (addr.offset) - R0_REGNUM);
12314 	else
12315 	  asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
12316 		       REGNO (addr.offset) - R0_REGNUM, addr.shift);
12317 	return true;
12318 
12319       case ADDRESS_REG_SXTW:
12320 	if (addr.shift == 0)
12321 	  asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
12322 		       REGNO (addr.offset) - R0_REGNUM);
12323 	else
12324 	  asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
12325 		       REGNO (addr.offset) - R0_REGNUM, addr.shift);
12326 	return true;
12327 
12328       case ADDRESS_REG_WB:
12329 	/* Writeback is only supported for fixed-width modes.  */
12330 	size = GET_MODE_SIZE (mode).to_constant ();
12331 	switch (GET_CODE (x))
12332 	  {
12333 	  case PRE_INC:
12334 	    asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
12335 	    return true;
12336 	  case POST_INC:
12337 	    asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
12338 	    return true;
12339 	  case PRE_DEC:
12340 	    asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
12341 	    return true;
12342 	  case POST_DEC:
12343 	    asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
12344 	    return true;
12345 	  case PRE_MODIFY:
12346 	    asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
12347 			 INTVAL (addr.offset));
12348 	    return true;
12349 	  case POST_MODIFY:
12350 	    asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
12351 			 INTVAL (addr.offset));
12352 	    return true;
12353 	  default:
12354 	    break;
12355 	  }
12356 	break;
12357 
12358       case ADDRESS_LO_SUM:
12359 	asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
12360 	output_addr_const (f, addr.offset);
12361 	asm_fprintf (f, "]");
12362 	return true;
12363 
12364       case ADDRESS_SYMBOLIC:
12365 	output_addr_const (f, x);
12366 	return true;
12367       }
12368 
12369   return false;
12370 }
12371 
12372 /* Print address 'x' of a memory access with mode 'mode'.  */
12373 static void
aarch64_print_operand_address(FILE * f,machine_mode mode,rtx x)12374 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
12375 {
12376   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
12377     output_addr_const (f, x);
12378 }
12379 
12380 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
12381 
12382 static bool
aarch64_output_addr_const_extra(FILE * file,rtx x)12383 aarch64_output_addr_const_extra (FILE *file, rtx x)
12384 {
12385   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
12386     {
12387       output_addr_const (file, XVECEXP (x, 0, 0));
12388       return true;
12389    }
12390   return false;
12391 }
12392 
12393 bool
aarch64_label_mentioned_p(rtx x)12394 aarch64_label_mentioned_p (rtx x)
12395 {
12396   const char *fmt;
12397   int i;
12398 
12399   if (LABEL_REF_P (x))
12400     return true;
12401 
12402   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12403      referencing instruction, but they are constant offsets, not
12404      symbols.  */
12405   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
12406     return false;
12407 
12408   fmt = GET_RTX_FORMAT (GET_CODE (x));
12409   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
12410     {
12411       if (fmt[i] == 'E')
12412 	{
12413 	  int j;
12414 
12415 	  for (j = XVECLEN (x, i) - 1; j >= 0; j--)
12416 	    if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
12417 	      return 1;
12418 	}
12419       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
12420 	return 1;
12421     }
12422 
12423   return 0;
12424 }
12425 
12426 /* Implement REGNO_REG_CLASS.  */
12427 
12428 enum reg_class
aarch64_regno_regclass(unsigned regno)12429 aarch64_regno_regclass (unsigned regno)
12430 {
12431   if (STUB_REGNUM_P (regno))
12432     return STUB_REGS;
12433 
12434   if (GP_REGNUM_P (regno))
12435     return GENERAL_REGS;
12436 
12437   if (regno == SP_REGNUM)
12438     return STACK_REG;
12439 
12440   if (regno == FRAME_POINTER_REGNUM
12441       || regno == ARG_POINTER_REGNUM)
12442     return POINTER_REGS;
12443 
12444   if (FP_REGNUM_P (regno))
12445     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
12446 	    : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
12447 
12448   if (PR_REGNUM_P (regno))
12449     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
12450 
12451   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
12452     return FFR_REGS;
12453 
12454   return NO_REGS;
12455 }
12456 
12457 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12458    If OFFSET is out of range, return an offset of an anchor point
12459    that is in range.  Return 0 otherwise.  */
12460 
12461 static HOST_WIDE_INT
aarch64_anchor_offset(HOST_WIDE_INT offset,HOST_WIDE_INT size,machine_mode mode)12462 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
12463 		       machine_mode mode)
12464 {
12465   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
12466   if (size > 16)
12467     return (offset + 0x400) & ~0x7f0;
12468 
12469   /* For offsets that aren't a multiple of the access size, the limit is
12470      -256...255.  */
12471   if (offset & (size - 1))
12472     {
12473       /* BLKmode typically uses LDP of X-registers.  */
12474       if (mode == BLKmode)
12475 	return (offset + 512) & ~0x3ff;
12476       return (offset + 0x100) & ~0x1ff;
12477     }
12478 
12479   /* Small negative offsets are supported.  */
12480   if (IN_RANGE (offset, -256, 0))
12481     return 0;
12482 
12483   if (mode == TImode || mode == TFmode)
12484     return (offset + 0x100) & ~0x1ff;
12485 
12486   /* Use 12-bit offset by access size.  */
12487   return offset & (~0xfff * size);
12488 }
12489 
12490 static rtx
aarch64_legitimize_address(rtx x,rtx,machine_mode mode)12491 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
12492 {
12493   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
12494      where mask is selected by alignment and size of the offset.
12495      We try to pick as large a range for the offset as possible to
12496      maximize the chance of a CSE.  However, for aligned addresses
12497      we limit the range to 4k so that structures with different sized
12498      elements are likely to use the same base.  We need to be careful
12499      not to split a CONST for some forms of address expression, otherwise
12500      it will generate sub-optimal code.  */
12501 
12502   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
12503     {
12504       rtx base = XEXP (x, 0);
12505       rtx offset_rtx = XEXP (x, 1);
12506       HOST_WIDE_INT offset = INTVAL (offset_rtx);
12507 
12508       if (GET_CODE (base) == PLUS)
12509 	{
12510 	  rtx op0 = XEXP (base, 0);
12511 	  rtx op1 = XEXP (base, 1);
12512 
12513 	  /* Force any scaling into a temp for CSE.  */
12514 	  op0 = force_reg (Pmode, op0);
12515 	  op1 = force_reg (Pmode, op1);
12516 
12517 	  /* Let the pointer register be in op0.  */
12518 	  if (REG_POINTER (op1))
12519 	    std::swap (op0, op1);
12520 
12521 	  /* If the pointer is virtual or frame related, then we know that
12522 	     virtual register instantiation or register elimination is going
12523 	     to apply a second constant.  We want the two constants folded
12524 	     together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
12525 	  if (virt_or_elim_regno_p (REGNO (op0)))
12526 	    {
12527 	      base = expand_binop (Pmode, add_optab, op0, offset_rtx,
12528 				   NULL_RTX, true, OPTAB_DIRECT);
12529 	      return gen_rtx_PLUS (Pmode, base, op1);
12530 	    }
12531 
12532 	  /* Otherwise, in order to encourage CSE (and thence loop strength
12533 	     reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
12534 	  base = expand_binop (Pmode, add_optab, op0, op1,
12535 			       NULL_RTX, true, OPTAB_DIRECT);
12536 	  x = gen_rtx_PLUS (Pmode, base, offset_rtx);
12537 	}
12538 
12539       HOST_WIDE_INT size;
12540       if (GET_MODE_SIZE (mode).is_constant (&size))
12541 	{
12542 	  HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
12543 							     mode);
12544 	  if (base_offset != 0)
12545 	    {
12546 	      base = plus_constant (Pmode, base, base_offset);
12547 	      base = force_operand (base, NULL_RTX);
12548 	      return plus_constant (Pmode, base, offset - base_offset);
12549 	    }
12550 	}
12551     }
12552 
12553   return x;
12554 }
12555 
12556 static reg_class_t
aarch64_secondary_reload(bool in_p ATTRIBUTE_UNUSED,rtx x,reg_class_t rclass,machine_mode mode,secondary_reload_info * sri)12557 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
12558 			  reg_class_t rclass,
12559 			  machine_mode mode,
12560 			  secondary_reload_info *sri)
12561 {
12562   /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
12563      LDR and STR.  See the comment at the head of aarch64-sve.md for
12564      more details about the big-endian handling.  */
12565   if (reg_class_subset_p (rclass, FP_REGS)
12566       && !((REG_P (x) && HARD_REGISTER_P (x))
12567 	   || aarch64_simd_valid_immediate (x, NULL))
12568       && mode != VNx16QImode)
12569     {
12570       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12571       if ((vec_flags & VEC_SVE_DATA)
12572 	  && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
12573 	{
12574 	  sri->icode = CODE_FOR_aarch64_sve_reload_mem;
12575 	  return NO_REGS;
12576 	}
12577     }
12578 
12579   /* If we have to disable direct literal pool loads and stores because the
12580      function is too big, then we need a scratch register.  */
12581   if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
12582       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
12583 	  || targetm.vector_mode_supported_p (GET_MODE (x)))
12584       && !aarch64_pcrelative_literal_loads)
12585     {
12586       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
12587       return NO_REGS;
12588     }
12589 
12590   /* Without the TARGET_SIMD instructions we cannot move a Q register
12591      to a Q register directly.  We need a scratch.  */
12592   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
12593       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
12594       && reg_class_subset_p (rclass, FP_REGS))
12595     {
12596       sri->icode = code_for_aarch64_reload_mov (mode);
12597       return NO_REGS;
12598     }
12599 
12600   /* A TFmode or TImode memory access should be handled via an FP_REGS
12601      because AArch64 has richer addressing modes for LDR/STR instructions
12602      than LDP/STP instructions.  */
12603   if (TARGET_FLOAT && rclass == GENERAL_REGS
12604       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
12605     return FP_REGS;
12606 
12607   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
12608       return GENERAL_REGS;
12609 
12610   return NO_REGS;
12611 }
12612 
12613 static bool
aarch64_can_eliminate(const int from ATTRIBUTE_UNUSED,const int to)12614 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
12615 {
12616   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
12617 
12618   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
12619      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
12620   if (frame_pointer_needed)
12621     return to == HARD_FRAME_POINTER_REGNUM;
12622   return true;
12623 }
12624 
12625 poly_int64
aarch64_initial_elimination_offset(unsigned from,unsigned to)12626 aarch64_initial_elimination_offset (unsigned from, unsigned to)
12627 {
12628   aarch64_frame &frame = cfun->machine->frame;
12629 
12630   if (to == HARD_FRAME_POINTER_REGNUM)
12631     {
12632       if (from == ARG_POINTER_REGNUM)
12633 	return frame.bytes_above_hard_fp;
12634 
12635       if (from == FRAME_POINTER_REGNUM)
12636 	return frame.bytes_above_hard_fp - frame.bytes_above_locals;
12637     }
12638 
12639   if (to == STACK_POINTER_REGNUM)
12640     {
12641       if (from == FRAME_POINTER_REGNUM)
12642 	return frame.frame_size - frame.bytes_above_locals;
12643     }
12644 
12645   return frame.frame_size;
12646 }
12647 
12648 
12649 /* Get return address without mangling.  */
12650 
12651 rtx
aarch64_return_addr_rtx(void)12652 aarch64_return_addr_rtx (void)
12653 {
12654   rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
12655   /* Note: aarch64_return_address_signing_enabled only
12656      works after cfun->machine->frame.laid_out is set,
12657      so here we don't know if the return address will
12658      be signed or not.  */
12659   rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
12660   emit_move_insn (lr, val);
12661   emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
12662   return lr;
12663 }
12664 
12665 
12666 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
12667    previous frame.  */
12668 
12669 rtx
aarch64_return_addr(int count,rtx frame ATTRIBUTE_UNUSED)12670 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
12671 {
12672   if (count != 0)
12673     return const0_rtx;
12674   return aarch64_return_addr_rtx ();
12675 }
12676 
12677 static void
aarch64_asm_trampoline_template(FILE * f)12678 aarch64_asm_trampoline_template (FILE *f)
12679 {
12680   /* Even if the current function doesn't have branch protection, some
12681      later function might, so since this template is only generated once
12682      we have to add a BTI just in case. */
12683   asm_fprintf (f, "\thint\t34 // bti c\n");
12684 
12685   if (TARGET_ILP32)
12686     {
12687       asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
12688       asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
12689     }
12690   else
12691     {
12692       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
12693       asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
12694     }
12695   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
12696 
12697   /* We always emit a speculation barrier.
12698      This is because the same trampoline template is used for every nested
12699      function.  Since nested functions are not particularly common or
12700      performant we don't worry too much about the extra instructions to copy
12701      around.
12702      This is not yet a problem, since we have not yet implemented function
12703      specific attributes to choose between hardening against straight line
12704      speculation or not, but such function specific attributes are likely to
12705      happen in the future.  */
12706   asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
12707 
12708   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12709   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12710 }
12711 
12712 static void
aarch64_trampoline_init(rtx m_tramp,tree fndecl,rtx chain_value)12713 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
12714 {
12715   rtx fnaddr, mem, a_tramp;
12716   const int tramp_code_sz = 24;
12717 
12718   /* Don't need to copy the trailing D-words, we fill those in below.  */
12719   /* We create our own memory address in Pmode so that `emit_block_move` can
12720      use parts of the backend which expect Pmode addresses.  */
12721   rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
12722   emit_block_move (gen_rtx_MEM (BLKmode, temp),
12723 		   assemble_trampoline_template (),
12724 		   GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
12725   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
12726   fnaddr = XEXP (DECL_RTL (fndecl), 0);
12727   if (GET_MODE (fnaddr) != ptr_mode)
12728     fnaddr = convert_memory_address (ptr_mode, fnaddr);
12729   emit_move_insn (mem, fnaddr);
12730 
12731   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
12732   emit_move_insn (mem, chain_value);
12733 
12734   /* XXX We should really define a "clear_cache" pattern and use
12735      gen_clear_cache().  */
12736   a_tramp = XEXP (m_tramp, 0);
12737   maybe_emit_call_builtin___clear_cache (a_tramp,
12738 					 plus_constant (ptr_mode,
12739 							a_tramp,
12740 							TRAMPOLINE_SIZE));
12741 }
12742 
12743 static unsigned char
aarch64_class_max_nregs(reg_class_t regclass,machine_mode mode)12744 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
12745 {
12746   /* ??? Logically we should only need to provide a value when
12747      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
12748      can hold MODE, but at the moment we need to handle all modes.
12749      Just ignore any runtime parts for registers that can't store them.  */
12750   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
12751   unsigned int nregs, vec_flags;
12752   switch (regclass)
12753     {
12754     case STUB_REGS:
12755     case TAILCALL_ADDR_REGS:
12756     case POINTER_REGS:
12757     case GENERAL_REGS:
12758     case ALL_REGS:
12759     case POINTER_AND_FP_REGS:
12760     case FP_REGS:
12761     case FP_LO_REGS:
12762     case FP_LO8_REGS:
12763       vec_flags = aarch64_classify_vector_mode (mode);
12764       if ((vec_flags & VEC_SVE_DATA)
12765 	  && constant_multiple_p (GET_MODE_SIZE (mode),
12766 				  aarch64_vl_bytes (mode, vec_flags), &nregs))
12767 	return nregs;
12768       return (vec_flags & VEC_ADVSIMD
12769 	      ? CEIL (lowest_size, UNITS_PER_VREG)
12770 	      : CEIL (lowest_size, UNITS_PER_WORD));
12771     case STACK_REG:
12772     case PR_REGS:
12773     case PR_LO_REGS:
12774     case PR_HI_REGS:
12775     case FFR_REGS:
12776     case PR_AND_FFR_REGS:
12777       return 1;
12778 
12779     case NO_REGS:
12780       return 0;
12781 
12782     default:
12783       break;
12784     }
12785   gcc_unreachable ();
12786 }
12787 
12788 static reg_class_t
aarch64_preferred_reload_class(rtx x,reg_class_t regclass)12789 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
12790 {
12791   if (regclass == POINTER_REGS)
12792     return GENERAL_REGS;
12793 
12794   if (regclass == STACK_REG)
12795     {
12796       if (REG_P(x)
12797 	  && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
12798 	  return regclass;
12799 
12800       return NO_REGS;
12801     }
12802 
12803   /* Register eliminiation can result in a request for
12804      SP+constant->FP_REGS.  We cannot support such operations which
12805      use SP as source and an FP_REG as destination, so reject out
12806      right now.  */
12807   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
12808     {
12809       rtx lhs = XEXP (x, 0);
12810 
12811       /* Look through a possible SUBREG introduced by ILP32.  */
12812       if (SUBREG_P (lhs))
12813 	lhs = SUBREG_REG (lhs);
12814 
12815       gcc_assert (REG_P (lhs));
12816       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
12817 				      POINTER_REGS));
12818       return NO_REGS;
12819     }
12820 
12821   return regclass;
12822 }
12823 
12824 void
aarch64_asm_output_labelref(FILE * f,const char * name)12825 aarch64_asm_output_labelref (FILE* f, const char *name)
12826 {
12827   asm_fprintf (f, "%U%s", name);
12828 }
12829 
12830 static void
aarch64_elf_asm_constructor(rtx symbol,int priority)12831 aarch64_elf_asm_constructor (rtx symbol, int priority)
12832 {
12833   if (priority == DEFAULT_INIT_PRIORITY)
12834     default_ctor_section_asm_out_constructor (symbol, priority);
12835   else
12836     {
12837       section *s;
12838       /* While priority is known to be in range [0, 65535], so 18 bytes
12839          would be enough, the compiler might not know that.  To avoid
12840          -Wformat-truncation false positive, use a larger size.  */
12841       char buf[23];
12842       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
12843       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
12844       switch_to_section (s);
12845       assemble_align (POINTER_SIZE);
12846       assemble_aligned_integer (POINTER_BYTES, symbol);
12847     }
12848 }
12849 
12850 static void
aarch64_elf_asm_destructor(rtx symbol,int priority)12851 aarch64_elf_asm_destructor (rtx symbol, int priority)
12852 {
12853   if (priority == DEFAULT_INIT_PRIORITY)
12854     default_dtor_section_asm_out_destructor (symbol, priority);
12855   else
12856     {
12857       section *s;
12858       /* While priority is known to be in range [0, 65535], so 18 bytes
12859          would be enough, the compiler might not know that.  To avoid
12860          -Wformat-truncation false positive, use a larger size.  */
12861       char buf[23];
12862       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
12863       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
12864       switch_to_section (s);
12865       assemble_align (POINTER_SIZE);
12866       assemble_aligned_integer (POINTER_BYTES, symbol);
12867     }
12868 }
12869 
12870 const char*
aarch64_output_casesi(rtx * operands)12871 aarch64_output_casesi (rtx *operands)
12872 {
12873   char buf[100];
12874   char label[100];
12875   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
12876   int index;
12877   static const char *const patterns[4][2] =
12878   {
12879     {
12880       "ldrb\t%w3, [%0,%w1,uxtw]",
12881       "add\t%3, %4, %w3, sxtb #2"
12882     },
12883     {
12884       "ldrh\t%w3, [%0,%w1,uxtw #1]",
12885       "add\t%3, %4, %w3, sxth #2"
12886     },
12887     {
12888       "ldr\t%w3, [%0,%w1,uxtw #2]",
12889       "add\t%3, %4, %w3, sxtw #2"
12890     },
12891     /* We assume that DImode is only generated when not optimizing and
12892        that we don't really need 64-bit address offsets.  That would
12893        imply an object file with 8GB of code in a single function!  */
12894     {
12895       "ldr\t%w3, [%0,%w1,uxtw #2]",
12896       "add\t%3, %4, %w3, sxtw #2"
12897     }
12898   };
12899 
12900   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
12901 
12902   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
12903   index = exact_log2 (GET_MODE_SIZE (mode));
12904 
12905   gcc_assert (index >= 0 && index <= 3);
12906 
12907   /* Need to implement table size reduction, by chaning the code below.  */
12908   output_asm_insn (patterns[index][0], operands);
12909   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
12910   snprintf (buf, sizeof (buf),
12911 	    "adr\t%%4, %s", targetm.strip_name_encoding (label));
12912   output_asm_insn (buf, operands);
12913   output_asm_insn (patterns[index][1], operands);
12914   output_asm_insn ("br\t%3", operands);
12915   output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
12916 		   operands);
12917   assemble_label (asm_out_file, label);
12918   return "";
12919 }
12920 
12921 
12922 /* Return size in bits of an arithmetic operand which is shifted/scaled and
12923    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
12924    operator.  */
12925 
12926 int
aarch64_uxt_size(int shift,HOST_WIDE_INT mask)12927 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
12928 {
12929   if (shift >= 0 && shift <= 3)
12930     {
12931       int size;
12932       for (size = 8; size <= 32; size *= 2)
12933 	{
12934 	  HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
12935 	  if (mask == bits << shift)
12936 	    return size;
12937 	}
12938     }
12939   return 0;
12940 }
12941 
12942 /* Constant pools are per function only when PC relative
12943    literal loads are true or we are in the large memory
12944    model.  */
12945 
12946 static inline bool
aarch64_can_use_per_function_literal_pools_p(void)12947 aarch64_can_use_per_function_literal_pools_p (void)
12948 {
12949   return (aarch64_pcrelative_literal_loads
12950 	  || aarch64_cmodel == AARCH64_CMODEL_LARGE);
12951 }
12952 
12953 static bool
aarch64_use_blocks_for_constant_p(machine_mode,const_rtx)12954 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
12955 {
12956   /* We can't use blocks for constants when we're using a per-function
12957      constant pool.  */
12958   return !aarch64_can_use_per_function_literal_pools_p ();
12959 }
12960 
12961 /* Select appropriate section for constants depending
12962    on where we place literal pools.  */
12963 
12964 static section *
aarch64_select_rtx_section(machine_mode mode,rtx x,unsigned HOST_WIDE_INT align)12965 aarch64_select_rtx_section (machine_mode mode,
12966 			    rtx x,
12967 			    unsigned HOST_WIDE_INT align)
12968 {
12969   if (aarch64_can_use_per_function_literal_pools_p ())
12970     return function_section (current_function_decl);
12971 
12972   return default_elf_select_rtx_section (mode, x, align);
12973 }
12974 
12975 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
12976 void
aarch64_asm_output_pool_epilogue(FILE * f,const char *,tree,HOST_WIDE_INT offset)12977 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
12978 				  HOST_WIDE_INT offset)
12979 {
12980   /* When using per-function literal pools, we must ensure that any code
12981      section is aligned to the minimal instruction length, lest we get
12982      errors from the assembler re "unaligned instructions".  */
12983   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
12984     ASM_OUTPUT_ALIGN (f, 2);
12985 }
12986 
12987 /* Costs.  */
12988 
12989 /* Helper function for rtx cost calculation.  Strip a shift expression
12990    from X.  Returns the inner operand if successful, or the original
12991    expression on failure.  */
12992 static rtx
aarch64_strip_shift(rtx x)12993 aarch64_strip_shift (rtx x)
12994 {
12995   rtx op = x;
12996 
12997   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
12998      we can convert both to ROR during final output.  */
12999   if ((GET_CODE (op) == ASHIFT
13000        || GET_CODE (op) == ASHIFTRT
13001        || GET_CODE (op) == LSHIFTRT
13002        || GET_CODE (op) == ROTATERT
13003        || GET_CODE (op) == ROTATE)
13004       && CONST_INT_P (XEXP (op, 1)))
13005     return XEXP (op, 0);
13006 
13007   if (GET_CODE (op) == MULT
13008       && CONST_INT_P (XEXP (op, 1))
13009       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
13010     return XEXP (op, 0);
13011 
13012   return x;
13013 }
13014 
13015 /* Helper function for rtx cost calculation.  Strip an extend
13016    expression from X.  Returns the inner operand if successful, or the
13017    original expression on failure.  We deal with a number of possible
13018    canonicalization variations here. If STRIP_SHIFT is true, then
13019    we can strip off a shift also.  */
13020 static rtx
aarch64_strip_extend(rtx x,bool strip_shift)13021 aarch64_strip_extend (rtx x, bool strip_shift)
13022 {
13023   scalar_int_mode mode;
13024   rtx op = x;
13025 
13026   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
13027     return op;
13028 
13029   if (GET_CODE (op) == AND
13030       && GET_CODE (XEXP (op, 0)) == MULT
13031       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
13032       && CONST_INT_P (XEXP (op, 1))
13033       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
13034 			   INTVAL (XEXP (op, 1))) != 0)
13035     return XEXP (XEXP (op, 0), 0);
13036 
13037   /* Now handle extended register, as this may also have an optional
13038      left shift by 1..4.  */
13039   if (strip_shift
13040       && GET_CODE (op) == ASHIFT
13041       && CONST_INT_P (XEXP (op, 1))
13042       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
13043     op = XEXP (op, 0);
13044 
13045   if (GET_CODE (op) == ZERO_EXTEND
13046       || GET_CODE (op) == SIGN_EXTEND)
13047     op = XEXP (op, 0);
13048 
13049   if (op != x)
13050     return op;
13051 
13052   return x;
13053 }
13054 
13055 /* Helper function for rtx cost calculation. Strip extension as well as any
13056    inner VEC_SELECT high-half from X. Returns the inner vector operand if
13057    successful, or the original expression on failure.  */
13058 static rtx
aarch64_strip_extend_vec_half(rtx x)13059 aarch64_strip_extend_vec_half (rtx x)
13060 {
13061   if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13062     {
13063       x = XEXP (x, 0);
13064       if (GET_CODE (x) == VEC_SELECT
13065 	  && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
13066 				    XEXP (x, 1)))
13067 	x = XEXP (x, 0);
13068     }
13069   return x;
13070 }
13071 
13072 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13073    any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13074    operand if successful, or the original expression on failure.  */
13075 static rtx
aarch64_strip_duplicate_vec_elt(rtx x)13076 aarch64_strip_duplicate_vec_elt (rtx x)
13077 {
13078   if (GET_CODE (x) == VEC_DUPLICATE
13079       && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
13080     {
13081       x = XEXP (x, 0);
13082       if (GET_CODE (x) == VEC_SELECT)
13083 	x = XEXP (x, 0);
13084       else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13085 	       && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
13086 	x = XEXP (XEXP (x, 0), 0);
13087     }
13088   return x;
13089 }
13090 
13091 /* Return true iff CODE is a shift supported in combination
13092    with arithmetic instructions.  */
13093 
13094 static bool
aarch64_shift_p(enum rtx_code code)13095 aarch64_shift_p (enum rtx_code code)
13096 {
13097   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
13098 }
13099 
13100 
13101 /* Return true iff X is a cheap shift without a sign extend. */
13102 
13103 static bool
aarch64_cheap_mult_shift_p(rtx x)13104 aarch64_cheap_mult_shift_p (rtx x)
13105 {
13106   rtx op0, op1;
13107 
13108   op0 = XEXP (x, 0);
13109   op1 = XEXP (x, 1);
13110 
13111   if (!(aarch64_tune_params.extra_tuning_flags
13112                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
13113     return false;
13114 
13115   if (GET_CODE (op0) == SIGN_EXTEND)
13116     return false;
13117 
13118   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
13119       && UINTVAL (op1) <= 4)
13120     return true;
13121 
13122   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
13123     return false;
13124 
13125   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
13126 
13127   if (l2 > 0 && l2 <= 4)
13128     return true;
13129 
13130   return false;
13131 }
13132 
13133 /* Helper function for rtx cost calculation.  Calculate the cost of
13134    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13135    Return the calculated cost of the expression, recursing manually in to
13136    operands where needed.  */
13137 
13138 static int
aarch64_rtx_mult_cost(rtx x,enum rtx_code code,int outer,bool speed)13139 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
13140 {
13141   rtx op0, op1;
13142   const struct cpu_cost_table *extra_cost
13143     = aarch64_tune_params.insn_extra_cost;
13144   int cost = 0;
13145   bool compound_p = (outer == PLUS || outer == MINUS);
13146   machine_mode mode = GET_MODE (x);
13147 
13148   gcc_checking_assert (code == MULT);
13149 
13150   op0 = XEXP (x, 0);
13151   op1 = XEXP (x, 1);
13152 
13153   if (VECTOR_MODE_P (mode))
13154     {
13155       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13156       if (vec_flags & VEC_ADVSIMD)
13157 	{
13158 	  /* The select-operand-high-half versions of the instruction have the
13159 	     same cost as the three vector version - don't add the costs of the
13160 	     extension or selection into the costs of the multiply.  */
13161 	  op0 = aarch64_strip_extend_vec_half (op0);
13162 	  op1 = aarch64_strip_extend_vec_half (op1);
13163 	  /* The by-element versions of the instruction have the same costs as
13164 	     the normal 3-vector version.  We make an assumption that the input
13165 	     to the VEC_DUPLICATE is already on the FP & SIMD side.  This means
13166 	     costing of a MUL by element pre RA is a bit optimistic.  */
13167 	  op0 = aarch64_strip_duplicate_vec_elt (op0);
13168 	  op1 = aarch64_strip_duplicate_vec_elt (op1);
13169 	}
13170       cost += rtx_cost (op0, mode, MULT, 0, speed);
13171       cost += rtx_cost (op1, mode, MULT, 1, speed);
13172       if (speed)
13173 	{
13174 	  if (GET_CODE (x) == MULT)
13175 	    cost += extra_cost->vect.mult;
13176 	  /* This is to catch the SSRA costing currently flowing here.  */
13177 	  else
13178 	    cost += extra_cost->vect.alu;
13179 	}
13180       return cost;
13181     }
13182 
13183   /* Integer multiply/fma.  */
13184   if (GET_MODE_CLASS (mode) == MODE_INT)
13185     {
13186       /* The multiply will be canonicalized as a shift, cost it as such.  */
13187       if (aarch64_shift_p (GET_CODE (x))
13188 	  || (CONST_INT_P (op1)
13189 	      && exact_log2 (INTVAL (op1)) > 0))
13190 	{
13191 	  bool is_extend = GET_CODE (op0) == ZERO_EXTEND
13192 	                   || GET_CODE (op0) == SIGN_EXTEND;
13193 	  if (speed)
13194 	    {
13195 	      if (compound_p)
13196 	        {
13197 		  /* If the shift is considered cheap,
13198 		     then don't add any cost. */
13199 		  if (aarch64_cheap_mult_shift_p (x))
13200 		    ;
13201 	          else if (REG_P (op1))
13202 		    /* ARITH + shift-by-register.  */
13203 		    cost += extra_cost->alu.arith_shift_reg;
13204 		  else if (is_extend)
13205 		    /* ARITH + extended register.  We don't have a cost field
13206 		       for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
13207 		    cost += extra_cost->alu.extend_arith;
13208 		  else
13209 		    /* ARITH + shift-by-immediate.  */
13210 		    cost += extra_cost->alu.arith_shift;
13211 		}
13212 	      else
13213 		/* LSL (immediate).  */
13214 	        cost += extra_cost->alu.shift;
13215 
13216 	    }
13217 	  /* Strip extends as we will have costed them in the case above.  */
13218 	  if (is_extend)
13219 	    op0 = aarch64_strip_extend (op0, true);
13220 
13221 	  cost += rtx_cost (op0, VOIDmode, code, 0, speed);
13222 
13223 	  return cost;
13224 	}
13225 
13226       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
13227 	 compound and let the below cases handle it.  After all, MNEG is a
13228 	 special-case alias of MSUB.  */
13229       if (GET_CODE (op0) == NEG)
13230 	{
13231 	  op0 = XEXP (op0, 0);
13232 	  compound_p = true;
13233 	}
13234 
13235       /* Integer multiplies or FMAs have zero/sign extending variants.  */
13236       if ((GET_CODE (op0) == ZERO_EXTEND
13237 	   && GET_CODE (op1) == ZERO_EXTEND)
13238 	  || (GET_CODE (op0) == SIGN_EXTEND
13239 	      && GET_CODE (op1) == SIGN_EXTEND))
13240 	{
13241 	  cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
13242 	  cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
13243 
13244 	  if (speed)
13245 	    {
13246 	      if (compound_p)
13247 		/* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
13248 		cost += extra_cost->mult[0].extend_add;
13249 	      else
13250 		/* MUL/SMULL/UMULL.  */
13251 		cost += extra_cost->mult[0].extend;
13252 	    }
13253 
13254 	  return cost;
13255 	}
13256 
13257       /* This is either an integer multiply or a MADD.  In both cases
13258 	 we want to recurse and cost the operands.  */
13259       cost += rtx_cost (op0, mode, MULT, 0, speed);
13260       cost += rtx_cost (op1, mode, MULT, 1, speed);
13261 
13262       if (speed)
13263 	{
13264 	  if (compound_p)
13265 	    /* MADD/MSUB.  */
13266 	    cost += extra_cost->mult[mode == DImode].add;
13267 	  else
13268 	    /* MUL.  */
13269 	    cost += extra_cost->mult[mode == DImode].simple;
13270 	}
13271 
13272       return cost;
13273     }
13274   else
13275     {
13276       if (speed)
13277 	{
13278 	  /* Floating-point FMA/FMUL can also support negations of the
13279 	     operands, unless the rounding mode is upward or downward in
13280 	     which case FNMUL is different than FMUL with operand negation.  */
13281 	  bool neg0 = GET_CODE (op0) == NEG;
13282 	  bool neg1 = GET_CODE (op1) == NEG;
13283 	  if (compound_p || !flag_rounding_math || (neg0 && neg1))
13284 	    {
13285 	      if (neg0)
13286 		op0 = XEXP (op0, 0);
13287 	      if (neg1)
13288 		op1 = XEXP (op1, 0);
13289 	    }
13290 
13291 	  if (compound_p)
13292 	    /* FMADD/FNMADD/FNMSUB/FMSUB.  */
13293 	    cost += extra_cost->fp[mode == DFmode].fma;
13294 	  else
13295 	    /* FMUL/FNMUL.  */
13296 	    cost += extra_cost->fp[mode == DFmode].mult;
13297 	}
13298 
13299       cost += rtx_cost (op0, mode, MULT, 0, speed);
13300       cost += rtx_cost (op1, mode, MULT, 1, speed);
13301       return cost;
13302     }
13303 }
13304 
13305 static int
aarch64_address_cost(rtx x,machine_mode mode,addr_space_t as ATTRIBUTE_UNUSED,bool speed)13306 aarch64_address_cost (rtx x,
13307 		      machine_mode mode,
13308 		      addr_space_t as ATTRIBUTE_UNUSED,
13309 		      bool speed)
13310 {
13311   enum rtx_code c = GET_CODE (x);
13312   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
13313   struct aarch64_address_info info;
13314   int cost = 0;
13315   info.shift = 0;
13316 
13317   if (!aarch64_classify_address (&info, x, mode, false))
13318     {
13319       if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
13320 	{
13321 	  /* This is a CONST or SYMBOL ref which will be split
13322 	     in a different way depending on the code model in use.
13323 	     Cost it through the generic infrastructure.  */
13324 	  int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
13325 	  /* Divide through by the cost of one instruction to
13326 	     bring it to the same units as the address costs.  */
13327 	  cost_symbol_ref /= COSTS_N_INSNS (1);
13328 	  /* The cost is then the cost of preparing the address,
13329 	     followed by an immediate (possibly 0) offset.  */
13330 	  return cost_symbol_ref + addr_cost->imm_offset;
13331 	}
13332       else
13333 	{
13334 	  /* This is most likely a jump table from a case
13335 	     statement.  */
13336 	  return addr_cost->register_offset;
13337 	}
13338     }
13339 
13340   switch (info.type)
13341     {
13342       case ADDRESS_LO_SUM:
13343       case ADDRESS_SYMBOLIC:
13344       case ADDRESS_REG_IMM:
13345 	cost += addr_cost->imm_offset;
13346 	break;
13347 
13348       case ADDRESS_REG_WB:
13349 	if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
13350 	  cost += addr_cost->pre_modify;
13351 	else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
13352 	  {
13353 	    unsigned int nvectors = aarch64_ldn_stn_vectors (mode);
13354 	    if (nvectors == 3)
13355 	      cost += addr_cost->post_modify_ld3_st3;
13356 	    else if (nvectors == 4)
13357 	      cost += addr_cost->post_modify_ld4_st4;
13358 	    else
13359 	      cost += addr_cost->post_modify;
13360 	  }
13361 	else
13362 	  gcc_unreachable ();
13363 
13364 	break;
13365 
13366       case ADDRESS_REG_REG:
13367 	cost += addr_cost->register_offset;
13368 	break;
13369 
13370       case ADDRESS_REG_SXTW:
13371 	cost += addr_cost->register_sextend;
13372 	break;
13373 
13374       case ADDRESS_REG_UXTW:
13375 	cost += addr_cost->register_zextend;
13376 	break;
13377 
13378       default:
13379 	gcc_unreachable ();
13380     }
13381 
13382 
13383   if (info.shift > 0)
13384     {
13385       /* For the sake of calculating the cost of the shifted register
13386 	 component, we can treat same sized modes in the same way.  */
13387       if (known_eq (GET_MODE_BITSIZE (mode), 16))
13388 	cost += addr_cost->addr_scale_costs.hi;
13389       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
13390 	cost += addr_cost->addr_scale_costs.si;
13391       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
13392 	cost += addr_cost->addr_scale_costs.di;
13393       else
13394 	/* We can't tell, or this is a 128-bit vector.  */
13395 	cost += addr_cost->addr_scale_costs.ti;
13396     }
13397 
13398   return cost;
13399 }
13400 
13401 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
13402    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
13403    to be taken.  */
13404 
13405 int
aarch64_branch_cost(bool speed_p,bool predictable_p)13406 aarch64_branch_cost (bool speed_p, bool predictable_p)
13407 {
13408   /* When optimizing for speed, use the cost of unpredictable branches.  */
13409   const struct cpu_branch_cost *branch_costs =
13410     aarch64_tune_params.branch_costs;
13411 
13412   if (!speed_p || predictable_p)
13413     return branch_costs->predictable;
13414   else
13415     return branch_costs->unpredictable;
13416 }
13417 
13418 /* Return true if X is a zero or sign extract
13419    usable in an ADD or SUB (extended register) instruction.  */
13420 static bool
aarch64_rtx_arith_op_extract_p(rtx x)13421 aarch64_rtx_arith_op_extract_p (rtx x)
13422 {
13423   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
13424      No shift.  */
13425   if (GET_CODE (x) == SIGN_EXTEND
13426       || GET_CODE (x) == ZERO_EXTEND)
13427     return REG_P (XEXP (x, 0));
13428 
13429   return false;
13430 }
13431 
13432 static bool
aarch64_frint_unspec_p(unsigned int u)13433 aarch64_frint_unspec_p (unsigned int u)
13434 {
13435   switch (u)
13436     {
13437       case UNSPEC_FRINTZ:
13438       case UNSPEC_FRINTP:
13439       case UNSPEC_FRINTM:
13440       case UNSPEC_FRINTA:
13441       case UNSPEC_FRINTN:
13442       case UNSPEC_FRINTX:
13443       case UNSPEC_FRINTI:
13444         return true;
13445 
13446       default:
13447         return false;
13448     }
13449 }
13450 
13451 /* Return true iff X is an rtx that will match an extr instruction
13452    i.e. as described in the *extr<mode>5_insn family of patterns.
13453    OP0 and OP1 will be set to the operands of the shifts involved
13454    on success and will be NULL_RTX otherwise.  */
13455 
13456 static bool
aarch64_extr_rtx_p(rtx x,rtx * res_op0,rtx * res_op1)13457 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
13458 {
13459   rtx op0, op1;
13460   scalar_int_mode mode;
13461   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
13462     return false;
13463 
13464   *res_op0 = NULL_RTX;
13465   *res_op1 = NULL_RTX;
13466 
13467   if (GET_CODE (x) != IOR)
13468     return false;
13469 
13470   op0 = XEXP (x, 0);
13471   op1 = XEXP (x, 1);
13472 
13473   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
13474       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
13475     {
13476      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
13477       if (GET_CODE (op1) == ASHIFT)
13478         std::swap (op0, op1);
13479 
13480       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
13481         return false;
13482 
13483       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
13484       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
13485 
13486       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
13487           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
13488         {
13489           *res_op0 = XEXP (op0, 0);
13490           *res_op1 = XEXP (op1, 0);
13491           return true;
13492         }
13493     }
13494 
13495   return false;
13496 }
13497 
13498 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
13499    storing it in *COST.  Result is true if the total cost of the operation
13500    has now been calculated.  */
13501 static bool
aarch64_if_then_else_costs(rtx op0,rtx op1,rtx op2,int * cost,bool speed)13502 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
13503 {
13504   rtx inner;
13505   rtx comparator;
13506   enum rtx_code cmpcode;
13507   const struct cpu_cost_table *extra_cost
13508     = aarch64_tune_params.insn_extra_cost;
13509 
13510   if (COMPARISON_P (op0))
13511     {
13512       inner = XEXP (op0, 0);
13513       comparator = XEXP (op0, 1);
13514       cmpcode = GET_CODE (op0);
13515     }
13516   else
13517     {
13518       inner = op0;
13519       comparator = const0_rtx;
13520       cmpcode = NE;
13521     }
13522 
13523   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
13524     {
13525       /* Conditional branch.  */
13526       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13527 	return true;
13528       else
13529 	{
13530 	  if (cmpcode == NE || cmpcode == EQ)
13531 	    {
13532 	      if (comparator == const0_rtx)
13533 		{
13534 		  /* TBZ/TBNZ/CBZ/CBNZ.  */
13535 		  if (GET_CODE (inner) == ZERO_EXTRACT)
13536 		    /* TBZ/TBNZ.  */
13537 		    *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
13538 				       ZERO_EXTRACT, 0, speed);
13539 		  else
13540 		    /* CBZ/CBNZ.  */
13541 		    *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
13542 
13543 		  return true;
13544 		}
13545 	      if (register_operand (inner, VOIDmode)
13546 		  && aarch64_imm24 (comparator, VOIDmode))
13547 		{
13548 		  /* SUB and SUBS.  */
13549 		  *cost += COSTS_N_INSNS (2);
13550 		  if (speed)
13551 		    *cost += extra_cost->alu.arith * 2;
13552 		  return true;
13553 		}
13554 	    }
13555 	  else if (cmpcode == LT || cmpcode == GE)
13556 	    {
13557 	      /* TBZ/TBNZ.  */
13558 	      if (comparator == const0_rtx)
13559 		return true;
13560 	    }
13561 	}
13562     }
13563   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13564     {
13565       /* CCMP.  */
13566       if (GET_CODE (op1) == COMPARE)
13567 	{
13568 	  /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
13569 	  if (XEXP (op1, 1) == const0_rtx)
13570 	    *cost += 1;
13571 	  if (speed)
13572 	    {
13573 	      machine_mode mode = GET_MODE (XEXP (op1, 0));
13574 
13575 	      if (GET_MODE_CLASS (mode) == MODE_INT)
13576 		*cost += extra_cost->alu.arith;
13577 	      else
13578 		*cost += extra_cost->fp[mode == DFmode].compare;
13579 	    }
13580 	  return true;
13581 	}
13582 
13583       /* It's a conditional operation based on the status flags,
13584 	 so it must be some flavor of CSEL.  */
13585 
13586       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
13587       if (GET_CODE (op1) == NEG
13588           || GET_CODE (op1) == NOT
13589           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
13590 	op1 = XEXP (op1, 0);
13591       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
13592 	{
13593 	  /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
13594 	  op1 = XEXP (op1, 0);
13595 	  op2 = XEXP (op2, 0);
13596 	}
13597       else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
13598 	{
13599 	  inner = XEXP (op1, 0);
13600 	  if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
13601 	    /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3).  */
13602 	    op1 = XEXP (inner, 0);
13603 	}
13604 
13605       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13606       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13607       return true;
13608     }
13609 
13610   /* We don't know what this is, cost all operands.  */
13611   return false;
13612 }
13613 
13614 /* Check whether X is a bitfield operation of the form shift + extend that
13615    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
13616    operand to which the bitfield operation is applied.  Otherwise return
13617    NULL_RTX.  */
13618 
13619 static rtx
aarch64_extend_bitfield_pattern_p(rtx x)13620 aarch64_extend_bitfield_pattern_p (rtx x)
13621 {
13622   rtx_code outer_code = GET_CODE (x);
13623   machine_mode outer_mode = GET_MODE (x);
13624 
13625   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
13626       && outer_mode != SImode && outer_mode != DImode)
13627     return NULL_RTX;
13628 
13629   rtx inner = XEXP (x, 0);
13630   rtx_code inner_code = GET_CODE (inner);
13631   machine_mode inner_mode = GET_MODE (inner);
13632   rtx op = NULL_RTX;
13633 
13634   switch (inner_code)
13635     {
13636       case ASHIFT:
13637 	if (CONST_INT_P (XEXP (inner, 1))
13638 	    && (inner_mode == QImode || inner_mode == HImode))
13639 	  op = XEXP (inner, 0);
13640 	break;
13641       case LSHIFTRT:
13642 	if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
13643 	    && (inner_mode == QImode || inner_mode == HImode))
13644 	  op = XEXP (inner, 0);
13645 	break;
13646       case ASHIFTRT:
13647 	if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
13648 	    && (inner_mode == QImode || inner_mode == HImode))
13649 	  op = XEXP (inner, 0);
13650 	break;
13651       default:
13652 	break;
13653     }
13654 
13655   return op;
13656 }
13657 
13658 /* Return true if the mask and a shift amount from an RTX of the form
13659    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
13660    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
13661 
13662 bool
aarch64_mask_and_shift_for_ubfiz_p(scalar_int_mode mode,rtx mask,rtx shft_amnt)13663 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
13664 				    rtx shft_amnt)
13665 {
13666   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
13667 	 && INTVAL (mask) > 0
13668 	 && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
13669 	 && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
13670 	 && (UINTVAL (mask)
13671 	     & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
13672 }
13673 
13674 /* Return true if the masks and a shift amount from an RTX of the form
13675    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
13676    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
13677 
13678 bool
aarch64_masks_and_shift_for_bfi_p(scalar_int_mode mode,unsigned HOST_WIDE_INT mask1,unsigned HOST_WIDE_INT shft_amnt,unsigned HOST_WIDE_INT mask2)13679 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
13680 				   unsigned HOST_WIDE_INT mask1,
13681 				   unsigned HOST_WIDE_INT shft_amnt,
13682 				   unsigned HOST_WIDE_INT mask2)
13683 {
13684   unsigned HOST_WIDE_INT t;
13685 
13686   /* Verify that there is no overlap in what bits are set in the two masks.  */
13687   if (mask1 != ~mask2)
13688     return false;
13689 
13690   /* Verify that mask2 is not all zeros or ones.  */
13691   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
13692     return false;
13693 
13694   /* The shift amount should always be less than the mode size.  */
13695   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
13696 
13697   /* Verify that the mask being shifted is contiguous and would be in the
13698      least significant bits after shifting by shft_amnt.  */
13699   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
13700   return (t == (t & -t));
13701 }
13702 
13703 /* Calculate the cost of calculating X, storing it in *COST.  Result
13704    is true if the total cost of the operation has now been calculated.  */
13705 static bool
aarch64_rtx_costs(rtx x,machine_mode mode,int outer ATTRIBUTE_UNUSED,int param ATTRIBUTE_UNUSED,int * cost,bool speed)13706 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
13707 		   int param ATTRIBUTE_UNUSED, int *cost, bool speed)
13708 {
13709   rtx op0, op1, op2;
13710   const struct cpu_cost_table *extra_cost
13711     = aarch64_tune_params.insn_extra_cost;
13712   rtx_code code = GET_CODE (x);
13713   scalar_int_mode int_mode;
13714 
13715   /* By default, assume that everything has equivalent cost to the
13716      cheapest instruction.  Any additional costs are applied as a delta
13717      above this default.  */
13718   *cost = COSTS_N_INSNS (1);
13719 
13720   switch (code)
13721     {
13722     case SET:
13723       /* The cost depends entirely on the operands to SET.  */
13724       *cost = 0;
13725       op0 = SET_DEST (x);
13726       op1 = SET_SRC (x);
13727 
13728       switch (GET_CODE (op0))
13729 	{
13730 	case MEM:
13731 	  if (speed)
13732 	    {
13733 	      rtx address = XEXP (op0, 0);
13734 	      if (VECTOR_MODE_P (mode))
13735 		*cost += extra_cost->ldst.storev;
13736 	      else if (GET_MODE_CLASS (mode) == MODE_INT)
13737 		*cost += extra_cost->ldst.store;
13738 	      else if (mode == SFmode)
13739 		*cost += extra_cost->ldst.storef;
13740 	      else if (mode == DFmode)
13741 		*cost += extra_cost->ldst.stored;
13742 
13743 	      *cost +=
13744 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
13745 						     0, speed));
13746 	    }
13747 
13748 	  *cost += rtx_cost (op1, mode, SET, 1, speed);
13749 	  return true;
13750 
13751 	case SUBREG:
13752 	  if (! REG_P (SUBREG_REG (op0)))
13753 	    *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
13754 
13755 	  /* Fall through.  */
13756 	case REG:
13757 	  /* The cost is one per vector-register copied.  */
13758 	  if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
13759 	    {
13760 	      int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
13761 	      *cost = COSTS_N_INSNS (nregs);
13762 	    }
13763 	  /* const0_rtx is in general free, but we will use an
13764 	     instruction to set a register to 0.  */
13765 	  else if (REG_P (op1) || op1 == const0_rtx)
13766 	    {
13767 	      /* The cost is 1 per register copied.  */
13768 	      int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
13769 	      *cost = COSTS_N_INSNS (nregs);
13770 	    }
13771           else
13772 	    /* Cost is just the cost of the RHS of the set.  */
13773 	    *cost += rtx_cost (op1, mode, SET, 1, speed);
13774 	  return true;
13775 
13776 	case ZERO_EXTRACT:
13777 	case SIGN_EXTRACT:
13778 	  /* Bit-field insertion.  Strip any redundant widening of
13779 	     the RHS to meet the width of the target.  */
13780 	  if (SUBREG_P (op1))
13781 	    op1 = SUBREG_REG (op1);
13782 	  if ((GET_CODE (op1) == ZERO_EXTEND
13783 	       || GET_CODE (op1) == SIGN_EXTEND)
13784 	      && CONST_INT_P (XEXP (op0, 1))
13785 	      && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
13786 	      && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
13787 	    op1 = XEXP (op1, 0);
13788 
13789           if (CONST_INT_P (op1))
13790             {
13791               /* MOV immediate is assumed to always be cheap.  */
13792               *cost = COSTS_N_INSNS (1);
13793             }
13794           else
13795             {
13796               /* BFM.  */
13797 	      if (speed)
13798 		*cost += extra_cost->alu.bfi;
13799 	      *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
13800             }
13801 
13802 	  return true;
13803 
13804 	default:
13805 	  /* We can't make sense of this, assume default cost.  */
13806           *cost = COSTS_N_INSNS (1);
13807 	  return false;
13808 	}
13809       return false;
13810 
13811     case CONST_INT:
13812       /* If an instruction can incorporate a constant within the
13813 	 instruction, the instruction's expression avoids calling
13814 	 rtx_cost() on the constant.  If rtx_cost() is called on a
13815 	 constant, then it is usually because the constant must be
13816 	 moved into a register by one or more instructions.
13817 
13818 	 The exception is constant 0, which can be expressed
13819 	 as XZR/WZR and is therefore free.  The exception to this is
13820 	 if we have (set (reg) (const0_rtx)) in which case we must cost
13821 	 the move.  However, we can catch that when we cost the SET, so
13822 	 we don't need to consider that here.  */
13823       if (x == const0_rtx)
13824 	*cost = 0;
13825       else
13826 	{
13827 	  /* To an approximation, building any other constant is
13828 	     proportionally expensive to the number of instructions
13829 	     required to build that constant.  This is true whether we
13830 	     are compiling for SPEED or otherwise.  */
13831 	  if (!is_a <scalar_int_mode> (mode, &int_mode))
13832 	    int_mode = word_mode;
13833 	  *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
13834 				 (NULL_RTX, x, false, int_mode));
13835 	}
13836       return true;
13837 
13838     case CONST_DOUBLE:
13839 
13840       /* First determine number of instructions to do the move
13841 	  as an integer constant.  */
13842       if (!aarch64_float_const_representable_p (x)
13843 	   && !aarch64_can_const_movi_rtx_p (x, mode)
13844 	   && aarch64_float_const_rtx_p (x))
13845 	{
13846 	  unsigned HOST_WIDE_INT ival;
13847 	  bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
13848 	  gcc_assert (succeed);
13849 
13850 	  scalar_int_mode imode = (mode == HFmode
13851 				   ? SImode
13852 				   : int_mode_for_mode (mode).require ());
13853 	  int ncost = aarch64_internal_mov_immediate
13854 		(NULL_RTX, gen_int_mode (ival, imode), false, imode);
13855 	  *cost += COSTS_N_INSNS (ncost);
13856 	  return true;
13857 	}
13858 
13859       if (speed)
13860 	{
13861 	  /* mov[df,sf]_aarch64.  */
13862 	  if (aarch64_float_const_representable_p (x))
13863 	    /* FMOV (scalar immediate).  */
13864 	    *cost += extra_cost->fp[mode == DFmode].fpconst;
13865 	  else if (!aarch64_float_const_zero_rtx_p (x))
13866 	    {
13867 	      /* This will be a load from memory.  */
13868 	      if (mode == DFmode)
13869 		*cost += extra_cost->ldst.loadd;
13870 	      else
13871 		*cost += extra_cost->ldst.loadf;
13872 	    }
13873 	  else
13874 	    /* Otherwise this is +0.0.  We get this using MOVI d0, #0
13875 	       or MOV v0.s[0], wzr - neither of which are modeled by the
13876 	       cost tables.  Just use the default cost.  */
13877 	    {
13878 	    }
13879 	}
13880 
13881       return true;
13882 
13883     case MEM:
13884       if (speed)
13885 	{
13886 	  /* For loads we want the base cost of a load, plus an
13887 	     approximation for the additional cost of the addressing
13888 	     mode.  */
13889 	  rtx address = XEXP (x, 0);
13890 	  if (VECTOR_MODE_P (mode))
13891 	    *cost += extra_cost->ldst.loadv;
13892 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
13893 	    *cost += extra_cost->ldst.load;
13894 	  else if (mode == SFmode)
13895 	    *cost += extra_cost->ldst.loadf;
13896 	  else if (mode == DFmode)
13897 	    *cost += extra_cost->ldst.loadd;
13898 
13899 	  *cost +=
13900 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
13901 						     0, speed));
13902 	}
13903 
13904       return true;
13905 
13906     case NEG:
13907       op0 = XEXP (x, 0);
13908 
13909       if (VECTOR_MODE_P (mode))
13910 	{
13911 	  if (speed)
13912 	    {
13913 	      /* FNEG.  */
13914 	      *cost += extra_cost->vect.alu;
13915 	    }
13916 	  return false;
13917 	}
13918 
13919       if (GET_MODE_CLASS (mode) == MODE_INT)
13920 	{
13921           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
13922               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
13923             {
13924               /* CSETM.  */
13925 	      *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
13926               return true;
13927             }
13928 
13929 	  /* Cost this as SUB wzr, X.  */
13930           op0 = CONST0_RTX (mode);
13931           op1 = XEXP (x, 0);
13932           goto cost_minus;
13933         }
13934 
13935       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13936         {
13937           /* Support (neg(fma...)) as a single instruction only if
13938              sign of zeros is unimportant.  This matches the decision
13939              making in aarch64.md.  */
13940           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
13941             {
13942 	      /* FNMADD.  */
13943 	      *cost = rtx_cost (op0, mode, NEG, 0, speed);
13944               return true;
13945             }
13946 	  if (GET_CODE (op0) == MULT)
13947 	    {
13948 	      /* FNMUL.  */
13949 	      *cost = rtx_cost (op0, mode, NEG, 0, speed);
13950 	      return true;
13951 	    }
13952 	  if (speed)
13953 	    /* FNEG.  */
13954 	    *cost += extra_cost->fp[mode == DFmode].neg;
13955           return false;
13956         }
13957 
13958       return false;
13959 
13960     case CLRSB:
13961     case CLZ:
13962       if (speed)
13963 	{
13964 	  if (VECTOR_MODE_P (mode))
13965 	    *cost += extra_cost->vect.alu;
13966 	  else
13967 	    *cost += extra_cost->alu.clz;
13968 	}
13969 
13970       return false;
13971 
13972     case CTZ:
13973       *cost = COSTS_N_INSNS (2);
13974 
13975       if (speed)
13976 	*cost += extra_cost->alu.clz + extra_cost->alu.rev;
13977       return false;
13978 
13979     case COMPARE:
13980       op0 = XEXP (x, 0);
13981       op1 = XEXP (x, 1);
13982 
13983       if (op1 == const0_rtx
13984 	  && GET_CODE (op0) == AND)
13985 	{
13986 	  x = op0;
13987 	  mode = GET_MODE (op0);
13988 	  goto cost_logic;
13989 	}
13990 
13991       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
13992         {
13993           /* TODO: A write to the CC flags possibly costs extra, this
13994 	     needs encoding in the cost tables.  */
13995 
13996 	  mode = GET_MODE (op0);
13997           /* ANDS.  */
13998           if (GET_CODE (op0) == AND)
13999             {
14000               x = op0;
14001               goto cost_logic;
14002             }
14003 
14004           if (GET_CODE (op0) == PLUS)
14005             {
14006 	      /* ADDS (and CMN alias).  */
14007               x = op0;
14008               goto cost_plus;
14009             }
14010 
14011           if (GET_CODE (op0) == MINUS)
14012             {
14013 	      /* SUBS.  */
14014               x = op0;
14015               goto cost_minus;
14016             }
14017 
14018 	  if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
14019 	      && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
14020 	      && CONST_INT_P (XEXP (op0, 2)))
14021 	    {
14022 	      /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14023 		 Handle it here directly rather than going to cost_logic
14024 		 since we know the immediate generated for the TST is valid
14025 		 so we can avoid creating an intermediate rtx for it only
14026 		 for costing purposes.  */
14027 	      if (speed)
14028 		*cost += extra_cost->alu.logical;
14029 
14030 	      *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
14031 				 ZERO_EXTRACT, 0, speed);
14032 	      return true;
14033 	    }
14034 
14035           if (GET_CODE (op1) == NEG)
14036             {
14037 	      /* CMN.  */
14038 	      if (speed)
14039 		*cost += extra_cost->alu.arith;
14040 
14041 	      *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
14042 	      *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
14043               return true;
14044             }
14045 
14046           /* CMP.
14047 
14048 	     Compare can freely swap the order of operands, and
14049              canonicalization puts the more complex operation first.
14050              But the integer MINUS logic expects the shift/extend
14051              operation in op1.  */
14052           if (! (REG_P (op0)
14053 		 || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0)))))
14054           {
14055             op0 = XEXP (x, 1);
14056             op1 = XEXP (x, 0);
14057           }
14058           goto cost_minus;
14059         }
14060 
14061       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
14062         {
14063 	  /* FCMP.  */
14064 	  if (speed)
14065 	    *cost += extra_cost->fp[mode == DFmode].compare;
14066 
14067           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
14068             {
14069 	      *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
14070               /* FCMP supports constant 0.0 for no extra cost. */
14071               return true;
14072             }
14073           return false;
14074         }
14075 
14076       if (VECTOR_MODE_P (mode))
14077 	{
14078 	  /* Vector compare.  */
14079 	  if (speed)
14080 	    *cost += extra_cost->vect.alu;
14081 
14082 	  if (aarch64_float_const_zero_rtx_p (op1))
14083 	    {
14084 	      /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14085 		 cost.  */
14086 	      return true;
14087 	    }
14088 	  return false;
14089 	}
14090       return false;
14091 
14092     case MINUS:
14093       {
14094 	op0 = XEXP (x, 0);
14095 	op1 = XEXP (x, 1);
14096 
14097 cost_minus:
14098 	if (VECTOR_MODE_P (mode))
14099 	  {
14100 	    /* SUBL2 and SUBW2.  */
14101 	    unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14102 	    if (vec_flags & VEC_ADVSIMD)
14103 	      {
14104 		/* The select-operand-high-half versions of the sub instruction
14105 		   have the same cost as the regular three vector version -
14106 		   don't add the costs of the select into the costs of the sub.
14107 		   */
14108 		op0 = aarch64_strip_extend_vec_half (op0);
14109 		op1 = aarch64_strip_extend_vec_half (op1);
14110 	      }
14111 	  }
14112 
14113 	*cost += rtx_cost (op0, mode, MINUS, 0, speed);
14114 
14115 	/* Detect valid immediates.  */
14116 	if ((GET_MODE_CLASS (mode) == MODE_INT
14117 	     || (GET_MODE_CLASS (mode) == MODE_CC
14118 		 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
14119 	    && CONST_INT_P (op1)
14120 	    && aarch64_uimm12_shift (INTVAL (op1)))
14121 	  {
14122 	    if (speed)
14123 	      /* SUB(S) (immediate).  */
14124 	      *cost += extra_cost->alu.arith;
14125 	    return true;
14126 	  }
14127 
14128 	/* Look for SUB (extended register).  */
14129 	if (is_a <scalar_int_mode> (mode)
14130 	    && aarch64_rtx_arith_op_extract_p (op1))
14131 	  {
14132 	    if (speed)
14133 	      *cost += extra_cost->alu.extend_arith;
14134 
14135 	    op1 = aarch64_strip_extend (op1, true);
14136 	    *cost += rtx_cost (op1, VOIDmode,
14137 			       (enum rtx_code) GET_CODE (op1), 0, speed);
14138 	    return true;
14139 	  }
14140 
14141 	rtx new_op1 = aarch64_strip_extend (op1, false);
14142 
14143 	/* Cost this as an FMA-alike operation.  */
14144 	if ((GET_CODE (new_op1) == MULT
14145 	     || aarch64_shift_p (GET_CODE (new_op1)))
14146 	    && code != COMPARE)
14147 	  {
14148 	    *cost += aarch64_rtx_mult_cost (new_op1, MULT,
14149 					    (enum rtx_code) code,
14150 					    speed);
14151 	    return true;
14152 	  }
14153 
14154 	*cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
14155 
14156 	if (speed)
14157 	  {
14158 	    if (VECTOR_MODE_P (mode))
14159 	      {
14160 		/* Vector SUB.  */
14161 		*cost += extra_cost->vect.alu;
14162 	      }
14163 	    else if (GET_MODE_CLASS (mode) == MODE_INT)
14164 	      {
14165 		/* SUB(S).  */
14166 		*cost += extra_cost->alu.arith;
14167 	      }
14168 	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14169 	      {
14170 		/* FSUB.  */
14171 		*cost += extra_cost->fp[mode == DFmode].addsub;
14172 	      }
14173 	  }
14174 	return true;
14175       }
14176 
14177     case PLUS:
14178       {
14179 	rtx new_op0;
14180 
14181 	op0 = XEXP (x, 0);
14182 	op1 = XEXP (x, 1);
14183 
14184 cost_plus:
14185 	if (VECTOR_MODE_P (mode))
14186 	  {
14187 	    /* ADDL2 and ADDW2.  */
14188 	    unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14189 	    if (vec_flags & VEC_ADVSIMD)
14190 	      {
14191 		/* The select-operand-high-half versions of the add instruction
14192 		   have the same cost as the regular three vector version -
14193 		   don't add the costs of the select into the costs of the add.
14194 		   */
14195 		op0 = aarch64_strip_extend_vec_half (op0);
14196 		op1 = aarch64_strip_extend_vec_half (op1);
14197 	      }
14198 	  }
14199 
14200 	if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14201 	    || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14202 	  {
14203 	    /* CSINC.  */
14204 	    *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
14205 	    *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14206 	    return true;
14207 	  }
14208 
14209 	if (GET_MODE_CLASS (mode) == MODE_INT
14210 	    && (aarch64_plus_immediate (op1, mode)
14211 		|| aarch64_sve_addvl_addpl_immediate (op1, mode)))
14212 	  {
14213 	    *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14214 
14215 	    if (speed)
14216 	      {
14217 		/* ADD (immediate).  */
14218 		*cost += extra_cost->alu.arith;
14219 
14220 		/* Some tunings prefer to not use the VL-based scalar ops.
14221 		   Increase the cost of the poly immediate to prevent their
14222 		   formation.  */
14223 		if (GET_CODE (op1) == CONST_POLY_INT
14224 		    && (aarch64_tune_params.extra_tuning_flags
14225 			& AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
14226 		  *cost += COSTS_N_INSNS (1);
14227 	      }
14228 	    return true;
14229 	  }
14230 
14231 	*cost += rtx_cost (op1, mode, PLUS, 1, speed);
14232 
14233 	/* Look for ADD (extended register).  */
14234 	if (is_a <scalar_int_mode> (mode)
14235 	    && aarch64_rtx_arith_op_extract_p (op0))
14236 	  {
14237 	    if (speed)
14238 	      *cost += extra_cost->alu.extend_arith;
14239 
14240 	    op0 = aarch64_strip_extend (op0, true);
14241 	    *cost += rtx_cost (op0, VOIDmode,
14242 			       (enum rtx_code) GET_CODE (op0), 0, speed);
14243 	    return true;
14244 	  }
14245 
14246 	/* Strip any extend, leave shifts behind as we will
14247 	   cost them through mult_cost.  */
14248 	new_op0 = aarch64_strip_extend (op0, false);
14249 
14250 	if (GET_CODE (new_op0) == MULT
14251 	    || aarch64_shift_p (GET_CODE (new_op0)))
14252 	  {
14253 	    *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
14254 					    speed);
14255 	    return true;
14256 	  }
14257 
14258 	*cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
14259 
14260 	if (speed)
14261 	  {
14262 	    if (VECTOR_MODE_P (mode))
14263 	      {
14264 		/* Vector ADD.  */
14265 		*cost += extra_cost->vect.alu;
14266 	      }
14267 	    else if (GET_MODE_CLASS (mode) == MODE_INT)
14268 	      {
14269 		/* ADD.  */
14270 		*cost += extra_cost->alu.arith;
14271 	      }
14272 	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14273 	      {
14274 		/* FADD.  */
14275 		*cost += extra_cost->fp[mode == DFmode].addsub;
14276 	      }
14277 	  }
14278 	return true;
14279       }
14280 
14281     case BSWAP:
14282       *cost = COSTS_N_INSNS (1);
14283 
14284       if (speed)
14285 	{
14286 	  if (VECTOR_MODE_P (mode))
14287 	    *cost += extra_cost->vect.alu;
14288 	  else
14289 	    *cost += extra_cost->alu.rev;
14290 	}
14291       return false;
14292 
14293     case IOR:
14294       if (aarch_rev16_p (x))
14295         {
14296           *cost = COSTS_N_INSNS (1);
14297 
14298 	  if (speed)
14299 	    {
14300 	      if (VECTOR_MODE_P (mode))
14301 		*cost += extra_cost->vect.alu;
14302 	      else
14303 		*cost += extra_cost->alu.rev;
14304 	    }
14305 	  return true;
14306         }
14307 
14308       if (aarch64_extr_rtx_p (x, &op0, &op1))
14309         {
14310 	  *cost += rtx_cost (op0, mode, IOR, 0, speed);
14311 	  *cost += rtx_cost (op1, mode, IOR, 1, speed);
14312           if (speed)
14313             *cost += extra_cost->alu.shift;
14314 
14315           return true;
14316         }
14317     /* Fall through.  */
14318     case XOR:
14319     case AND:
14320     cost_logic:
14321       op0 = XEXP (x, 0);
14322       op1 = XEXP (x, 1);
14323 
14324       if (VECTOR_MODE_P (mode))
14325 	{
14326 	  if (speed)
14327 	    *cost += extra_cost->vect.alu;
14328 	  return true;
14329 	}
14330 
14331       if (code == AND
14332           && GET_CODE (op0) == MULT
14333           && CONST_INT_P (XEXP (op0, 1))
14334           && CONST_INT_P (op1)
14335           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
14336                                INTVAL (op1)) != 0)
14337         {
14338           /* This is a UBFM/SBFM.  */
14339 	  *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
14340 	  if (speed)
14341 	    *cost += extra_cost->alu.bfx;
14342           return true;
14343         }
14344 
14345       if (is_int_mode (mode, &int_mode))
14346 	{
14347 	  if (CONST_INT_P (op1))
14348 	    {
14349 	      /* We have a mask + shift version of a UBFIZ
14350 		 i.e. the *andim_ashift<mode>_bfiz pattern.  */
14351 	      if (GET_CODE (op0) == ASHIFT
14352 		  && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
14353 							 XEXP (op0, 1)))
14354 		{
14355 		  *cost += rtx_cost (XEXP (op0, 0), int_mode,
14356 				     (enum rtx_code) code, 0, speed);
14357 		  if (speed)
14358 		    *cost += extra_cost->alu.bfx;
14359 
14360 		  return true;
14361 		}
14362 	      else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
14363 		{
14364 		/* We possibly get the immediate for free, this is not
14365 		   modelled.  */
14366 		  *cost += rtx_cost (op0, int_mode,
14367 				     (enum rtx_code) code, 0, speed);
14368 		  if (speed)
14369 		    *cost += extra_cost->alu.logical;
14370 
14371 		  return true;
14372 		}
14373 	    }
14374 	  else
14375 	    {
14376 	      rtx new_op0 = op0;
14377 
14378 	      /* Handle ORN, EON, or BIC.  */
14379 	      if (GET_CODE (op0) == NOT)
14380 		op0 = XEXP (op0, 0);
14381 
14382 	      new_op0 = aarch64_strip_shift (op0);
14383 
14384 	      /* If we had a shift on op0 then this is a logical-shift-
14385 		 by-register/immediate operation.  Otherwise, this is just
14386 		 a logical operation.  */
14387 	      if (speed)
14388 		{
14389 		  if (new_op0 != op0)
14390 		    {
14391 		      /* Shift by immediate.  */
14392 		      if (CONST_INT_P (XEXP (op0, 1)))
14393 			*cost += extra_cost->alu.log_shift;
14394 		      else
14395 			*cost += extra_cost->alu.log_shift_reg;
14396 		    }
14397 		  else
14398 		    *cost += extra_cost->alu.logical;
14399 		}
14400 
14401 	      /* In both cases we want to cost both operands.  */
14402 	      *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
14403 				 0, speed);
14404 	      *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
14405 				 1, speed);
14406 
14407 	      return true;
14408 	    }
14409 	}
14410       return false;
14411 
14412     case NOT:
14413       x = XEXP (x, 0);
14414       op0 = aarch64_strip_shift (x);
14415 
14416       if (VECTOR_MODE_P (mode))
14417 	{
14418 	  /* Vector NOT.  */
14419 	  *cost += extra_cost->vect.alu;
14420 	  return false;
14421 	}
14422 
14423       /* MVN-shifted-reg.  */
14424       if (op0 != x)
14425         {
14426 	  *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14427 
14428           if (speed)
14429             *cost += extra_cost->alu.log_shift;
14430 
14431           return true;
14432         }
14433       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
14434          Handle the second form here taking care that 'a' in the above can
14435          be a shift.  */
14436       else if (GET_CODE (op0) == XOR)
14437         {
14438           rtx newop0 = XEXP (op0, 0);
14439           rtx newop1 = XEXP (op0, 1);
14440           rtx op0_stripped = aarch64_strip_shift (newop0);
14441 
14442 	  *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
14443 	  *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
14444 
14445           if (speed)
14446             {
14447               if (op0_stripped != newop0)
14448                 *cost += extra_cost->alu.log_shift;
14449               else
14450                 *cost += extra_cost->alu.logical;
14451             }
14452 
14453           return true;
14454         }
14455       /* MVN.  */
14456       if (speed)
14457 	*cost += extra_cost->alu.logical;
14458 
14459       return false;
14460 
14461     case ZERO_EXTEND:
14462 
14463       op0 = XEXP (x, 0);
14464       /* If a value is written in SI mode, then zero extended to DI
14465 	 mode, the operation will in general be free as a write to
14466 	 a 'w' register implicitly zeroes the upper bits of an 'x'
14467 	 register.  However, if this is
14468 
14469 	   (set (reg) (zero_extend (reg)))
14470 
14471 	 we must cost the explicit register move.  */
14472       if (mode == DImode
14473 	  && GET_MODE (op0) == SImode)
14474 	{
14475 	  int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
14476 
14477 	/* If OP_COST is non-zero, then the cost of the zero extend
14478 	   is effectively the cost of the inner operation.  Otherwise
14479 	   we have a MOV instruction and we take the cost from the MOV
14480 	   itself.  This is true independently of whether we are
14481 	   optimizing for space or time.  */
14482 	  if (op_cost)
14483 	    *cost = op_cost;
14484 
14485 	  return true;
14486 	}
14487       else if (MEM_P (op0))
14488 	{
14489 	  /* All loads can zero extend to any size for free.  */
14490 	  *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
14491 	  return true;
14492 	}
14493 
14494       op0 = aarch64_extend_bitfield_pattern_p (x);
14495       if (op0)
14496 	{
14497 	  *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
14498 	  if (speed)
14499 	    *cost += extra_cost->alu.bfx;
14500 	  return true;
14501 	}
14502 
14503       if (speed)
14504 	{
14505 	  if (VECTOR_MODE_P (mode))
14506 	    {
14507 	      /* UMOV.  */
14508 	      *cost += extra_cost->vect.alu;
14509 	    }
14510 	  else
14511 	    {
14512 	      /* We generate an AND instead of UXTB/UXTH.  */
14513 	      *cost += extra_cost->alu.logical;
14514 	    }
14515 	}
14516       return false;
14517 
14518     case SIGN_EXTEND:
14519       if (MEM_P (XEXP (x, 0)))
14520 	{
14521 	  /* LDRSH.  */
14522 	  if (speed)
14523 	    {
14524 	      rtx address = XEXP (XEXP (x, 0), 0);
14525 	      *cost += extra_cost->ldst.load_sign_extend;
14526 
14527 	      *cost +=
14528 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
14529 						     0, speed));
14530 	    }
14531 	  return true;
14532 	}
14533 
14534       op0 = aarch64_extend_bitfield_pattern_p (x);
14535       if (op0)
14536 	{
14537 	  *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
14538 	  if (speed)
14539 	    *cost += extra_cost->alu.bfx;
14540 	  return true;
14541 	}
14542 
14543       if (speed)
14544 	{
14545 	  if (VECTOR_MODE_P (mode))
14546 	    *cost += extra_cost->vect.alu;
14547 	  else
14548 	    *cost += extra_cost->alu.extend;
14549 	}
14550       return false;
14551 
14552     case ASHIFT:
14553       op0 = XEXP (x, 0);
14554       op1 = XEXP (x, 1);
14555 
14556       if (CONST_INT_P (op1))
14557         {
14558 	  if (speed)
14559 	    {
14560 	      if (VECTOR_MODE_P (mode))
14561 		{
14562 		  /* Vector shift (immediate).  */
14563 		  *cost += extra_cost->vect.alu;
14564 		}
14565 	      else
14566 		{
14567 		  /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
14568 		     aliases.  */
14569 		  *cost += extra_cost->alu.shift;
14570 		}
14571 	    }
14572 
14573           /* We can incorporate zero/sign extend for free.  */
14574           if (GET_CODE (op0) == ZERO_EXTEND
14575               || GET_CODE (op0) == SIGN_EXTEND)
14576             op0 = XEXP (op0, 0);
14577 
14578 	  *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
14579           return true;
14580         }
14581       else
14582         {
14583 	  if (VECTOR_MODE_P (mode))
14584 	    {
14585 	      if (speed)
14586 		/* Vector shift (register).  */
14587 		*cost += extra_cost->vect.alu;
14588 	    }
14589 	  else
14590 	    {
14591 	      if (speed)
14592 		/* LSLV.  */
14593 		*cost += extra_cost->alu.shift_reg;
14594 
14595 	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
14596 		  && CONST_INT_P (XEXP (op1, 1))
14597 		  && known_eq (INTVAL (XEXP (op1, 1)),
14598 			       GET_MODE_BITSIZE (mode) - 1))
14599 		{
14600 		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
14601 		  /* We already demanded XEXP (op1, 0) to be REG_P, so
14602 		     don't recurse into it.  */
14603 		  return true;
14604 		}
14605 	    }
14606 	  return false;  /* All arguments need to be in registers.  */
14607         }
14608 
14609     case ROTATE:
14610     case ROTATERT:
14611     case LSHIFTRT:
14612     case ASHIFTRT:
14613       op0 = XEXP (x, 0);
14614       op1 = XEXP (x, 1);
14615 
14616       if (CONST_INT_P (op1))
14617 	{
14618 	  /* ASR (immediate) and friends.  */
14619 	  if (speed)
14620 	    {
14621 	      if (VECTOR_MODE_P (mode))
14622 		*cost += extra_cost->vect.alu;
14623 	      else
14624 		*cost += extra_cost->alu.shift;
14625 	    }
14626 
14627 	  *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14628 	  return true;
14629 	}
14630       else
14631 	{
14632 	  if (VECTOR_MODE_P (mode))
14633 	    {
14634 	      if (speed)
14635 		/* Vector shift (register).  */
14636 		*cost += extra_cost->vect.alu;
14637 	    }
14638 	  else
14639 	    {
14640 	      if (speed)
14641 		/* ASR (register) and friends.  */
14642 		*cost += extra_cost->alu.shift_reg;
14643 
14644 	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
14645 		  && CONST_INT_P (XEXP (op1, 1))
14646 		  && known_eq (INTVAL (XEXP (op1, 1)),
14647 			       GET_MODE_BITSIZE (mode) - 1))
14648 		{
14649 		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
14650 		  /* We already demanded XEXP (op1, 0) to be REG_P, so
14651 		     don't recurse into it.  */
14652 		  return true;
14653 		}
14654 	    }
14655 	  return false;  /* All arguments need to be in registers.  */
14656 	}
14657 
14658     case SYMBOL_REF:
14659 
14660       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
14661 	  || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
14662 	{
14663 	  /* LDR.  */
14664 	  if (speed)
14665 	    *cost += extra_cost->ldst.load;
14666 	}
14667       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
14668 	       || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
14669 	{
14670 	  /* ADRP, followed by ADD.  */
14671 	  *cost += COSTS_N_INSNS (1);
14672 	  if (speed)
14673 	    *cost += 2 * extra_cost->alu.arith;
14674 	}
14675       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
14676 	       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14677 	{
14678 	  /* ADR.  */
14679 	  if (speed)
14680 	    *cost += extra_cost->alu.arith;
14681 	}
14682 
14683       if (flag_pic)
14684 	{
14685 	  /* One extra load instruction, after accessing the GOT.  */
14686 	  *cost += COSTS_N_INSNS (1);
14687 	  if (speed)
14688 	    *cost += extra_cost->ldst.load;
14689 	}
14690       return true;
14691 
14692     case HIGH:
14693     case LO_SUM:
14694       /* ADRP/ADD (immediate).  */
14695       if (speed)
14696 	*cost += extra_cost->alu.arith;
14697       return true;
14698 
14699     case ZERO_EXTRACT:
14700     case SIGN_EXTRACT:
14701       /* UBFX/SBFX.  */
14702       if (speed)
14703 	{
14704 	  if (VECTOR_MODE_P (mode))
14705 	    *cost += extra_cost->vect.alu;
14706 	  else
14707 	    *cost += extra_cost->alu.bfx;
14708 	}
14709 
14710       /* We can trust that the immediates used will be correct (there
14711 	 are no by-register forms), so we need only cost op0.  */
14712       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
14713       return true;
14714 
14715     case MULT:
14716       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
14717       /* aarch64_rtx_mult_cost always handles recursion to its
14718 	 operands.  */
14719       return true;
14720 
14721     case MOD:
14722     /* We can expand signed mod by power of 2 using a NEGS, two parallel
14723        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
14724        an unconditional negate.  This case should only ever be reached through
14725        the set_smod_pow2_cheap check in expmed.cc.  */
14726       if (CONST_INT_P (XEXP (x, 1))
14727 	  && exact_log2 (INTVAL (XEXP (x, 1))) > 0
14728 	  && (mode == SImode || mode == DImode))
14729 	{
14730 	  /* We expand to 4 instructions.  Reset the baseline.  */
14731 	  *cost = COSTS_N_INSNS (4);
14732 
14733 	  if (speed)
14734 	    *cost += 2 * extra_cost->alu.logical
14735 		     + 2 * extra_cost->alu.arith;
14736 
14737 	  return true;
14738 	}
14739 
14740     /* Fall-through.  */
14741     case UMOD:
14742       if (speed)
14743 	{
14744 	  /* Slighly prefer UMOD over SMOD.  */
14745 	  if (VECTOR_MODE_P (mode))
14746 	    *cost += extra_cost->vect.alu;
14747 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
14748 	    *cost += (extra_cost->mult[mode == DImode].add
14749 		      + extra_cost->mult[mode == DImode].idiv
14750 		      + (code == MOD ? 1 : 0));
14751 	}
14752       return false;  /* All arguments need to be in registers.  */
14753 
14754     case DIV:
14755     case UDIV:
14756     case SQRT:
14757       if (speed)
14758 	{
14759 	  if (VECTOR_MODE_P (mode))
14760 	    *cost += extra_cost->vect.alu;
14761 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
14762 	    /* There is no integer SQRT, so only DIV and UDIV can get
14763 	       here.  */
14764 	    *cost += (extra_cost->mult[mode == DImode].idiv
14765 		     /* Slighly prefer UDIV over SDIV.  */
14766 		     + (code == DIV ? 1 : 0));
14767 	  else
14768 	    *cost += extra_cost->fp[mode == DFmode].div;
14769 	}
14770       return false;  /* All arguments need to be in registers.  */
14771 
14772     case IF_THEN_ELSE:
14773       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
14774 					 XEXP (x, 2), cost, speed);
14775 
14776     case EQ:
14777     case NE:
14778     case GT:
14779     case GTU:
14780     case LT:
14781     case LTU:
14782     case GE:
14783     case GEU:
14784     case LE:
14785     case LEU:
14786 
14787       return false; /* All arguments must be in registers.  */
14788 
14789     case FMA:
14790       op0 = XEXP (x, 0);
14791       op1 = XEXP (x, 1);
14792       op2 = XEXP (x, 2);
14793 
14794       if (speed)
14795 	{
14796 	  if (VECTOR_MODE_P (mode))
14797 	    *cost += extra_cost->vect.alu;
14798 	  else
14799 	    *cost += extra_cost->fp[mode == DFmode].fma;
14800 	}
14801 
14802       /* FMSUB, FNMADD, and FNMSUB are free.  */
14803       if (GET_CODE (op0) == NEG)
14804         op0 = XEXP (op0, 0);
14805 
14806       if (GET_CODE (op2) == NEG)
14807         op2 = XEXP (op2, 0);
14808 
14809       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
14810 	 and the by-element operand as operand 0.  */
14811       if (GET_CODE (op1) == NEG)
14812         op1 = XEXP (op1, 0);
14813 
14814       /* Catch vector-by-element operations.  The by-element operand can
14815 	 either be (vec_duplicate (vec_select (x))) or just
14816 	 (vec_select (x)), depending on whether we are multiplying by
14817 	 a vector or a scalar.
14818 
14819 	 Canonicalization is not very good in these cases, FMA4 will put the
14820 	 by-element operand as operand 0, FNMA4 will have it as operand 1.  */
14821       if (GET_CODE (op0) == VEC_DUPLICATE)
14822 	op0 = XEXP (op0, 0);
14823       else if (GET_CODE (op1) == VEC_DUPLICATE)
14824 	op1 = XEXP (op1, 0);
14825 
14826       if (GET_CODE (op0) == VEC_SELECT)
14827 	op0 = XEXP (op0, 0);
14828       else if (GET_CODE (op1) == VEC_SELECT)
14829 	op1 = XEXP (op1, 0);
14830 
14831       /* If the remaining parameters are not registers,
14832          get the cost to put them into registers.  */
14833       *cost += rtx_cost (op0, mode, FMA, 0, speed);
14834       *cost += rtx_cost (op1, mode, FMA, 1, speed);
14835       *cost += rtx_cost (op2, mode, FMA, 2, speed);
14836       return true;
14837 
14838     case FLOAT:
14839     case UNSIGNED_FLOAT:
14840       if (speed)
14841 	*cost += extra_cost->fp[mode == DFmode].fromint;
14842       return false;
14843 
14844     case FLOAT_EXTEND:
14845       if (speed)
14846 	{
14847 	  if (VECTOR_MODE_P (mode))
14848 	    {
14849 	      /*Vector truncate.  */
14850 	      *cost += extra_cost->vect.alu;
14851 	    }
14852 	  else
14853 	    *cost += extra_cost->fp[mode == DFmode].widen;
14854 	}
14855       return false;
14856 
14857     case FLOAT_TRUNCATE:
14858       if (speed)
14859 	{
14860 	  if (VECTOR_MODE_P (mode))
14861 	    {
14862 	      /*Vector conversion.  */
14863 	      *cost += extra_cost->vect.alu;
14864 	    }
14865 	  else
14866 	    *cost += extra_cost->fp[mode == DFmode].narrow;
14867 	}
14868       return false;
14869 
14870     case FIX:
14871     case UNSIGNED_FIX:
14872       x = XEXP (x, 0);
14873       /* Strip the rounding part.  They will all be implemented
14874          by the fcvt* family of instructions anyway.  */
14875       if (GET_CODE (x) == UNSPEC)
14876         {
14877           unsigned int uns_code = XINT (x, 1);
14878 
14879           if (uns_code == UNSPEC_FRINTA
14880               || uns_code == UNSPEC_FRINTM
14881               || uns_code == UNSPEC_FRINTN
14882               || uns_code == UNSPEC_FRINTP
14883               || uns_code == UNSPEC_FRINTZ)
14884             x = XVECEXP (x, 0, 0);
14885         }
14886 
14887       if (speed)
14888 	{
14889 	  if (VECTOR_MODE_P (mode))
14890 	    *cost += extra_cost->vect.alu;
14891 	  else
14892 	    *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
14893 	}
14894 
14895       /* We can combine fmul by a power of 2 followed by a fcvt into a single
14896 	 fixed-point fcvt.  */
14897       if (GET_CODE (x) == MULT
14898 	  && ((VECTOR_MODE_P (mode)
14899 	       && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
14900 	      || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
14901 	{
14902 	  *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
14903 			     0, speed);
14904 	  return true;
14905 	}
14906 
14907       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
14908       return true;
14909 
14910     case ABS:
14911       if (VECTOR_MODE_P (mode))
14912 	{
14913 	  /* ABS (vector).  */
14914 	  if (speed)
14915 	    *cost += extra_cost->vect.alu;
14916 	}
14917       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14918 	{
14919 	  op0 = XEXP (x, 0);
14920 
14921 	  /* FABD, which is analogous to FADD.  */
14922 	  if (GET_CODE (op0) == MINUS)
14923 	    {
14924 	      *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
14925 	      *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
14926 	      if (speed)
14927 		*cost += extra_cost->fp[mode == DFmode].addsub;
14928 
14929 	      return true;
14930 	    }
14931 	  /* Simple FABS is analogous to FNEG.  */
14932 	  if (speed)
14933 	    *cost += extra_cost->fp[mode == DFmode].neg;
14934 	}
14935       else
14936 	{
14937 	  /* Integer ABS will either be split to
14938 	     two arithmetic instructions, or will be an ABS
14939 	     (scalar), which we don't model.  */
14940 	  *cost = COSTS_N_INSNS (2);
14941 	  if (speed)
14942 	    *cost += 2 * extra_cost->alu.arith;
14943 	}
14944       return false;
14945 
14946     case SMAX:
14947     case SMIN:
14948       if (speed)
14949 	{
14950 	  if (VECTOR_MODE_P (mode))
14951 	    *cost += extra_cost->vect.alu;
14952 	  else
14953 	    {
14954 	      /* FMAXNM/FMINNM/FMAX/FMIN.
14955 	         TODO: This may not be accurate for all implementations, but
14956 	         we do not model this in the cost tables.  */
14957 	      *cost += extra_cost->fp[mode == DFmode].addsub;
14958 	    }
14959 	}
14960       return false;
14961 
14962     case UNSPEC:
14963       /* The floating point round to integer frint* instructions.  */
14964       if (aarch64_frint_unspec_p (XINT (x, 1)))
14965         {
14966           if (speed)
14967             *cost += extra_cost->fp[mode == DFmode].roundint;
14968 
14969           return false;
14970         }
14971 
14972       if (XINT (x, 1) == UNSPEC_RBIT)
14973         {
14974           if (speed)
14975             *cost += extra_cost->alu.rev;
14976 
14977           return false;
14978         }
14979       break;
14980 
14981     case TRUNCATE:
14982 
14983       /* Decompose <su>muldi3_highpart.  */
14984       if (/* (truncate:DI  */
14985 	  mode == DImode
14986 	  /*   (lshiftrt:TI  */
14987           && GET_MODE (XEXP (x, 0)) == TImode
14988           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
14989 	  /*      (mult:TI  */
14990           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
14991 	  /*        (ANY_EXTEND:TI (reg:DI))
14992 	            (ANY_EXTEND:TI (reg:DI)))  */
14993           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
14994                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
14995               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
14996                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
14997           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
14998           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
14999 	  /*     (const_int 64)  */
15000           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
15001           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
15002         {
15003           /* UMULH/SMULH.  */
15004 	  if (speed)
15005 	    *cost += extra_cost->mult[mode == DImode].extend;
15006 	  *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
15007 			     mode, MULT, 0, speed);
15008 	  *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
15009 			     mode, MULT, 1, speed);
15010           return true;
15011         }
15012 	break;
15013     case CONST_VECTOR:
15014 	{
15015 	  /* Load using MOVI/MVNI.  */
15016 	  if (aarch64_simd_valid_immediate (x, NULL))
15017 	    *cost = extra_cost->vect.movi;
15018 	  else /* Load using constant pool.  */
15019 	    *cost = extra_cost->ldst.load;
15020 	  break;
15021 	}
15022     case VEC_CONCAT:
15023 	/* depending on the operation, either DUP or INS.
15024 	   For now, keep default costing.  */
15025 	break;
15026     case VEC_DUPLICATE:
15027 	/* Load using a DUP.  */
15028 	*cost = extra_cost->vect.dup;
15029 	return false;
15030     case VEC_SELECT:
15031 	{
15032 	  rtx op0 = XEXP (x, 0);
15033 	  *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
15034 
15035 	  /* cost subreg of 0 as free, otherwise as DUP */
15036 	  rtx op1 = XEXP (x, 1);
15037 	  if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
15038 	    ;
15039 	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
15040 	    *cost = extra_cost->vect.dup;
15041 	  else
15042 	    *cost = extra_cost->vect.extract;
15043 	  return true;
15044 	}
15045     default:
15046       break;
15047     }
15048 
15049   if (dump_file
15050       && flag_aarch64_verbose_cost)
15051     fprintf (dump_file,
15052       "\nFailed to cost RTX.  Assuming default cost.\n");
15053 
15054   return true;
15055 }
15056 
15057 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15058    calculated for X.  This cost is stored in *COST.  Returns true
15059    if the total cost of X was calculated.  */
15060 static bool
aarch64_rtx_costs_wrapper(rtx x,machine_mode mode,int outer,int param,int * cost,bool speed)15061 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
15062 		   int param, int *cost, bool speed)
15063 {
15064   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
15065 
15066   if (dump_file
15067       && flag_aarch64_verbose_cost)
15068     {
15069       print_rtl_single (dump_file, x);
15070       fprintf (dump_file, "\n%s cost: %d (%s)\n",
15071 	       speed ? "Hot" : "Cold",
15072 	       *cost, result ? "final" : "partial");
15073     }
15074 
15075   return result;
15076 }
15077 
15078 static int
aarch64_register_move_cost(machine_mode mode,reg_class_t from_i,reg_class_t to_i)15079 aarch64_register_move_cost (machine_mode mode,
15080 			    reg_class_t from_i, reg_class_t to_i)
15081 {
15082   enum reg_class from = (enum reg_class) from_i;
15083   enum reg_class to = (enum reg_class) to_i;
15084   const struct cpu_regmove_cost *regmove_cost
15085     = aarch64_tune_params.regmove_cost;
15086 
15087   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
15088   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
15089       || to == STUB_REGS)
15090     to = GENERAL_REGS;
15091 
15092   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
15093       || from == STUB_REGS)
15094     from = GENERAL_REGS;
15095 
15096   /* Make RDFFR very expensive.  In particular, if we know that the FFR
15097      contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15098      as a way of obtaining a PTRUE.  */
15099   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15100       && hard_reg_set_subset_p (reg_class_contents[from_i],
15101 				reg_class_contents[FFR_REGS]))
15102     return 80;
15103 
15104   /* Moving between GPR and stack cost is the same as GP2GP.  */
15105   if ((from == GENERAL_REGS && to == STACK_REG)
15106       || (to == GENERAL_REGS && from == STACK_REG))
15107     return regmove_cost->GP2GP;
15108 
15109   /* To/From the stack register, we move via the gprs.  */
15110   if (to == STACK_REG || from == STACK_REG)
15111     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
15112             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
15113 
15114   if (known_eq (GET_MODE_SIZE (mode), 16))
15115     {
15116       /* 128-bit operations on general registers require 2 instructions.  */
15117       if (from == GENERAL_REGS && to == GENERAL_REGS)
15118 	return regmove_cost->GP2GP * 2;
15119       else if (from == GENERAL_REGS)
15120 	return regmove_cost->GP2FP * 2;
15121       else if (to == GENERAL_REGS)
15122 	return regmove_cost->FP2GP * 2;
15123 
15124       /* When AdvSIMD instructions are disabled it is not possible to move
15125 	 a 128-bit value directly between Q registers.  This is handled in
15126 	 secondary reload.  A general register is used as a scratch to move
15127 	 the upper DI value and the lower DI value is moved directly,
15128 	 hence the cost is the sum of three moves. */
15129       if (! TARGET_SIMD)
15130 	return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
15131 
15132       return regmove_cost->FP2FP;
15133     }
15134 
15135   if (from == GENERAL_REGS && to == GENERAL_REGS)
15136     return regmove_cost->GP2GP;
15137   else if (from == GENERAL_REGS)
15138     return regmove_cost->GP2FP;
15139   else if (to == GENERAL_REGS)
15140     return regmove_cost->FP2GP;
15141 
15142   return regmove_cost->FP2FP;
15143 }
15144 
15145 /* Implements TARGET_MEMORY_MOVE_COST.  */
15146 static int
aarch64_memory_move_cost(machine_mode mode,reg_class_t rclass_i,bool in)15147 aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
15148 {
15149   enum reg_class rclass = (enum reg_class) rclass_i;
15150   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15151       ? reg_classes_intersect_p (rclass, PR_REGS)
15152       : reg_class_subset_p (rclass, PR_REGS))
15153     return (in
15154 	    ? aarch64_tune_params.memmov_cost.load_pred
15155 	    : aarch64_tune_params.memmov_cost.store_pred);
15156 
15157   if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
15158       ? reg_classes_intersect_p (rclass, FP_REGS)
15159       : reg_class_subset_p (rclass, FP_REGS))
15160     return (in
15161 	    ? aarch64_tune_params.memmov_cost.load_fp
15162 	    : aarch64_tune_params.memmov_cost.store_fp);
15163 
15164   return (in
15165 	  ? aarch64_tune_params.memmov_cost.load_int
15166 	  : aarch64_tune_params.memmov_cost.store_int);
15167 }
15168 
15169 /* Implement TARGET_INIT_BUILTINS.  */
15170 static void
aarch64_init_builtins()15171 aarch64_init_builtins ()
15172 {
15173   aarch64_general_init_builtins ();
15174   aarch64_sve::init_builtins ();
15175 #ifdef SUBTARGET_INIT_BUILTINS
15176   SUBTARGET_INIT_BUILTINS;
15177 #endif
15178 }
15179 
15180 /* Implement TARGET_FOLD_BUILTIN.  */
15181 static tree
aarch64_fold_builtin(tree fndecl,int nargs,tree * args,bool)15182 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
15183 {
15184   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15185   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15186   tree type = TREE_TYPE (TREE_TYPE (fndecl));
15187   switch (code & AARCH64_BUILTIN_CLASS)
15188     {
15189     case AARCH64_BUILTIN_GENERAL:
15190       return aarch64_general_fold_builtin (subcode, type, nargs, args);
15191 
15192     case AARCH64_BUILTIN_SVE:
15193       return NULL_TREE;
15194     }
15195   gcc_unreachable ();
15196 }
15197 
15198 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
15199 static bool
aarch64_gimple_fold_builtin(gimple_stmt_iterator * gsi)15200 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
15201 {
15202   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
15203   tree fndecl = gimple_call_fndecl (stmt);
15204   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15205   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15206   gimple *new_stmt = NULL;
15207   switch (code & AARCH64_BUILTIN_CLASS)
15208     {
15209     case AARCH64_BUILTIN_GENERAL:
15210       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt, gsi);
15211       break;
15212 
15213     case AARCH64_BUILTIN_SVE:
15214       new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
15215       break;
15216     }
15217 
15218   if (!new_stmt)
15219     return false;
15220 
15221   gsi_replace (gsi, new_stmt, true);
15222   return true;
15223 }
15224 
15225 /* Implement TARGET_EXPAND_BUILTIN.  */
15226 static rtx
aarch64_expand_builtin(tree exp,rtx target,rtx,machine_mode,int ignore)15227 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
15228 {
15229   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
15230   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15231   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15232   switch (code & AARCH64_BUILTIN_CLASS)
15233     {
15234     case AARCH64_BUILTIN_GENERAL:
15235       return aarch64_general_expand_builtin (subcode, exp, target, ignore);
15236 
15237     case AARCH64_BUILTIN_SVE:
15238       return aarch64_sve::expand_builtin (subcode, exp, target);
15239     }
15240   gcc_unreachable ();
15241 }
15242 
15243 /* Implement TARGET_BUILTIN_DECL.  */
15244 static tree
aarch64_builtin_decl(unsigned int code,bool initialize_p)15245 aarch64_builtin_decl (unsigned int code, bool initialize_p)
15246 {
15247   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15248   switch (code & AARCH64_BUILTIN_CLASS)
15249     {
15250     case AARCH64_BUILTIN_GENERAL:
15251       return aarch64_general_builtin_decl (subcode, initialize_p);
15252 
15253     case AARCH64_BUILTIN_SVE:
15254       return aarch64_sve::builtin_decl (subcode, initialize_p);
15255     }
15256   gcc_unreachable ();
15257 }
15258 
15259 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15260    to optimize 1.0/sqrt.  */
15261 
15262 static bool
use_rsqrt_p(machine_mode mode)15263 use_rsqrt_p (machine_mode mode)
15264 {
15265   return (!flag_trapping_math
15266 	  && flag_unsafe_math_optimizations
15267 	  && ((aarch64_tune_params.approx_modes->recip_sqrt
15268 	       & AARCH64_APPROX_MODE (mode))
15269 	      || flag_mrecip_low_precision_sqrt));
15270 }
15271 
15272 /* Function to decide when to use the approximate reciprocal square root
15273    builtin.  */
15274 
15275 static tree
aarch64_builtin_reciprocal(tree fndecl)15276 aarch64_builtin_reciprocal (tree fndecl)
15277 {
15278   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
15279 
15280   if (!use_rsqrt_p (mode))
15281     return NULL_TREE;
15282   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15283   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15284   switch (code & AARCH64_BUILTIN_CLASS)
15285     {
15286     case AARCH64_BUILTIN_GENERAL:
15287       return aarch64_general_builtin_rsqrt (subcode);
15288 
15289     case AARCH64_BUILTIN_SVE:
15290       return NULL_TREE;
15291     }
15292   gcc_unreachable ();
15293 }
15294 
15295 /* Emit code to perform the floating-point operation:
15296 
15297      DST = SRC1 * SRC2
15298 
15299    where all three operands are already known to be registers.
15300    If the operation is an SVE one, PTRUE is a suitable all-true
15301    predicate.  */
15302 
15303 static void
aarch64_emit_mult(rtx dst,rtx ptrue,rtx src1,rtx src2)15304 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
15305 {
15306   if (ptrue)
15307     emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
15308 				 dst, ptrue, src1, src2,
15309 				 gen_int_mode (SVE_RELAXED_GP, SImode)));
15310   else
15311     emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
15312 }
15313 
15314 /* Emit instruction sequence to compute either the approximate square root
15315    or its approximate reciprocal, depending on the flag RECP, and return
15316    whether the sequence was emitted or not.  */
15317 
15318 bool
aarch64_emit_approx_sqrt(rtx dst,rtx src,bool recp)15319 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
15320 {
15321   machine_mode mode = GET_MODE (dst);
15322 
15323   if (GET_MODE_INNER (mode) == HFmode)
15324     {
15325       gcc_assert (!recp);
15326       return false;
15327     }
15328 
15329   if (!recp)
15330     {
15331       if (!(flag_mlow_precision_sqrt
15332 	    || (aarch64_tune_params.approx_modes->sqrt
15333 		& AARCH64_APPROX_MODE (mode))))
15334 	return false;
15335 
15336       if (!flag_finite_math_only
15337 	  || flag_trapping_math
15338 	  || !flag_unsafe_math_optimizations
15339 	  || optimize_function_for_size_p (cfun))
15340 	return false;
15341     }
15342   else
15343     /* Caller assumes we cannot fail.  */
15344     gcc_assert (use_rsqrt_p (mode));
15345 
15346   rtx pg = NULL_RTX;
15347   if (aarch64_sve_mode_p (mode))
15348     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15349   machine_mode mmsk = (VECTOR_MODE_P (mode)
15350 		       ? related_int_vector_mode (mode).require ()
15351 		       : int_mode_for_mode (mode).require ());
15352   rtx xmsk = NULL_RTX;
15353   if (!recp)
15354     {
15355       /* When calculating the approximate square root, compare the
15356 	 argument with 0.0 and create a mask.  */
15357       rtx zero = CONST0_RTX (mode);
15358       if (pg)
15359 	{
15360 	  xmsk = gen_reg_rtx (GET_MODE (pg));
15361 	  rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
15362 	  emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
15363 					   xmsk, pg, hint, src, zero));
15364 	}
15365       else
15366 	{
15367 	  xmsk = gen_reg_rtx (mmsk);
15368 	  emit_insn (gen_rtx_SET (xmsk,
15369 				  gen_rtx_NEG (mmsk,
15370 					       gen_rtx_EQ (mmsk, src, zero))));
15371 	}
15372     }
15373 
15374   /* Estimate the approximate reciprocal square root.  */
15375   rtx xdst = gen_reg_rtx (mode);
15376   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
15377 
15378   /* Iterate over the series twice for SF and thrice for DF.  */
15379   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15380 
15381   /* Optionally iterate over the series once less for faster performance
15382      while sacrificing the accuracy.  */
15383   if ((recp && flag_mrecip_low_precision_sqrt)
15384       || (!recp && flag_mlow_precision_sqrt))
15385     iterations--;
15386 
15387   /* Iterate over the series to calculate the approximate reciprocal square
15388      root.  */
15389   rtx x1 = gen_reg_rtx (mode);
15390   while (iterations--)
15391     {
15392       rtx x2 = gen_reg_rtx (mode);
15393       aarch64_emit_mult (x2, pg, xdst, xdst);
15394 
15395       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
15396 
15397       if (iterations > 0)
15398 	aarch64_emit_mult (xdst, pg, xdst, x1);
15399     }
15400 
15401   if (!recp)
15402     {
15403       if (pg)
15404 	/* Multiply nonzero source values by the corresponding intermediate
15405 	   result elements, so that the final calculation is the approximate
15406 	   square root rather than its reciprocal.  Select a zero result for
15407 	   zero source values, to avoid the Inf * 0 -> NaN that we'd get
15408 	   otherwise.  */
15409 	emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
15410 			     xdst, xmsk, xdst, src, CONST0_RTX (mode)));
15411       else
15412 	{
15413 	  /* Qualify the approximate reciprocal square root when the
15414 	     argument is 0.0 by squashing the intermediary result to 0.0.  */
15415 	  rtx xtmp = gen_reg_rtx (mmsk);
15416 	  emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
15417 					    gen_rtx_SUBREG (mmsk, xdst, 0)));
15418 	  emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
15419 
15420 	  /* Calculate the approximate square root.  */
15421 	  aarch64_emit_mult (xdst, pg, xdst, src);
15422 	}
15423     }
15424 
15425   /* Finalize the approximation.  */
15426   aarch64_emit_mult (dst, pg, xdst, x1);
15427 
15428   return true;
15429 }
15430 
15431 /* Emit the instruction sequence to compute the approximation for the division
15432    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
15433 
15434 bool
aarch64_emit_approx_div(rtx quo,rtx num,rtx den)15435 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
15436 {
15437   machine_mode mode = GET_MODE (quo);
15438 
15439   if (GET_MODE_INNER (mode) == HFmode)
15440     return false;
15441 
15442   bool use_approx_division_p = (flag_mlow_precision_div
15443 			        || (aarch64_tune_params.approx_modes->division
15444 				    & AARCH64_APPROX_MODE (mode)));
15445 
15446   if (!flag_finite_math_only
15447       || flag_trapping_math
15448       || !flag_unsafe_math_optimizations
15449       || optimize_function_for_size_p (cfun)
15450       || !use_approx_division_p)
15451     return false;
15452 
15453   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
15454     return false;
15455 
15456   rtx pg = NULL_RTX;
15457   if (aarch64_sve_mode_p (mode))
15458     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15459 
15460   /* Estimate the approximate reciprocal.  */
15461   rtx xrcp = gen_reg_rtx (mode);
15462   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
15463 
15464   /* Iterate over the series twice for SF and thrice for DF.  */
15465   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15466 
15467   /* Optionally iterate over the series less for faster performance,
15468      while sacrificing the accuracy.  The default is 2 for DF and 1 for SF.  */
15469   if (flag_mlow_precision_div)
15470     iterations = (GET_MODE_INNER (mode) == DFmode
15471 		  ? aarch64_double_recp_precision
15472 		  : aarch64_float_recp_precision);
15473 
15474   /* Iterate over the series to calculate the approximate reciprocal.  */
15475   rtx xtmp = gen_reg_rtx (mode);
15476   while (iterations--)
15477     {
15478       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
15479 
15480       if (iterations > 0)
15481 	aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
15482     }
15483 
15484   if (num != CONST1_RTX (mode))
15485     {
15486       /* As the approximate reciprocal of DEN is already calculated, only
15487 	 calculate the approximate division when NUM is not 1.0.  */
15488       rtx xnum = force_reg (mode, num);
15489       aarch64_emit_mult (xrcp, pg, xrcp, xnum);
15490     }
15491 
15492   /* Finalize the approximation.  */
15493   aarch64_emit_mult (quo, pg, xrcp, xtmp);
15494   return true;
15495 }
15496 
15497 /* Return the number of instructions that can be issued per cycle.  */
15498 static int
aarch64_sched_issue_rate(void)15499 aarch64_sched_issue_rate (void)
15500 {
15501   return aarch64_tune_params.issue_rate;
15502 }
15503 
15504 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
15505 static int
aarch64_sched_variable_issue(FILE *,int,rtx_insn * insn,int more)15506 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
15507 {
15508   if (DEBUG_INSN_P (insn))
15509     return more;
15510 
15511   rtx_code code = GET_CODE (PATTERN (insn));
15512   if (code == USE || code == CLOBBER)
15513     return more;
15514 
15515   if (get_attr_type (insn) == TYPE_NO_INSN)
15516     return more;
15517 
15518   return more - 1;
15519 }
15520 
15521 static int
aarch64_sched_first_cycle_multipass_dfa_lookahead(void)15522 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
15523 {
15524   int issue_rate = aarch64_sched_issue_rate ();
15525 
15526   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
15527 }
15528 
15529 
15530 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
15531    autopref_multipass_dfa_lookahead_guard from haifa-sched.cc.  It only
15532    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
15533 
15534 static int
aarch64_first_cycle_multipass_dfa_lookahead_guard(rtx_insn * insn,int ready_index)15535 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
15536 						    int ready_index)
15537 {
15538   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
15539 }
15540 
15541 
15542 /* Vectorizer cost model target hooks.  */
15543 
15544 /* If a vld1 from address ADDR should be recorded in vector_load_decls,
15545    return the decl that should be recorded.  Return null otherwise.  */
15546 tree
aarch64_vector_load_decl(tree addr)15547 aarch64_vector_load_decl (tree addr)
15548 {
15549   if (TREE_CODE (addr) != ADDR_EXPR)
15550     return NULL_TREE;
15551   tree base = get_base_address (TREE_OPERAND (addr, 0));
15552   if (TREE_CODE (base) != VAR_DECL)
15553     return NULL_TREE;
15554   return base;
15555 }
15556 
15557 /* Return true if STMT_INFO accesses a decl that is known to be the
15558    argument to a vld1 in the same function.  */
15559 static bool
aarch64_accesses_vector_load_decl_p(stmt_vec_info stmt_info)15560 aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info)
15561 {
15562   if (!cfun->machine->vector_load_decls)
15563     return false;
15564   auto dr = STMT_VINFO_DATA_REF (stmt_info);
15565   if (!dr)
15566     return false;
15567   tree decl = aarch64_vector_load_decl (DR_BASE_ADDRESS (dr));
15568   return decl && cfun->machine->vector_load_decls->contains (decl);
15569 }
15570 
15571 /* Information about how the CPU would issue the scalar, Advanced SIMD
15572    or SVE version of a vector loop, using the scheme defined by the
15573    aarch64_base_vec_issue_info hierarchy of structures.  */
15574 class aarch64_vec_op_count
15575 {
15576 public:
15577   aarch64_vec_op_count () = default;
15578   aarch64_vec_op_count (const aarch64_vec_issue_info *, unsigned int,
15579 			unsigned int = 1);
15580 
vec_flags() const15581   unsigned int vec_flags () const { return m_vec_flags; }
vf_factor() const15582   unsigned int vf_factor () const { return m_vf_factor; }
15583 
15584   const aarch64_base_vec_issue_info *base_issue_info () const;
15585   const aarch64_simd_vec_issue_info *simd_issue_info () const;
15586   const aarch64_sve_vec_issue_info *sve_issue_info () const;
15587 
15588   fractional_cost rename_cycles_per_iter () const;
15589   fractional_cost min_nonpred_cycles_per_iter () const;
15590   fractional_cost min_pred_cycles_per_iter () const;
15591   fractional_cost min_cycles_per_iter () const;
15592 
15593   void dump () const;
15594 
15595   /* The number of individual "general" operations.  See the comments
15596      in aarch64_base_vec_issue_info for details.  */
15597   unsigned int general_ops = 0;
15598 
15599   /* The number of load and store operations, under the same scheme
15600      as above.  */
15601   unsigned int loads = 0;
15602   unsigned int stores = 0;
15603 
15604   /* The minimum number of cycles needed to execute all loop-carried
15605      operations, which in the vector code become associated with
15606      reductions.  */
15607   unsigned int reduction_latency = 0;
15608 
15609   /* The number of individual predicate operations.  See the comments
15610      in aarch64_sve_vec_issue_info for details.  */
15611   unsigned int pred_ops = 0;
15612 
15613 private:
15614   /* The issue information for the core.  */
15615   const aarch64_vec_issue_info *m_issue_info = nullptr;
15616 
15617   /* - If M_VEC_FLAGS is zero then this structure describes scalar code
15618      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
15619        Advanced SIMD code.
15620      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
15621        SVE code.  */
15622   unsigned int m_vec_flags = 0;
15623 
15624   /* Assume that, when the code is executing on the core described
15625      by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
15626      times more data than the vectorizer anticipates.
15627 
15628      This is only ever different from 1 for SVE.  It allows us to consider
15629      what would happen on a 256-bit SVE target even when the -mtune
15630      parameters say that the “likely” SVE length is 128 bits.  */
15631   unsigned int m_vf_factor = 1;
15632 };
15633 
15634 aarch64_vec_op_count::
aarch64_vec_op_count(const aarch64_vec_issue_info * issue_info,unsigned int vec_flags,unsigned int vf_factor)15635 aarch64_vec_op_count (const aarch64_vec_issue_info *issue_info,
15636 		      unsigned int vec_flags, unsigned int vf_factor)
15637   : m_issue_info (issue_info),
15638     m_vec_flags (vec_flags),
15639     m_vf_factor (vf_factor)
15640 {
15641 }
15642 
15643 /* Return the base issue information (i.e. the parts that make sense
15644    for both scalar and vector code).  Return null if we have no issue
15645    information.  */
15646 const aarch64_base_vec_issue_info *
base_issue_info() const15647 aarch64_vec_op_count::base_issue_info () const
15648 {
15649   if (auto *ret = simd_issue_info ())
15650     return ret;
15651   return m_issue_info->scalar;
15652 }
15653 
15654 /* If the structure describes vector code and we have associated issue
15655    information, return that issue information, otherwise return null.  */
15656 const aarch64_simd_vec_issue_info *
simd_issue_info() const15657 aarch64_vec_op_count::simd_issue_info () const
15658 {
15659   if (auto *ret = sve_issue_info ())
15660     return ret;
15661   if (m_vec_flags)
15662     return m_issue_info->advsimd;
15663   return nullptr;
15664 }
15665 
15666 /* If the structure describes SVE code and we have associated issue
15667    information, return that issue information, otherwise return null.  */
15668 const aarch64_sve_vec_issue_info *
sve_issue_info() const15669 aarch64_vec_op_count::sve_issue_info () const
15670 {
15671   if (m_vec_flags & VEC_ANY_SVE)
15672     return m_issue_info->sve;
15673   return nullptr;
15674 }
15675 
15676 /* Estimate the minimum number of cycles per iteration needed to rename
15677    the instructions.
15678 
15679    ??? For now this is done inline rather than via cost tables, since it
15680    isn't clear how it should be parameterized for the general case.  */
15681 fractional_cost
rename_cycles_per_iter() const15682 aarch64_vec_op_count::rename_cycles_per_iter () const
15683 {
15684   if (sve_issue_info () == &neoverse512tvb_sve_issue_info
15685       || sve_issue_info () == &neoversen2_sve_issue_info
15686       || sve_issue_info () == &neoversev2_sve_issue_info)
15687     /* + 1 for an addition.  We've already counted a general op for each
15688        store, so we don't need to account for stores separately.  The branch
15689        reads no registers and so does not need to be counted either.
15690 
15691        ??? This value is very much on the pessimistic side, but seems to work
15692        pretty well in practice.  */
15693     return { general_ops + loads + pred_ops + 1, 5 };
15694 
15695   return 0;
15696 }
15697 
15698 /* Like min_cycles_per_iter, but excluding predicate operations.  */
15699 fractional_cost
min_nonpred_cycles_per_iter() const15700 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
15701 {
15702   auto *issue_info = base_issue_info ();
15703 
15704   fractional_cost cycles = MAX (reduction_latency, 1);
15705   cycles = std::max (cycles, { stores, issue_info->stores_per_cycle });
15706   cycles = std::max (cycles, { loads + stores,
15707 			       issue_info->loads_stores_per_cycle });
15708   cycles = std::max (cycles, { general_ops,
15709 			       issue_info->general_ops_per_cycle });
15710   cycles = std::max (cycles, rename_cycles_per_iter ());
15711   return cycles;
15712 }
15713 
15714 /* Like min_cycles_per_iter, but including only the predicate operations.  */
15715 fractional_cost
min_pred_cycles_per_iter() const15716 aarch64_vec_op_count::min_pred_cycles_per_iter () const
15717 {
15718   if (auto *issue_info = sve_issue_info ())
15719     return { pred_ops, issue_info->pred_ops_per_cycle };
15720   return 0;
15721 }
15722 
15723 /* Estimate the minimum number of cycles needed to issue the operations.
15724    This is a very simplistic model!  */
15725 fractional_cost
min_cycles_per_iter() const15726 aarch64_vec_op_count::min_cycles_per_iter () const
15727 {
15728   return std::max (min_nonpred_cycles_per_iter (),
15729 		   min_pred_cycles_per_iter ());
15730 }
15731 
15732 /* Dump information about the structure.  */
15733 void
dump() const15734 aarch64_vec_op_count::dump () const
15735 {
15736   dump_printf_loc (MSG_NOTE, vect_location,
15737 		   "  load operations = %d\n", loads);
15738   dump_printf_loc (MSG_NOTE, vect_location,
15739 		   "  store operations = %d\n", stores);
15740   dump_printf_loc (MSG_NOTE, vect_location,
15741 		   "  general operations = %d\n", general_ops);
15742   if (sve_issue_info ())
15743     dump_printf_loc (MSG_NOTE, vect_location,
15744 		     "  predicate operations = %d\n", pred_ops);
15745   dump_printf_loc (MSG_NOTE, vect_location,
15746 		   "  reduction latency = %d\n", reduction_latency);
15747   if (auto rcpi = rename_cycles_per_iter ())
15748     dump_printf_loc (MSG_NOTE, vect_location,
15749 		     "  estimated cycles per iteration to rename = %f\n",
15750 		     rcpi.as_double ());
15751   if (auto pred_cpi = min_pred_cycles_per_iter ())
15752     {
15753       dump_printf_loc (MSG_NOTE, vect_location,
15754 		       "  estimated min cycles per iteration"
15755 		       " without predication = %f\n",
15756 		       min_nonpred_cycles_per_iter ().as_double ());
15757       dump_printf_loc (MSG_NOTE, vect_location,
15758 		       "  estimated min cycles per iteration"
15759 		       " for predication = %f\n", pred_cpi.as_double ());
15760     }
15761   if (auto cpi = min_cycles_per_iter ())
15762     dump_printf_loc (MSG_NOTE, vect_location,
15763 		     "  estimated min cycles per iteration = %f\n",
15764 		     cpi.as_double ());
15765 }
15766 
15767 /* Information about vector code that we're in the process of costing.  */
15768 class aarch64_vector_costs : public vector_costs
15769 {
15770 public:
15771   aarch64_vector_costs (vec_info *, bool);
15772 
15773   unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
15774 			      stmt_vec_info stmt_info, slp_tree, tree vectype,
15775 			      int misalign,
15776 			      vect_cost_model_location where) override;
15777   void finish_cost (const vector_costs *) override;
15778   bool better_main_loop_than_p (const vector_costs *other) const override;
15779 
15780 private:
15781   void record_potential_advsimd_unrolling (loop_vec_info);
15782   void analyze_loop_vinfo (loop_vec_info);
15783   void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info,
15784 		  aarch64_vec_op_count *);
15785   fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *,
15786 					fractional_cost, unsigned int,
15787 					unsigned int *, bool *);
15788   unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
15789 				 unsigned int);
15790   bool prefer_unrolled_loop () const;
15791   unsigned int determine_suggested_unroll_factor ();
15792 
15793   /* True if we have performed one-time initialization based on the
15794      vec_info.  */
15795   bool m_analyzed_vinfo = false;
15796 
15797   /* This loop uses an average operation that is not supported by SVE, but is
15798      supported by Advanced SIMD and SVE2.  */
15799   bool m_has_avg = false;
15800 
15801   /* True if the vector body contains a store to a decl and if the
15802      function is known to have a vld1 from the same decl.
15803 
15804      In the Advanced SIMD ACLE, the recommended endian-agnostic way of
15805      initializing a vector is:
15806 
15807        float f[4] = { elts };
15808        float32x4_t x = vld1q_f32(f);
15809 
15810      We should strongly prefer vectorization of the initialization of f,
15811      so that the store to f and the load back can be optimized away,
15812      leaving a vectorization of { elts }.  */
15813   bool m_stores_to_vector_load_decl = false;
15814 
15815   /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
15816      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
15817        SIMD code.
15818      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code.  */
15819   unsigned int m_vec_flags = 0;
15820 
15821   /* At the moment, we do not model LDP and STP in the vector and scalar costs.
15822      This means that code such as:
15823 
15824 	a[0] = x;
15825 	a[1] = x;
15826 
15827      will be costed as two scalar instructions and two vector instructions
15828      (a scalar_to_vec and an unaligned_store).  For SLP, the vector form
15829      wins if the costs are equal, because of the fact that the vector costs
15830      include constant initializations whereas the scalar costs don't.
15831      We would therefore tend to vectorize the code above, even though
15832      the scalar version can use a single STP.
15833 
15834      We should eventually fix this and model LDP and STP in the main costs;
15835      see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
15836      Until then, we look specifically for code that does nothing more than
15837      STP-like operations.  We cost them on that basis in addition to the
15838      normal latency-based costs.
15839 
15840      If the scalar or vector code could be a sequence of STPs +
15841      initialization, this variable counts the cost of the sequence,
15842      with 2 units per instruction.  The variable is ~0U for other
15843      kinds of code.  */
15844   unsigned int m_stp_sequence_cost = 0;
15845 
15846   /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
15847      throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE.  In those
15848      situations, we try to predict whether an Advanced SIMD implementation
15849      of the loop could be completely unrolled and become straight-line code.
15850      If so, it is generally better to use the Advanced SIMD version rather
15851      than length-agnostic SVE, since the SVE loop would execute an unknown
15852      number of times and so could not be completely unrolled in the same way.
15853 
15854      If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
15855      number of Advanced SIMD loop iterations that would be unrolled and
15856      M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
15857      in the unrolled loop.  Both values are zero if we're not applying
15858      the heuristic.  */
15859   unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
15860   unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
15861 
15862   /* If we're vectorizing a loop that executes a constant number of times,
15863      this variable gives the number of times that the vector loop would
15864      iterate, otherwise it is zero.  */
15865   uint64_t m_num_vector_iterations = 0;
15866 
15867   /* Used only when vectorizing loops.  Estimates the number and kind of
15868      operations that would be needed by one iteration of the scalar
15869      or vector loop.  There is one entry for each tuning option of
15870      interest.  */
15871   auto_vec<aarch64_vec_op_count, 2> m_ops;
15872 };
15873 
aarch64_vector_costs(vec_info * vinfo,bool costing_for_scalar)15874 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
15875 					    bool costing_for_scalar)
15876   : vector_costs (vinfo, costing_for_scalar),
15877     m_vec_flags (costing_for_scalar ? 0
15878 		 : aarch64_classify_vector_mode (vinfo->vector_mode))
15879 {
15880   if (auto *issue_info = aarch64_tune_params.vec_costs->issue_info)
15881     {
15882       m_ops.quick_push ({ issue_info, m_vec_flags });
15883       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
15884 	{
15885 	  unsigned int vf_factor = (m_vec_flags & VEC_ANY_SVE) ? 2 : 1;
15886 	  m_ops.quick_push ({ &neoversev1_vec_issue_info, m_vec_flags,
15887 			      vf_factor });
15888 	}
15889     }
15890 }
15891 
15892 /* Implement TARGET_VECTORIZE_CREATE_COSTS.  */
15893 vector_costs *
aarch64_vectorize_create_costs(vec_info * vinfo,bool costing_for_scalar)15894 aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
15895 {
15896   return new aarch64_vector_costs (vinfo, costing_for_scalar);
15897 }
15898 
15899 /* Return true if the current CPU should use the new costs defined
15900    in GCC 11.  This should be removed for GCC 12 and above, with the
15901    costs applying to all CPUs instead.  */
15902 static bool
aarch64_use_new_vector_costs_p()15903 aarch64_use_new_vector_costs_p ()
15904 {
15905   return (aarch64_tune_params.extra_tuning_flags
15906 	  & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
15907 }
15908 
15909 /* Return the appropriate SIMD costs for vectors of type VECTYPE.  */
15910 static const simd_vec_cost *
aarch64_simd_vec_costs(tree vectype)15911 aarch64_simd_vec_costs (tree vectype)
15912 {
15913   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
15914   if (vectype != NULL
15915       && aarch64_sve_mode_p (TYPE_MODE (vectype))
15916       && costs->sve != NULL)
15917     return costs->sve;
15918   return costs->advsimd;
15919 }
15920 
15921 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS.  */
15922 static const simd_vec_cost *
aarch64_simd_vec_costs_for_flags(unsigned int flags)15923 aarch64_simd_vec_costs_for_flags (unsigned int flags)
15924 {
15925   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
15926   if ((flags & VEC_ANY_SVE) && costs->sve)
15927     return costs->sve;
15928   return costs->advsimd;
15929 }
15930 
15931 /* If STMT_INFO is a memory reference, return the scalar memory type,
15932    otherwise return null.  */
15933 static tree
aarch64_dr_type(stmt_vec_info stmt_info)15934 aarch64_dr_type (stmt_vec_info stmt_info)
15935 {
15936   if (auto dr = STMT_VINFO_DATA_REF (stmt_info))
15937     return TREE_TYPE (DR_REF (dr));
15938   return NULL_TREE;
15939 }
15940 
15941 /* Decide whether to use the unrolling heuristic described above
15942    m_unrolled_advsimd_niters, updating that field if so.  LOOP_VINFO
15943    describes the loop that we're vectorizing.  */
15944 void
15945 aarch64_vector_costs::
record_potential_advsimd_unrolling(loop_vec_info loop_vinfo)15946 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
15947 {
15948   /* The heuristic only makes sense on targets that have the same
15949      vector throughput for SVE and Advanced SIMD.  */
15950   if (!(aarch64_tune_params.extra_tuning_flags
15951 	& AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
15952     return;
15953 
15954   /* We only want to apply the heuristic if LOOP_VINFO is being
15955      vectorized for SVE.  */
15956   if (!(m_vec_flags & VEC_ANY_SVE))
15957     return;
15958 
15959   /* Check whether it is possible in principle to use Advanced SIMD
15960      instead.  */
15961   if (aarch64_autovec_preference == 2)
15962     return;
15963 
15964   /* We don't want to apply the heuristic to outer loops, since it's
15965      harder to track two levels of unrolling.  */
15966   if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
15967     return;
15968 
15969   /* Only handle cases in which the number of Advanced SIMD iterations
15970      would be known at compile time but the number of SVE iterations
15971      would not.  */
15972   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
15973       || aarch64_sve_vg.is_constant ())
15974     return;
15975 
15976   /* Guess how many times the Advanced SIMD loop would iterate and make
15977      sure that it is within the complete unrolling limit.  Even if the
15978      number of iterations is small enough, the number of statements might
15979      not be, which is why we need to estimate the number of statements too.  */
15980   unsigned int estimated_vq = aarch64_estimated_sve_vq ();
15981   unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
15982   unsigned HOST_WIDE_INT unrolled_advsimd_niters
15983     = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
15984   if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
15985     return;
15986 
15987   /* Record that we're applying the heuristic and should try to estimate
15988      the number of statements in the Advanced SIMD loop.  */
15989   m_unrolled_advsimd_niters = unrolled_advsimd_niters;
15990 }
15991 
15992 /* Do one-time initialization of the aarch64_vector_costs given that we're
15993    costing the loop vectorization described by LOOP_VINFO.  */
15994 void
analyze_loop_vinfo(loop_vec_info loop_vinfo)15995 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
15996 {
15997   /* Record the number of times that the vector loop would execute,
15998      if known.  */
15999   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
16000   auto scalar_niters = max_stmt_executions_int (loop);
16001   if (scalar_niters >= 0)
16002     {
16003       unsigned int vf = vect_vf_for_cost (loop_vinfo);
16004       if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
16005 	m_num_vector_iterations = scalar_niters / vf;
16006       else
16007 	m_num_vector_iterations = CEIL (scalar_niters, vf);
16008     }
16009 
16010   /* Detect whether we're vectorizing for SVE and should apply the unrolling
16011      heuristic described above m_unrolled_advsimd_niters.  */
16012   record_potential_advsimd_unrolling (loop_vinfo);
16013 
16014   /* Record the issue information for any SVE WHILE instructions that the
16015      loop needs.  */
16016   if (!m_ops.is_empty () && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
16017     {
16018       unsigned int num_masks = 0;
16019       rgroup_controls *rgm;
16020       unsigned int num_vectors_m1;
16021       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
16022 	if (rgm->type)
16023 	  num_masks += num_vectors_m1 + 1;
16024       for (auto &ops : m_ops)
16025 	if (auto *issue = ops.sve_issue_info ())
16026 	  ops.pred_ops += num_masks * issue->while_pred_ops;
16027     }
16028 }
16029 
16030 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
16031 static int
aarch64_builtin_vectorization_cost(enum vect_cost_for_stmt type_of_cost,tree vectype,int misalign ATTRIBUTE_UNUSED)16032 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
16033 				    tree vectype,
16034 				    int misalign ATTRIBUTE_UNUSED)
16035 {
16036   unsigned elements;
16037   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16038   bool fp = false;
16039 
16040   if (vectype != NULL)
16041     fp = FLOAT_TYPE_P (vectype);
16042 
16043   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16044 
16045   switch (type_of_cost)
16046     {
16047       case scalar_stmt:
16048 	return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
16049 
16050       case scalar_load:
16051 	return costs->scalar_load_cost;
16052 
16053       case scalar_store:
16054 	return costs->scalar_store_cost;
16055 
16056       case vector_stmt:
16057 	return fp ? simd_costs->fp_stmt_cost
16058 		  : simd_costs->int_stmt_cost;
16059 
16060       case vector_load:
16061 	return simd_costs->align_load_cost;
16062 
16063       case vector_store:
16064 	return simd_costs->store_cost;
16065 
16066       case vec_to_scalar:
16067 	return simd_costs->vec_to_scalar_cost;
16068 
16069       case scalar_to_vec:
16070 	return simd_costs->scalar_to_vec_cost;
16071 
16072       case unaligned_load:
16073       case vector_gather_load:
16074 	return simd_costs->unalign_load_cost;
16075 
16076       case unaligned_store:
16077       case vector_scatter_store:
16078 	return simd_costs->unalign_store_cost;
16079 
16080       case cond_branch_taken:
16081 	return costs->cond_taken_branch_cost;
16082 
16083       case cond_branch_not_taken:
16084 	return costs->cond_not_taken_branch_cost;
16085 
16086       case vec_perm:
16087 	return simd_costs->permute_cost;
16088 
16089       case vec_promote_demote:
16090 	return fp ? simd_costs->fp_stmt_cost
16091 		  : simd_costs->int_stmt_cost;
16092 
16093       case vec_construct:
16094 	elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
16095 	return elements / 2 + 1;
16096 
16097       default:
16098 	gcc_unreachable ();
16099     }
16100 }
16101 
16102 /* Return true if an access of kind KIND for STMT_INFO represents one
16103    vector of an LD[234] or ST[234] operation.  Return the total number of
16104    vectors (2, 3 or 4) if so, otherwise return a value outside that range.  */
16105 static int
aarch64_ld234_st234_vectors(vect_cost_for_stmt kind,stmt_vec_info stmt_info)16106 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
16107 {
16108   if ((kind == vector_load
16109        || kind == unaligned_load
16110        || kind == vector_store
16111        || kind == unaligned_store)
16112       && STMT_VINFO_DATA_REF (stmt_info))
16113     {
16114       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
16115       if (stmt_info
16116 	  && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
16117 	return DR_GROUP_SIZE (stmt_info);
16118     }
16119   return 0;
16120 }
16121 
16122 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16123    vectors would produce a series of LDP or STP operations.  KIND is the
16124    kind of statement that STMT_INFO represents.  */
16125 static bool
aarch64_advsimd_ldp_stp_p(enum vect_cost_for_stmt kind,stmt_vec_info stmt_info)16126 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
16127 			   stmt_vec_info stmt_info)
16128 {
16129   switch (kind)
16130     {
16131     case vector_load:
16132     case vector_store:
16133     case unaligned_load:
16134     case unaligned_store:
16135       break;
16136 
16137     default:
16138       return false;
16139     }
16140 
16141   if (aarch64_tune_params.extra_tuning_flags
16142       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
16143     return false;
16144 
16145   return is_gimple_assign (stmt_info->stmt);
16146 }
16147 
16148 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
16149    or multiply-subtract sequence that might be suitable for fusing into a
16150    single instruction.  If VEC_FLAGS is zero, analyze the operation as
16151    a scalar one, otherwise analyze it as an operation on vectors with those
16152    VEC_* flags.  */
16153 static bool
aarch64_multiply_add_p(vec_info * vinfo,stmt_vec_info stmt_info,unsigned int vec_flags)16154 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
16155 			unsigned int vec_flags)
16156 {
16157   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16158   if (!assign)
16159     return false;
16160   tree_code code = gimple_assign_rhs_code (assign);
16161   if (code != PLUS_EXPR && code != MINUS_EXPR)
16162     return false;
16163 
16164   if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
16165       || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
16166     return false;
16167 
16168   for (int i = 1; i < 3; ++i)
16169     {
16170       tree rhs = gimple_op (assign, i);
16171       /* ??? Should we try to check for a single use as well?  */
16172       if (TREE_CODE (rhs) != SSA_NAME)
16173 	continue;
16174 
16175       stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16176       if (!def_stmt_info
16177 	  || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16178 	continue;
16179       gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16180       if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
16181 	continue;
16182 
16183       if (vec_flags & VEC_ADVSIMD)
16184 	{
16185 	  /* Scalar and SVE code can tie the result to any FMLA input (or none,
16186 	     although that requires a MOVPRFX for SVE).  However, Advanced SIMD
16187 	     only supports MLA forms, so will require a move if the result
16188 	     cannot be tied to the accumulator.  The most important case in
16189 	     which this is true is when the accumulator input is invariant.  */
16190 	  rhs = gimple_op (assign, 3 - i);
16191 	  if (TREE_CODE (rhs) != SSA_NAME)
16192 	    return false;
16193 	  def_stmt_info = vinfo->lookup_def (rhs);
16194 	  if (!def_stmt_info
16195 	      || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
16196 	    return false;
16197 	}
16198 
16199       return true;
16200     }
16201   return false;
16202 }
16203 
16204 /* We are considering implementing STMT_INFO using SVE.  If STMT_INFO is an
16205    in-loop reduction that SVE supports directly, return its latency in cycles,
16206    otherwise return zero.  SVE_COSTS specifies the latencies of the relevant
16207    instructions.  */
16208 static unsigned int
aarch64_sve_in_loop_reduction_latency(vec_info * vinfo,stmt_vec_info stmt_info,const sve_vec_cost * sve_costs)16209 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
16210 				       stmt_vec_info stmt_info,
16211 				       const sve_vec_cost *sve_costs)
16212 {
16213   switch (vect_reduc_type (vinfo, stmt_info))
16214     {
16215     case EXTRACT_LAST_REDUCTION:
16216       return sve_costs->clast_cost;
16217 
16218     case FOLD_LEFT_REDUCTION:
16219       switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info->stmt))))
16220 	{
16221 	case E_HFmode:
16222 	case E_BFmode:
16223 	  return sve_costs->fadda_f16_cost;
16224 
16225 	case E_SFmode:
16226 	  return sve_costs->fadda_f32_cost;
16227 
16228 	case E_DFmode:
16229 	  return sve_costs->fadda_f64_cost;
16230 
16231 	default:
16232 	  break;
16233 	}
16234       break;
16235     }
16236 
16237   return 0;
16238 }
16239 
16240 /* STMT_INFO describes a loop-carried operation in the original scalar code
16241    that we are considering implementing as a reduction.  Return one of the
16242    following values, depending on VEC_FLAGS:
16243 
16244    - If VEC_FLAGS is zero, return the loop carry latency of the original
16245      scalar operation.
16246 
16247    - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
16248      Advanced SIMD implementation.
16249 
16250    - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
16251      SVE implementation.  */
16252 static unsigned int
aarch64_in_loop_reduction_latency(vec_info * vinfo,stmt_vec_info stmt_info,unsigned int vec_flags)16253 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
16254 				   unsigned int vec_flags)
16255 {
16256   const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
16257   const sve_vec_cost *sve_costs = nullptr;
16258   if (vec_flags & VEC_ANY_SVE)
16259     sve_costs = aarch64_tune_params.vec_costs->sve;
16260 
16261   /* If the caller is asking for the SVE latency, check for forms of reduction
16262      that only SVE can handle directly.  */
16263   if (sve_costs)
16264     {
16265       unsigned int latency
16266 	= aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16267       if (latency)
16268 	return latency;
16269     }
16270 
16271   /* Handle scalar costs.  */
16272   bool is_float = FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)));
16273   if (vec_flags == 0)
16274     {
16275       if (is_float)
16276 	return vec_costs->scalar_fp_stmt_cost;
16277       return vec_costs->scalar_int_stmt_cost;
16278     }
16279 
16280   /* Otherwise, the loop body just contains normal integer or FP operations,
16281      with a vector reduction outside the loop.  */
16282   const simd_vec_cost *simd_costs
16283     = aarch64_simd_vec_costs_for_flags (vec_flags);
16284   if (is_float)
16285     return simd_costs->fp_stmt_cost;
16286   return simd_costs->int_stmt_cost;
16287 }
16288 
16289 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16290    for STMT_INFO, which has cost kind KIND.  If this is a scalar operation,
16291    try to subdivide the target-independent categorization provided by KIND
16292    to get a more accurate cost.  */
16293 static fractional_cost
aarch64_detect_scalar_stmt_subtype(vec_info * vinfo,vect_cost_for_stmt kind,stmt_vec_info stmt_info,fractional_cost stmt_cost)16294 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16295 				    stmt_vec_info stmt_info,
16296 				    fractional_cost stmt_cost)
16297 {
16298   /* Detect an extension of a loaded value.  In general, we'll be able to fuse
16299      the extension with the load.  */
16300   if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
16301     return 0;
16302 
16303   return stmt_cost;
16304 }
16305 
16306 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16307    for the vectorized form of STMT_INFO, which has cost kind KIND and which
16308    when vectorized would operate on vector type VECTYPE.  Try to subdivide
16309    the target-independent categorization provided by KIND to get a more
16310    accurate cost.  WHERE specifies where the cost associated with KIND
16311    occurs.  */
16312 static fractional_cost
aarch64_detect_vector_stmt_subtype(vec_info * vinfo,vect_cost_for_stmt kind,stmt_vec_info stmt_info,tree vectype,enum vect_cost_model_location where,fractional_cost stmt_cost)16313 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16314 				    stmt_vec_info stmt_info, tree vectype,
16315 				    enum vect_cost_model_location where,
16316 				    fractional_cost stmt_cost)
16317 {
16318   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16319   const sve_vec_cost *sve_costs = nullptr;
16320   if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16321     sve_costs = aarch64_tune_params.vec_costs->sve;
16322 
16323   /* It's generally better to avoid costing inductions, since the induction
16324      will usually be hidden by other operations.  This is particularly true
16325      for things like COND_REDUCTIONS.  */
16326   if (is_a<gphi *> (stmt_info->stmt))
16327     return 0;
16328 
16329   /* Detect cases in which vec_to_scalar is describing the extraction of a
16330      vector element in preparation for a scalar store.  The store itself is
16331      costed separately.  */
16332   if (vect_is_store_elt_extraction (kind, stmt_info))
16333     return simd_costs->store_elt_extra_cost;
16334 
16335   /* Detect SVE gather loads, which are costed as a single scalar_load
16336      for each element.  We therefore need to divide the full-instruction
16337      cost by the number of elements in the vector.  */
16338   if (kind == scalar_load
16339       && sve_costs
16340       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16341     {
16342       unsigned int nunits = vect_nunits_for_cost (vectype);
16343       if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
16344 	return { sve_costs->gather_load_x64_cost, nunits };
16345       return { sve_costs->gather_load_x32_cost, nunits };
16346     }
16347 
16348   /* Detect cases in which a scalar_store is really storing one element
16349      in a scatter operation.  */
16350   if (kind == scalar_store
16351       && sve_costs
16352       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16353     return sve_costs->scatter_store_elt_cost;
16354 
16355   /* Detect cases in which vec_to_scalar represents an in-loop reduction.  */
16356   if (kind == vec_to_scalar
16357       && where == vect_body
16358       && sve_costs)
16359     {
16360       unsigned int latency
16361 	= aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16362       if (latency)
16363 	return latency;
16364     }
16365 
16366   /* Detect cases in which vec_to_scalar represents a single reduction
16367      instruction like FADDP or MAXV.  */
16368   if (kind == vec_to_scalar
16369       && where == vect_epilogue
16370       && vect_is_reduction (stmt_info))
16371     switch (GET_MODE_INNER (TYPE_MODE (vectype)))
16372       {
16373       case E_QImode:
16374 	return simd_costs->reduc_i8_cost;
16375 
16376       case E_HImode:
16377 	return simd_costs->reduc_i16_cost;
16378 
16379       case E_SImode:
16380 	return simd_costs->reduc_i32_cost;
16381 
16382       case E_DImode:
16383 	return simd_costs->reduc_i64_cost;
16384 
16385       case E_HFmode:
16386       case E_BFmode:
16387 	return simd_costs->reduc_f16_cost;
16388 
16389       case E_SFmode:
16390 	return simd_costs->reduc_f32_cost;
16391 
16392       case E_DFmode:
16393 	return simd_costs->reduc_f64_cost;
16394 
16395       default:
16396 	break;
16397       }
16398 
16399   /* Otherwise stick with the original categorization.  */
16400   return stmt_cost;
16401 }
16402 
16403 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16404    for STMT_INFO, which has cost kind KIND and which when vectorized would
16405    operate on vector type VECTYPE.  Adjust the cost as necessary for SVE
16406    targets.  */
16407 static fractional_cost
aarch64_sve_adjust_stmt_cost(class vec_info * vinfo,vect_cost_for_stmt kind,stmt_vec_info stmt_info,tree vectype,fractional_cost stmt_cost)16408 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
16409 			      stmt_vec_info stmt_info, tree vectype,
16410 			      fractional_cost stmt_cost)
16411 {
16412   /* Unlike vec_promote_demote, vector_stmt conversions do not change the
16413      vector register size or number of units.  Integer promotions of this
16414      type therefore map to SXT[BHW] or UXT[BHW].
16415 
16416      Most loads have extending forms that can do the sign or zero extension
16417      on the fly.  Optimistically assume that a load followed by an extension
16418      will fold to this form during combine, and that the extension therefore
16419      comes for free.  */
16420   if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
16421     stmt_cost = 0;
16422 
16423   /* For similar reasons, vector_stmt integer truncations are a no-op,
16424      because we can just ignore the unused upper bits of the source.  */
16425   if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
16426     stmt_cost = 0;
16427 
16428   /* Advanced SIMD can load and store pairs of registers using LDP and STP,
16429      but there are no equivalent instructions for SVE.  This means that
16430      (all other things being equal) 128-bit SVE needs twice as many load
16431      and store instructions as Advanced SIMD in order to process vector pairs.
16432 
16433      Also, scalar code can often use LDP and STP to access pairs of values,
16434      so it is too simplistic to say that one SVE load or store replaces
16435      VF scalar loads and stores.
16436 
16437      Ideally we would account for this in the scalar and Advanced SIMD
16438      costs by making suitable load/store pairs as cheap as a single
16439      load/store.  However, that would be a very invasive change and in
16440      practice it tends to stress other parts of the cost model too much.
16441      E.g. stores of scalar constants currently count just a store,
16442      whereas stores of vector constants count a store and a vec_init.
16443      This is an artificial distinction for AArch64, where stores of
16444      nonzero scalar constants need the same kind of register invariant
16445      as vector stores.
16446 
16447      An alternative would be to double the cost of any SVE loads and stores
16448      that could be paired in Advanced SIMD (and possibly also paired in
16449      scalar code).  But this tends to stress other parts of the cost model
16450      in the same way.  It also means that we can fall back to Advanced SIMD
16451      even if full-loop predication would have been useful.
16452 
16453      Here we go for a more conservative version: double the costs of SVE
16454      loads and stores if one iteration of the scalar loop processes enough
16455      elements for it to use a whole number of Advanced SIMD LDP or STP
16456      instructions.  This makes it very likely that the VF would be 1 for
16457      Advanced SIMD, and so no epilogue should be needed.  */
16458   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
16459     {
16460       stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
16461       unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
16462       unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
16463       if (multiple_p (count * elt_bits, 256)
16464 	  && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
16465 	stmt_cost *= 2;
16466     }
16467 
16468   return stmt_cost;
16469 }
16470 
16471 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
16472    and which when vectorized would operate on vector type VECTYPE.  Add the
16473    cost of any embedded operations.  */
16474 static fractional_cost
aarch64_adjust_stmt_cost(vect_cost_for_stmt kind,stmt_vec_info stmt_info,tree vectype,fractional_cost stmt_cost)16475 aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
16476 			  tree vectype, fractional_cost stmt_cost)
16477 {
16478   if (vectype)
16479     {
16480       const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16481 
16482       /* Detect cases in which a vector load or store represents an
16483 	 LD[234] or ST[234] instruction.  */
16484       switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16485 	{
16486 	case 2:
16487 	  stmt_cost += simd_costs->ld2_st2_permute_cost;
16488 	  break;
16489 
16490 	case 3:
16491 	  stmt_cost += simd_costs->ld3_st3_permute_cost;
16492 	  break;
16493 
16494 	case 4:
16495 	  stmt_cost += simd_costs->ld4_st4_permute_cost;
16496 	  break;
16497 	}
16498 
16499       if (kind == vector_stmt || kind == vec_to_scalar)
16500 	if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16501 	  {
16502 	    if (FLOAT_TYPE_P (cmp_type))
16503 	      stmt_cost += simd_costs->fp_stmt_cost;
16504 	    else
16505 	      stmt_cost += simd_costs->int_stmt_cost;
16506 	  }
16507     }
16508 
16509   if (kind == scalar_stmt)
16510     if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16511       {
16512 	if (FLOAT_TYPE_P (cmp_type))
16513 	  stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
16514 	else
16515 	  stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
16516       }
16517 
16518   return stmt_cost;
16519 }
16520 
16521 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
16522    and they describe an operation in the body of a vector loop.  Record issue
16523    information relating to the vector operation in OPS.  */
16524 void
count_ops(unsigned int count,vect_cost_for_stmt kind,stmt_vec_info stmt_info,aarch64_vec_op_count * ops)16525 aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
16526 				 stmt_vec_info stmt_info,
16527 				 aarch64_vec_op_count *ops)
16528 {
16529   const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info ();
16530   if (!base_issue)
16531     return;
16532   const aarch64_simd_vec_issue_info *simd_issue = ops->simd_issue_info ();
16533   const aarch64_sve_vec_issue_info *sve_issue = ops->sve_issue_info ();
16534 
16535   /* Calculate the minimum cycles per iteration imposed by a reduction
16536      operation.  */
16537   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16538       && vect_is_reduction (stmt_info))
16539     {
16540       unsigned int base
16541 	= aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
16542 
16543       /* ??? Ideally we'd do COUNT reductions in parallel, but unfortunately
16544 	 that's not yet the case.  */
16545       ops->reduction_latency = MAX (ops->reduction_latency, base * count);
16546     }
16547 
16548   /* Assume that multiply-adds will become a single operation.  */
16549   if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
16550     return;
16551 
16552   /* Count the basic operation cost associated with KIND.  */
16553   switch (kind)
16554     {
16555     case cond_branch_taken:
16556     case cond_branch_not_taken:
16557     case vector_gather_load:
16558     case vector_scatter_store:
16559       /* We currently don't expect these to be used in a loop body.  */
16560       break;
16561 
16562     case vec_perm:
16563     case vec_promote_demote:
16564     case vec_construct:
16565     case vec_to_scalar:
16566     case scalar_to_vec:
16567     case vector_stmt:
16568     case scalar_stmt:
16569       ops->general_ops += count;
16570       break;
16571 
16572     case scalar_load:
16573     case vector_load:
16574     case unaligned_load:
16575       ops->loads += count;
16576       if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16577 	ops->general_ops += base_issue->fp_simd_load_general_ops * count;
16578       break;
16579 
16580     case vector_store:
16581     case unaligned_store:
16582     case scalar_store:
16583       ops->stores += count;
16584       if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16585 	ops->general_ops += base_issue->fp_simd_store_general_ops * count;
16586       break;
16587     }
16588 
16589   /* Add any embedded comparison operations.  */
16590   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16591       && vect_embedded_comparison_type (stmt_info))
16592     ops->general_ops += count;
16593 
16594   /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
16595      have only accounted for one.  */
16596   if ((kind == vector_stmt || kind == vec_to_scalar)
16597       && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
16598     ops->general_ops += count;
16599 
16600   /* Count the predicate operations needed by an SVE comparison.  */
16601   if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
16602     if (tree type = vect_comparison_type (stmt_info))
16603       {
16604 	unsigned int base = (FLOAT_TYPE_P (type)
16605 			     ? sve_issue->fp_cmp_pred_ops
16606 			     : sve_issue->int_cmp_pred_ops);
16607 	ops->pred_ops += base * count;
16608       }
16609 
16610   /* Add any extra overhead associated with LD[234] and ST[234] operations.  */
16611   if (simd_issue)
16612     switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16613       {
16614       case 2:
16615 	ops->general_ops += simd_issue->ld2_st2_general_ops * count;
16616 	break;
16617 
16618       case 3:
16619 	ops->general_ops += simd_issue->ld3_st3_general_ops * count;
16620 	break;
16621 
16622       case 4:
16623 	ops->general_ops += simd_issue->ld4_st4_general_ops * count;
16624 	break;
16625       }
16626 
16627   /* Add any overhead associated with gather loads and scatter stores.  */
16628   if (sve_issue
16629       && (kind == scalar_load || kind == scalar_store)
16630       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16631     {
16632       unsigned int pairs = CEIL (count, 2);
16633       ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
16634       ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
16635     }
16636 }
16637 
16638 /* Return true if STMT_INFO contains a memory access and if the constant
16639    component of the memory address is aligned to SIZE bytes.  */
16640 static bool
aarch64_aligned_constant_offset_p(stmt_vec_info stmt_info,poly_uint64 size)16641 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
16642 				   poly_uint64 size)
16643 {
16644   if (!STMT_VINFO_DATA_REF (stmt_info))
16645     return false;
16646 
16647   if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
16648     stmt_info = first_stmt;
16649   tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
16650   /* Needed for gathers & scatters, for example.  */
16651   if (!constant_offset)
16652     return false;
16653 
16654   return multiple_p (wi::to_poly_offset (constant_offset), size);
16655 }
16656 
16657 /* Check if a scalar or vector stmt could be part of a region of code
16658    that does nothing more than store values to memory, in the scalar
16659    case using STP.  Return the cost of the stmt if so, counting 2 for
16660    one instruction.  Return ~0U otherwise.
16661 
16662    The arguments are a subset of those passed to add_stmt_cost.  */
16663 unsigned int
aarch64_stp_sequence_cost(unsigned int count,vect_cost_for_stmt kind,stmt_vec_info stmt_info,tree vectype)16664 aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
16665 			   stmt_vec_info stmt_info, tree vectype)
16666 {
16667   /* Code that stores vector constants uses a vector_load to create
16668      the constant.  We don't apply the heuristic to that case for two
16669      main reasons:
16670 
16671      - At the moment, STPs are only formed via peephole2, and the
16672        constant scalar moves would often come between STRs and so
16673        prevent STP formation.
16674 
16675      - The scalar code also has to load the constant somehow, and that
16676        isn't costed.  */
16677   switch (kind)
16678     {
16679     case scalar_to_vec:
16680       /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup.  */
16681       return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
16682 
16683     case vec_construct:
16684       if (FLOAT_TYPE_P (vectype))
16685 	/* Count 1 insn for the maximum number of FP->SIMD INS
16686 	   instructions.  */
16687 	return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
16688 
16689       /* Count 2 insns for a GPR->SIMD move and 2 insns for the
16690 	 maximum number of GPR->SIMD INS instructions.  */
16691       return vect_nunits_for_cost (vectype) * 4 * count;
16692 
16693     case vector_store:
16694     case unaligned_store:
16695       /* Count 1 insn per vector if we can't form STP Q pairs.  */
16696       if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16697 	return count * 2;
16698       if (aarch64_tune_params.extra_tuning_flags
16699 	  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
16700 	return count * 2;
16701 
16702       if (stmt_info)
16703 	{
16704 	  /* Assume we won't be able to use STP if the constant offset
16705 	     component of the address is misaligned.  ??? This could be
16706 	     removed if we formed STP pairs earlier, rather than relying
16707 	     on peephole2.  */
16708 	  auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
16709 	  if (!aarch64_aligned_constant_offset_p (stmt_info, size))
16710 	    return count * 2;
16711 	}
16712       return CEIL (count, 2) * 2;
16713 
16714     case scalar_store:
16715       if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
16716 	{
16717 	  /* Check for a mode in which STP pairs can be formed.  */
16718 	  auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
16719 	  if (maybe_ne (size, 4) && maybe_ne (size, 8))
16720 	    return ~0U;
16721 
16722 	  /* Assume we won't be able to use STP if the constant offset
16723 	     component of the address is misaligned.  ??? This could be
16724 	     removed if we formed STP pairs earlier, rather than relying
16725 	     on peephole2.  */
16726 	  if (!aarch64_aligned_constant_offset_p (stmt_info, size))
16727 	    return ~0U;
16728 	}
16729       return count;
16730 
16731     default:
16732       return ~0U;
16733     }
16734 }
16735 
16736 unsigned
add_stmt_cost(int count,vect_cost_for_stmt kind,stmt_vec_info stmt_info,slp_tree,tree vectype,int misalign,vect_cost_model_location where)16737 aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
16738 				     stmt_vec_info stmt_info, slp_tree,
16739 				     tree vectype, int misalign,
16740 				     vect_cost_model_location where)
16741 {
16742   fractional_cost stmt_cost
16743     = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
16744 
16745   bool in_inner_loop_p = (where == vect_body
16746 			  && stmt_info
16747 			  && stmt_in_inner_loop_p (m_vinfo, stmt_info));
16748 
16749   /* Do one-time initialization based on the vinfo.  */
16750   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
16751   if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
16752     {
16753       if (loop_vinfo)
16754 	analyze_loop_vinfo (loop_vinfo);
16755 
16756       m_analyzed_vinfo = true;
16757     }
16758 
16759   /* Apply the heuristic described above m_stp_sequence_cost.  */
16760   if (m_stp_sequence_cost != ~0U)
16761     {
16762       uint64_t cost = aarch64_stp_sequence_cost (count, kind,
16763 						 stmt_info, vectype);
16764       m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
16765     }
16766 
16767   /* Try to get a more accurate cost by looking at STMT_INFO instead
16768      of just looking at KIND.  */
16769   if (stmt_info && aarch64_use_new_vector_costs_p ())
16770     {
16771       /* If we scalarize a strided store, the vectorizer costs one
16772 	 vec_to_scalar for each element.  However, we can store the first
16773 	 element using an FP store without a separate extract step.  */
16774       if (vect_is_store_elt_extraction (kind, stmt_info))
16775 	count -= 1;
16776 
16777       stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
16778 						      stmt_info, stmt_cost);
16779 
16780       if (vectype && m_vec_flags)
16781 	stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
16782 							stmt_info, vectype,
16783 							where, stmt_cost);
16784     }
16785 
16786   /* Do any SVE-specific adjustments to the cost.  */
16787   if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
16788     stmt_cost = aarch64_sve_adjust_stmt_cost (m_vinfo, kind, stmt_info,
16789 					      vectype, stmt_cost);
16790 
16791   if (stmt_info && aarch64_use_new_vector_costs_p ())
16792     {
16793       /* Account for any extra "embedded" costs that apply additively
16794 	 to the base cost calculated above.  */
16795       stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
16796 					    stmt_cost);
16797 
16798       /* If we're recording a nonzero vector loop body cost for the
16799 	 innermost loop, also estimate the operations that would need
16800 	 to be issued by all relevant implementations of the loop.  */
16801       if (loop_vinfo
16802 	  && (m_costing_for_scalar || where == vect_body)
16803 	  && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
16804 	  && stmt_cost != 0)
16805 	for (auto &ops : m_ops)
16806 	  count_ops (count, kind, stmt_info, &ops);
16807 
16808       /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
16809 	 estimate the number of statements in the unrolled Advanced SIMD
16810 	 loop.  For simplicitly, we assume that one iteration of the
16811 	 Advanced SIMD loop would need the same number of statements
16812 	 as one iteration of the SVE loop.  */
16813       if (where == vect_body && m_unrolled_advsimd_niters)
16814 	m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
16815 
16816       /* Detect the use of an averaging operation.  */
16817       gimple *stmt = stmt_info->stmt;
16818       if (is_gimple_call (stmt)
16819 	  && gimple_call_internal_p (stmt))
16820 	{
16821 	  switch (gimple_call_internal_fn (stmt))
16822 	    {
16823 	    case IFN_AVG_FLOOR:
16824 	    case IFN_AVG_CEIL:
16825 	      m_has_avg = true;
16826 	    default:
16827 	      break;
16828 	    }
16829 	}
16830     }
16831 
16832   /* If the statement stores to a decl that is known to be the argument
16833      to a vld1 in the same function, ignore the store for costing purposes.
16834      See the comment above m_stores_to_vector_load_decl for more details.  */
16835   if (stmt_info
16836       && (kind == vector_store || kind == unaligned_store)
16837       && aarch64_accesses_vector_load_decl_p (stmt_info))
16838     {
16839       stmt_cost = 0;
16840       m_stores_to_vector_load_decl = true;
16841     }
16842 
16843   return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
16844 }
16845 
16846 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
16847    heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
16848    says that we should prefer the Advanced SIMD loop.  */
16849 bool
prefer_unrolled_loop() const16850 aarch64_vector_costs::prefer_unrolled_loop () const
16851 {
16852   if (!m_unrolled_advsimd_stmts)
16853     return false;
16854 
16855   if (dump_enabled_p ())
16856     dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
16857 		     " unrolled Advanced SIMD loop = "
16858 		     HOST_WIDE_INT_PRINT_UNSIGNED "\n",
16859 		     m_unrolled_advsimd_stmts);
16860 
16861   /* The balance here is tricky.  On the one hand, we can't be sure whether
16862      the code is vectorizable with Advanced SIMD or not.  However, even if
16863      it isn't vectorizable with Advanced SIMD, there's a possibility that
16864      the scalar code could also be unrolled.  Some of the code might then
16865      benefit from SLP, or from using LDP and STP.  We therefore apply
16866      the heuristic regardless of can_use_advsimd_p.  */
16867   return (m_unrolled_advsimd_stmts
16868 	  && (m_unrolled_advsimd_stmts
16869 	      <= (unsigned int) param_max_completely_peeled_insns));
16870 }
16871 
16872 /* Subroutine of adjust_body_cost for handling SVE.  Use ISSUE_INFO to work out
16873    how fast the SVE code can be issued and compare it to the equivalent value
16874    for scalar code (SCALAR_CYCLES_PER_ITER).  If COULD_USE_ADVSIMD is true,
16875    also compare it to the issue rate of Advanced SIMD code
16876    (ADVSIMD_CYCLES_PER_ITER).
16877 
16878    ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
16879    *BODY_COST is the current value of the adjusted cost.  *SHOULD_DISPARAGE
16880    is true if we think the loop body is too expensive.  */
16881 
16882 fractional_cost
16883 aarch64_vector_costs::
adjust_body_cost_sve(const aarch64_vec_op_count * ops,fractional_cost scalar_cycles_per_iter,unsigned int orig_body_cost,unsigned int * body_cost,bool * should_disparage)16884 adjust_body_cost_sve (const aarch64_vec_op_count *ops,
16885 		      fractional_cost scalar_cycles_per_iter,
16886 		      unsigned int orig_body_cost, unsigned int *body_cost,
16887 		      bool *should_disparage)
16888 {
16889   if (dump_enabled_p ())
16890     ops->dump ();
16891 
16892   fractional_cost sve_pred_cycles_per_iter = ops->min_pred_cycles_per_iter ();
16893   fractional_cost sve_cycles_per_iter = ops->min_cycles_per_iter ();
16894 
16895   /* If the scalar version of the loop could issue at least as
16896      quickly as the predicate parts of the SVE loop, make the SVE loop
16897      prohibitively expensive.  In this case vectorization is adding an
16898      overhead that the original scalar code didn't have.
16899 
16900      This is mostly intended to detect cases in which WHILELOs dominate
16901      for very tight loops, which is something that normal latency-based
16902      costs would not model.  Adding this kind of cliffedge would be
16903      too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
16904      code in the caller handles that case in a more conservative way.  */
16905   fractional_cost sve_estimate = sve_pred_cycles_per_iter + 1;
16906   if (scalar_cycles_per_iter < sve_estimate)
16907     {
16908       unsigned int min_cost
16909 	= orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
16910       if (*body_cost < min_cost)
16911 	{
16912 	  if (dump_enabled_p ())
16913 	    dump_printf_loc (MSG_NOTE, vect_location,
16914 			     "Increasing body cost to %d because the"
16915 			     " scalar code could issue within the limit"
16916 			     " imposed by predicate operations\n",
16917 			     min_cost);
16918 	  *body_cost = min_cost;
16919 	  *should_disparage = true;
16920 	}
16921     }
16922 
16923   return sve_cycles_per_iter;
16924 }
16925 
16926 unsigned int
determine_suggested_unroll_factor()16927 aarch64_vector_costs::determine_suggested_unroll_factor ()
16928 {
16929   bool sve = m_vec_flags & VEC_ANY_SVE;
16930   /* If we are trying to unroll an Advanced SIMD main loop that contains
16931      an averaging operation that we do not support with SVE and we might use a
16932      predicated epilogue, we need to be conservative and block unrolling as
16933      this might lead to a less optimal loop for the first and only epilogue
16934      using the original loop's vectorization factor.
16935      TODO: Remove this constraint when we add support for multiple epilogue
16936      vectorization.  */
16937   if (!sve && !TARGET_SVE2 && m_has_avg)
16938     return 1;
16939 
16940   unsigned int max_unroll_factor = 1;
16941   for (auto vec_ops : m_ops)
16942     {
16943       aarch64_simd_vec_issue_info const *vec_issue
16944 	= vec_ops.simd_issue_info ();
16945       if (!vec_issue)
16946 	return 1;
16947       /* Limit unroll factor to a value adjustable by the user, the default
16948 	 value is 4. */
16949       unsigned int unroll_factor = aarch64_vect_unroll_limit;
16950       unsigned int factor
16951        = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
16952       unsigned int temp;
16953 
16954       /* Sanity check, this should never happen.  */
16955       if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
16956 	return 1;
16957 
16958       /* Check stores.  */
16959       if (vec_ops.stores > 0)
16960 	{
16961 	  temp = CEIL (factor * vec_issue->stores_per_cycle,
16962 		       vec_ops.stores);
16963 	  unroll_factor = MIN (unroll_factor, temp);
16964 	}
16965 
16966       /* Check loads + stores.  */
16967       if (vec_ops.loads > 0)
16968 	{
16969 	  temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
16970 		       vec_ops.loads + vec_ops.stores);
16971 	  unroll_factor = MIN (unroll_factor, temp);
16972 	}
16973 
16974       /* Check general ops.  */
16975       if (vec_ops.general_ops > 0)
16976 	{
16977 	  temp = CEIL (factor * vec_issue->general_ops_per_cycle,
16978 		       vec_ops.general_ops);
16979 	  unroll_factor = MIN (unroll_factor, temp);
16980 	 }
16981       max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
16982     }
16983 
16984   /* Make sure unroll factor is power of 2.  */
16985   return 1 << ceil_log2 (max_unroll_factor);
16986 }
16987 
16988 /* BODY_COST is the cost of a vector loop body.  Adjust the cost as necessary
16989    and return the new cost.  */
16990 unsigned int
16991 aarch64_vector_costs::
adjust_body_cost(loop_vec_info loop_vinfo,const aarch64_vector_costs * scalar_costs,unsigned int body_cost)16992 adjust_body_cost (loop_vec_info loop_vinfo,
16993 		  const aarch64_vector_costs *scalar_costs,
16994 		  unsigned int body_cost)
16995 {
16996   if (scalar_costs->m_ops.is_empty () || m_ops.is_empty ())
16997     return body_cost;
16998 
16999   const auto &scalar_ops = scalar_costs->m_ops[0];
17000   const auto &vector_ops = m_ops[0];
17001   unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
17002   unsigned int orig_body_cost = body_cost;
17003   bool should_disparage = false;
17004 
17005   if (dump_enabled_p ())
17006     dump_printf_loc (MSG_NOTE, vect_location,
17007 		     "Original vector body cost = %d\n", body_cost);
17008 
17009   fractional_cost scalar_cycles_per_iter
17010     = scalar_ops.min_cycles_per_iter () * estimated_vf;
17011 
17012   fractional_cost vector_cycles_per_iter = vector_ops.min_cycles_per_iter ();
17013 
17014   if (dump_enabled_p ())
17015     {
17016       if (IN_RANGE (m_num_vector_iterations, 0, 65536))
17017 	dump_printf_loc (MSG_NOTE, vect_location,
17018 			 "Vector loop iterates at most %wd times\n",
17019 			 m_num_vector_iterations);
17020       dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
17021       scalar_ops.dump ();
17022       dump_printf_loc (MSG_NOTE, vect_location,
17023 		       "  estimated cycles per vector iteration"
17024 		       " (for VF %d) = %f\n",
17025 		       estimated_vf, scalar_cycles_per_iter.as_double ());
17026     }
17027 
17028   if (vector_ops.sve_issue_info ())
17029     {
17030       if (dump_enabled_p ())
17031 	dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
17032       vector_cycles_per_iter
17033 	= adjust_body_cost_sve (&vector_ops, scalar_cycles_per_iter,
17034 				orig_body_cost, &body_cost, &should_disparage);
17035 
17036       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
17037 	{
17038 	  /* Also take Neoverse V1 tuning into account, doubling the
17039 	     scalar and Advanced SIMD estimates to account for the
17040 	     doubling in SVE vector length.  */
17041 	  if (dump_enabled_p ())
17042 	    dump_printf_loc (MSG_NOTE, vect_location,
17043 			     "Neoverse V1 estimate:\n");
17044 	  auto vf_factor = m_ops[1].vf_factor ();
17045 	  adjust_body_cost_sve (&m_ops[1], scalar_cycles_per_iter * vf_factor,
17046 				orig_body_cost, &body_cost, &should_disparage);
17047 	}
17048     }
17049   else
17050     {
17051       if (dump_enabled_p ())
17052 	{
17053 	  dump_printf_loc (MSG_NOTE, vect_location,
17054 			   "Vector issue estimate:\n");
17055 	  vector_ops.dump ();
17056 	}
17057     }
17058 
17059   /* Decide whether to stick to latency-based costs or whether to try to
17060      take issue rates into account.  */
17061   unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
17062   if (m_vec_flags & VEC_ANY_SVE)
17063     threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
17064 
17065   if (m_num_vector_iterations >= 1
17066       && m_num_vector_iterations < threshold)
17067     {
17068       if (dump_enabled_p ())
17069 	dump_printf_loc (MSG_NOTE, vect_location,
17070 			 "Low iteration count, so using pure latency"
17071 			 " costs\n");
17072     }
17073   /* Increase the cost of the vector code if it looks like the scalar code
17074      could issue more quickly.  These values are only rough estimates,
17075      so minor differences should only result in minor changes.  */
17076   else if (scalar_cycles_per_iter < vector_cycles_per_iter)
17077     {
17078       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17079 					  scalar_cycles_per_iter);
17080       if (dump_enabled_p ())
17081 	dump_printf_loc (MSG_NOTE, vect_location,
17082 			 "Increasing body cost to %d because scalar code"
17083 			 " would issue more quickly\n", body_cost);
17084     }
17085   /* In general, it's expected that the proposed vector code would be able
17086      to issue more quickly than the original scalar code.  This should
17087      already be reflected to some extent in the latency-based costs.
17088 
17089      However, the latency-based costs effectively assume that the scalar
17090      code and the vector code execute serially, which tends to underplay
17091      one important case: if the real (non-serialized) execution time of
17092      a scalar iteration is dominated by loop-carried dependencies,
17093      and if the vector code is able to reduce both the length of
17094      the loop-carried dependencies *and* the number of cycles needed
17095      to issue the code in general, we can be more confident that the
17096      vector code is an improvement, even if adding the other (non-loop-carried)
17097      latencies tends to hide this saving.  We therefore reduce the cost of the
17098      vector loop body in proportion to the saving.  */
17099   else if (scalar_ops.reduction_latency > vector_ops.reduction_latency
17100 	   && scalar_ops.reduction_latency == scalar_cycles_per_iter
17101 	   && scalar_cycles_per_iter > vector_cycles_per_iter
17102 	   && !should_disparage)
17103     {
17104       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17105 					  scalar_cycles_per_iter);
17106       if (dump_enabled_p ())
17107 	dump_printf_loc (MSG_NOTE, vect_location,
17108 			 "Decreasing body cost to %d account for smaller"
17109 			 " reduction latency\n", body_cost);
17110     }
17111 
17112   return body_cost;
17113 }
17114 
17115 void
finish_cost(const vector_costs * uncast_scalar_costs)17116 aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
17117 {
17118   auto *scalar_costs
17119     = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
17120   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17121   if (loop_vinfo
17122       && m_vec_flags
17123       && aarch64_use_new_vector_costs_p ())
17124     {
17125       m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
17126 					     m_costs[vect_body]);
17127       m_suggested_unroll_factor = determine_suggested_unroll_factor ();
17128     }
17129 
17130   /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
17131      the scalar code in the event of a tie, since there is more chance
17132      of scalar code being optimized with surrounding operations.
17133 
17134      In addition, if the vector body is a simple store to a decl that
17135      is elsewhere loaded using vld1, strongly prefer the vector form,
17136      to the extent of giving the prologue a zero cost.  See the comment
17137      above m_stores_to_vector_load_decl for details.  */
17138   if (!loop_vinfo
17139       && scalar_costs
17140       && m_stp_sequence_cost != ~0U)
17141     {
17142       if (m_stores_to_vector_load_decl)
17143 	m_costs[vect_prologue] = 0;
17144       else if (m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
17145 	m_costs[vect_body] = 2 * scalar_costs->total_cost ();
17146     }
17147 
17148   vector_costs::finish_cost (scalar_costs);
17149 }
17150 
17151 bool
17152 aarch64_vector_costs::
better_main_loop_than_p(const vector_costs * uncast_other) const17153 better_main_loop_than_p (const vector_costs *uncast_other) const
17154 {
17155   auto other = static_cast<const aarch64_vector_costs *> (uncast_other);
17156 
17157   auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
17158   auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
17159 
17160   if (dump_enabled_p ())
17161     dump_printf_loc (MSG_NOTE, vect_location,
17162 		     "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
17163 		     GET_MODE_NAME (this_loop_vinfo->vector_mode),
17164 		     vect_vf_for_cost (this_loop_vinfo),
17165 		     GET_MODE_NAME (other_loop_vinfo->vector_mode),
17166 		     vect_vf_for_cost (other_loop_vinfo));
17167 
17168   /* Apply the unrolling heuristic described above
17169      m_unrolled_advsimd_niters.  */
17170   if (bool (m_unrolled_advsimd_stmts)
17171       != bool (other->m_unrolled_advsimd_stmts))
17172     {
17173       bool this_prefer_unrolled = this->prefer_unrolled_loop ();
17174       bool other_prefer_unrolled = other->prefer_unrolled_loop ();
17175       if (this_prefer_unrolled != other_prefer_unrolled)
17176 	{
17177 	  if (dump_enabled_p ())
17178 	    dump_printf_loc (MSG_NOTE, vect_location,
17179 			     "Preferring Advanced SIMD loop because"
17180 			     " it can be unrolled\n");
17181 	  return other_prefer_unrolled;
17182 	}
17183     }
17184 
17185   for (unsigned int i = 0; i < m_ops.length (); ++i)
17186     {
17187       if (dump_enabled_p ())
17188 	{
17189 	  if (i)
17190 	    dump_printf_loc (MSG_NOTE, vect_location,
17191 			     "Reconsidering with subtuning %d\n", i);
17192 	  dump_printf_loc (MSG_NOTE, vect_location,
17193 			   "Issue info for %s loop:\n",
17194 			   GET_MODE_NAME (this_loop_vinfo->vector_mode));
17195 	  this->m_ops[i].dump ();
17196 	  dump_printf_loc (MSG_NOTE, vect_location,
17197 			   "Issue info for %s loop:\n",
17198 			   GET_MODE_NAME (other_loop_vinfo->vector_mode));
17199 	  other->m_ops[i].dump ();
17200 	}
17201 
17202       auto this_estimated_vf = (vect_vf_for_cost (this_loop_vinfo)
17203 				* this->m_ops[i].vf_factor ());
17204       auto other_estimated_vf = (vect_vf_for_cost (other_loop_vinfo)
17205 				 * other->m_ops[i].vf_factor ());
17206 
17207       /* If it appears that one loop could process the same amount of data
17208 	 in fewer cycles, prefer that loop over the other one.  */
17209       fractional_cost this_cost
17210 	= this->m_ops[i].min_cycles_per_iter () * other_estimated_vf;
17211       fractional_cost other_cost
17212 	= other->m_ops[i].min_cycles_per_iter () * this_estimated_vf;
17213       if (dump_enabled_p ())
17214 	{
17215 	  dump_printf_loc (MSG_NOTE, vect_location,
17216 			   "Weighted cycles per iteration of %s loop ~= %f\n",
17217 			   GET_MODE_NAME (this_loop_vinfo->vector_mode),
17218 			   this_cost.as_double ());
17219 	  dump_printf_loc (MSG_NOTE, vect_location,
17220 			   "Weighted cycles per iteration of %s loop ~= %f\n",
17221 			   GET_MODE_NAME (other_loop_vinfo->vector_mode),
17222 			   other_cost.as_double ());
17223 	}
17224       if (this_cost != other_cost)
17225 	{
17226 	  if (dump_enabled_p ())
17227 	    dump_printf_loc (MSG_NOTE, vect_location,
17228 			     "Preferring loop with lower cycles"
17229 			     " per iteration\n");
17230 	  return this_cost < other_cost;
17231 	}
17232 
17233       /* If the issue rate of SVE code is limited by predicate operations
17234 	 (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
17235 	 and if Advanced SIMD code could issue within the limit imposed
17236 	 by the predicate operations, the predicate operations are adding an
17237 	 overhead that the original code didn't have and so we should prefer
17238 	 the Advanced SIMD version.  */
17239       auto better_pred_limit_p = [](const aarch64_vec_op_count &a,
17240 				    const aarch64_vec_op_count &b) -> bool
17241 	{
17242 	  if (a.pred_ops == 0
17243 	      && (b.min_pred_cycles_per_iter ()
17244 		  > b.min_nonpred_cycles_per_iter ()))
17245 	    {
17246 	      if (dump_enabled_p ())
17247 		dump_printf_loc (MSG_NOTE, vect_location,
17248 				 "Preferring Advanced SIMD loop since"
17249 				 " SVE loop is predicate-limited\n");
17250 	      return true;
17251 	    }
17252 	  return false;
17253 	};
17254       if (better_pred_limit_p (this->m_ops[i], other->m_ops[i]))
17255 	return true;
17256       if (better_pred_limit_p (other->m_ops[i], this->m_ops[i]))
17257 	return false;
17258     }
17259 
17260   return vector_costs::better_main_loop_than_p (other);
17261 }
17262 
17263 static void initialize_aarch64_code_model (struct gcc_options *);
17264 
17265 /* Parse the TO_PARSE string and put the architecture struct that it
17266    selects into RES and the architectural features into ISA_FLAGS.
17267    Return an aarch64_parse_opt_result describing the parse result.
17268    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
17269    When the TO_PARSE string contains an invalid extension,
17270    a copy of the string is created and stored to INVALID_EXTENSION.  */
17271 
17272 static enum aarch64_parse_opt_result
aarch64_parse_arch(const char * to_parse,const struct processor ** res,uint64_t * isa_flags,std::string * invalid_extension)17273 aarch64_parse_arch (const char *to_parse, const struct processor **res,
17274 		    uint64_t *isa_flags, std::string *invalid_extension)
17275 {
17276   const char *ext;
17277   const struct processor *arch;
17278   size_t len;
17279 
17280   ext = strchr (to_parse, '+');
17281 
17282   if (ext != NULL)
17283     len = ext - to_parse;
17284   else
17285     len = strlen (to_parse);
17286 
17287   if (len == 0)
17288     return AARCH64_PARSE_MISSING_ARG;
17289 
17290 
17291   /* Loop through the list of supported ARCHes to find a match.  */
17292   for (arch = all_architectures; arch->name != NULL; arch++)
17293     {
17294       if (strlen (arch->name) == len
17295 	  && strncmp (arch->name, to_parse, len) == 0)
17296 	{
17297 	  uint64_t isa_temp = arch->flags;
17298 
17299 	  if (ext != NULL)
17300 	    {
17301 	      /* TO_PARSE string contains at least one extension.  */
17302 	      enum aarch64_parse_opt_result ext_res
17303 		= aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17304 
17305 	      if (ext_res != AARCH64_PARSE_OK)
17306 		return ext_res;
17307 	    }
17308 	  /* Extension parsing was successful.  Confirm the result
17309 	     arch and ISA flags.  */
17310 	  *res = arch;
17311 	  *isa_flags = isa_temp;
17312 	  return AARCH64_PARSE_OK;
17313 	}
17314     }
17315 
17316   /* ARCH name not found in list.  */
17317   return AARCH64_PARSE_INVALID_ARG;
17318 }
17319 
17320 /* Parse the TO_PARSE string and put the result tuning in RES and the
17321    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
17322    describing the parse result.  If there is an error parsing, RES and
17323    ISA_FLAGS are left unchanged.
17324    When the TO_PARSE string contains an invalid extension,
17325    a copy of the string is created and stored to INVALID_EXTENSION.  */
17326 
17327 static enum aarch64_parse_opt_result
aarch64_parse_cpu(const char * to_parse,const struct processor ** res,uint64_t * isa_flags,std::string * invalid_extension)17328 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
17329 		   uint64_t *isa_flags, std::string *invalid_extension)
17330 {
17331   const char *ext;
17332   const struct processor *cpu;
17333   size_t len;
17334 
17335   ext = strchr (to_parse, '+');
17336 
17337   if (ext != NULL)
17338     len = ext - to_parse;
17339   else
17340     len = strlen (to_parse);
17341 
17342   if (len == 0)
17343     return AARCH64_PARSE_MISSING_ARG;
17344 
17345 
17346   /* Loop through the list of supported CPUs to find a match.  */
17347   for (cpu = all_cores; cpu->name != NULL; cpu++)
17348     {
17349       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
17350 	{
17351 	  uint64_t isa_temp = cpu->flags;
17352 
17353 
17354 	  if (ext != NULL)
17355 	    {
17356 	      /* TO_PARSE string contains at least one extension.  */
17357 	      enum aarch64_parse_opt_result ext_res
17358 		= aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17359 
17360 	      if (ext_res != AARCH64_PARSE_OK)
17361 		return ext_res;
17362 	    }
17363 	  /* Extension parsing was successfull.  Confirm the result
17364 	     cpu and ISA flags.  */
17365 	  *res = cpu;
17366 	  *isa_flags = isa_temp;
17367 	  return AARCH64_PARSE_OK;
17368 	}
17369     }
17370 
17371   /* CPU name not found in list.  */
17372   return AARCH64_PARSE_INVALID_ARG;
17373 }
17374 
17375 /* Parse the TO_PARSE string and put the cpu it selects into RES.
17376    Return an aarch64_parse_opt_result describing the parse result.
17377    If the parsing fails the RES does not change.  */
17378 
17379 static enum aarch64_parse_opt_result
aarch64_parse_tune(const char * to_parse,const struct processor ** res)17380 aarch64_parse_tune (const char *to_parse, const struct processor **res)
17381 {
17382   const struct processor *cpu;
17383 
17384   /* Loop through the list of supported CPUs to find a match.  */
17385   for (cpu = all_cores; cpu->name != NULL; cpu++)
17386     {
17387       if (strcmp (cpu->name, to_parse) == 0)
17388 	{
17389 	  *res = cpu;
17390 	  return AARCH64_PARSE_OK;
17391 	}
17392     }
17393 
17394   /* CPU name not found in list.  */
17395   return AARCH64_PARSE_INVALID_ARG;
17396 }
17397 
17398 /* Parse TOKEN, which has length LENGTH to see if it is an option
17399    described in FLAG.  If it is, return the index bit for that fusion type.
17400    If not, error (printing OPTION_NAME) and return zero.  */
17401 
17402 static unsigned int
aarch64_parse_one_option_token(const char * token,size_t length,const struct aarch64_flag_desc * flag,const char * option_name)17403 aarch64_parse_one_option_token (const char *token,
17404 				size_t length,
17405 				const struct aarch64_flag_desc *flag,
17406 				const char *option_name)
17407 {
17408   for (; flag->name != NULL; flag++)
17409     {
17410       if (length == strlen (flag->name)
17411 	  && !strncmp (flag->name, token, length))
17412 	return flag->flag;
17413     }
17414 
17415   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
17416   return 0;
17417 }
17418 
17419 /* Parse OPTION which is a comma-separated list of flags to enable.
17420    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
17421    default state we inherit from the CPU tuning structures.  OPTION_NAME
17422    gives the top-level option we are parsing in the -moverride string,
17423    for use in error messages.  */
17424 
17425 static unsigned int
aarch64_parse_boolean_options(const char * option,const struct aarch64_flag_desc * flags,unsigned int initial_state,const char * option_name)17426 aarch64_parse_boolean_options (const char *option,
17427 			       const struct aarch64_flag_desc *flags,
17428 			       unsigned int initial_state,
17429 			       const char *option_name)
17430 {
17431   const char separator = '.';
17432   const char* specs = option;
17433   const char* ntoken = option;
17434   unsigned int found_flags = initial_state;
17435 
17436   while ((ntoken = strchr (specs, separator)))
17437     {
17438       size_t token_length = ntoken - specs;
17439       unsigned token_ops = aarch64_parse_one_option_token (specs,
17440 							   token_length,
17441 							   flags,
17442 							   option_name);
17443       /* If we find "none" (or, for simplicity's sake, an error) anywhere
17444 	 in the token stream, reset the supported operations.  So:
17445 
17446 	   adrp+add.cmp+branch.none.adrp+add
17447 
17448 	   would have the result of turning on only adrp+add fusion.  */
17449       if (!token_ops)
17450 	found_flags = 0;
17451 
17452       found_flags |= token_ops;
17453       specs = ++ntoken;
17454     }
17455 
17456   /* We ended with a comma, print something.  */
17457   if (!(*specs))
17458     {
17459       error ("%qs string ill-formed", option_name);
17460       return 0;
17461     }
17462 
17463   /* We still have one more token to parse.  */
17464   size_t token_length = strlen (specs);
17465   unsigned token_ops = aarch64_parse_one_option_token (specs,
17466 						       token_length,
17467 						       flags,
17468 						       option_name);
17469    if (!token_ops)
17470      found_flags = 0;
17471 
17472   found_flags |= token_ops;
17473   return found_flags;
17474 }
17475 
17476 /* Support for overriding instruction fusion.  */
17477 
17478 static void
aarch64_parse_fuse_string(const char * fuse_string,struct tune_params * tune)17479 aarch64_parse_fuse_string (const char *fuse_string,
17480 			    struct tune_params *tune)
17481 {
17482   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
17483 						     aarch64_fusible_pairs,
17484 						     tune->fusible_ops,
17485 						     "fuse=");
17486 }
17487 
17488 /* Support for overriding other tuning flags.  */
17489 
17490 static void
aarch64_parse_tune_string(const char * tune_string,struct tune_params * tune)17491 aarch64_parse_tune_string (const char *tune_string,
17492 			    struct tune_params *tune)
17493 {
17494   tune->extra_tuning_flags
17495     = aarch64_parse_boolean_options (tune_string,
17496 				     aarch64_tuning_flags,
17497 				     tune->extra_tuning_flags,
17498 				     "tune=");
17499 }
17500 
17501 /* Parse the sve_width tuning moverride string in TUNE_STRING.
17502    Accept the valid SVE vector widths allowed by
17503    aarch64_sve_vector_bits_enum and use it to override sve_width
17504    in TUNE.  */
17505 
17506 static void
aarch64_parse_sve_width_string(const char * tune_string,struct tune_params * tune)17507 aarch64_parse_sve_width_string (const char *tune_string,
17508 				struct tune_params *tune)
17509 {
17510   int width = -1;
17511 
17512   int n = sscanf (tune_string, "%d", &width);
17513   if (n == EOF)
17514     {
17515       error ("invalid format for %<sve_width%>");
17516       return;
17517     }
17518   switch (width)
17519     {
17520     case SVE_128:
17521     case SVE_256:
17522     case SVE_512:
17523     case SVE_1024:
17524     case SVE_2048:
17525       break;
17526     default:
17527       error ("invalid %<sve_width%> value: %d", width);
17528     }
17529   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
17530 }
17531 
17532 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
17533    we understand.  If it is, extract the option string and handoff to
17534    the appropriate function.  */
17535 
17536 void
aarch64_parse_one_override_token(const char * token,size_t length,struct tune_params * tune)17537 aarch64_parse_one_override_token (const char* token,
17538 				  size_t length,
17539 				  struct tune_params *tune)
17540 {
17541   const struct aarch64_tuning_override_function *fn
17542     = aarch64_tuning_override_functions;
17543 
17544   const char *option_part = strchr (token, '=');
17545   if (!option_part)
17546     {
17547       error ("tuning string missing in option (%s)", token);
17548       return;
17549     }
17550 
17551   /* Get the length of the option name.  */
17552   length = option_part - token;
17553   /* Skip the '=' to get to the option string.  */
17554   option_part++;
17555 
17556   for (; fn->name != NULL; fn++)
17557     {
17558       if (!strncmp (fn->name, token, length))
17559 	{
17560 	  fn->parse_override (option_part, tune);
17561 	  return;
17562 	}
17563     }
17564 
17565   error ("unknown tuning option (%s)",token);
17566   return;
17567 }
17568 
17569 /* A checking mechanism for the implementation of the tls size.  */
17570 
17571 static void
initialize_aarch64_tls_size(struct gcc_options * opts)17572 initialize_aarch64_tls_size (struct gcc_options *opts)
17573 {
17574   if (aarch64_tls_size == 0)
17575     aarch64_tls_size = 24;
17576 
17577   switch (opts->x_aarch64_cmodel_var)
17578     {
17579     case AARCH64_CMODEL_TINY:
17580       /* Both the default and maximum TLS size allowed under tiny is 1M which
17581 	 needs two instructions to address, so we clamp the size to 24.  */
17582       if (aarch64_tls_size > 24)
17583 	aarch64_tls_size = 24;
17584       break;
17585     case AARCH64_CMODEL_SMALL:
17586       /* The maximum TLS size allowed under small is 4G.  */
17587       if (aarch64_tls_size > 32)
17588 	aarch64_tls_size = 32;
17589       break;
17590     case AARCH64_CMODEL_LARGE:
17591       /* The maximum TLS size allowed under large is 16E.
17592 	 FIXME: 16E should be 64bit, we only support 48bit offset now.  */
17593       if (aarch64_tls_size > 48)
17594 	aarch64_tls_size = 48;
17595       break;
17596     default:
17597       gcc_unreachable ();
17598     }
17599 
17600   return;
17601 }
17602 
17603 /* Parse STRING looking for options in the format:
17604      string	:: option:string
17605      option	:: name=substring
17606      name	:: {a-z}
17607      substring	:: defined by option.  */
17608 
17609 static void
aarch64_parse_override_string(const char * input_string,struct tune_params * tune)17610 aarch64_parse_override_string (const char* input_string,
17611 			       struct tune_params* tune)
17612 {
17613   const char separator = ':';
17614   size_t string_length = strlen (input_string) + 1;
17615   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
17616   char *string = string_root;
17617   strncpy (string, input_string, string_length);
17618   string[string_length - 1] = '\0';
17619 
17620   char* ntoken = string;
17621 
17622   while ((ntoken = strchr (string, separator)))
17623     {
17624       size_t token_length = ntoken - string;
17625       /* Make this substring look like a string.  */
17626       *ntoken = '\0';
17627       aarch64_parse_one_override_token (string, token_length, tune);
17628       string = ++ntoken;
17629     }
17630 
17631   /* One last option to parse.  */
17632   aarch64_parse_one_override_token (string, strlen (string), tune);
17633   free (string_root);
17634 }
17635 
17636 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
17637    are best for a generic target with the currently-enabled architecture
17638    extensions.  */
17639 static void
aarch64_adjust_generic_arch_tuning(struct tune_params & current_tune)17640 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
17641 {
17642   /* Neoverse V1 is the only core that is known to benefit from
17643      AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS.  There is therefore no
17644      point enabling it for SVE2 and above.  */
17645   if (TARGET_SVE2)
17646     current_tune.extra_tuning_flags
17647       &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
17648 }
17649 
17650 static void
aarch64_override_options_after_change_1(struct gcc_options * opts)17651 aarch64_override_options_after_change_1 (struct gcc_options *opts)
17652 {
17653   if (accepted_branch_protection_string)
17654     {
17655       opts->x_aarch64_branch_protection_string
17656 	= xstrdup (accepted_branch_protection_string);
17657     }
17658 
17659   /* PR 70044: We have to be careful about being called multiple times for the
17660      same function.  This means all changes should be repeatable.  */
17661 
17662   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
17663      Disable the frame pointer flag so the mid-end will not use a frame
17664      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
17665      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
17666      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
17667   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
17668   if (opts->x_flag_omit_frame_pointer == 0)
17669     opts->x_flag_omit_frame_pointer = 2;
17670 
17671   /* If not optimizing for size, set the default
17672      alignment to what the target wants.  */
17673   if (!opts->x_optimize_size)
17674     {
17675       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
17676 	opts->x_str_align_loops = aarch64_tune_params.loop_align;
17677       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
17678 	opts->x_str_align_jumps = aarch64_tune_params.jump_align;
17679       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
17680 	opts->x_str_align_functions = aarch64_tune_params.function_align;
17681     }
17682 
17683   /* We default to no pc-relative literal loads.  */
17684 
17685   aarch64_pcrelative_literal_loads = false;
17686 
17687   /* If -mpc-relative-literal-loads is set on the command line, this
17688      implies that the user asked for PC relative literal loads.  */
17689   if (opts->x_pcrelative_literal_loads == 1)
17690     aarch64_pcrelative_literal_loads = true;
17691 
17692   /* In the tiny memory model it makes no sense to disallow PC relative
17693      literal pool loads.  */
17694   if (aarch64_cmodel == AARCH64_CMODEL_TINY
17695       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
17696     aarch64_pcrelative_literal_loads = true;
17697 
17698   /* When enabling the lower precision Newton series for the square root, also
17699      enable it for the reciprocal square root, since the latter is an
17700      intermediary step for the former.  */
17701   if (flag_mlow_precision_sqrt)
17702     flag_mrecip_low_precision_sqrt = true;
17703 }
17704 
17705 /* 'Unpack' up the internal tuning structs and update the options
17706     in OPTS.  The caller must have set up selected_tune and selected_arch
17707     as all the other target-specific codegen decisions are
17708     derived from them.  */
17709 
17710 void
aarch64_override_options_internal(struct gcc_options * opts)17711 aarch64_override_options_internal (struct gcc_options *opts)
17712 {
17713   aarch64_tune_flags = selected_tune->flags;
17714   aarch64_tune = selected_tune->sched_core;
17715   /* Make a copy of the tuning parameters attached to the core, which
17716      we may later overwrite.  */
17717   aarch64_tune_params = *(selected_tune->tune);
17718   aarch64_architecture_version = selected_arch->architecture_version;
17719   if (selected_tune->tune == &generic_tunings)
17720     aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
17721 
17722   if (opts->x_aarch64_override_tune_string)
17723     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
17724 				  &aarch64_tune_params);
17725 
17726   /* This target defaults to strict volatile bitfields.  */
17727   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
17728     opts->x_flag_strict_volatile_bitfields = 1;
17729 
17730   if (aarch64_stack_protector_guard == SSP_GLOBAL
17731       && opts->x_aarch64_stack_protector_guard_offset_str)
17732     {
17733       error ("incompatible options %<-mstack-protector-guard=global%> and "
17734 	     "%<-mstack-protector-guard-offset=%s%>",
17735 	     aarch64_stack_protector_guard_offset_str);
17736     }
17737 
17738   if (aarch64_stack_protector_guard == SSP_SYSREG
17739       && !(opts->x_aarch64_stack_protector_guard_offset_str
17740 	   && opts->x_aarch64_stack_protector_guard_reg_str))
17741     {
17742       error ("both %<-mstack-protector-guard-offset%> and "
17743 	     "%<-mstack-protector-guard-reg%> must be used "
17744 	     "with %<-mstack-protector-guard=sysreg%>");
17745     }
17746 
17747   if (opts->x_aarch64_stack_protector_guard_reg_str)
17748     {
17749       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
17750 	  error ("specify a system register with a small string length");
17751     }
17752 
17753   if (opts->x_aarch64_stack_protector_guard_offset_str)
17754     {
17755       char *end;
17756       const char *str = aarch64_stack_protector_guard_offset_str;
17757       errno = 0;
17758       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
17759       if (!*str || *end || errno)
17760 	error ("%qs is not a valid offset in %qs", str,
17761 	       "-mstack-protector-guard-offset=");
17762       aarch64_stack_protector_guard_offset = offs;
17763     }
17764 
17765   if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)
17766       && !fixed_regs[R18_REGNUM])
17767     error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
17768 
17769   initialize_aarch64_code_model (opts);
17770   initialize_aarch64_tls_size (opts);
17771 
17772   int queue_depth = 0;
17773   switch (aarch64_tune_params.autoprefetcher_model)
17774     {
17775       case tune_params::AUTOPREFETCHER_OFF:
17776 	queue_depth = -1;
17777 	break;
17778       case tune_params::AUTOPREFETCHER_WEAK:
17779 	queue_depth = 0;
17780 	break;
17781       case tune_params::AUTOPREFETCHER_STRONG:
17782 	queue_depth = max_insn_queue_index + 1;
17783 	break;
17784       default:
17785 	gcc_unreachable ();
17786     }
17787 
17788   /* We don't mind passing in global_options_set here as we don't use
17789      the *options_set structs anyway.  */
17790   SET_OPTION_IF_UNSET (opts, &global_options_set,
17791 		       param_sched_autopref_queue_depth, queue_depth);
17792 
17793   /* If using Advanced SIMD only for autovectorization disable SVE vector costs
17794      comparison.  */
17795   if (aarch64_autovec_preference == 1)
17796     SET_OPTION_IF_UNSET (opts, &global_options_set,
17797 			 aarch64_sve_compare_costs, 0);
17798 
17799   /* Set up parameters to be used in prefetching algorithm.  Do not
17800      override the defaults unless we are tuning for a core we have
17801      researched values for.  */
17802   if (aarch64_tune_params.prefetch->num_slots > 0)
17803     SET_OPTION_IF_UNSET (opts, &global_options_set,
17804 			 param_simultaneous_prefetches,
17805 			 aarch64_tune_params.prefetch->num_slots);
17806   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
17807     SET_OPTION_IF_UNSET (opts, &global_options_set,
17808 			 param_l1_cache_size,
17809 			 aarch64_tune_params.prefetch->l1_cache_size);
17810   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
17811     SET_OPTION_IF_UNSET (opts, &global_options_set,
17812 			 param_l1_cache_line_size,
17813 			 aarch64_tune_params.prefetch->l1_cache_line_size);
17814 
17815   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
17816     {
17817       SET_OPTION_IF_UNSET (opts, &global_options_set,
17818 			   param_destruct_interfere_size,
17819 			   aarch64_tune_params.prefetch->l1_cache_line_size);
17820       SET_OPTION_IF_UNSET (opts, &global_options_set,
17821 			   param_construct_interfere_size,
17822 			   aarch64_tune_params.prefetch->l1_cache_line_size);
17823     }
17824   else
17825     {
17826       /* For a generic AArch64 target, cover the current range of cache line
17827 	 sizes.  */
17828       SET_OPTION_IF_UNSET (opts, &global_options_set,
17829 			   param_destruct_interfere_size,
17830 			   256);
17831       SET_OPTION_IF_UNSET (opts, &global_options_set,
17832 			   param_construct_interfere_size,
17833 			   64);
17834     }
17835 
17836   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
17837     SET_OPTION_IF_UNSET (opts, &global_options_set,
17838 			 param_l2_cache_size,
17839 			 aarch64_tune_params.prefetch->l2_cache_size);
17840   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
17841     SET_OPTION_IF_UNSET (opts, &global_options_set,
17842 			 param_prefetch_dynamic_strides, 0);
17843   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
17844     SET_OPTION_IF_UNSET (opts, &global_options_set,
17845 			 param_prefetch_minimum_stride,
17846 			 aarch64_tune_params.prefetch->minimum_stride);
17847 
17848   /* Use the alternative scheduling-pressure algorithm by default.  */
17849   SET_OPTION_IF_UNSET (opts, &global_options_set,
17850 		       param_sched_pressure_algorithm,
17851 		       SCHED_PRESSURE_MODEL);
17852 
17853   /* Validate the guard size.  */
17854   int guard_size = param_stack_clash_protection_guard_size;
17855 
17856   if (guard_size != 12 && guard_size != 16)
17857     error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
17858 	   "size.  Given value %d (%llu KB) is out of range",
17859 	   guard_size, (1ULL << guard_size) / 1024ULL);
17860 
17861   /* Enforce that interval is the same size as size so the mid-end does the
17862      right thing.  */
17863   SET_OPTION_IF_UNSET (opts, &global_options_set,
17864 		       param_stack_clash_protection_probe_interval,
17865 		       guard_size);
17866 
17867   /* The maybe_set calls won't update the value if the user has explicitly set
17868      one.  Which means we need to validate that probing interval and guard size
17869      are equal.  */
17870   int probe_interval
17871     = param_stack_clash_protection_probe_interval;
17872   if (guard_size != probe_interval)
17873     error ("stack clash guard size %<%d%> must be equal to probing interval "
17874 	   "%<%d%>", guard_size, probe_interval);
17875 
17876   /* Enable sw prefetching at specified optimization level for
17877      CPUS that have prefetch.  Lower optimization level threshold by 1
17878      when profiling is enabled.  */
17879   if (opts->x_flag_prefetch_loop_arrays < 0
17880       && !opts->x_optimize_size
17881       && aarch64_tune_params.prefetch->default_opt_level >= 0
17882       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
17883     opts->x_flag_prefetch_loop_arrays = 1;
17884 
17885   if (opts->x_aarch64_arch_string == NULL)
17886     opts->x_aarch64_arch_string = selected_arch->name;
17887   if (opts->x_aarch64_cpu_string == NULL)
17888     opts->x_aarch64_cpu_string = selected_cpu->name;
17889   if (opts->x_aarch64_tune_string == NULL)
17890     opts->x_aarch64_tune_string = selected_tune->name;
17891 
17892   aarch64_override_options_after_change_1 (opts);
17893 }
17894 
17895 /* Print a hint with a suggestion for a core or architecture name that
17896    most closely resembles what the user passed in STR.  ARCH is true if
17897    the user is asking for an architecture name.  ARCH is false if the user
17898    is asking for a core name.  */
17899 
17900 static void
aarch64_print_hint_for_core_or_arch(const char * str,bool arch)17901 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
17902 {
17903   auto_vec<const char *> candidates;
17904   const struct processor *entry = arch ? all_architectures : all_cores;
17905   for (; entry->name != NULL; entry++)
17906     candidates.safe_push (entry->name);
17907 
17908 #ifdef HAVE_LOCAL_CPU_DETECT
17909   /* Add also "native" as possible value.  */
17910   if (arch)
17911     candidates.safe_push ("native");
17912 #endif
17913 
17914   char *s;
17915   const char *hint = candidates_list_and_hint (str, s, candidates);
17916   if (hint)
17917     inform (input_location, "valid arguments are: %s;"
17918 			     " did you mean %qs?", s, hint);
17919   else
17920     inform (input_location, "valid arguments are: %s", s);
17921 
17922   XDELETEVEC (s);
17923 }
17924 
17925 /* Print a hint with a suggestion for a core name that most closely resembles
17926    what the user passed in STR.  */
17927 
17928 inline static void
aarch64_print_hint_for_core(const char * str)17929 aarch64_print_hint_for_core (const char *str)
17930 {
17931   aarch64_print_hint_for_core_or_arch (str, false);
17932 }
17933 
17934 /* Print a hint with a suggestion for an architecture name that most closely
17935    resembles what the user passed in STR.  */
17936 
17937 inline static void
aarch64_print_hint_for_arch(const char * str)17938 aarch64_print_hint_for_arch (const char *str)
17939 {
17940   aarch64_print_hint_for_core_or_arch (str, true);
17941 }
17942 
17943 
17944 /* Print a hint with a suggestion for an extension name
17945    that most closely resembles what the user passed in STR.  */
17946 
17947 void
aarch64_print_hint_for_extensions(const std::string & str)17948 aarch64_print_hint_for_extensions (const std::string &str)
17949 {
17950   auto_vec<const char *> candidates;
17951   aarch64_get_all_extension_candidates (&candidates);
17952   char *s;
17953   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
17954   if (hint)
17955     inform (input_location, "valid arguments are: %s;"
17956 			     " did you mean %qs?", s, hint);
17957   else
17958     inform (input_location, "valid arguments are: %s", s);
17959 
17960   XDELETEVEC (s);
17961 }
17962 
17963 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
17964    specified in STR and throw errors if appropriate.  Put the results if
17965    they are valid in RES and ISA_FLAGS.  Return whether the option is
17966    valid.  */
17967 
17968 static bool
aarch64_validate_mcpu(const char * str,const struct processor ** res,uint64_t * isa_flags)17969 aarch64_validate_mcpu (const char *str, const struct processor **res,
17970 		       uint64_t *isa_flags)
17971 {
17972   std::string invalid_extension;
17973   enum aarch64_parse_opt_result parse_res
17974     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
17975 
17976   if (parse_res == AARCH64_PARSE_OK)
17977     return true;
17978 
17979   switch (parse_res)
17980     {
17981       case AARCH64_PARSE_MISSING_ARG:
17982 	error ("missing cpu name in %<-mcpu=%s%>", str);
17983 	break;
17984       case AARCH64_PARSE_INVALID_ARG:
17985 	error ("unknown value %qs for %<-mcpu%>", str);
17986 	aarch64_print_hint_for_core (str);
17987 	break;
17988       case AARCH64_PARSE_INVALID_FEATURE:
17989 	error ("invalid feature modifier %qs in %<-mcpu=%s%>",
17990 	       invalid_extension.c_str (), str);
17991 	aarch64_print_hint_for_extensions (invalid_extension);
17992 	break;
17993       default:
17994 	gcc_unreachable ();
17995     }
17996 
17997   return false;
17998 }
17999 
18000 /* Straight line speculation indicators.  */
18001 enum aarch64_sls_hardening_type
18002 {
18003   SLS_NONE = 0,
18004   SLS_RETBR = 1,
18005   SLS_BLR = 2,
18006   SLS_ALL = 3,
18007 };
18008 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
18009 
18010 /* Return whether we should mitigatate Straight Line Speculation for the RET
18011    and BR instructions.  */
18012 bool
aarch64_harden_sls_retbr_p(void)18013 aarch64_harden_sls_retbr_p (void)
18014 {
18015   return aarch64_sls_hardening & SLS_RETBR;
18016 }
18017 
18018 /* Return whether we should mitigatate Straight Line Speculation for the BLR
18019    instruction.  */
18020 bool
aarch64_harden_sls_blr_p(void)18021 aarch64_harden_sls_blr_p (void)
18022 {
18023   return aarch64_sls_hardening & SLS_BLR;
18024 }
18025 
18026 /* As of yet we only allow setting these options globally, in the future we may
18027    allow setting them per function.  */
18028 static void
aarch64_validate_sls_mitigation(const char * const_str)18029 aarch64_validate_sls_mitigation (const char *const_str)
18030 {
18031   char *token_save = NULL;
18032   char *str = NULL;
18033 
18034   if (strcmp (const_str, "none") == 0)
18035     {
18036       aarch64_sls_hardening = SLS_NONE;
18037       return;
18038     }
18039   if (strcmp (const_str, "all") == 0)
18040     {
18041       aarch64_sls_hardening = SLS_ALL;
18042       return;
18043     }
18044 
18045   char *str_root = xstrdup (const_str);
18046   str = strtok_r (str_root, ",", &token_save);
18047   if (!str)
18048     error ("invalid argument given to %<-mharden-sls=%>");
18049 
18050   int temp = SLS_NONE;
18051   while (str)
18052     {
18053       if (strcmp (str, "blr") == 0)
18054 	temp |= SLS_BLR;
18055       else if (strcmp (str, "retbr") == 0)
18056 	temp |= SLS_RETBR;
18057       else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
18058 	{
18059 	  error ("%qs must be by itself for %<-mharden-sls=%>", str);
18060 	  break;
18061 	}
18062       else
18063 	{
18064 	  error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
18065 	  break;
18066 	}
18067       str = strtok_r (NULL, ",", &token_save);
18068     }
18069   aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
18070   free (str_root);
18071 }
18072 
18073 /* Parses CONST_STR for branch protection features specified in
18074    aarch64_branch_protect_types, and set any global variables required.  Returns
18075    the parsing result and assigns LAST_STR to the last processed token from
18076    CONST_STR so that it can be used for error reporting.  */
18077 
18078 static enum
aarch64_parse_branch_protection(const char * const_str,char ** last_str)18079 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
18080 							  char** last_str)
18081 {
18082   char *str_root = xstrdup (const_str);
18083   char* token_save = NULL;
18084   char *str = strtok_r (str_root, "+", &token_save);
18085   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
18086   if (!str)
18087     res = AARCH64_PARSE_MISSING_ARG;
18088   else
18089     {
18090       char *next_str = strtok_r (NULL, "+", &token_save);
18091       /* Reset the branch protection features to their defaults.  */
18092       aarch64_handle_no_branch_protection (NULL, NULL);
18093 
18094       while (str && res == AARCH64_PARSE_OK)
18095 	{
18096 	  const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
18097 	  bool found = false;
18098 	  /* Search for this type.  */
18099 	  while (type && type->name && !found && res == AARCH64_PARSE_OK)
18100 	    {
18101 	      if (strcmp (str, type->name) == 0)
18102 		{
18103 		  found = true;
18104 		  res = type->handler (str, next_str);
18105 		  str = next_str;
18106 		  next_str = strtok_r (NULL, "+", &token_save);
18107 		}
18108 	      else
18109 		type++;
18110 	    }
18111 	  if (found && res == AARCH64_PARSE_OK)
18112 	    {
18113 	      bool found_subtype = true;
18114 	      /* Loop through each token until we find one that isn't a
18115 		 subtype.  */
18116 	      while (found_subtype)
18117 		{
18118 		  found_subtype = false;
18119 		  const aarch64_branch_protect_type *subtype = type->subtypes;
18120 		  /* Search for the subtype.  */
18121 		  while (str && subtype && subtype->name && !found_subtype
18122 			  && res == AARCH64_PARSE_OK)
18123 		    {
18124 		      if (strcmp (str, subtype->name) == 0)
18125 			{
18126 			  found_subtype = true;
18127 			  res = subtype->handler (str, next_str);
18128 			  str = next_str;
18129 			  next_str = strtok_r (NULL, "+", &token_save);
18130 			}
18131 		      else
18132 			subtype++;
18133 		    }
18134 		}
18135 	    }
18136 	  else if (!found)
18137 	    res = AARCH64_PARSE_INVALID_ARG;
18138 	}
18139     }
18140   /* Copy the last processed token into the argument to pass it back.
18141     Used by option and attribute validation to print the offending token.  */
18142   if (last_str)
18143     {
18144       if (str) strcpy (*last_str, str);
18145       else *last_str = NULL;
18146     }
18147   if (res == AARCH64_PARSE_OK)
18148     {
18149       /* If needed, alloc the accepted string then copy in const_str.
18150 	Used by override_option_after_change_1.  */
18151       if (!accepted_branch_protection_string)
18152 	accepted_branch_protection_string = (char *) xmalloc (
18153 						      BRANCH_PROTECT_STR_MAX
18154 							+ 1);
18155       strncpy (accepted_branch_protection_string, const_str,
18156 		BRANCH_PROTECT_STR_MAX + 1);
18157       /* Forcibly null-terminate.  */
18158       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
18159     }
18160   return res;
18161 }
18162 
18163 static bool
aarch64_validate_mbranch_protection(const char * const_str)18164 aarch64_validate_mbranch_protection (const char *const_str)
18165 {
18166   char *str = (char *) xmalloc (strlen (const_str));
18167   enum aarch64_parse_opt_result res =
18168     aarch64_parse_branch_protection (const_str, &str);
18169   if (res == AARCH64_PARSE_INVALID_ARG)
18170     error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
18171   else if (res == AARCH64_PARSE_MISSING_ARG)
18172     error ("missing argument for %<-mbranch-protection=%>");
18173   free (str);
18174   return res == AARCH64_PARSE_OK;
18175 }
18176 
18177 /* Validate a command-line -march option.  Parse the arch and extensions
18178    (if any) specified in STR and throw errors if appropriate.  Put the
18179    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
18180    option is valid.  */
18181 
18182 static bool
aarch64_validate_march(const char * str,const struct processor ** res,uint64_t * isa_flags)18183 aarch64_validate_march (const char *str, const struct processor **res,
18184 			 uint64_t *isa_flags)
18185 {
18186   std::string invalid_extension;
18187   enum aarch64_parse_opt_result parse_res
18188     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18189 
18190   if (parse_res == AARCH64_PARSE_OK)
18191     return true;
18192 
18193   switch (parse_res)
18194     {
18195       case AARCH64_PARSE_MISSING_ARG:
18196 	error ("missing arch name in %<-march=%s%>", str);
18197 	break;
18198       case AARCH64_PARSE_INVALID_ARG:
18199 	error ("unknown value %qs for %<-march%>", str);
18200 	aarch64_print_hint_for_arch (str);
18201 	break;
18202       case AARCH64_PARSE_INVALID_FEATURE:
18203 	error ("invalid feature modifier %qs in %<-march=%s%>",
18204 	       invalid_extension.c_str (), str);
18205 	aarch64_print_hint_for_extensions (invalid_extension);
18206 	break;
18207       default:
18208 	gcc_unreachable ();
18209     }
18210 
18211   return false;
18212 }
18213 
18214 /* Validate a command-line -mtune option.  Parse the cpu
18215    specified in STR and throw errors if appropriate.  Put the
18216    result, if it is valid, in RES.  Return whether the option is
18217    valid.  */
18218 
18219 static bool
aarch64_validate_mtune(const char * str,const struct processor ** res)18220 aarch64_validate_mtune (const char *str, const struct processor **res)
18221 {
18222   enum aarch64_parse_opt_result parse_res
18223     = aarch64_parse_tune (str, res);
18224 
18225   if (parse_res == AARCH64_PARSE_OK)
18226     return true;
18227 
18228   switch (parse_res)
18229     {
18230       case AARCH64_PARSE_MISSING_ARG:
18231 	error ("missing cpu name in %<-mtune=%s%>", str);
18232 	break;
18233       case AARCH64_PARSE_INVALID_ARG:
18234 	error ("unknown value %qs for %<-mtune%>", str);
18235 	aarch64_print_hint_for_core (str);
18236 	break;
18237       default:
18238 	gcc_unreachable ();
18239     }
18240   return false;
18241 }
18242 
18243 static_assert (TARGET_CPU_generic < TARGET_CPU_MASK,
18244 	       "TARGET_CPU_NBITS is big enough");
18245 
18246 /* Return the CPU corresponding to the enum CPU.
18247    If it doesn't specify a cpu, return the default.  */
18248 
18249 static const struct processor *
aarch64_get_tune_cpu(enum aarch64_processor cpu)18250 aarch64_get_tune_cpu (enum aarch64_processor cpu)
18251 {
18252   if (cpu != aarch64_none)
18253     return &all_cores[cpu];
18254 
18255   /* The & TARGET_CPU_MASK is to extract the bottom TARGET_CPU_NBITS bits that
18256      encode the default cpu as selected by the --with-cpu GCC configure option
18257      in config.gcc.
18258      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
18259      flags mechanism should be reworked to make it more sane.  */
18260   return &all_cores[TARGET_CPU_DEFAULT & TARGET_CPU_MASK];
18261 }
18262 
18263 /* Return the architecture corresponding to the enum ARCH.
18264    If it doesn't specify a valid architecture, return the default.  */
18265 
18266 static const struct processor *
aarch64_get_arch(enum aarch64_arch arch)18267 aarch64_get_arch (enum aarch64_arch arch)
18268 {
18269   if (arch != aarch64_no_arch)
18270     return &all_architectures[arch];
18271 
18272   const struct processor *cpu
18273     = &all_cores[TARGET_CPU_DEFAULT & TARGET_CPU_MASK];
18274 
18275   return &all_architectures[cpu->arch];
18276 }
18277 
18278 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
18279 
18280 static poly_uint16
aarch64_convert_sve_vector_bits(aarch64_sve_vector_bits_enum value)18281 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
18282 {
18283   /* 128-bit SVE and Advanced SIMD modes use different register layouts
18284      on big-endian targets, so we would need to forbid subregs that convert
18285      from one to the other.  By default a reinterpret sequence would then
18286      involve a store to memory in one mode and a load back in the other.
18287      Even if we optimize that sequence using reverse instructions,
18288      it would still be a significant potential overhead.
18289 
18290      For now, it seems better to generate length-agnostic code for that
18291      case instead.  */
18292   if (value == SVE_SCALABLE
18293       || (value == SVE_128 && BYTES_BIG_ENDIAN))
18294     return poly_uint16 (2, 2);
18295   else
18296     return (int) value / 64;
18297 }
18298 
18299 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
18300    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18301    tuning structs.  In particular it must set selected_tune and
18302    aarch64_isa_flags that define the available ISA features and tuning
18303    decisions.  It must also set selected_arch as this will be used to
18304    output the .arch asm tags for each function.  */
18305 
18306 static void
aarch64_override_options(void)18307 aarch64_override_options (void)
18308 {
18309   uint64_t cpu_isa = 0;
18310   uint64_t arch_isa = 0;
18311   aarch64_isa_flags = 0;
18312 
18313   bool valid_cpu = true;
18314   bool valid_tune = true;
18315   bool valid_arch = true;
18316 
18317   selected_cpu = NULL;
18318   selected_arch = NULL;
18319   selected_tune = NULL;
18320 
18321   if (aarch64_harden_sls_string)
18322     aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
18323 
18324   if (aarch64_branch_protection_string)
18325     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
18326 
18327   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18328      If either of -march or -mtune is given, they override their
18329      respective component of -mcpu.  */
18330   if (aarch64_cpu_string)
18331     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
18332 					&cpu_isa);
18333 
18334   if (aarch64_arch_string)
18335     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
18336 					  &arch_isa);
18337 
18338   if (aarch64_tune_string)
18339     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
18340 
18341 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18342   SUBTARGET_OVERRIDE_OPTIONS;
18343 #endif
18344 
18345   /* If the user did not specify a processor, choose the default
18346      one for them.  This will be the CPU set during configuration using
18347      --with-cpu, otherwise it is "generic".  */
18348   if (!selected_cpu)
18349     {
18350       if (selected_arch)
18351 	{
18352 	  selected_cpu = &all_cores[selected_arch->ident];
18353 	  aarch64_isa_flags = arch_isa;
18354 	  explicit_arch = selected_arch->arch;
18355 	}
18356       else
18357 	{
18358 	  /* Get default configure-time CPU.  */
18359 	  selected_cpu = aarch64_get_tune_cpu (aarch64_none);
18360 	  aarch64_isa_flags = TARGET_CPU_DEFAULT >> TARGET_CPU_NBITS;
18361 	}
18362 
18363       if (selected_tune)
18364 	explicit_tune_core = selected_tune->ident;
18365     }
18366   /* If both -mcpu and -march are specified check that they are architecturally
18367      compatible, warn if they're not and prefer the -march ISA flags.  */
18368   else if (selected_arch)
18369     {
18370       if (selected_arch->arch != selected_cpu->arch)
18371 	{
18372 	  warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
18373 		       aarch64_cpu_string,
18374 		       aarch64_arch_string);
18375 	}
18376       aarch64_isa_flags = arch_isa;
18377       explicit_arch = selected_arch->arch;
18378       explicit_tune_core = selected_tune ? selected_tune->ident
18379 					  : selected_cpu->ident;
18380     }
18381   else
18382     {
18383       /* -mcpu but no -march.  */
18384       aarch64_isa_flags = cpu_isa;
18385       explicit_tune_core = selected_tune ? selected_tune->ident
18386 					  : selected_cpu->ident;
18387       gcc_assert (selected_cpu);
18388       selected_arch = &all_architectures[selected_cpu->arch];
18389       explicit_arch = selected_arch->arch;
18390     }
18391 
18392   /* Set the arch as well as we will need it when outputing
18393      the .arch directive in assembly.  */
18394   if (!selected_arch)
18395     {
18396       gcc_assert (selected_cpu);
18397       selected_arch = &all_architectures[selected_cpu->arch];
18398     }
18399 
18400   if (!selected_tune)
18401     selected_tune = selected_cpu;
18402 
18403   if (aarch64_enable_bti == 2)
18404     {
18405 #ifdef TARGET_ENABLE_BTI
18406       aarch64_enable_bti = 1;
18407 #else
18408       aarch64_enable_bti = 0;
18409 #endif
18410     }
18411 
18412   /* Return address signing is currently not supported for ILP32 targets.  For
18413      LP64 targets use the configured option in the absence of a command-line
18414      option for -mbranch-protection.  */
18415   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
18416     {
18417 #ifdef TARGET_ENABLE_PAC_RET
18418       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
18419 #else
18420       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
18421 #endif
18422     }
18423 
18424 #ifndef HAVE_AS_MABI_OPTION
18425   /* The compiler may have been configured with 2.23.* binutils, which does
18426      not have support for ILP32.  */
18427   if (TARGET_ILP32)
18428     error ("assembler does not support %<-mabi=ilp32%>");
18429 #endif
18430 
18431   /* Convert -msve-vector-bits to a VG count.  */
18432   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
18433 
18434   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
18435     sorry ("return address signing is only supported for %<-mabi=lp64%>");
18436 
18437   /* Make sure we properly set up the explicit options.  */
18438   if ((aarch64_cpu_string && valid_cpu)
18439        || (aarch64_tune_string && valid_tune))
18440     gcc_assert (explicit_tune_core != aarch64_none);
18441 
18442   if ((aarch64_cpu_string && valid_cpu)
18443        || (aarch64_arch_string && valid_arch))
18444     gcc_assert (explicit_arch != aarch64_no_arch);
18445 
18446   /* The pass to insert speculation tracking runs before
18447      shrink-wrapping and the latter does not know how to update the
18448      tracking status.  So disable it in this case.  */
18449   if (aarch64_track_speculation)
18450     flag_shrink_wrap = 0;
18451 
18452   aarch64_override_options_internal (&global_options);
18453 
18454   /* Save these options as the default ones in case we push and pop them later
18455      while processing functions with potential target attributes.  */
18456   target_option_default_node = target_option_current_node
18457     = build_target_option_node (&global_options, &global_options_set);
18458 }
18459 
18460 /* Implement targetm.override_options_after_change.  */
18461 
18462 static void
aarch64_override_options_after_change(void)18463 aarch64_override_options_after_change (void)
18464 {
18465   aarch64_override_options_after_change_1 (&global_options);
18466 }
18467 
18468 /* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
18469 static char *
aarch64_offload_options(void)18470 aarch64_offload_options (void)
18471 {
18472   if (TARGET_ILP32)
18473     return xstrdup ("-foffload-abi=ilp32");
18474   else
18475     return xstrdup ("-foffload-abi=lp64");
18476 }
18477 
18478 static struct machine_function *
aarch64_init_machine_status(void)18479 aarch64_init_machine_status (void)
18480 {
18481   struct machine_function *machine;
18482   machine = ggc_cleared_alloc<machine_function> ();
18483   return machine;
18484 }
18485 
18486 void
aarch64_init_expanders(void)18487 aarch64_init_expanders (void)
18488 {
18489   init_machine_status = aarch64_init_machine_status;
18490 }
18491 
18492 /* A checking mechanism for the implementation of the various code models.  */
18493 static void
initialize_aarch64_code_model(struct gcc_options * opts)18494 initialize_aarch64_code_model (struct gcc_options *opts)
18495 {
18496   aarch64_cmodel = opts->x_aarch64_cmodel_var;
18497   switch (opts->x_aarch64_cmodel_var)
18498     {
18499     case AARCH64_CMODEL_TINY:
18500       if (opts->x_flag_pic)
18501 	aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
18502       break;
18503     case AARCH64_CMODEL_SMALL:
18504       if (opts->x_flag_pic)
18505 	{
18506 #ifdef HAVE_AS_SMALL_PIC_RELOCS
18507 	  aarch64_cmodel = (flag_pic == 2
18508 			    ? AARCH64_CMODEL_SMALL_PIC
18509 			    : AARCH64_CMODEL_SMALL_SPIC);
18510 #else
18511 	  aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
18512 #endif
18513 	}
18514       break;
18515     case AARCH64_CMODEL_LARGE:
18516       if (opts->x_flag_pic)
18517 	sorry ("code model %qs with %<-f%s%>", "large",
18518 	       opts->x_flag_pic > 1 ? "PIC" : "pic");
18519       if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
18520 	sorry ("code model %qs not supported in ilp32 mode", "large");
18521       break;
18522     case AARCH64_CMODEL_TINY_PIC:
18523     case AARCH64_CMODEL_SMALL_PIC:
18524     case AARCH64_CMODEL_SMALL_SPIC:
18525       gcc_unreachable ();
18526     }
18527 }
18528 
18529 /* Implement TARGET_OPTION_SAVE.  */
18530 
18531 static void
aarch64_option_save(struct cl_target_option * ptr,struct gcc_options * opts,struct gcc_options *)18532 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts,
18533 		     struct gcc_options */* opts_set */)
18534 {
18535   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
18536   ptr->x_aarch64_branch_protection_string
18537     = opts->x_aarch64_branch_protection_string;
18538 }
18539 
18540 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
18541    using the information saved in PTR.  */
18542 
18543 static void
aarch64_option_restore(struct gcc_options * opts,struct gcc_options *,struct cl_target_option * ptr)18544 aarch64_option_restore (struct gcc_options *opts,
18545 			struct gcc_options */* opts_set */,
18546 			struct cl_target_option *ptr)
18547 {
18548   opts->x_explicit_arch = ptr->x_explicit_arch;
18549   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
18550   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
18551   if (opts->x_explicit_tune_core == aarch64_none
18552       && opts->x_explicit_arch != aarch64_no_arch)
18553     selected_tune = &all_cores[selected_arch->ident];
18554   else
18555     selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
18556   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
18557   opts->x_aarch64_branch_protection_string
18558     = ptr->x_aarch64_branch_protection_string;
18559   if (opts->x_aarch64_branch_protection_string)
18560     {
18561       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
18562 					NULL);
18563     }
18564 
18565   aarch64_override_options_internal (opts);
18566 }
18567 
18568 /* Implement TARGET_OPTION_PRINT.  */
18569 
18570 static void
aarch64_option_print(FILE * file,int indent,struct cl_target_option * ptr)18571 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
18572 {
18573   const struct processor *cpu
18574     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
18575   uint64_t isa_flags = ptr->x_aarch64_isa_flags;
18576   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
18577   std::string extension
18578     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
18579 
18580   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
18581   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
18582 	   arch->name, extension.c_str ());
18583 }
18584 
18585 static GTY(()) tree aarch64_previous_fndecl;
18586 
18587 void
aarch64_reset_previous_fndecl(void)18588 aarch64_reset_previous_fndecl (void)
18589 {
18590   aarch64_previous_fndecl = NULL;
18591 }
18592 
18593 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
18594    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
18595    make sure optab availability predicates are recomputed when necessary.  */
18596 
18597 void
aarch64_save_restore_target_globals(tree new_tree)18598 aarch64_save_restore_target_globals (tree new_tree)
18599 {
18600   if (TREE_TARGET_GLOBALS (new_tree))
18601     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
18602   else if (new_tree == target_option_default_node)
18603     restore_target_globals (&default_target_globals);
18604   else
18605     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
18606 }
18607 
18608 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
18609    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
18610    of the function, if such exists.  This function may be called multiple
18611    times on a single function so use aarch64_previous_fndecl to avoid
18612    setting up identical state.  */
18613 
18614 static void
aarch64_set_current_function(tree fndecl)18615 aarch64_set_current_function (tree fndecl)
18616 {
18617   if (!fndecl || fndecl == aarch64_previous_fndecl)
18618     return;
18619 
18620   tree old_tree = (aarch64_previous_fndecl
18621 		   ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
18622 		   : NULL_TREE);
18623 
18624   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
18625 
18626   /* If current function has no attributes but the previous one did,
18627      use the default node.  */
18628   if (!new_tree && old_tree)
18629     new_tree = target_option_default_node;
18630 
18631   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
18632      the default have been handled by aarch64_save_restore_target_globals from
18633      aarch64_pragma_target_parse.  */
18634   if (old_tree == new_tree)
18635     return;
18636 
18637   aarch64_previous_fndecl = fndecl;
18638 
18639   /* First set the target options.  */
18640   cl_target_option_restore (&global_options, &global_options_set,
18641 			    TREE_TARGET_OPTION (new_tree));
18642 
18643   aarch64_save_restore_target_globals (new_tree);
18644 }
18645 
18646 /* Enum describing the various ways we can handle attributes.
18647    In many cases we can reuse the generic option handling machinery.  */
18648 
18649 enum aarch64_attr_opt_type
18650 {
18651   aarch64_attr_mask,	/* Attribute should set a bit in target_flags.  */
18652   aarch64_attr_bool,	/* Attribute sets or unsets a boolean variable.  */
18653   aarch64_attr_enum,	/* Attribute sets an enum variable.  */
18654   aarch64_attr_custom	/* Attribute requires a custom handling function.  */
18655 };
18656 
18657 /* All the information needed to handle a target attribute.
18658    NAME is the name of the attribute.
18659    ATTR_TYPE specifies the type of behavior of the attribute as described
18660    in the definition of enum aarch64_attr_opt_type.
18661    ALLOW_NEG is true if the attribute supports a "no-" form.
18662    HANDLER is the function that takes the attribute string as an argument
18663    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
18664    OPT_NUM is the enum specifying the option that the attribute modifies.
18665    This is needed for attributes that mirror the behavior of a command-line
18666    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
18667    aarch64_attr_enum.  */
18668 
18669 struct aarch64_attribute_info
18670 {
18671   const char *name;
18672   enum aarch64_attr_opt_type attr_type;
18673   bool allow_neg;
18674   bool (*handler) (const char *);
18675   enum opt_code opt_num;
18676 };
18677 
18678 /* Handle the ARCH_STR argument to the arch= target attribute.  */
18679 
18680 static bool
aarch64_handle_attr_arch(const char * str)18681 aarch64_handle_attr_arch (const char *str)
18682 {
18683   const struct processor *tmp_arch = NULL;
18684   std::string invalid_extension;
18685   enum aarch64_parse_opt_result parse_res
18686     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
18687 
18688   if (parse_res == AARCH64_PARSE_OK)
18689     {
18690       gcc_assert (tmp_arch);
18691       selected_arch = tmp_arch;
18692       explicit_arch = selected_arch->arch;
18693       return true;
18694     }
18695 
18696   switch (parse_res)
18697     {
18698       case AARCH64_PARSE_MISSING_ARG:
18699 	error ("missing name in %<target(\"arch=\")%> pragma or attribute");
18700 	break;
18701       case AARCH64_PARSE_INVALID_ARG:
18702 	error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str);
18703 	aarch64_print_hint_for_arch (str);
18704 	break;
18705       case AARCH64_PARSE_INVALID_FEATURE:
18706 	error ("invalid feature modifier %s of value %qs in "
18707 	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18708 	aarch64_print_hint_for_extensions (invalid_extension);
18709 	break;
18710       default:
18711 	gcc_unreachable ();
18712     }
18713 
18714   return false;
18715 }
18716 
18717 /* Handle the argument CPU_STR to the cpu= target attribute.  */
18718 
18719 static bool
aarch64_handle_attr_cpu(const char * str)18720 aarch64_handle_attr_cpu (const char *str)
18721 {
18722   const struct processor *tmp_cpu = NULL;
18723   std::string invalid_extension;
18724   enum aarch64_parse_opt_result parse_res
18725     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
18726 
18727   if (parse_res == AARCH64_PARSE_OK)
18728     {
18729       gcc_assert (tmp_cpu);
18730       selected_tune = tmp_cpu;
18731       explicit_tune_core = selected_tune->ident;
18732 
18733       selected_arch = &all_architectures[tmp_cpu->arch];
18734       explicit_arch = selected_arch->arch;
18735       return true;
18736     }
18737 
18738   switch (parse_res)
18739     {
18740       case AARCH64_PARSE_MISSING_ARG:
18741 	error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
18742 	break;
18743       case AARCH64_PARSE_INVALID_ARG:
18744 	error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str);
18745 	aarch64_print_hint_for_core (str);
18746 	break;
18747       case AARCH64_PARSE_INVALID_FEATURE:
18748 	error ("invalid feature modifier %qs of value %qs in "
18749 	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18750 	aarch64_print_hint_for_extensions (invalid_extension);
18751 	break;
18752       default:
18753 	gcc_unreachable ();
18754     }
18755 
18756   return false;
18757 }
18758 
18759 /* Handle the argument STR to the branch-protection= attribute.  */
18760 
18761  static bool
aarch64_handle_attr_branch_protection(const char * str)18762  aarch64_handle_attr_branch_protection (const char* str)
18763  {
18764   char *err_str = (char *) xmalloc (strlen (str) + 1);
18765   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
18766 								      &err_str);
18767   bool success = false;
18768   switch (res)
18769     {
18770      case AARCH64_PARSE_MISSING_ARG:
18771        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
18772 	      " attribute");
18773        break;
18774      case AARCH64_PARSE_INVALID_ARG:
18775        error ("invalid protection type %qs in %<target(\"branch-protection"
18776 	      "=\")%> pragma or attribute", err_str);
18777        break;
18778      case AARCH64_PARSE_OK:
18779        success = true;
18780       /* Fall through.  */
18781      case AARCH64_PARSE_INVALID_FEATURE:
18782        break;
18783      default:
18784        gcc_unreachable ();
18785     }
18786   free (err_str);
18787   return success;
18788  }
18789 
18790 /* Handle the argument STR to the tune= target attribute.  */
18791 
18792 static bool
aarch64_handle_attr_tune(const char * str)18793 aarch64_handle_attr_tune (const char *str)
18794 {
18795   const struct processor *tmp_tune = NULL;
18796   enum aarch64_parse_opt_result parse_res
18797     = aarch64_parse_tune (str, &tmp_tune);
18798 
18799   if (parse_res == AARCH64_PARSE_OK)
18800     {
18801       gcc_assert (tmp_tune);
18802       selected_tune = tmp_tune;
18803       explicit_tune_core = selected_tune->ident;
18804       return true;
18805     }
18806 
18807   switch (parse_res)
18808     {
18809       case AARCH64_PARSE_INVALID_ARG:
18810 	error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str);
18811 	aarch64_print_hint_for_core (str);
18812 	break;
18813       default:
18814 	gcc_unreachable ();
18815     }
18816 
18817   return false;
18818 }
18819 
18820 /* Parse an architecture extensions target attribute string specified in STR.
18821    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
18822    if successful.  Update aarch64_isa_flags to reflect the ISA features
18823    modified.  */
18824 
18825 static bool
aarch64_handle_attr_isa_flags(char * str)18826 aarch64_handle_attr_isa_flags (char *str)
18827 {
18828   enum aarch64_parse_opt_result parse_res;
18829   uint64_t isa_flags = aarch64_isa_flags;
18830 
18831   /* We allow "+nothing" in the beginning to clear out all architectural
18832      features if the user wants to handpick specific features.  */
18833   if (strncmp ("+nothing", str, 8) == 0)
18834     {
18835       isa_flags = 0;
18836       str += 8;
18837     }
18838 
18839   std::string invalid_extension;
18840   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
18841 
18842   if (parse_res == AARCH64_PARSE_OK)
18843     {
18844       aarch64_isa_flags = isa_flags;
18845       return true;
18846     }
18847 
18848   switch (parse_res)
18849     {
18850       case AARCH64_PARSE_MISSING_ARG:
18851 	error ("missing value in %<target()%> pragma or attribute");
18852 	break;
18853 
18854       case AARCH64_PARSE_INVALID_FEATURE:
18855 	error ("invalid feature modifier %qs of value %qs in "
18856 	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18857 	break;
18858 
18859       default:
18860 	gcc_unreachable ();
18861     }
18862 
18863  return false;
18864 }
18865 
18866 /* The target attributes that we support.  On top of these we also support just
18867    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
18868    handled explicitly in aarch64_process_one_target_attr.  */
18869 
18870 static const struct aarch64_attribute_info aarch64_attributes[] =
18871 {
18872   { "general-regs-only", aarch64_attr_mask, false, NULL,
18873      OPT_mgeneral_regs_only },
18874   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
18875      OPT_mfix_cortex_a53_835769 },
18876   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
18877      OPT_mfix_cortex_a53_843419 },
18878   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
18879   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
18880   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
18881      OPT_momit_leaf_frame_pointer },
18882   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
18883   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
18884      OPT_march_ },
18885   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
18886   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
18887      OPT_mtune_ },
18888   { "branch-protection", aarch64_attr_custom, false,
18889      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
18890   { "sign-return-address", aarch64_attr_enum, false, NULL,
18891      OPT_msign_return_address_ },
18892   { "outline-atomics", aarch64_attr_bool, true, NULL,
18893      OPT_moutline_atomics},
18894   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
18895 };
18896 
18897 /* Parse ARG_STR which contains the definition of one target attribute.
18898    Show appropriate errors if any or return true if the attribute is valid.  */
18899 
18900 static bool
aarch64_process_one_target_attr(char * arg_str)18901 aarch64_process_one_target_attr (char *arg_str)
18902 {
18903   bool invert = false;
18904 
18905   size_t len = strlen (arg_str);
18906 
18907   if (len == 0)
18908     {
18909       error ("malformed %<target()%> pragma or attribute");
18910       return false;
18911     }
18912 
18913   char *str_to_check = (char *) alloca (len + 1);
18914   strcpy (str_to_check, arg_str);
18915 
18916   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
18917      It is easier to detect and handle it explicitly here rather than going
18918      through the machinery for the rest of the target attributes in this
18919      function.  */
18920   if (*str_to_check == '+')
18921     return aarch64_handle_attr_isa_flags (str_to_check);
18922 
18923   if (len > 3 && startswith (str_to_check, "no-"))
18924     {
18925       invert = true;
18926       str_to_check += 3;
18927     }
18928   char *arg = strchr (str_to_check, '=');
18929 
18930   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
18931      and point ARG to "foo".  */
18932   if (arg)
18933     {
18934       *arg = '\0';
18935       arg++;
18936     }
18937   const struct aarch64_attribute_info *p_attr;
18938   bool found = false;
18939   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
18940     {
18941       /* If the names don't match up, or the user has given an argument
18942 	 to an attribute that doesn't accept one, or didn't give an argument
18943 	 to an attribute that expects one, fail to match.  */
18944       if (strcmp (str_to_check, p_attr->name) != 0)
18945 	continue;
18946 
18947       found = true;
18948       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
18949 			      || p_attr->attr_type == aarch64_attr_enum;
18950 
18951       if (attr_need_arg_p ^ (arg != NULL))
18952 	{
18953 	  error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
18954 	  return false;
18955 	}
18956 
18957       /* If the name matches but the attribute does not allow "no-" versions
18958 	 then we can't match.  */
18959       if (invert && !p_attr->allow_neg)
18960 	{
18961 	  error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
18962 	  return false;
18963 	}
18964 
18965       switch (p_attr->attr_type)
18966 	{
18967 	/* Has a custom handler registered.
18968 	   For example, cpu=, arch=, tune=.  */
18969 	  case aarch64_attr_custom:
18970 	    gcc_assert (p_attr->handler);
18971 	    if (!p_attr->handler (arg))
18972 	      return false;
18973 	    break;
18974 
18975 	  /* Either set or unset a boolean option.  */
18976 	  case aarch64_attr_bool:
18977 	    {
18978 	      struct cl_decoded_option decoded;
18979 
18980 	      generate_option (p_attr->opt_num, NULL, !invert,
18981 			       CL_TARGET, &decoded);
18982 	      aarch64_handle_option (&global_options, &global_options_set,
18983 				      &decoded, input_location);
18984 	      break;
18985 	    }
18986 	  /* Set or unset a bit in the target_flags.  aarch64_handle_option
18987 	     should know what mask to apply given the option number.  */
18988 	  case aarch64_attr_mask:
18989 	    {
18990 	      struct cl_decoded_option decoded;
18991 	      /* We only need to specify the option number.
18992 		 aarch64_handle_option will know which mask to apply.  */
18993 	      decoded.opt_index = p_attr->opt_num;
18994 	      decoded.value = !invert;
18995 	      aarch64_handle_option (&global_options, &global_options_set,
18996 				      &decoded, input_location);
18997 	      break;
18998 	    }
18999 	  /* Use the option setting machinery to set an option to an enum.  */
19000 	  case aarch64_attr_enum:
19001 	    {
19002 	      gcc_assert (arg);
19003 	      bool valid;
19004 	      int value;
19005 	      valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
19006 					      &value, CL_TARGET);
19007 	      if (valid)
19008 		{
19009 		  set_option (&global_options, NULL, p_attr->opt_num, value,
19010 			      NULL, DK_UNSPECIFIED, input_location,
19011 			      global_dc);
19012 		}
19013 	      else
19014 		{
19015 		  error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
19016 		}
19017 	      break;
19018 	    }
19019 	  default:
19020 	    gcc_unreachable ();
19021 	}
19022     }
19023 
19024   /* If we reached here we either have found an attribute and validated
19025      it or didn't match any.  If we matched an attribute but its arguments
19026      were malformed we will have returned false already.  */
19027   return found;
19028 }
19029 
19030 /* Count how many times the character C appears in
19031    NULL-terminated string STR.  */
19032 
19033 static unsigned int
num_occurences_in_str(char c,char * str)19034 num_occurences_in_str (char c, char *str)
19035 {
19036   unsigned int res = 0;
19037   while (*str != '\0')
19038     {
19039       if (*str == c)
19040 	res++;
19041 
19042       str++;
19043     }
19044 
19045   return res;
19046 }
19047 
19048 /* Parse the tree in ARGS that contains the target attribute information
19049    and update the global target options space.  */
19050 
19051 bool
aarch64_process_target_attr(tree args)19052 aarch64_process_target_attr (tree args)
19053 {
19054   if (TREE_CODE (args) == TREE_LIST)
19055     {
19056       do
19057 	{
19058 	  tree head = TREE_VALUE (args);
19059 	  if (head)
19060 	    {
19061 	      if (!aarch64_process_target_attr (head))
19062 		return false;
19063 	    }
19064 	  args = TREE_CHAIN (args);
19065 	} while (args);
19066 
19067       return true;
19068     }
19069 
19070   if (TREE_CODE (args) != STRING_CST)
19071     {
19072       error ("attribute %<target%> argument not a string");
19073       return false;
19074     }
19075 
19076   size_t len = strlen (TREE_STRING_POINTER (args));
19077   char *str_to_check = (char *) alloca (len + 1);
19078   strcpy (str_to_check, TREE_STRING_POINTER (args));
19079 
19080   if (len == 0)
19081     {
19082       error ("malformed %<target()%> pragma or attribute");
19083       return false;
19084     }
19085 
19086   /* Used to catch empty spaces between commas i.e.
19087      attribute ((target ("attr1,,attr2"))).  */
19088   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
19089 
19090   /* Handle multiple target attributes separated by ','.  */
19091   char *token = strtok_r (str_to_check, ",", &str_to_check);
19092 
19093   unsigned int num_attrs = 0;
19094   while (token)
19095     {
19096       num_attrs++;
19097       if (!aarch64_process_one_target_attr (token))
19098 	{
19099 	  /* Check if token is possibly an arch extension without
19100 	     leading '+'.  */
19101 	  uint64_t isa_temp = 0;
19102 	  auto with_plus = std::string ("+") + token;
19103 	  enum aarch64_parse_opt_result ext_res
19104 	    = aarch64_parse_extension (with_plus.c_str (), &isa_temp, nullptr);
19105 
19106 	  if (ext_res == AARCH64_PARSE_OK)
19107 	    error ("arch extension %<%s%> should be prefixed by %<+%>",
19108 		   token);
19109 	  else
19110 	    error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
19111 	  return false;
19112 	}
19113 
19114       token = strtok_r (NULL, ",", &str_to_check);
19115     }
19116 
19117   if (num_attrs != num_commas + 1)
19118     {
19119       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
19120       return false;
19121     }
19122 
19123   return true;
19124 }
19125 
19126 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
19127    process attribute ((target ("..."))).  */
19128 
19129 static bool
aarch64_option_valid_attribute_p(tree fndecl,tree,tree args,int)19130 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
19131 {
19132   struct cl_target_option cur_target;
19133   bool ret;
19134   tree old_optimize;
19135   tree new_target, new_optimize;
19136   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19137 
19138   /* If what we're processing is the current pragma string then the
19139      target option node is already stored in target_option_current_node
19140      by aarch64_pragma_target_parse in aarch64-c.cc.  Use that to avoid
19141      having to re-parse the string.  This is especially useful to keep
19142      arm_neon.h compile times down since that header contains a lot
19143      of intrinsics enclosed in pragmas.  */
19144   if (!existing_target && args == current_target_pragma)
19145     {
19146       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
19147       return true;
19148     }
19149   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19150 
19151   old_optimize
19152     = build_optimization_node (&global_options, &global_options_set);
19153   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19154 
19155   /* If the function changed the optimization levels as well as setting
19156      target options, start with the optimizations specified.  */
19157   if (func_optimize && func_optimize != old_optimize)
19158     cl_optimization_restore (&global_options, &global_options_set,
19159 			     TREE_OPTIMIZATION (func_optimize));
19160 
19161   /* Save the current target options to restore at the end.  */
19162   cl_target_option_save (&cur_target, &global_options, &global_options_set);
19163 
19164   /* If fndecl already has some target attributes applied to it, unpack
19165      them so that we add this attribute on top of them, rather than
19166      overwriting them.  */
19167   if (existing_target)
19168     {
19169       struct cl_target_option *existing_options
19170 	= TREE_TARGET_OPTION (existing_target);
19171 
19172       if (existing_options)
19173 	cl_target_option_restore (&global_options, &global_options_set,
19174 				  existing_options);
19175     }
19176   else
19177     cl_target_option_restore (&global_options, &global_options_set,
19178 			      TREE_TARGET_OPTION (target_option_current_node));
19179 
19180   ret = aarch64_process_target_attr (args);
19181 
19182   /* Set up any additional state.  */
19183   if (ret)
19184     {
19185       aarch64_override_options_internal (&global_options);
19186       /* Initialize SIMD builtins if we haven't already.
19187 	 Set current_target_pragma to NULL for the duration so that
19188 	 the builtin initialization code doesn't try to tag the functions
19189 	 being built with the attributes specified by any current pragma, thus
19190 	 going into an infinite recursion.  */
19191       if (TARGET_SIMD)
19192 	{
19193 	  tree saved_current_target_pragma = current_target_pragma;
19194 	  current_target_pragma = NULL;
19195 	  aarch64_init_simd_builtins ();
19196 	  current_target_pragma = saved_current_target_pragma;
19197 	}
19198       new_target = build_target_option_node (&global_options,
19199 					     &global_options_set);
19200     }
19201   else
19202     new_target = NULL;
19203 
19204   new_optimize = build_optimization_node (&global_options,
19205 					  &global_options_set);
19206 
19207   if (fndecl && ret)
19208     {
19209       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19210 
19211       if (old_optimize != new_optimize)
19212 	DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19213     }
19214 
19215   cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19216 
19217   if (old_optimize != new_optimize)
19218     cl_optimization_restore (&global_options, &global_options_set,
19219 			     TREE_OPTIMIZATION (old_optimize));
19220   return ret;
19221 }
19222 
19223 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
19224    tri-bool options (yes, no, don't care) and the default value is
19225    DEF, determine whether to reject inlining.  */
19226 
19227 static bool
aarch64_tribools_ok_for_inlining_p(int caller,int callee,int dont_care,int def)19228 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
19229 				     int dont_care, int def)
19230 {
19231   /* If the callee doesn't care, always allow inlining.  */
19232   if (callee == dont_care)
19233     return true;
19234 
19235   /* If the caller doesn't care, always allow inlining.  */
19236   if (caller == dont_care)
19237     return true;
19238 
19239   /* Otherwise, allow inlining if either the callee and caller values
19240      agree, or if the callee is using the default value.  */
19241   return (callee == caller || callee == def);
19242 }
19243 
19244 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
19245    to inline CALLEE into CALLER based on target-specific info.
19246    Make sure that the caller and callee have compatible architectural
19247    features.  Then go through the other possible target attributes
19248    and see if they can block inlining.  Try not to reject always_inline
19249    callees unless they are incompatible architecturally.  */
19250 
19251 static bool
aarch64_can_inline_p(tree caller,tree callee)19252 aarch64_can_inline_p (tree caller, tree callee)
19253 {
19254   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
19255   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
19256 
19257   struct cl_target_option *caller_opts
19258 	= TREE_TARGET_OPTION (caller_tree ? caller_tree
19259 					   : target_option_default_node);
19260 
19261   struct cl_target_option *callee_opts
19262 	= TREE_TARGET_OPTION (callee_tree ? callee_tree
19263 					   : target_option_default_node);
19264 
19265   /* Callee's ISA flags should be a subset of the caller's.  */
19266   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
19267        != callee_opts->x_aarch64_isa_flags)
19268     return false;
19269 
19270   /* Allow non-strict aligned functions inlining into strict
19271      aligned ones.  */
19272   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
19273        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
19274       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
19275 	   && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
19276     return false;
19277 
19278   bool always_inline = lookup_attribute ("always_inline",
19279 					  DECL_ATTRIBUTES (callee));
19280 
19281   /* If the architectural features match up and the callee is always_inline
19282      then the other attributes don't matter.  */
19283   if (always_inline)
19284     return true;
19285 
19286   if (caller_opts->x_aarch64_cmodel_var
19287       != callee_opts->x_aarch64_cmodel_var)
19288     return false;
19289 
19290   if (caller_opts->x_aarch64_tls_dialect
19291       != callee_opts->x_aarch64_tls_dialect)
19292     return false;
19293 
19294   /* Honour explicit requests to workaround errata.  */
19295   if (!aarch64_tribools_ok_for_inlining_p (
19296 	  caller_opts->x_aarch64_fix_a53_err835769,
19297 	  callee_opts->x_aarch64_fix_a53_err835769,
19298 	  2, TARGET_FIX_ERR_A53_835769_DEFAULT))
19299     return false;
19300 
19301   if (!aarch64_tribools_ok_for_inlining_p (
19302 	  caller_opts->x_aarch64_fix_a53_err843419,
19303 	  callee_opts->x_aarch64_fix_a53_err843419,
19304 	  2, TARGET_FIX_ERR_A53_843419))
19305     return false;
19306 
19307   /* If the user explicitly specified -momit-leaf-frame-pointer for the
19308      caller and calle and they don't match up, reject inlining.  */
19309   if (!aarch64_tribools_ok_for_inlining_p (
19310 	  caller_opts->x_flag_omit_leaf_frame_pointer,
19311 	  callee_opts->x_flag_omit_leaf_frame_pointer,
19312 	  2, 1))
19313     return false;
19314 
19315   /* If the callee has specific tuning overrides, respect them.  */
19316   if (callee_opts->x_aarch64_override_tune_string != NULL
19317       && caller_opts->x_aarch64_override_tune_string == NULL)
19318     return false;
19319 
19320   /* If the user specified tuning override strings for the
19321      caller and callee and they don't match up, reject inlining.
19322      We just do a string compare here, we don't analyze the meaning
19323      of the string, as it would be too costly for little gain.  */
19324   if (callee_opts->x_aarch64_override_tune_string
19325       && caller_opts->x_aarch64_override_tune_string
19326       && (strcmp (callee_opts->x_aarch64_override_tune_string,
19327 		  caller_opts->x_aarch64_override_tune_string) != 0))
19328     return false;
19329 
19330   return true;
19331 }
19332 
19333 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
19334    been already.  */
19335 
19336 unsigned int
aarch64_tlsdesc_abi_id()19337 aarch64_tlsdesc_abi_id ()
19338 {
19339   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
19340   if (!tlsdesc_abi.initialized_p ())
19341     {
19342       HARD_REG_SET full_reg_clobbers;
19343       CLEAR_HARD_REG_SET (full_reg_clobbers);
19344       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
19345       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
19346       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
19347 	SET_HARD_REG_BIT (full_reg_clobbers, regno);
19348       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
19349     }
19350   return tlsdesc_abi.id ();
19351 }
19352 
19353 /* Return true if SYMBOL_REF X binds locally.  */
19354 
19355 static bool
aarch64_symbol_binds_local_p(const_rtx x)19356 aarch64_symbol_binds_local_p (const_rtx x)
19357 {
19358   return (SYMBOL_REF_DECL (x)
19359 	  ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
19360 	  : SYMBOL_REF_LOCAL_P (x));
19361 }
19362 
19363 /* Return true if SYMBOL_REF X is thread local */
19364 static bool
aarch64_tls_symbol_p(rtx x)19365 aarch64_tls_symbol_p (rtx x)
19366 {
19367   if (! TARGET_HAVE_TLS)
19368     return false;
19369 
19370   x = strip_salt (x);
19371   if (!SYMBOL_REF_P (x))
19372     return false;
19373 
19374   return SYMBOL_REF_TLS_MODEL (x) != 0;
19375 }
19376 
19377 /* Classify a TLS symbol into one of the TLS kinds.  */
19378 enum aarch64_symbol_type
aarch64_classify_tls_symbol(rtx x)19379 aarch64_classify_tls_symbol (rtx x)
19380 {
19381   enum tls_model tls_kind = tls_symbolic_operand_type (x);
19382 
19383   switch (tls_kind)
19384     {
19385     case TLS_MODEL_GLOBAL_DYNAMIC:
19386     case TLS_MODEL_LOCAL_DYNAMIC:
19387       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
19388 
19389     case TLS_MODEL_INITIAL_EXEC:
19390       switch (aarch64_cmodel)
19391 	{
19392 	case AARCH64_CMODEL_TINY:
19393 	case AARCH64_CMODEL_TINY_PIC:
19394 	  return SYMBOL_TINY_TLSIE;
19395 	default:
19396 	  return SYMBOL_SMALL_TLSIE;
19397 	}
19398 
19399     case TLS_MODEL_LOCAL_EXEC:
19400       if (aarch64_tls_size == 12)
19401 	return SYMBOL_TLSLE12;
19402       else if (aarch64_tls_size == 24)
19403 	return SYMBOL_TLSLE24;
19404       else if (aarch64_tls_size == 32)
19405 	return SYMBOL_TLSLE32;
19406       else if (aarch64_tls_size == 48)
19407 	return SYMBOL_TLSLE48;
19408       else
19409 	gcc_unreachable ();
19410 
19411     case TLS_MODEL_EMULATED:
19412     case TLS_MODEL_NONE:
19413       return SYMBOL_FORCE_TO_MEM;
19414 
19415     default:
19416       gcc_unreachable ();
19417     }
19418 }
19419 
19420 /* Return the correct method for accessing X + OFFSET, where X is either
19421    a SYMBOL_REF or LABEL_REF.  */
19422 
19423 enum aarch64_symbol_type
aarch64_classify_symbol(rtx x,HOST_WIDE_INT offset)19424 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
19425 {
19426   x = strip_salt (x);
19427 
19428   if (LABEL_REF_P (x))
19429     {
19430       switch (aarch64_cmodel)
19431 	{
19432 	case AARCH64_CMODEL_LARGE:
19433 	  return SYMBOL_FORCE_TO_MEM;
19434 
19435 	case AARCH64_CMODEL_TINY_PIC:
19436 	case AARCH64_CMODEL_TINY:
19437 	  return SYMBOL_TINY_ABSOLUTE;
19438 
19439 	case AARCH64_CMODEL_SMALL_SPIC:
19440 	case AARCH64_CMODEL_SMALL_PIC:
19441 	case AARCH64_CMODEL_SMALL:
19442 	  return SYMBOL_SMALL_ABSOLUTE;
19443 
19444 	default:
19445 	  gcc_unreachable ();
19446 	}
19447     }
19448 
19449   if (SYMBOL_REF_P (x))
19450     {
19451       if (aarch64_tls_symbol_p (x))
19452 	return aarch64_classify_tls_symbol (x);
19453 
19454       switch (aarch64_cmodel)
19455 	{
19456 	case AARCH64_CMODEL_TINY_PIC:
19457 	case AARCH64_CMODEL_TINY:
19458 	  /* With -fPIC non-local symbols use the GOT.  For orthogonality
19459 	     always use the GOT for extern weak symbols.  */
19460 	  if ((flag_pic || SYMBOL_REF_WEAK (x))
19461 	      && !aarch64_symbol_binds_local_p (x))
19462 	    return SYMBOL_TINY_GOT;
19463 
19464 	  /* When we retrieve symbol + offset address, we have to make sure
19465 	     the offset does not cause overflow of the final address.  But
19466 	     we have no way of knowing the address of symbol at compile time
19467 	     so we can't accurately say if the distance between the PC and
19468 	     symbol + offset is outside the addressible range of +/-1MB in the
19469 	     TINY code model.  So we limit the maximum offset to +/-64KB and
19470 	     assume the offset to the symbol is not larger than +/-(1MB - 64KB).
19471 	     If offset_within_block_p is true we allow larger offsets.  */
19472 	  if (!(IN_RANGE (offset, -0x10000, 0x10000)
19473 		|| offset_within_block_p (x, offset)))
19474 	    return SYMBOL_FORCE_TO_MEM;
19475 
19476 	  return SYMBOL_TINY_ABSOLUTE;
19477 
19478 
19479 	case AARCH64_CMODEL_SMALL_SPIC:
19480 	case AARCH64_CMODEL_SMALL_PIC:
19481 	case AARCH64_CMODEL_SMALL:
19482 	  if ((flag_pic || SYMBOL_REF_WEAK (x))
19483 	      && !aarch64_symbol_binds_local_p (x))
19484 	    return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
19485 		    ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
19486 
19487 	  /* Same reasoning as the tiny code model, but the offset cap here is
19488 	     1MB, allowing +/-3.9GB for the offset to the symbol.  */
19489 	  if (!(IN_RANGE (offset, -0x100000, 0x100000)
19490 		|| offset_within_block_p (x, offset)))
19491 	    return SYMBOL_FORCE_TO_MEM;
19492 
19493 	  return SYMBOL_SMALL_ABSOLUTE;
19494 
19495 	case AARCH64_CMODEL_LARGE:
19496 	  /* This is alright even in PIC code as the constant
19497 	     pool reference is always PC relative and within
19498 	     the same translation unit.  */
19499 	  if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
19500 	    return SYMBOL_SMALL_ABSOLUTE;
19501 	  else
19502 	    return SYMBOL_FORCE_TO_MEM;
19503 
19504 	default:
19505 	  gcc_unreachable ();
19506 	}
19507     }
19508 
19509   /* By default push everything into the constant pool.  */
19510   return SYMBOL_FORCE_TO_MEM;
19511 }
19512 
19513 bool
aarch64_constant_address_p(rtx x)19514 aarch64_constant_address_p (rtx x)
19515 {
19516   return (CONSTANT_P (x) && memory_address_p (DImode, x));
19517 }
19518 
19519 bool
aarch64_legitimate_pic_operand_p(rtx x)19520 aarch64_legitimate_pic_operand_p (rtx x)
19521 {
19522   poly_int64 offset;
19523   x = strip_offset_and_salt (x, &offset);
19524   if (SYMBOL_REF_P (x))
19525     return false;
19526 
19527   return true;
19528 }
19529 
19530 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
19531    that should be rematerialized rather than spilled.  */
19532 
19533 static bool
aarch64_legitimate_constant_p(machine_mode mode,rtx x)19534 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
19535 {
19536   /* Support CSE and rematerialization of common constants.  */
19537   if (CONST_INT_P (x)
19538       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT))
19539     return true;
19540 
19541   /* Only accept variable-length vector constants if they can be
19542      handled directly.
19543 
19544      ??? It would be possible (but complex) to handle rematerialization
19545      of other constants via secondary reloads.  */
19546   if (!GET_MODE_SIZE (mode).is_constant ())
19547     return aarch64_simd_valid_immediate (x, NULL);
19548 
19549   /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
19550      least be forced to memory and loaded from there.  */
19551   if (CONST_VECTOR_P (x))
19552     return !targetm.cannot_force_const_mem (mode, x);
19553 
19554   /* Do not allow vector struct mode constants for Advanced SIMD.
19555      We could support 0 and -1 easily, but they need support in
19556      aarch64-simd.md.  */
19557   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19558   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
19559     return false;
19560 
19561   if (GET_CODE (x) == HIGH)
19562     x = XEXP (x, 0);
19563 
19564   /* Accept polynomial constants that can be calculated by using the
19565      destination of a move as the sole temporary.  Constants that
19566      require a second temporary cannot be rematerialized (they can't be
19567      forced to memory and also aren't legitimate constants).  */
19568   poly_int64 offset;
19569   if (poly_int_rtx_p (x, &offset))
19570     return aarch64_offset_temporaries (false, offset) <= 1;
19571 
19572   /* If an offset is being added to something else, we need to allow the
19573      base to be moved into the destination register, meaning that there
19574      are no free temporaries for the offset.  */
19575   x = strip_offset_and_salt (x, &offset);
19576   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
19577     return false;
19578 
19579   /* Do not allow const (plus (anchor_symbol, const_int)).  */
19580   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
19581     return false;
19582 
19583   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
19584      so spilling them is better than rematerialization.  */
19585   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
19586     return true;
19587 
19588   /* Label references are always constant.  */
19589   if (LABEL_REF_P (x))
19590     return true;
19591 
19592   return false;
19593 }
19594 
19595 rtx
aarch64_load_tp(rtx target)19596 aarch64_load_tp (rtx target)
19597 {
19598   if (!target
19599       || GET_MODE (target) != Pmode
19600       || !register_operand (target, Pmode))
19601     target = gen_reg_rtx (Pmode);
19602 
19603   /* Can return in any reg.  */
19604   emit_insn (gen_aarch64_load_tp_hard (target));
19605   return target;
19606 }
19607 
19608 /* On AAPCS systems, this is the "struct __va_list".  */
19609 static GTY(()) tree va_list_type;
19610 
19611 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
19612    Return the type to use as __builtin_va_list.
19613 
19614    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
19615 
19616    struct __va_list
19617    {
19618      void *__stack;
19619      void *__gr_top;
19620      void *__vr_top;
19621      int   __gr_offs;
19622      int   __vr_offs;
19623    };  */
19624 
19625 static tree
aarch64_build_builtin_va_list(void)19626 aarch64_build_builtin_va_list (void)
19627 {
19628   tree va_list_name;
19629   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19630 
19631   /* Create the type.  */
19632   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
19633   /* Give it the required name.  */
19634   va_list_name = build_decl (BUILTINS_LOCATION,
19635 			     TYPE_DECL,
19636 			     get_identifier ("__va_list"),
19637 			     va_list_type);
19638   DECL_ARTIFICIAL (va_list_name) = 1;
19639   TYPE_NAME (va_list_type) = va_list_name;
19640   TYPE_STUB_DECL (va_list_type) = va_list_name;
19641 
19642   /* Create the fields.  */
19643   f_stack = build_decl (BUILTINS_LOCATION,
19644 			FIELD_DECL, get_identifier ("__stack"),
19645 			ptr_type_node);
19646   f_grtop = build_decl (BUILTINS_LOCATION,
19647 			FIELD_DECL, get_identifier ("__gr_top"),
19648 			ptr_type_node);
19649   f_vrtop = build_decl (BUILTINS_LOCATION,
19650 			FIELD_DECL, get_identifier ("__vr_top"),
19651 			ptr_type_node);
19652   f_groff = build_decl (BUILTINS_LOCATION,
19653 			FIELD_DECL, get_identifier ("__gr_offs"),
19654 			integer_type_node);
19655   f_vroff = build_decl (BUILTINS_LOCATION,
19656 			FIELD_DECL, get_identifier ("__vr_offs"),
19657 			integer_type_node);
19658 
19659   /* Tell tree-stdarg pass about our internal offset fields.
19660      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
19661      purpose to identify whether the code is updating va_list internal
19662      offset fields through irregular way.  */
19663   va_list_gpr_counter_field = f_groff;
19664   va_list_fpr_counter_field = f_vroff;
19665 
19666   DECL_ARTIFICIAL (f_stack) = 1;
19667   DECL_ARTIFICIAL (f_grtop) = 1;
19668   DECL_ARTIFICIAL (f_vrtop) = 1;
19669   DECL_ARTIFICIAL (f_groff) = 1;
19670   DECL_ARTIFICIAL (f_vroff) = 1;
19671 
19672   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
19673   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
19674   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
19675   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
19676   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
19677 
19678   TYPE_FIELDS (va_list_type) = f_stack;
19679   DECL_CHAIN (f_stack) = f_grtop;
19680   DECL_CHAIN (f_grtop) = f_vrtop;
19681   DECL_CHAIN (f_vrtop) = f_groff;
19682   DECL_CHAIN (f_groff) = f_vroff;
19683 
19684   /* Compute its layout.  */
19685   layout_type (va_list_type);
19686 
19687   return va_list_type;
19688 }
19689 
19690 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
19691 static void
aarch64_expand_builtin_va_start(tree valist,rtx nextarg ATTRIBUTE_UNUSED)19692 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
19693 {
19694   const CUMULATIVE_ARGS *cum;
19695   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19696   tree stack, grtop, vrtop, groff, vroff;
19697   tree t;
19698   int gr_save_area_size = cfun->va_list_gpr_size;
19699   int vr_save_area_size = cfun->va_list_fpr_size;
19700   int vr_offset;
19701 
19702   cum = &crtl->args.info;
19703   if (cfun->va_list_gpr_size)
19704     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
19705 			     cfun->va_list_gpr_size);
19706   if (cfun->va_list_fpr_size)
19707     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
19708 			     * UNITS_PER_VREG, cfun->va_list_fpr_size);
19709 
19710   if (!TARGET_FLOAT)
19711     {
19712       gcc_assert (cum->aapcs_nvrn == 0);
19713       vr_save_area_size = 0;
19714     }
19715 
19716   f_stack = TYPE_FIELDS (va_list_type_node);
19717   f_grtop = DECL_CHAIN (f_stack);
19718   f_vrtop = DECL_CHAIN (f_grtop);
19719   f_groff = DECL_CHAIN (f_vrtop);
19720   f_vroff = DECL_CHAIN (f_groff);
19721 
19722   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
19723 		  NULL_TREE);
19724   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
19725 		  NULL_TREE);
19726   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
19727 		  NULL_TREE);
19728   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
19729 		  NULL_TREE);
19730   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
19731 		  NULL_TREE);
19732 
19733   /* Emit code to initialize STACK, which points to the next varargs stack
19734      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
19735      by named arguments.  STACK is 8-byte aligned.  */
19736   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
19737   if (cum->aapcs_stack_size > 0)
19738     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
19739   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
19740   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19741 
19742   /* Emit code to initialize GRTOP, the top of the GR save area.
19743      virtual_incoming_args_rtx should have been 16 byte aligned.  */
19744   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
19745   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
19746   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19747 
19748   /* Emit code to initialize VRTOP, the top of the VR save area.
19749      This address is gr_save_area_bytes below GRTOP, rounded
19750      down to the next 16-byte boundary.  */
19751   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
19752   vr_offset = ROUND_UP (gr_save_area_size,
19753 			STACK_BOUNDARY / BITS_PER_UNIT);
19754 
19755   if (vr_offset)
19756     t = fold_build_pointer_plus_hwi (t, -vr_offset);
19757   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
19758   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19759 
19760   /* Emit code to initialize GROFF, the offset from GRTOP of the
19761      next GPR argument.  */
19762   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
19763 	      build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
19764   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19765 
19766   /* Likewise emit code to initialize VROFF, the offset from FTOP
19767      of the next VR argument.  */
19768   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
19769 	      build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
19770   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19771 }
19772 
19773 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
19774 
19775 static tree
aarch64_gimplify_va_arg_expr(tree valist,tree type,gimple_seq * pre_p,gimple_seq * post_p ATTRIBUTE_UNUSED)19776 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
19777 			      gimple_seq *post_p ATTRIBUTE_UNUSED)
19778 {
19779   tree addr;
19780   bool indirect_p;
19781   bool is_ha;		/* is HFA or HVA.  */
19782   bool dw_align;	/* double-word align.  */
19783   machine_mode ag_mode = VOIDmode;
19784   int nregs;
19785   machine_mode mode;
19786 
19787   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19788   tree stack, f_top, f_off, off, arg, roundup, on_stack;
19789   HOST_WIDE_INT size, rsize, adjust, align;
19790   tree t, u, cond1, cond2;
19791 
19792   indirect_p = pass_va_arg_by_reference (type);
19793   if (indirect_p)
19794     type = build_pointer_type (type);
19795 
19796   mode = TYPE_MODE (type);
19797 
19798   f_stack = TYPE_FIELDS (va_list_type_node);
19799   f_grtop = DECL_CHAIN (f_stack);
19800   f_vrtop = DECL_CHAIN (f_grtop);
19801   f_groff = DECL_CHAIN (f_vrtop);
19802   f_vroff = DECL_CHAIN (f_groff);
19803 
19804   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
19805 		  f_stack, NULL_TREE);
19806   size = int_size_in_bytes (type);
19807 
19808   unsigned int abi_break;
19809   align
19810     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
19811 
19812   dw_align = false;
19813   adjust = 0;
19814   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
19815 					       &is_ha, false))
19816     {
19817       /* No frontends can create types with variable-sized modes, so we
19818 	 shouldn't be asked to pass or return them.  */
19819       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
19820 
19821       /* TYPE passed in fp/simd registers.  */
19822       if (!TARGET_FLOAT)
19823 	aarch64_err_no_fpadvsimd (mode);
19824 
19825       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
19826 		      unshare_expr (valist), f_vrtop, NULL_TREE);
19827       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
19828 		      unshare_expr (valist), f_vroff, NULL_TREE);
19829 
19830       rsize = nregs * UNITS_PER_VREG;
19831 
19832       if (is_ha)
19833 	{
19834 	  if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
19835 	    adjust = UNITS_PER_VREG - ag_size;
19836 	}
19837       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19838 	       && size < UNITS_PER_VREG)
19839 	{
19840 	  adjust = UNITS_PER_VREG - size;
19841 	}
19842     }
19843   else
19844     {
19845       /* TYPE passed in general registers.  */
19846       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
19847 		      unshare_expr (valist), f_grtop, NULL_TREE);
19848       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
19849 		      unshare_expr (valist), f_groff, NULL_TREE);
19850       rsize = ROUND_UP (size, UNITS_PER_WORD);
19851       nregs = rsize / UNITS_PER_WORD;
19852 
19853       if (align > 8)
19854 	{
19855 	  if (abi_break && warn_psabi)
19856 	    inform (input_location, "parameter passing for argument of type "
19857 		    "%qT changed in GCC 9.1", type);
19858 	  dw_align = true;
19859 	}
19860 
19861       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19862 	  && size < UNITS_PER_WORD)
19863 	{
19864 	  adjust = UNITS_PER_WORD  - size;
19865 	}
19866     }
19867 
19868   /* Get a local temporary for the field value.  */
19869   off = get_initialized_tmp_var (f_off, pre_p, NULL);
19870 
19871   /* Emit code to branch if off >= 0.  */
19872   t = build2 (GE_EXPR, boolean_type_node, off,
19873 	      build_int_cst (TREE_TYPE (off), 0));
19874   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
19875 
19876   if (dw_align)
19877     {
19878       /* Emit: offs = (offs + 15) & -16.  */
19879       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
19880 		  build_int_cst (TREE_TYPE (off), 15));
19881       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
19882 		  build_int_cst (TREE_TYPE (off), -16));
19883       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
19884     }
19885   else
19886     roundup = NULL;
19887 
19888   /* Update ap.__[g|v]r_offs  */
19889   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
19890 	      build_int_cst (TREE_TYPE (off), rsize));
19891   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
19892 
19893   /* String up.  */
19894   if (roundup)
19895     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
19896 
19897   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
19898   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
19899 	      build_int_cst (TREE_TYPE (f_off), 0));
19900   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
19901 
19902   /* String up: make sure the assignment happens before the use.  */
19903   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
19904   COND_EXPR_ELSE (cond1) = t;
19905 
19906   /* Prepare the trees handling the argument that is passed on the stack;
19907      the top level node will store in ON_STACK.  */
19908   arg = get_initialized_tmp_var (stack, pre_p, NULL);
19909   if (align > 8)
19910     {
19911       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
19912       t = fold_build_pointer_plus_hwi (arg, 15);
19913       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
19914 		  build_int_cst (TREE_TYPE (t), -16));
19915       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
19916     }
19917   else
19918     roundup = NULL;
19919   /* Advance ap.__stack  */
19920   t = fold_build_pointer_plus_hwi (arg, size + 7);
19921   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
19922 	      build_int_cst (TREE_TYPE (t), -8));
19923   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
19924   /* String up roundup and advance.  */
19925   if (roundup)
19926     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
19927   /* String up with arg */
19928   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
19929   /* Big-endianness related address adjustment.  */
19930   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19931       && size < UNITS_PER_WORD)
19932   {
19933     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
19934 		size_int (UNITS_PER_WORD - size));
19935     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
19936   }
19937 
19938   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
19939   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
19940 
19941   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
19942   t = off;
19943   if (adjust)
19944     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
19945 		build_int_cst (TREE_TYPE (off), adjust));
19946 
19947   t = fold_convert (sizetype, t);
19948   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
19949 
19950   if (is_ha)
19951     {
19952       /* type ha; // treat as "struct {ftype field[n];}"
19953          ... [computing offs]
19954          for (i = 0; i <nregs; ++i, offs += 16)
19955 	   ha.field[i] = *((ftype *)(ap.__vr_top + offs));
19956 	 return ha;  */
19957       int i;
19958       tree tmp_ha, field_t, field_ptr_t;
19959 
19960       /* Declare a local variable.  */
19961       tmp_ha = create_tmp_var_raw (type, "ha");
19962       gimple_add_tmp_var (tmp_ha);
19963 
19964       /* Establish the base type.  */
19965       switch (ag_mode)
19966 	{
19967 	case E_SFmode:
19968 	  field_t = float_type_node;
19969 	  field_ptr_t = float_ptr_type_node;
19970 	  break;
19971 	case E_DFmode:
19972 	  field_t = double_type_node;
19973 	  field_ptr_t = double_ptr_type_node;
19974 	  break;
19975 	case E_TFmode:
19976 	  field_t = long_double_type_node;
19977 	  field_ptr_t = long_double_ptr_type_node;
19978 	  break;
19979 	case E_HFmode:
19980 	  field_t = aarch64_fp16_type_node;
19981 	  field_ptr_t = aarch64_fp16_ptr_type_node;
19982 	  break;
19983 	case E_BFmode:
19984 	  field_t = aarch64_bf16_type_node;
19985 	  field_ptr_t = aarch64_bf16_ptr_type_node;
19986 	  break;
19987 	case E_V2SImode:
19988 	case E_V4SImode:
19989 	    {
19990 	      tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
19991 	      field_t = build_vector_type_for_mode (innertype, ag_mode);
19992 	      field_ptr_t = build_pointer_type (field_t);
19993 	    }
19994 	  break;
19995 	default:
19996 	  gcc_assert (0);
19997 	}
19998 
19999       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
20000       TREE_ADDRESSABLE (tmp_ha) = 1;
20001       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
20002       addr = t;
20003       t = fold_convert (field_ptr_t, addr);
20004       t = build2 (MODIFY_EXPR, field_t,
20005 		  build1 (INDIRECT_REF, field_t, tmp_ha),
20006 		  build1 (INDIRECT_REF, field_t, t));
20007 
20008       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
20009       for (i = 1; i < nregs; ++i)
20010 	{
20011 	  addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
20012 	  u = fold_convert (field_ptr_t, addr);
20013 	  u = build2 (MODIFY_EXPR, field_t,
20014 		      build2 (MEM_REF, field_t, tmp_ha,
20015 			      build_int_cst (field_ptr_t,
20016 					     (i *
20017 					      int_size_in_bytes (field_t)))),
20018 		      build1 (INDIRECT_REF, field_t, u));
20019 	  t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
20020 	}
20021 
20022       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
20023       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
20024     }
20025 
20026   COND_EXPR_ELSE (cond2) = t;
20027   addr = fold_convert (build_pointer_type (type), cond1);
20028   addr = build_va_arg_indirect_ref (addr);
20029 
20030   if (indirect_p)
20031     addr = build_va_arg_indirect_ref (addr);
20032 
20033   return addr;
20034 }
20035 
20036 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
20037 
20038 static void
aarch64_setup_incoming_varargs(cumulative_args_t cum_v,const function_arg_info & arg,int * pretend_size ATTRIBUTE_UNUSED,int no_rtl)20039 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
20040 				const function_arg_info &arg,
20041 				int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
20042 {
20043   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
20044   CUMULATIVE_ARGS local_cum;
20045   int gr_saved = cfun->va_list_gpr_size;
20046   int vr_saved = cfun->va_list_fpr_size;
20047 
20048   /* The caller has advanced CUM up to, but not beyond, the last named
20049      argument.  Advance a local copy of CUM past the last "real" named
20050      argument, to find out how many registers are left over.  */
20051   local_cum = *cum;
20052   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
20053 
20054   /* Found out how many registers we need to save.
20055      Honor tree-stdvar analysis results.  */
20056   if (cfun->va_list_gpr_size)
20057     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
20058 		    cfun->va_list_gpr_size / UNITS_PER_WORD);
20059   if (cfun->va_list_fpr_size)
20060     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
20061 		    cfun->va_list_fpr_size / UNITS_PER_VREG);
20062 
20063   if (!TARGET_FLOAT)
20064     {
20065       gcc_assert (local_cum.aapcs_nvrn == 0);
20066       vr_saved = 0;
20067     }
20068 
20069   if (!no_rtl)
20070     {
20071       if (gr_saved > 0)
20072 	{
20073 	  rtx ptr, mem;
20074 
20075 	  /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
20076 	  ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
20077 			       - gr_saved * UNITS_PER_WORD);
20078 	  mem = gen_frame_mem (BLKmode, ptr);
20079 	  set_mem_alias_set (mem, get_varargs_alias_set ());
20080 
20081 	  move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
20082 			       mem, gr_saved);
20083 	}
20084       if (vr_saved > 0)
20085 	{
20086 	  /* We can't use move_block_from_reg, because it will use
20087 	     the wrong mode, storing D regs only.  */
20088 	  machine_mode mode = TImode;
20089 	  int off, i, vr_start;
20090 
20091 	  /* Set OFF to the offset from virtual_incoming_args_rtx of
20092 	     the first vector register.  The VR save area lies below
20093 	     the GR one, and is aligned to 16 bytes.  */
20094 	  off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
20095 			   STACK_BOUNDARY / BITS_PER_UNIT);
20096 	  off -= vr_saved * UNITS_PER_VREG;
20097 
20098 	  vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
20099 	  for (i = 0; i < vr_saved; ++i)
20100 	    {
20101 	      rtx ptr, mem;
20102 
20103 	      ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
20104 	      mem = gen_frame_mem (mode, ptr);
20105 	      set_mem_alias_set (mem, get_varargs_alias_set ());
20106 	      aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
20107 	      off += UNITS_PER_VREG;
20108 	    }
20109 	}
20110     }
20111 
20112   /* We don't save the size into *PRETEND_SIZE because we want to avoid
20113      any complication of having crtl->args.pretend_args_size changed.  */
20114   cfun->machine->frame.saved_varargs_size
20115     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
20116 		 STACK_BOUNDARY / BITS_PER_UNIT)
20117        + vr_saved * UNITS_PER_VREG);
20118 }
20119 
20120 static void
aarch64_conditional_register_usage(void)20121 aarch64_conditional_register_usage (void)
20122 {
20123   int i;
20124   if (!TARGET_FLOAT)
20125     {
20126       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
20127 	{
20128 	  fixed_regs[i] = 1;
20129 	  call_used_regs[i] = 1;
20130 	}
20131     }
20132   if (!TARGET_SVE)
20133     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
20134       {
20135 	fixed_regs[i] = 1;
20136 	call_used_regs[i] = 1;
20137       }
20138 
20139   /* Only allow the FFR and FFRT to be accessed via special patterns.  */
20140   CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
20141   CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
20142 
20143   /* When tracking speculation, we need a couple of call-clobbered registers
20144      to track the speculation state.  It would be nice to just use
20145      IP0 and IP1, but currently there are numerous places that just
20146      assume these registers are free for other uses (eg pointer
20147      authentication).  */
20148   if (aarch64_track_speculation)
20149     {
20150       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
20151       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
20152       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
20153       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
20154     }
20155 }
20156 
20157 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK.  */
20158 
20159 bool
aarch64_member_type_forces_blk(const_tree field_or_array,machine_mode mode)20160 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
20161 {
20162   /* For records we're passed a FIELD_DECL, for arrays we're passed
20163      an ARRAY_TYPE.  In both cases we're interested in the TREE_TYPE.  */
20164   const_tree type = TREE_TYPE (field_or_array);
20165 
20166   /* Assign BLKmode to anything that contains multiple SVE predicates.
20167      For structures, the "multiple" case is indicated by MODE being
20168      VOIDmode.  */
20169   unsigned int num_zr, num_pr;
20170   if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
20171     {
20172       if (TREE_CODE (field_or_array) == ARRAY_TYPE)
20173 	return !simple_cst_equal (TYPE_SIZE (field_or_array),
20174 				  TYPE_SIZE (type));
20175       return mode == VOIDmode;
20176     }
20177 
20178   return default_member_type_forces_blk (field_or_array, mode);
20179 }
20180 
20181 /* Bitmasks that indicate whether earlier versions of GCC would have
20182    taken a different path through the ABI logic.  This should result in
20183    a -Wpsabi warning if the earlier path led to a different ABI decision.
20184 
20185    WARN_PSABI_EMPTY_CXX17_BASE
20186       Indicates that the type includes an artificial empty C++17 base field
20187       that, prior to GCC 10.1, would prevent the type from being treated as
20188       a HFA or HVA.  See PR94383 for details.
20189 
20190    WARN_PSABI_NO_UNIQUE_ADDRESS
20191       Indicates that the type includes an empty [[no_unique_address]] field
20192       that, prior to GCC 10.1, would prevent the type from being treated as
20193       a HFA or HVA.  */
20194 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
20195 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
20196 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
20197 
20198 /* Walk down the type tree of TYPE counting consecutive base elements.
20199    If *MODEP is VOIDmode, then set it to the first valid floating point
20200    type.  If a non-floating point type is found, or if a floating point
20201    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
20202    otherwise return the count in the sub-tree.
20203 
20204    The WARN_PSABI_FLAGS argument allows the caller to check whether this
20205    function has changed its behavior relative to earlier versions of GCC.
20206    Normally the argument should be nonnull and point to a zero-initialized
20207    variable.  The function then records whether the ABI decision might
20208    be affected by a known fix to the ABI logic, setting the associated
20209    WARN_PSABI_* bits if so.
20210 
20211    When the argument is instead a null pointer, the function tries to
20212    simulate the behavior of GCC before all such ABI fixes were made.
20213    This is useful to check whether the function returns something
20214    different after the ABI fixes.  */
20215 static int
aapcs_vfp_sub_candidate(const_tree type,machine_mode * modep,unsigned int * warn_psabi_flags)20216 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
20217 			 unsigned int *warn_psabi_flags)
20218 {
20219   machine_mode mode;
20220   HOST_WIDE_INT size;
20221 
20222   if (aarch64_sve::builtin_type_p (type))
20223     return -1;
20224 
20225   switch (TREE_CODE (type))
20226     {
20227     case REAL_TYPE:
20228       mode = TYPE_MODE (type);
20229       if (mode != DFmode && mode != SFmode
20230 	  && mode != TFmode && mode != HFmode)
20231 	return -1;
20232 
20233       if (*modep == VOIDmode)
20234 	*modep = mode;
20235 
20236       if (*modep == mode)
20237 	return 1;
20238 
20239       break;
20240 
20241     case COMPLEX_TYPE:
20242       mode = TYPE_MODE (TREE_TYPE (type));
20243       if (mode != DFmode && mode != SFmode
20244 	  && mode != TFmode && mode != HFmode)
20245 	return -1;
20246 
20247       if (*modep == VOIDmode)
20248 	*modep = mode;
20249 
20250       if (*modep == mode)
20251 	return 2;
20252 
20253       break;
20254 
20255     case VECTOR_TYPE:
20256       /* Use V2SImode and V4SImode as representatives of all 64-bit
20257 	 and 128-bit vector types.  */
20258       size = int_size_in_bytes (type);
20259       switch (size)
20260 	{
20261 	case 8:
20262 	  mode = V2SImode;
20263 	  break;
20264 	case 16:
20265 	  mode = V4SImode;
20266 	  break;
20267 	default:
20268 	  return -1;
20269 	}
20270 
20271       if (*modep == VOIDmode)
20272 	*modep = mode;
20273 
20274       /* Vector modes are considered to be opaque: two vectors are
20275 	 equivalent for the purposes of being homogeneous aggregates
20276 	 if they are the same size.  */
20277       if (*modep == mode)
20278 	return 1;
20279 
20280       break;
20281 
20282     case ARRAY_TYPE:
20283       {
20284 	int count;
20285 	tree index = TYPE_DOMAIN (type);
20286 
20287 	/* Can't handle incomplete types nor sizes that are not
20288 	   fixed.  */
20289 	if (!COMPLETE_TYPE_P (type)
20290 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20291 	  return -1;
20292 
20293 	count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
20294 					 warn_psabi_flags);
20295 	if (count == -1
20296 	    || !index
20297 	    || !TYPE_MAX_VALUE (index)
20298 	    || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
20299 	    || !TYPE_MIN_VALUE (index)
20300 	    || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
20301 	    || count < 0)
20302 	  return -1;
20303 
20304 	count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
20305 		      - tree_to_uhwi (TYPE_MIN_VALUE (index)));
20306 
20307 	/* There must be no padding.  */
20308 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20309 		      count * GET_MODE_BITSIZE (*modep)))
20310 	  return -1;
20311 
20312 	return count;
20313       }
20314 
20315     case RECORD_TYPE:
20316       {
20317 	int count = 0;
20318 	int sub_count;
20319 	tree field;
20320 
20321 	/* Can't handle incomplete types nor sizes that are not
20322 	   fixed.  */
20323 	if (!COMPLETE_TYPE_P (type)
20324 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20325 	  return -1;
20326 
20327 	for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
20328 	  {
20329 	    if (TREE_CODE (field) != FIELD_DECL)
20330 	      continue;
20331 
20332 	    if (DECL_FIELD_ABI_IGNORED (field))
20333 	      {
20334 		/* See whether this is something that earlier versions of
20335 		   GCC failed to ignore.  */
20336 		unsigned int flag;
20337 		if (lookup_attribute ("no_unique_address",
20338 				      DECL_ATTRIBUTES (field)))
20339 		  flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
20340 		else if (cxx17_empty_base_field_p (field))
20341 		  flag = WARN_PSABI_EMPTY_CXX17_BASE;
20342 		else
20343 		  /* No compatibility problem.  */
20344 		  continue;
20345 
20346 		/* Simulate the old behavior when WARN_PSABI_FLAGS is null.  */
20347 		if (warn_psabi_flags)
20348 		  {
20349 		    *warn_psabi_flags |= flag;
20350 		    continue;
20351 		  }
20352 	      }
20353 	    /* A zero-width bitfield may affect layout in some
20354 	       circumstances, but adds no members.  The determination
20355 	       of whether or not a type is an HFA is performed after
20356 	       layout is complete, so if the type still looks like an
20357 	       HFA afterwards, it is still classed as one.  This is
20358 	       potentially an ABI break for the hard-float ABI.  */
20359 	    else if (DECL_BIT_FIELD (field)
20360 		     && integer_zerop (DECL_SIZE (field)))
20361 	      {
20362 		/* Prior to GCC-12 these fields were striped early,
20363 		   hiding them from the back-end entirely and
20364 		   resulting in the correct behaviour for argument
20365 		   passing.  Simulate that old behaviour without
20366 		   generating a warning.  */
20367 		if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
20368 		  continue;
20369 		if (warn_psabi_flags)
20370 		  {
20371 		    *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
20372 		    continue;
20373 		  }
20374 	      }
20375 
20376 	    sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
20377 						 warn_psabi_flags);
20378 	    if (sub_count < 0)
20379 	      return -1;
20380 	    count += sub_count;
20381 	  }
20382 
20383 	/* There must be no padding.  */
20384 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20385 		      count * GET_MODE_BITSIZE (*modep)))
20386 	  return -1;
20387 
20388 	return count;
20389       }
20390 
20391     case UNION_TYPE:
20392     case QUAL_UNION_TYPE:
20393       {
20394 	/* These aren't very interesting except in a degenerate case.  */
20395 	int count = 0;
20396 	int sub_count;
20397 	tree field;
20398 
20399 	/* Can't handle incomplete types nor sizes that are not
20400 	   fixed.  */
20401 	if (!COMPLETE_TYPE_P (type)
20402 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20403 	  return -1;
20404 
20405 	for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
20406 	  {
20407 	    if (TREE_CODE (field) != FIELD_DECL)
20408 	      continue;
20409 
20410 	    sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
20411 						 warn_psabi_flags);
20412 	    if (sub_count < 0)
20413 	      return -1;
20414 	    count = count > sub_count ? count : sub_count;
20415 	  }
20416 
20417 	/* There must be no padding.  */
20418 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20419 		      count * GET_MODE_BITSIZE (*modep)))
20420 	  return -1;
20421 
20422 	return count;
20423       }
20424 
20425     default:
20426       break;
20427     }
20428 
20429   return -1;
20430 }
20431 
20432 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
20433    type as described in AAPCS64 \S 4.1.2.
20434 
20435    See the comment above aarch64_composite_type_p for the notes on MODE.  */
20436 
20437 static bool
aarch64_short_vector_p(const_tree type,machine_mode mode)20438 aarch64_short_vector_p (const_tree type,
20439 			machine_mode mode)
20440 {
20441   poly_int64 size = -1;
20442 
20443   if (type && TREE_CODE (type) == VECTOR_TYPE)
20444     {
20445       if (aarch64_sve::builtin_type_p (type))
20446 	return false;
20447       size = int_size_in_bytes (type);
20448     }
20449   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
20450 	   || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
20451     {
20452       /* The containing "else if" is too loose: it means that we look at TYPE
20453 	 if the type is a vector type (good), but that we otherwise ignore TYPE
20454 	 and look only at the mode.  This is wrong because the type describes
20455 	 the language-level information whereas the mode is purely an internal
20456 	 GCC concept.  We can therefore reach here for types that are not
20457 	 vectors in the AAPCS64 sense.
20458 
20459 	 We can't "fix" that for the traditional Advanced SIMD vector modes
20460 	 without breaking backwards compatibility.  However, there's no such
20461 	 baggage for the structure modes, which were introduced in GCC 12.  */
20462       if (aarch64_advsimd_struct_mode_p (mode))
20463 	return false;
20464 
20465       /* For similar reasons, rely only on the type, not the mode, when
20466 	 processing SVE types.  */
20467       if (type && aarch64_some_values_include_pst_objects_p (type))
20468 	/* Leave later code to report an error if SVE is disabled.  */
20469 	gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
20470       else
20471 	size = GET_MODE_SIZE (mode);
20472     }
20473   if (known_eq (size, 8) || known_eq (size, 16))
20474     {
20475       /* 64-bit and 128-bit vectors should only acquire an SVE mode if
20476 	 they are being treated as scalable AAPCS64 types.  */
20477       gcc_assert (!aarch64_sve_mode_p (mode)
20478 		  && !aarch64_advsimd_struct_mode_p (mode));
20479       return true;
20480     }
20481   return false;
20482 }
20483 
20484 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
20485    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
20486    array types.  The C99 floating-point complex types are also considered
20487    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
20488    types, which are GCC extensions and out of the scope of AAPCS64, are
20489    treated as composite types here as well.
20490 
20491    Note that MODE itself is not sufficient in determining whether a type
20492    is such a composite type or not.  This is because
20493    stor-layout.cc:compute_record_mode may have already changed the MODE
20494    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
20495    structure with only one field may have its MODE set to the mode of the
20496    field.  Also an integer mode whose size matches the size of the
20497    RECORD_TYPE type may be used to substitute the original mode
20498    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
20499    solely relied on.  */
20500 
20501 static bool
aarch64_composite_type_p(const_tree type,machine_mode mode)20502 aarch64_composite_type_p (const_tree type,
20503 			  machine_mode mode)
20504 {
20505   if (aarch64_short_vector_p (type, mode))
20506     return false;
20507 
20508   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
20509     return true;
20510 
20511   if (mode == BLKmode
20512       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
20513       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
20514     return true;
20515 
20516   return false;
20517 }
20518 
20519 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
20520    shall be passed or returned in simd/fp register(s) (providing these
20521    parameter passing registers are available).
20522 
20523    Upon successful return, *COUNT returns the number of needed registers,
20524    *BASE_MODE returns the mode of the individual register and when IS_HA
20525    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
20526    floating-point aggregate or a homogeneous short-vector aggregate.
20527 
20528    SILENT_P is true if the function should refrain from reporting any
20529    diagnostics.  This should only be used if the caller is certain that
20530    any ABI decisions would eventually come through this function with
20531    SILENT_P set to false.  */
20532 
20533 static bool
aarch64_vfp_is_call_or_return_candidate(machine_mode mode,const_tree type,machine_mode * base_mode,int * count,bool * is_ha,bool silent_p)20534 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
20535 					 const_tree type,
20536 					 machine_mode *base_mode,
20537 					 int *count,
20538 					 bool *is_ha,
20539 					 bool silent_p)
20540 {
20541   if (is_ha != NULL) *is_ha = false;
20542 
20543   machine_mode new_mode = VOIDmode;
20544   bool composite_p = aarch64_composite_type_p (type, mode);
20545 
20546   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
20547       || aarch64_short_vector_p (type, mode))
20548     {
20549       *count = 1;
20550       new_mode = mode;
20551     }
20552   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
20553     {
20554       if (is_ha != NULL) *is_ha = true;
20555       *count = 2;
20556       new_mode = GET_MODE_INNER (mode);
20557     }
20558   else if (type && composite_p)
20559     {
20560       unsigned int warn_psabi_flags = 0;
20561       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
20562 					      &warn_psabi_flags);
20563       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
20564 	{
20565 	  static unsigned last_reported_type_uid;
20566 	  unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
20567 	  int alt;
20568 	  if (!silent_p
20569 	      && warn_psabi
20570 	      && warn_psabi_flags
20571 	      && uid != last_reported_type_uid
20572 	      && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
20573 		  != ag_count))
20574 	    {
20575 	      const char *url10
20576 		= CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
20577 	      const char *url12
20578 		= CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
20579 	      gcc_assert (alt == -1);
20580 	      last_reported_type_uid = uid;
20581 	      /* Use TYPE_MAIN_VARIANT to strip any redundant const
20582 		 qualification.  */
20583 	      if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
20584 		inform (input_location, "parameter passing for argument of "
20585 			"type %qT with %<[[no_unique_address]]%> members "
20586 			"changed %{in GCC 10.1%}",
20587 			TYPE_MAIN_VARIANT (type), url10);
20588 	      else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
20589 		inform (input_location, "parameter passing for argument of "
20590 			"type %qT when C++17 is enabled changed to match "
20591 			"C++14 %{in GCC 10.1%}",
20592 			TYPE_MAIN_VARIANT (type), url10);
20593 	      else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
20594 		inform (input_location, "parameter passing for argument of "
20595 			"type %qT changed %{in GCC 12.1%}",
20596 			TYPE_MAIN_VARIANT (type), url12);
20597 	    }
20598 
20599 	  if (is_ha != NULL) *is_ha = true;
20600 	  *count = ag_count;
20601 	}
20602       else
20603 	return false;
20604     }
20605   else
20606     return false;
20607 
20608   gcc_assert (!aarch64_sve_mode_p (new_mode));
20609   *base_mode = new_mode;
20610   return true;
20611 }
20612 
20613 /* Implement TARGET_STRUCT_VALUE_RTX.  */
20614 
20615 static rtx
aarch64_struct_value_rtx(tree fndecl ATTRIBUTE_UNUSED,int incoming ATTRIBUTE_UNUSED)20616 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
20617 			  int incoming ATTRIBUTE_UNUSED)
20618 {
20619   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
20620 }
20621 
20622 /* Implements target hook vector_mode_supported_p.  */
20623 static bool
aarch64_vector_mode_supported_p(machine_mode mode)20624 aarch64_vector_mode_supported_p (machine_mode mode)
20625 {
20626   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
20627   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
20628 }
20629 
20630 /* Return the full-width SVE vector mode for element mode MODE, if one
20631    exists.  */
20632 opt_machine_mode
aarch64_full_sve_mode(scalar_mode mode)20633 aarch64_full_sve_mode (scalar_mode mode)
20634 {
20635   switch (mode)
20636     {
20637     case E_DFmode:
20638       return VNx2DFmode;
20639     case E_SFmode:
20640       return VNx4SFmode;
20641     case E_HFmode:
20642       return VNx8HFmode;
20643     case E_BFmode:
20644       return VNx8BFmode;
20645     case E_DImode:
20646       return VNx2DImode;
20647     case E_SImode:
20648       return VNx4SImode;
20649     case E_HImode:
20650       return VNx8HImode;
20651     case E_QImode:
20652       return VNx16QImode;
20653     default:
20654       return opt_machine_mode ();
20655     }
20656 }
20657 
20658 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
20659    if it exists.  */
20660 opt_machine_mode
aarch64_vq_mode(scalar_mode mode)20661 aarch64_vq_mode (scalar_mode mode)
20662 {
20663   switch (mode)
20664     {
20665     case E_DFmode:
20666       return V2DFmode;
20667     case E_SFmode:
20668       return V4SFmode;
20669     case E_HFmode:
20670       return V8HFmode;
20671     case E_BFmode:
20672       return V8BFmode;
20673     case E_SImode:
20674       return V4SImode;
20675     case E_HImode:
20676       return V8HImode;
20677     case E_QImode:
20678       return V16QImode;
20679     case E_DImode:
20680       return V2DImode;
20681     default:
20682       return opt_machine_mode ();
20683     }
20684 }
20685 
20686 /* Return appropriate SIMD container
20687    for MODE within a vector of WIDTH bits.  */
20688 static machine_mode
aarch64_simd_container_mode(scalar_mode mode,poly_int64 width)20689 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
20690 {
20691   if (TARGET_SVE
20692       && maybe_ne (width, 128)
20693       && known_eq (width, BITS_PER_SVE_VECTOR))
20694     return aarch64_full_sve_mode (mode).else_mode (word_mode);
20695 
20696   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
20697   if (TARGET_SIMD)
20698     {
20699       if (known_eq (width, 128))
20700 	return aarch64_vq_mode (mode).else_mode (word_mode);
20701       else
20702 	switch (mode)
20703 	  {
20704 	  case E_SFmode:
20705 	    return V2SFmode;
20706 	  case E_HFmode:
20707 	    return V4HFmode;
20708 	  case E_BFmode:
20709 	    return V4BFmode;
20710 	  case E_SImode:
20711 	    return V2SImode;
20712 	  case E_HImode:
20713 	    return V4HImode;
20714 	  case E_QImode:
20715 	    return V8QImode;
20716 	  default:
20717 	    break;
20718 	  }
20719     }
20720   return word_mode;
20721 }
20722 
20723 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
20724    and return whether the SVE mode should be preferred over the
20725    Advanced SIMD one in aarch64_autovectorize_vector_modes.  */
20726 static bool
aarch64_cmp_autovec_modes(machine_mode sve_m,machine_mode asimd_m)20727 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
20728 {
20729   /* Take into account the aarch64-autovec-preference param if non-zero.  */
20730   bool only_asimd_p = aarch64_autovec_preference == 1;
20731   bool only_sve_p = aarch64_autovec_preference == 2;
20732 
20733   if (only_asimd_p)
20734     return false;
20735   if (only_sve_p)
20736     return true;
20737 
20738   /* The preference in case of a tie in costs.  */
20739   bool prefer_asimd = aarch64_autovec_preference == 3;
20740   bool prefer_sve = aarch64_autovec_preference == 4;
20741 
20742   poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
20743   poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
20744   /* If the CPU information does not have an SVE width registered use the
20745      generic poly_int comparison that prefers SVE.  If a preference is
20746      explicitly requested avoid this path.  */
20747   if (aarch64_tune_params.sve_width == SVE_SCALABLE
20748       && !prefer_asimd
20749       && !prefer_sve)
20750     return maybe_gt (nunits_sve, nunits_asimd);
20751 
20752   /* Otherwise estimate the runtime width of the modes involved.  */
20753   HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
20754   HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
20755 
20756   /* Preferring SVE means picking it first unless the Advanced SIMD mode
20757      is clearly wider.  */
20758   if (prefer_sve)
20759     return est_sve >= est_asimd;
20760   /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
20761      is clearly wider.  */
20762   if (prefer_asimd)
20763     return est_sve > est_asimd;
20764 
20765   /* In the default case prefer Advanced SIMD over SVE in case of a tie.  */
20766   return est_sve > est_asimd;
20767 }
20768 
20769 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
20770 static machine_mode
aarch64_preferred_simd_mode(scalar_mode mode)20771 aarch64_preferred_simd_mode (scalar_mode mode)
20772 {
20773   /* Take into account explicit auto-vectorization ISA preferences through
20774      aarch64_cmp_autovec_modes.  */
20775   if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
20776     return aarch64_full_sve_mode (mode).else_mode (word_mode);
20777   if (TARGET_SIMD)
20778     return aarch64_vq_mode (mode).else_mode (word_mode);
20779   return word_mode;
20780 }
20781 
20782 /* Return a list of possible vector sizes for the vectorizer
20783    to iterate over.  */
20784 static unsigned int
aarch64_autovectorize_vector_modes(vector_modes * modes,bool)20785 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
20786 {
20787   static const machine_mode sve_modes[] = {
20788     /* Try using full vectors for all element types.  */
20789     VNx16QImode,
20790 
20791     /* Try using 16-bit containers for 8-bit elements and full vectors
20792        for wider elements.  */
20793     VNx8QImode,
20794 
20795     /* Try using 32-bit containers for 8-bit and 16-bit elements and
20796        full vectors for wider elements.  */
20797     VNx4QImode,
20798 
20799     /* Try using 64-bit containers for all element types.  */
20800     VNx2QImode
20801   };
20802 
20803   static const machine_mode advsimd_modes[] = {
20804     /* Try using 128-bit vectors for all element types.  */
20805     V16QImode,
20806 
20807     /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
20808        for wider elements.  */
20809     V8QImode,
20810 
20811     /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
20812        for wider elements.
20813 
20814        TODO: We could support a limited form of V4QImode too, so that
20815        we use 32-bit vectors for 8-bit elements.  */
20816     V4HImode,
20817 
20818     /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
20819        for 64-bit elements.
20820 
20821        TODO: We could similarly support limited forms of V2QImode and V2HImode
20822        for this case.  */
20823     V2SImode
20824   };
20825 
20826   /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
20827      This is because:
20828 
20829      - If we can't use N-byte Advanced SIMD vectors then the placement
20830        doesn't matter; we'll just continue as though the Advanced SIMD
20831        entry didn't exist.
20832 
20833      - If an SVE main loop with N bytes ends up being cheaper than an
20834        Advanced SIMD main loop with N bytes then by default we'll replace
20835        the Advanced SIMD version with the SVE one.
20836 
20837      - If an Advanced SIMD main loop with N bytes ends up being cheaper
20838        than an SVE main loop with N bytes then by default we'll try to
20839        use the SVE loop to vectorize the epilogue instead.  */
20840 
20841   bool only_asimd_p = aarch64_autovec_preference == 1;
20842   bool only_sve_p = aarch64_autovec_preference == 2;
20843 
20844   unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
20845   unsigned int advsimd_i = 0;
20846 
20847   while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
20848     {
20849       if (sve_i < ARRAY_SIZE (sve_modes)
20850 	  && aarch64_cmp_autovec_modes (sve_modes[sve_i],
20851 					advsimd_modes[advsimd_i]))
20852 	modes->safe_push (sve_modes[sve_i++]);
20853       else
20854 	modes->safe_push (advsimd_modes[advsimd_i++]);
20855     }
20856   while (sve_i < ARRAY_SIZE (sve_modes))
20857    modes->safe_push (sve_modes[sve_i++]);
20858 
20859   unsigned int flags = 0;
20860   /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
20861      can compare SVE against Advanced SIMD and so that we can compare
20862      multiple SVE vectorization approaches against each other.  There's
20863      not really any point doing this for Advanced SIMD only, since the
20864      first mode that works should always be the best.  */
20865   if (TARGET_SVE && aarch64_sve_compare_costs)
20866     flags |= VECT_COMPARE_COSTS;
20867   return flags;
20868 }
20869 
20870 /* Implement TARGET_MANGLE_TYPE.  */
20871 
20872 static const char *
aarch64_mangle_type(const_tree type)20873 aarch64_mangle_type (const_tree type)
20874 {
20875   /* The AArch64 ABI documents say that "__va_list" has to be
20876      mangled as if it is in the "std" namespace.  */
20877   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
20878     return "St9__va_list";
20879 
20880   /* Half-precision floating point types.  */
20881   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
20882     {
20883       if (TYPE_MODE (type) == BFmode)
20884 	return "u6__bf16";
20885       else
20886 	return "Dh";
20887     }
20888 
20889   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
20890      builtin types.  */
20891   if (TYPE_NAME (type) != NULL)
20892     {
20893       const char *res;
20894       if ((res = aarch64_general_mangle_builtin_type (type))
20895 	  || (res = aarch64_sve::mangle_builtin_type (type)))
20896 	return res;
20897     }
20898 
20899   /* Use the default mangling.  */
20900   return NULL;
20901 }
20902 
20903 /* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
20904 
20905 static bool
aarch64_verify_type_context(location_t loc,type_context_kind context,const_tree type,bool silent_p)20906 aarch64_verify_type_context (location_t loc, type_context_kind context,
20907 			     const_tree type, bool silent_p)
20908 {
20909   return aarch64_sve::verify_type_context (loc, context, type, silent_p);
20910 }
20911 
20912 /* Find the first rtx_insn before insn that will generate an assembly
20913    instruction.  */
20914 
20915 static rtx_insn *
aarch64_prev_real_insn(rtx_insn * insn)20916 aarch64_prev_real_insn (rtx_insn *insn)
20917 {
20918   if (!insn)
20919     return NULL;
20920 
20921   do
20922     {
20923       insn = prev_real_insn (insn);
20924     }
20925   while (insn && recog_memoized (insn) < 0);
20926 
20927   return insn;
20928 }
20929 
20930 static bool
is_madd_op(enum attr_type t1)20931 is_madd_op (enum attr_type t1)
20932 {
20933   unsigned int i;
20934   /* A number of these may be AArch32 only.  */
20935   enum attr_type mlatypes[] = {
20936     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
20937     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
20938     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
20939   };
20940 
20941   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
20942     {
20943       if (t1 == mlatypes[i])
20944 	return true;
20945     }
20946 
20947   return false;
20948 }
20949 
20950 /* Check if there is a register dependency between a load and the insn
20951    for which we hold recog_data.  */
20952 
20953 static bool
dep_between_memop_and_curr(rtx memop)20954 dep_between_memop_and_curr (rtx memop)
20955 {
20956   rtx load_reg;
20957   int opno;
20958 
20959   gcc_assert (GET_CODE (memop) == SET);
20960 
20961   if (!REG_P (SET_DEST (memop)))
20962     return false;
20963 
20964   load_reg = SET_DEST (memop);
20965   for (opno = 1; opno < recog_data.n_operands; opno++)
20966     {
20967       rtx operand = recog_data.operand[opno];
20968       if (REG_P (operand)
20969           && reg_overlap_mentioned_p (load_reg, operand))
20970         return true;
20971 
20972     }
20973   return false;
20974 }
20975 
20976 
20977 /* When working around the Cortex-A53 erratum 835769,
20978    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
20979    instruction and has a preceding memory instruction such that a NOP
20980    should be inserted between them.  */
20981 
20982 bool
aarch64_madd_needs_nop(rtx_insn * insn)20983 aarch64_madd_needs_nop (rtx_insn* insn)
20984 {
20985   enum attr_type attr_type;
20986   rtx_insn *prev;
20987   rtx body;
20988 
20989   if (!TARGET_FIX_ERR_A53_835769)
20990     return false;
20991 
20992   if (!INSN_P (insn) || recog_memoized (insn) < 0)
20993     return false;
20994 
20995   attr_type = get_attr_type (insn);
20996   if (!is_madd_op (attr_type))
20997     return false;
20998 
20999   prev = aarch64_prev_real_insn (insn);
21000   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
21001      Restore recog state to INSN to avoid state corruption.  */
21002   extract_constrain_insn_cached (insn);
21003 
21004   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
21005     return false;
21006 
21007   body = single_set (prev);
21008 
21009   /* If the previous insn is a memory op and there is no dependency between
21010      it and the DImode madd, emit a NOP between them.  If body is NULL then we
21011      have a complex memory operation, probably a load/store pair.
21012      Be conservative for now and emit a NOP.  */
21013   if (GET_MODE (recog_data.operand[0]) == DImode
21014       && (!body || !dep_between_memop_and_curr (body)))
21015     return true;
21016 
21017   return false;
21018 
21019 }
21020 
21021 
21022 /* Implement FINAL_PRESCAN_INSN.  */
21023 
21024 void
aarch64_final_prescan_insn(rtx_insn * insn)21025 aarch64_final_prescan_insn (rtx_insn *insn)
21026 {
21027   if (aarch64_madd_needs_nop (insn))
21028     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
21029 }
21030 
21031 
21032 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
21033    instruction.  */
21034 
21035 bool
aarch64_sve_index_immediate_p(rtx base_or_step)21036 aarch64_sve_index_immediate_p (rtx base_or_step)
21037 {
21038   return (CONST_INT_P (base_or_step)
21039 	  && IN_RANGE (INTVAL (base_or_step), -16, 15));
21040 }
21041 
21042 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
21043    when applied to mode MODE.  Negate X first if NEGATE_P is true.  */
21044 
21045 bool
aarch64_sve_arith_immediate_p(machine_mode mode,rtx x,bool negate_p)21046 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
21047 {
21048   rtx elt = unwrap_const_vec_duplicate (x);
21049   if (!CONST_INT_P (elt))
21050     return false;
21051 
21052   HOST_WIDE_INT val = INTVAL (elt);
21053   if (negate_p)
21054     val = -val;
21055   val &= GET_MODE_MASK (GET_MODE_INNER (mode));
21056 
21057   if (val & 0xff)
21058     return IN_RANGE (val, 0, 0xff);
21059   return IN_RANGE (val, 0, 0xff00);
21060 }
21061 
21062 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
21063    instructions when applied to mode MODE.  Negate X first if NEGATE_P
21064    is true.  */
21065 
21066 bool
aarch64_sve_sqadd_sqsub_immediate_p(machine_mode mode,rtx x,bool negate_p)21067 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
21068 {
21069   if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
21070     return false;
21071 
21072   /* After the optional negation, the immediate must be nonnegative.
21073      E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
21074      instead of SQADD Zn.B, Zn.B, #129.  */
21075   rtx elt = unwrap_const_vec_duplicate (x);
21076   return negate_p == (INTVAL (elt) < 0);
21077 }
21078 
21079 /* Return true if X is a valid immediate operand for an SVE logical
21080    instruction such as AND.  */
21081 
21082 bool
aarch64_sve_bitmask_immediate_p(rtx x)21083 aarch64_sve_bitmask_immediate_p (rtx x)
21084 {
21085   rtx elt;
21086 
21087   return (const_vec_duplicate_p (x, &elt)
21088 	  && CONST_INT_P (elt)
21089 	  && aarch64_bitmask_imm (INTVAL (elt),
21090 				  GET_MODE_INNER (GET_MODE (x))));
21091 }
21092 
21093 /* Return true if X is a valid immediate for the SVE DUP and CPY
21094    instructions.  */
21095 
21096 bool
aarch64_sve_dup_immediate_p(rtx x)21097 aarch64_sve_dup_immediate_p (rtx x)
21098 {
21099   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
21100   if (!CONST_INT_P (x))
21101     return false;
21102 
21103   HOST_WIDE_INT val = INTVAL (x);
21104   if (val & 0xff)
21105     return IN_RANGE (val, -0x80, 0x7f);
21106   return IN_RANGE (val, -0x8000, 0x7f00);
21107 }
21108 
21109 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
21110    SIGNED_P says whether the operand is signed rather than unsigned.  */
21111 
21112 bool
aarch64_sve_cmp_immediate_p(rtx x,bool signed_p)21113 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
21114 {
21115   x = unwrap_const_vec_duplicate (x);
21116   return (CONST_INT_P (x)
21117 	  && (signed_p
21118 	      ? IN_RANGE (INTVAL (x), -16, 15)
21119 	      : IN_RANGE (INTVAL (x), 0, 127)));
21120 }
21121 
21122 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
21123    instruction.  Negate X first if NEGATE_P is true.  */
21124 
21125 bool
aarch64_sve_float_arith_immediate_p(rtx x,bool negate_p)21126 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
21127 {
21128   rtx elt;
21129   REAL_VALUE_TYPE r;
21130 
21131   if (!const_vec_duplicate_p (x, &elt)
21132       || !CONST_DOUBLE_P (elt))
21133     return false;
21134 
21135   r = *CONST_DOUBLE_REAL_VALUE (elt);
21136 
21137   if (negate_p)
21138     r = real_value_negate (&r);
21139 
21140   if (real_equal (&r, &dconst1))
21141     return true;
21142   if (real_equal (&r, &dconsthalf))
21143     return true;
21144   return false;
21145 }
21146 
21147 /* Return true if X is a valid immediate operand for an SVE FMUL
21148    instruction.  */
21149 
21150 bool
aarch64_sve_float_mul_immediate_p(rtx x)21151 aarch64_sve_float_mul_immediate_p (rtx x)
21152 {
21153   rtx elt;
21154 
21155   return (const_vec_duplicate_p (x, &elt)
21156 	  && CONST_DOUBLE_P (elt)
21157 	  && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
21158 	      || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
21159 }
21160 
21161 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
21162    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
21163    is nonnull, use it to describe valid immediates.  */
21164 static bool
aarch64_advsimd_valid_immediate_hs(unsigned int val32,simd_immediate_info * info,enum simd_immediate_check which,simd_immediate_info::insn_type insn)21165 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
21166 				    simd_immediate_info *info,
21167 				    enum simd_immediate_check which,
21168 				    simd_immediate_info::insn_type insn)
21169 {
21170   /* Try a 4-byte immediate with LSL.  */
21171   for (unsigned int shift = 0; shift < 32; shift += 8)
21172     if ((val32 & (0xff << shift)) == val32)
21173       {
21174 	if (info)
21175 	  *info = simd_immediate_info (SImode, val32 >> shift, insn,
21176 				       simd_immediate_info::LSL, shift);
21177 	return true;
21178       }
21179 
21180   /* Try a 2-byte immediate with LSL.  */
21181   unsigned int imm16 = val32 & 0xffff;
21182   if (imm16 == (val32 >> 16))
21183     for (unsigned int shift = 0; shift < 16; shift += 8)
21184       if ((imm16 & (0xff << shift)) == imm16)
21185 	{
21186 	  if (info)
21187 	    *info = simd_immediate_info (HImode, imm16 >> shift, insn,
21188 					 simd_immediate_info::LSL, shift);
21189 	  return true;
21190 	}
21191 
21192   /* Try a 4-byte immediate with MSL, except for cases that MVN
21193      can handle.  */
21194   if (which == AARCH64_CHECK_MOV)
21195     for (unsigned int shift = 8; shift < 24; shift += 8)
21196       {
21197 	unsigned int low = (1 << shift) - 1;
21198 	if (((val32 & (0xff << shift)) | low) == val32)
21199 	  {
21200 	    if (info)
21201 	      *info = simd_immediate_info (SImode, val32 >> shift, insn,
21202 					   simd_immediate_info::MSL, shift);
21203 	    return true;
21204 	  }
21205       }
21206 
21207   return false;
21208 }
21209 
21210 /* Return true if replicating VAL64 is a valid immediate for the
21211    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
21212    use it to describe valid immediates.  */
21213 static bool
aarch64_advsimd_valid_immediate(unsigned HOST_WIDE_INT val64,simd_immediate_info * info,enum simd_immediate_check which)21214 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
21215 				 simd_immediate_info *info,
21216 				 enum simd_immediate_check which)
21217 {
21218   unsigned int val32 = val64 & 0xffffffff;
21219   unsigned int val16 = val64 & 0xffff;
21220   unsigned int val8 = val64 & 0xff;
21221 
21222   if (val32 == (val64 >> 32))
21223     {
21224       if ((which & AARCH64_CHECK_ORR) != 0
21225 	  && aarch64_advsimd_valid_immediate_hs (val32, info, which,
21226 						 simd_immediate_info::MOV))
21227 	return true;
21228 
21229       if ((which & AARCH64_CHECK_BIC) != 0
21230 	  && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
21231 						 simd_immediate_info::MVN))
21232 	return true;
21233 
21234       /* Try using a replicated byte.  */
21235       if (which == AARCH64_CHECK_MOV
21236 	  && val16 == (val32 >> 16)
21237 	  && val8 == (val16 >> 8))
21238 	{
21239 	  if (info)
21240 	    *info = simd_immediate_info (QImode, val8);
21241 	  return true;
21242 	}
21243     }
21244 
21245   /* Try using a bit-to-bytemask.  */
21246   if (which == AARCH64_CHECK_MOV)
21247     {
21248       unsigned int i;
21249       for (i = 0; i < 64; i += 8)
21250 	{
21251 	  unsigned char byte = (val64 >> i) & 0xff;
21252 	  if (byte != 0 && byte != 0xff)
21253 	    break;
21254 	}
21255       if (i == 64)
21256 	{
21257 	  if (info)
21258 	    *info = simd_immediate_info (DImode, val64);
21259 	  return true;
21260 	}
21261     }
21262   return false;
21263 }
21264 
21265 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
21266    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
21267 
21268 static bool
aarch64_sve_valid_immediate(unsigned HOST_WIDE_INT val64,simd_immediate_info * info)21269 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
21270 			     simd_immediate_info *info)
21271 {
21272   scalar_int_mode mode = DImode;
21273   unsigned int val32 = val64 & 0xffffffff;
21274   if (val32 == (val64 >> 32))
21275     {
21276       mode = SImode;
21277       unsigned int val16 = val32 & 0xffff;
21278       if (val16 == (val32 >> 16))
21279 	{
21280 	  mode = HImode;
21281 	  unsigned int val8 = val16 & 0xff;
21282 	  if (val8 == (val16 >> 8))
21283 	    mode = QImode;
21284 	}
21285     }
21286   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
21287   if (IN_RANGE (val, -0x80, 0x7f))
21288     {
21289       /* DUP with no shift.  */
21290       if (info)
21291 	*info = simd_immediate_info (mode, val);
21292       return true;
21293     }
21294   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
21295     {
21296       /* DUP with LSL #8.  */
21297       if (info)
21298 	*info = simd_immediate_info (mode, val);
21299       return true;
21300     }
21301   if (aarch64_bitmask_imm (val64, mode))
21302     {
21303       /* DUPM.  */
21304       if (info)
21305 	*info = simd_immediate_info (mode, val);
21306       return true;
21307     }
21308   return false;
21309 }
21310 
21311 /* Return true if X is an UNSPEC_PTRUE constant of the form:
21312 
21313        (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
21314 
21315    where PATTERN is the svpattern as a CONST_INT and where ZERO
21316    is a zero constant of the required PTRUE mode (which can have
21317    fewer elements than X's mode, if zero bits are significant).
21318 
21319    If so, and if INFO is nonnull, describe the immediate in INFO.  */
21320 bool
aarch64_sve_ptrue_svpattern_p(rtx x,struct simd_immediate_info * info)21321 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
21322 {
21323   if (GET_CODE (x) != CONST)
21324     return false;
21325 
21326   x = XEXP (x, 0);
21327   if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
21328     return false;
21329 
21330   if (info)
21331     {
21332       aarch64_svpattern pattern
21333 	= (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
21334       machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
21335       scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
21336       *info = simd_immediate_info (int_mode, pattern);
21337     }
21338   return true;
21339 }
21340 
21341 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
21342    it to describe valid immediates.  */
21343 
21344 static bool
aarch64_sve_pred_valid_immediate(rtx x,simd_immediate_info * info)21345 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
21346 {
21347   if (aarch64_sve_ptrue_svpattern_p (x, info))
21348     return true;
21349 
21350   if (x == CONST0_RTX (GET_MODE (x)))
21351     {
21352       if (info)
21353 	*info = simd_immediate_info (DImode, 0);
21354       return true;
21355     }
21356 
21357   /* Analyze the value as a VNx16BImode.  This should be relatively
21358      efficient, since rtx_vector_builder has enough built-in capacity
21359      to store all VLA predicate constants without needing the heap.  */
21360   rtx_vector_builder builder;
21361   if (!aarch64_get_sve_pred_bits (builder, x))
21362     return false;
21363 
21364   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
21365   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
21366     {
21367       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
21368       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
21369       if (pattern != AARCH64_NUM_SVPATTERNS)
21370 	{
21371 	  if (info)
21372 	    {
21373 	      scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
21374 	      *info = simd_immediate_info (int_mode, pattern);
21375 	    }
21376 	  return true;
21377 	}
21378     }
21379   return false;
21380 }
21381 
21382 /* Return true if OP is a valid SIMD immediate for the operation
21383    described by WHICH.  If INFO is nonnull, use it to describe valid
21384    immediates.  */
21385 bool
aarch64_simd_valid_immediate(rtx op,simd_immediate_info * info,enum simd_immediate_check which)21386 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
21387 			      enum simd_immediate_check which)
21388 {
21389   machine_mode mode = GET_MODE (op);
21390   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
21391   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
21392     return false;
21393 
21394   if (vec_flags & VEC_SVE_PRED)
21395     return aarch64_sve_pred_valid_immediate (op, info);
21396 
21397   scalar_mode elt_mode = GET_MODE_INNER (mode);
21398   rtx base, step;
21399   unsigned int n_elts;
21400   if (CONST_VECTOR_P (op)
21401       && CONST_VECTOR_DUPLICATE_P (op))
21402     n_elts = CONST_VECTOR_NPATTERNS (op);
21403   else if ((vec_flags & VEC_SVE_DATA)
21404 	   && const_vec_series_p (op, &base, &step))
21405     {
21406       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
21407       if (!aarch64_sve_index_immediate_p (base)
21408 	  || !aarch64_sve_index_immediate_p (step))
21409 	return false;
21410 
21411       if (info)
21412 	{
21413 	  /* Get the corresponding container mode.  E.g. an INDEX on V2SI
21414 	     should yield two integer values per 128-bit block, meaning
21415 	     that we need to treat it in the same way as V2DI and then
21416 	     ignore the upper 32 bits of each element.  */
21417 	  elt_mode = aarch64_sve_container_int_mode (mode);
21418 	  *info = simd_immediate_info (elt_mode, base, step);
21419 	}
21420       return true;
21421     }
21422   else if (CONST_VECTOR_P (op)
21423 	   && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
21424     /* N_ELTS set above.  */;
21425   else
21426     return false;
21427 
21428   scalar_float_mode elt_float_mode;
21429   if (n_elts == 1
21430       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
21431     {
21432       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
21433       if (aarch64_float_const_zero_rtx_p (elt)
21434 	  || aarch64_float_const_representable_p (elt))
21435 	{
21436 	  if (info)
21437 	    *info = simd_immediate_info (elt_float_mode, elt);
21438 	  return true;
21439 	}
21440     }
21441 
21442   /* If all elements in an SVE vector have the same value, we have a free
21443      choice between using the element mode and using the container mode.
21444      Using the element mode means that unused parts of the vector are
21445      duplicates of the used elements, while using the container mode means
21446      that the unused parts are an extension of the used elements.  Using the
21447      element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
21448      for its container mode VNx4SI while 0x00000101 isn't.
21449 
21450      If not all elements in an SVE vector have the same value, we need the
21451      transition from one element to the next to occur at container boundaries.
21452      E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
21453      in the same way as a VNx4SI containing { 1, 2, 3, 4 }.  */
21454   scalar_int_mode elt_int_mode;
21455   if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
21456     elt_int_mode = aarch64_sve_container_int_mode (mode);
21457   else
21458     elt_int_mode = int_mode_for_mode (elt_mode).require ();
21459 
21460   unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
21461   if (elt_size > 8)
21462     return false;
21463 
21464   /* Expand the vector constant out into a byte vector, with the least
21465      significant byte of the register first.  */
21466   auto_vec<unsigned char, 16> bytes;
21467   bytes.reserve (n_elts * elt_size);
21468   for (unsigned int i = 0; i < n_elts; i++)
21469     {
21470       /* The vector is provided in gcc endian-neutral fashion.
21471 	 For aarch64_be Advanced SIMD, it must be laid out in the vector
21472 	 register in reverse order.  */
21473       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
21474       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
21475 
21476       if (elt_mode != elt_int_mode)
21477 	elt = gen_lowpart (elt_int_mode, elt);
21478 
21479       if (!CONST_INT_P (elt))
21480 	return false;
21481 
21482       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
21483       for (unsigned int byte = 0; byte < elt_size; byte++)
21484 	{
21485 	  bytes.quick_push (elt_val & 0xff);
21486 	  elt_val >>= BITS_PER_UNIT;
21487 	}
21488     }
21489 
21490   /* The immediate must repeat every eight bytes.  */
21491   unsigned int nbytes = bytes.length ();
21492   for (unsigned i = 8; i < nbytes; ++i)
21493     if (bytes[i] != bytes[i - 8])
21494       return false;
21495 
21496   /* Get the repeating 8-byte value as an integer.  No endian correction
21497      is needed here because bytes is already in lsb-first order.  */
21498   unsigned HOST_WIDE_INT val64 = 0;
21499   for (unsigned int i = 0; i < 8; i++)
21500     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
21501 	      << (i * BITS_PER_UNIT));
21502 
21503   if (vec_flags & VEC_SVE_DATA)
21504     return aarch64_sve_valid_immediate (val64, info);
21505   else
21506     return aarch64_advsimd_valid_immediate (val64, info, which);
21507 }
21508 
21509 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
21510    has a step in the range of INDEX.  Return the index expression if so,
21511    otherwise return null.  */
21512 rtx
aarch64_check_zero_based_sve_index_immediate(rtx x)21513 aarch64_check_zero_based_sve_index_immediate (rtx x)
21514 {
21515   rtx base, step;
21516   if (const_vec_series_p (x, &base, &step)
21517       && base == const0_rtx
21518       && aarch64_sve_index_immediate_p (step))
21519     return step;
21520   return NULL_RTX;
21521 }
21522 
21523 /* Check of immediate shift constants are within range.  */
21524 bool
aarch64_simd_shift_imm_p(rtx x,machine_mode mode,bool left)21525 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
21526 {
21527   x = unwrap_const_vec_duplicate (x);
21528   if (!CONST_INT_P (x))
21529     return false;
21530   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
21531   if (left)
21532     return IN_RANGE (INTVAL (x), 0, bit_width - 1);
21533   else
21534     return IN_RANGE (INTVAL (x), 1, bit_width);
21535 }
21536 
21537 /* Return the bitmask CONST_INT to select the bits required by a zero extract
21538    operation of width WIDTH at bit position POS.  */
21539 
21540 rtx
aarch64_mask_from_zextract_ops(rtx width,rtx pos)21541 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
21542 {
21543   gcc_assert (CONST_INT_P (width));
21544   gcc_assert (CONST_INT_P (pos));
21545 
21546   unsigned HOST_WIDE_INT mask
21547     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
21548   return GEN_INT (mask << UINTVAL (pos));
21549 }
21550 
21551 bool
aarch64_mov_operand_p(rtx x,machine_mode mode)21552 aarch64_mov_operand_p (rtx x, machine_mode mode)
21553 {
21554   if (GET_CODE (x) == HIGH
21555       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
21556     return true;
21557 
21558   if (CONST_INT_P (x))
21559     return true;
21560 
21561   if (VECTOR_MODE_P (GET_MODE (x)))
21562     {
21563       /* Require predicate constants to be VNx16BI before RA, so that we
21564 	 force everything to have a canonical form.  */
21565       if (!lra_in_progress
21566 	  && !reload_completed
21567 	  && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
21568 	  && GET_MODE (x) != VNx16BImode)
21569 	return false;
21570 
21571       return aarch64_simd_valid_immediate (x, NULL);
21572     }
21573 
21574   /* Remove UNSPEC_SALT_ADDR before checking symbol reference.  */
21575   x = strip_salt (x);
21576 
21577   /* GOT accesses are valid moves.  */
21578   if (SYMBOL_REF_P (x)
21579       && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
21580     return true;
21581 
21582   if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
21583     return true;
21584 
21585   if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
21586     return true;
21587 
21588   return aarch64_classify_symbolic_expression (x)
21589     == SYMBOL_TINY_ABSOLUTE;
21590 }
21591 
21592 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
21593    the constant creation.  */
21594 
21595 rtx
aarch64_gen_shareable_zero(machine_mode mode)21596 aarch64_gen_shareable_zero (machine_mode mode)
21597 {
21598   machine_mode zmode = V4SImode;
21599   rtx tmp = gen_reg_rtx (zmode);
21600   emit_move_insn (tmp, CONST0_RTX (zmode));
21601   return lowpart_subreg (mode, tmp, zmode);
21602 }
21603 
21604 /* Return a const_int vector of VAL.  */
21605 rtx
aarch64_simd_gen_const_vector_dup(machine_mode mode,HOST_WIDE_INT val)21606 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
21607 {
21608   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
21609   return gen_const_vec_duplicate (mode, c);
21610 }
21611 
21612 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
21613 
21614 bool
aarch64_simd_scalar_immediate_valid_for_move(rtx op,scalar_int_mode mode)21615 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
21616 {
21617   machine_mode vmode;
21618 
21619   vmode = aarch64_simd_container_mode (mode, 64);
21620   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
21621   return aarch64_simd_valid_immediate (op_v, NULL);
21622 }
21623 
21624 /* Construct and return a PARALLEL RTX vector with elements numbering the
21625    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
21626    the vector - from the perspective of the architecture.  This does not
21627    line up with GCC's perspective on lane numbers, so we end up with
21628    different masks depending on our target endian-ness.  The diagram
21629    below may help.  We must draw the distinction when building masks
21630    which select one half of the vector.  An instruction selecting
21631    architectural low-lanes for a big-endian target, must be described using
21632    a mask selecting GCC high-lanes.
21633 
21634                  Big-Endian             Little-Endian
21635 
21636 GCC             0   1   2   3           3   2   1   0
21637               | x | x | x | x |       | x | x | x | x |
21638 Architecture    3   2   1   0           3   2   1   0
21639 
21640 Low Mask:         { 2, 3 }                { 0, 1 }
21641 High Mask:        { 0, 1 }                { 2, 3 }
21642 
21643    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
21644 
21645 rtx
aarch64_simd_vect_par_cnst_half(machine_mode mode,int nunits,bool high)21646 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
21647 {
21648   rtvec v = rtvec_alloc (nunits / 2);
21649   int high_base = nunits / 2;
21650   int low_base = 0;
21651   int base;
21652   rtx t1;
21653   int i;
21654 
21655   if (BYTES_BIG_ENDIAN)
21656     base = high ? low_base : high_base;
21657   else
21658     base = high ? high_base : low_base;
21659 
21660   for (i = 0; i < nunits / 2; i++)
21661     RTVEC_ELT (v, i) = GEN_INT (base + i);
21662 
21663   t1 = gen_rtx_PARALLEL (mode, v);
21664   return t1;
21665 }
21666 
21667 /* Check OP for validity as a PARALLEL RTX vector with elements
21668    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
21669    from the perspective of the architecture.  See the diagram above
21670    aarch64_simd_vect_par_cnst_half for more details.  */
21671 
21672 bool
aarch64_simd_check_vect_par_cnst_half(rtx op,machine_mode mode,bool high)21673 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
21674 				       bool high)
21675 {
21676   int nelts;
21677   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
21678     return false;
21679 
21680   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
21681   HOST_WIDE_INT count_op = XVECLEN (op, 0);
21682   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
21683   int i = 0;
21684 
21685   if (count_op != count_ideal)
21686     return false;
21687 
21688   for (i = 0; i < count_ideal; i++)
21689     {
21690       rtx elt_op = XVECEXP (op, 0, i);
21691       rtx elt_ideal = XVECEXP (ideal, 0, i);
21692 
21693       if (!CONST_INT_P (elt_op)
21694 	  || INTVAL (elt_ideal) != INTVAL (elt_op))
21695 	return false;
21696     }
21697   return true;
21698 }
21699 
21700 /* Return a PARALLEL containing NELTS elements, with element I equal
21701    to BASE + I * STEP.  */
21702 
21703 rtx
aarch64_gen_stepped_int_parallel(unsigned int nelts,int base,int step)21704 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
21705 {
21706   rtvec vec = rtvec_alloc (nelts);
21707   for (unsigned int i = 0; i < nelts; ++i)
21708     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
21709   return gen_rtx_PARALLEL (VOIDmode, vec);
21710 }
21711 
21712 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
21713    series with step STEP.  */
21714 
21715 bool
aarch64_stepped_int_parallel_p(rtx op,int step)21716 aarch64_stepped_int_parallel_p (rtx op, int step)
21717 {
21718   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
21719     return false;
21720 
21721   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
21722   for (int i = 1; i < XVECLEN (op, 0); ++i)
21723     if (!CONST_INT_P (XVECEXP (op, 0, i))
21724 	|| UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
21725       return false;
21726 
21727   return true;
21728 }
21729 
21730 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
21731    HIGH (exclusive).  */
21732 void
aarch64_simd_lane_bounds(rtx operand,HOST_WIDE_INT low,HOST_WIDE_INT high,const_tree exp)21733 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
21734 			  const_tree exp)
21735 {
21736   HOST_WIDE_INT lane;
21737   gcc_assert (CONST_INT_P (operand));
21738   lane = INTVAL (operand);
21739 
21740   if (lane < low || lane >= high)
21741   {
21742     if (exp)
21743       error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd",
21744 		lane, low, high - 1);
21745     else
21746       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
21747   }
21748 }
21749 
21750 /* Peform endian correction on lane number N, which indexes a vector
21751    of mode MODE, and return the result as an SImode rtx.  */
21752 
21753 rtx
aarch64_endian_lane_rtx(machine_mode mode,unsigned int n)21754 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
21755 {
21756   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
21757 }
21758 
21759 /* Return TRUE if OP is a valid vector addressing mode.  */
21760 
21761 bool
aarch64_simd_mem_operand_p(rtx op)21762 aarch64_simd_mem_operand_p (rtx op)
21763 {
21764   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
21765 			|| REG_P (XEXP (op, 0)));
21766 }
21767 
21768 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
21769 
21770 bool
aarch64_sve_ld1r_operand_p(rtx op)21771 aarch64_sve_ld1r_operand_p (rtx op)
21772 {
21773   struct aarch64_address_info addr;
21774   scalar_mode mode;
21775 
21776   return (MEM_P (op)
21777 	  && is_a <scalar_mode> (GET_MODE (op), &mode)
21778 	  && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
21779 	  && addr.type == ADDRESS_REG_IMM
21780 	  && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
21781 }
21782 
21783 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
21784    where the size of the read data is specified by `mode` and the size of the
21785    vector elements are specified by `elem_mode`.   */
21786 bool
aarch64_sve_ld1rq_ld1ro_operand_p(rtx op,machine_mode mode,scalar_mode elem_mode)21787 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
21788 				   scalar_mode elem_mode)
21789 {
21790   struct aarch64_address_info addr;
21791   if (!MEM_P (op)
21792       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
21793     return false;
21794 
21795   if (addr.type == ADDRESS_REG_IMM)
21796     return offset_4bit_signed_scaled_p (mode, addr.const_offset);
21797 
21798   if (addr.type == ADDRESS_REG_REG)
21799     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
21800 
21801   return false;
21802 }
21803 
21804 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
21805 bool
aarch64_sve_ld1rq_operand_p(rtx op)21806 aarch64_sve_ld1rq_operand_p (rtx op)
21807 {
21808   return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
21809 					    GET_MODE_INNER (GET_MODE (op)));
21810 }
21811 
21812 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
21813    accessing a vector where the element size is specified by `elem_mode`.  */
21814 bool
aarch64_sve_ld1ro_operand_p(rtx op,scalar_mode elem_mode)21815 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
21816 {
21817   return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
21818 }
21819 
21820 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
21821 bool
aarch64_sve_ldff1_operand_p(rtx op)21822 aarch64_sve_ldff1_operand_p (rtx op)
21823 {
21824   if (!MEM_P (op))
21825     return false;
21826 
21827   struct aarch64_address_info addr;
21828   if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
21829     return false;
21830 
21831   if (addr.type == ADDRESS_REG_IMM)
21832     return known_eq (addr.const_offset, 0);
21833 
21834   return addr.type == ADDRESS_REG_REG;
21835 }
21836 
21837 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
21838 bool
aarch64_sve_ldnf1_operand_p(rtx op)21839 aarch64_sve_ldnf1_operand_p (rtx op)
21840 {
21841   struct aarch64_address_info addr;
21842 
21843   return (MEM_P (op)
21844 	  && aarch64_classify_address (&addr, XEXP (op, 0),
21845 				       GET_MODE (op), false)
21846 	  && addr.type == ADDRESS_REG_IMM);
21847 }
21848 
21849 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
21850    The conditions for STR are the same.  */
21851 bool
aarch64_sve_ldr_operand_p(rtx op)21852 aarch64_sve_ldr_operand_p (rtx op)
21853 {
21854   struct aarch64_address_info addr;
21855 
21856   return (MEM_P (op)
21857 	  && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
21858 				       false, ADDR_QUERY_ANY)
21859 	  && addr.type == ADDRESS_REG_IMM);
21860 }
21861 
21862 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
21863    addressing memory of mode MODE.  */
21864 bool
aarch64_sve_prefetch_operand_p(rtx op,machine_mode mode)21865 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
21866 {
21867   struct aarch64_address_info addr;
21868   if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
21869     return false;
21870 
21871   if (addr.type == ADDRESS_REG_IMM)
21872     return offset_6bit_signed_scaled_p (mode, addr.const_offset);
21873 
21874   return addr.type == ADDRESS_REG_REG;
21875 }
21876 
21877 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
21878    We need to be able to access the individual pieces, so the range
21879    is different from LD[234] and ST[234].  */
21880 bool
aarch64_sve_struct_memory_operand_p(rtx op)21881 aarch64_sve_struct_memory_operand_p (rtx op)
21882 {
21883   if (!MEM_P (op))
21884     return false;
21885 
21886   machine_mode mode = GET_MODE (op);
21887   struct aarch64_address_info addr;
21888   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
21889 				 ADDR_QUERY_ANY)
21890       || addr.type != ADDRESS_REG_IMM)
21891     return false;
21892 
21893   poly_int64 first = addr.const_offset;
21894   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
21895   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
21896 	  && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
21897 }
21898 
21899 /* Emit a register copy from operand to operand, taking care not to
21900    early-clobber source registers in the process.
21901 
21902    COUNT is the number of components into which the copy needs to be
21903    decomposed.  */
21904 void
aarch64_simd_emit_reg_reg_move(rtx * operands,machine_mode mode,unsigned int count)21905 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
21906 				unsigned int count)
21907 {
21908   unsigned int i;
21909   int rdest = REGNO (operands[0]);
21910   int rsrc = REGNO (operands[1]);
21911 
21912   if (!reg_overlap_mentioned_p (operands[0], operands[1])
21913       || rdest < rsrc)
21914     for (i = 0; i < count; i++)
21915       emit_move_insn (gen_rtx_REG (mode, rdest + i),
21916 		      gen_rtx_REG (mode, rsrc + i));
21917   else
21918     for (i = 0; i < count; i++)
21919       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
21920 		      gen_rtx_REG (mode, rsrc + count - i - 1));
21921 }
21922 
21923 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
21924    one of VSTRUCT modes: OI, CI, or XI.  */
21925 int
aarch64_simd_attr_length_rglist(machine_mode mode)21926 aarch64_simd_attr_length_rglist (machine_mode mode)
21927 {
21928   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
21929   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
21930 }
21931 
21932 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
21933    alignment of a vector to 128 bits.  SVE predicates have an alignment of
21934    16 bits.  */
21935 static HOST_WIDE_INT
aarch64_simd_vector_alignment(const_tree type)21936 aarch64_simd_vector_alignment (const_tree type)
21937 {
21938   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
21939      be set for non-predicate vectors of booleans.  Modes are the most
21940      direct way we have of identifying real SVE predicate types.  */
21941   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
21942     return 16;
21943   widest_int min_size
21944     = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
21945   return wi::umin (min_size, 128).to_uhwi ();
21946 }
21947 
21948 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
21949 static poly_uint64
aarch64_vectorize_preferred_vector_alignment(const_tree type)21950 aarch64_vectorize_preferred_vector_alignment (const_tree type)
21951 {
21952   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
21953     {
21954       /* If the length of the vector is a fixed power of 2, try to align
21955 	 to that length, otherwise don't try to align at all.  */
21956       HOST_WIDE_INT result;
21957       if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
21958 	  || !pow2p_hwi (result))
21959 	result = TYPE_ALIGN (TREE_TYPE (type));
21960       return result;
21961     }
21962   return TYPE_ALIGN (type);
21963 }
21964 
21965 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
21966 static bool
aarch64_simd_vector_alignment_reachable(const_tree type,bool is_packed)21967 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
21968 {
21969   if (is_packed)
21970     return false;
21971 
21972   /* For fixed-length vectors, check that the vectorizer will aim for
21973      full-vector alignment.  This isn't true for generic GCC vectors
21974      that are wider than the ABI maximum of 128 bits.  */
21975   poly_uint64 preferred_alignment =
21976     aarch64_vectorize_preferred_vector_alignment (type);
21977   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
21978       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
21979 		   preferred_alignment))
21980     return false;
21981 
21982   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
21983   return true;
21984 }
21985 
21986 /* Return true if the vector misalignment factor is supported by the
21987    target.  */
21988 static bool
aarch64_builtin_support_vector_misalignment(machine_mode mode,const_tree type,int misalignment,bool is_packed)21989 aarch64_builtin_support_vector_misalignment (machine_mode mode,
21990 					     const_tree type, int misalignment,
21991 					     bool is_packed)
21992 {
21993   if (TARGET_SIMD && STRICT_ALIGNMENT)
21994     {
21995       /* Return if movmisalign pattern is not supported for this mode.  */
21996       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
21997         return false;
21998 
21999       /* Misalignment factor is unknown at compile time.  */
22000       if (misalignment == -1)
22001 	return false;
22002     }
22003   return default_builtin_support_vector_misalignment (mode, type, misalignment,
22004 						      is_packed);
22005 }
22006 
22007 /* If VALS is a vector constant that can be loaded into a register
22008    using DUP, generate instructions to do so and return an RTX to
22009    assign to the register.  Otherwise return NULL_RTX.  */
22010 static rtx
aarch64_simd_dup_constant(rtx vals)22011 aarch64_simd_dup_constant (rtx vals)
22012 {
22013   machine_mode mode = GET_MODE (vals);
22014   machine_mode inner_mode = GET_MODE_INNER (mode);
22015   rtx x;
22016 
22017   if (!const_vec_duplicate_p (vals, &x))
22018     return NULL_RTX;
22019 
22020   /* We can load this constant by using DUP and a constant in a
22021      single ARM register.  This will be cheaper than a vector
22022      load.  */
22023   x = copy_to_mode_reg (inner_mode, x);
22024   return gen_vec_duplicate (mode, x);
22025 }
22026 
22027 
22028 /* Generate code to load VALS, which is a PARALLEL containing only
22029    constants (for vec_init) or CONST_VECTOR, efficiently into a
22030    register.  Returns an RTX to copy into the register, or NULL_RTX
22031    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
22032 static rtx
aarch64_simd_make_constant(rtx vals)22033 aarch64_simd_make_constant (rtx vals)
22034 {
22035   machine_mode mode = GET_MODE (vals);
22036   rtx const_dup;
22037   rtx const_vec = NULL_RTX;
22038   int n_const = 0;
22039   int i;
22040 
22041   if (CONST_VECTOR_P (vals))
22042     const_vec = vals;
22043   else if (GET_CODE (vals) == PARALLEL)
22044     {
22045       /* A CONST_VECTOR must contain only CONST_INTs and
22046 	 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
22047 	 Only store valid constants in a CONST_VECTOR.  */
22048       int n_elts = XVECLEN (vals, 0);
22049       for (i = 0; i < n_elts; ++i)
22050 	{
22051 	  rtx x = XVECEXP (vals, 0, i);
22052 	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22053 	    n_const++;
22054 	}
22055       if (n_const == n_elts)
22056 	const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
22057     }
22058   else
22059     gcc_unreachable ();
22060 
22061   if (const_vec != NULL_RTX
22062       && aarch64_simd_valid_immediate (const_vec, NULL))
22063     /* Load using MOVI/MVNI.  */
22064     return const_vec;
22065   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
22066     /* Loaded using DUP.  */
22067     return const_dup;
22068   else if (const_vec != NULL_RTX)
22069     /* Load from constant pool. We cannot take advantage of single-cycle
22070        LD1 because we need a PC-relative addressing mode.  */
22071     return const_vec;
22072   else
22073     /* A PARALLEL containing something not valid inside CONST_VECTOR.
22074        We cannot construct an initializer.  */
22075     return NULL_RTX;
22076 }
22077 
22078 /* Expand a vector initialisation sequence, such that TARGET is
22079    initialised to contain VALS.  */
22080 
22081 void
aarch64_expand_vector_init(rtx target,rtx vals)22082 aarch64_expand_vector_init (rtx target, rtx vals)
22083 {
22084   machine_mode mode = GET_MODE (target);
22085   scalar_mode inner_mode = GET_MODE_INNER (mode);
22086   /* The number of vector elements.  */
22087   int n_elts = XVECLEN (vals, 0);
22088   /* The number of vector elements which are not constant.  */
22089   int n_var = 0;
22090   rtx any_const = NULL_RTX;
22091   /* The first element of vals.  */
22092   rtx v0 = XVECEXP (vals, 0, 0);
22093   bool all_same = true;
22094 
22095   /* This is a special vec_init<M><N> where N is not an element mode but a
22096      vector mode with half the elements of M.  We expect to find two entries
22097      of mode N in VALS and we must put their concatentation into TARGET.  */
22098   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
22099     {
22100       machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
22101       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
22102 		  && known_eq (GET_MODE_SIZE (mode),
22103 			       2 * GET_MODE_SIZE (narrow_mode)));
22104       emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
22105 					 XVECEXP (vals, 0, 0),
22106 					 XVECEXP (vals, 0, 1)));
22107      return;
22108    }
22109 
22110   /* Count the number of variable elements to initialise.  */
22111   for (int i = 0; i < n_elts; ++i)
22112     {
22113       rtx x = XVECEXP (vals, 0, i);
22114       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
22115 	++n_var;
22116       else
22117 	any_const = x;
22118 
22119       all_same &= rtx_equal_p (x, v0);
22120     }
22121 
22122   /* No variable elements, hand off to aarch64_simd_make_constant which knows
22123      how best to handle this.  */
22124   if (n_var == 0)
22125     {
22126       rtx constant = aarch64_simd_make_constant (vals);
22127       if (constant != NULL_RTX)
22128 	{
22129 	  emit_move_insn (target, constant);
22130 	  return;
22131 	}
22132     }
22133 
22134   /* Splat a single non-constant element if we can.  */
22135   if (all_same)
22136     {
22137       rtx x = copy_to_mode_reg (inner_mode, v0);
22138       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
22139       return;
22140     }
22141 
22142   enum insn_code icode = optab_handler (vec_set_optab, mode);
22143   gcc_assert (icode != CODE_FOR_nothing);
22144 
22145   /* If there are only variable elements, try to optimize
22146      the insertion using dup for the most common element
22147      followed by insertions.  */
22148 
22149   /* The algorithm will fill matches[*][0] with the earliest matching element,
22150      and matches[X][1] with the count of duplicate elements (if X is the
22151      earliest element which has duplicates).  */
22152 
22153   if (n_var == n_elts && n_elts <= 16)
22154     {
22155       int matches[16][2] = {0};
22156       for (int i = 0; i < n_elts; i++)
22157 	{
22158 	  for (int j = 0; j <= i; j++)
22159 	    {
22160 	      if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
22161 		{
22162 		  matches[i][0] = j;
22163 		  matches[j][1]++;
22164 		  break;
22165 		}
22166 	    }
22167 	}
22168       int maxelement = 0;
22169       int maxv = 0;
22170       for (int i = 0; i < n_elts; i++)
22171 	if (matches[i][1] > maxv)
22172 	  {
22173 	    maxelement = i;
22174 	    maxv = matches[i][1];
22175 	  }
22176 
22177       /* Create a duplicate of the most common element, unless all elements
22178 	 are equally useless to us, in which case just immediately set the
22179 	 vector register using the first element.  */
22180 
22181       if (maxv == 1)
22182 	{
22183 	  /* For vectors of two 64-bit elements, we can do even better.  */
22184 	  if (n_elts == 2
22185 	      && (inner_mode == E_DImode
22186 		  || inner_mode == E_DFmode))
22187 
22188 	    {
22189 	      rtx x0 = XVECEXP (vals, 0, 0);
22190 	      rtx x1 = XVECEXP (vals, 0, 1);
22191 	      /* Combine can pick up this case, but handling it directly
22192 		 here leaves clearer RTL.
22193 
22194 		 This is load_pair_lanes<mode>, and also gives us a clean-up
22195 		 for store_pair_lanes<mode>.  */
22196 	      if (memory_operand (x0, inner_mode)
22197 		  && memory_operand (x1, inner_mode)
22198 		  && aarch64_mergeable_load_pair_p (mode, x0, x1))
22199 		{
22200 		  rtx t;
22201 		  if (inner_mode == DFmode)
22202 		    t = gen_load_pair_lanesdf (target, x0, x1);
22203 		  else
22204 		    t = gen_load_pair_lanesdi (target, x0, x1);
22205 		  emit_insn (t);
22206 		  return;
22207 		}
22208 	    }
22209 	  /* The subreg-move sequence below will move into lane zero of the
22210 	     vector register.  For big-endian we want that position to hold
22211 	     the last element of VALS.  */
22212 	  maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
22213 	  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
22214 	  aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
22215 	}
22216       else
22217 	{
22218 	  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
22219 	  aarch64_emit_move (target, gen_vec_duplicate (mode, x));
22220 	}
22221 
22222       /* Insert the rest.  */
22223       for (int i = 0; i < n_elts; i++)
22224 	{
22225 	  rtx x = XVECEXP (vals, 0, i);
22226 	  if (matches[i][0] == maxelement)
22227 	    continue;
22228 	  x = copy_to_mode_reg (inner_mode, x);
22229 	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
22230 	}
22231       return;
22232     }
22233 
22234   /* Initialise a vector which is part-variable.  We want to first try
22235      to build those lanes which are constant in the most efficient way we
22236      can.  */
22237   if (n_var != n_elts)
22238     {
22239       rtx copy = copy_rtx (vals);
22240 
22241       /* Load constant part of vector.  We really don't care what goes into the
22242 	 parts we will overwrite, but we're more likely to be able to load the
22243 	 constant efficiently if it has fewer, larger, repeating parts
22244 	 (see aarch64_simd_valid_immediate).  */
22245       for (int i = 0; i < n_elts; i++)
22246 	{
22247 	  rtx x = XVECEXP (vals, 0, i);
22248 	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22249 	    continue;
22250 	  rtx subst = any_const;
22251 	  for (int bit = n_elts / 2; bit > 0; bit /= 2)
22252 	    {
22253 	      /* Look in the copied vector, as more elements are const.  */
22254 	      rtx test = XVECEXP (copy, 0, i ^ bit);
22255 	      if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
22256 		{
22257 		  subst = test;
22258 		  break;
22259 		}
22260 	    }
22261 	  XVECEXP (copy, 0, i) = subst;
22262 	}
22263       aarch64_expand_vector_init (target, copy);
22264     }
22265 
22266   /* Insert the variable lanes directly.  */
22267   for (int i = 0; i < n_elts; i++)
22268     {
22269       rtx x = XVECEXP (vals, 0, i);
22270       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22271 	continue;
22272       x = copy_to_mode_reg (inner_mode, x);
22273       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
22274     }
22275 }
22276 
22277 /* Emit RTL corresponding to:
22278    insr TARGET, ELEM.  */
22279 
22280 static void
emit_insr(rtx target,rtx elem)22281 emit_insr (rtx target, rtx elem)
22282 {
22283   machine_mode mode = GET_MODE (target);
22284   scalar_mode elem_mode = GET_MODE_INNER (mode);
22285   elem = force_reg (elem_mode, elem);
22286 
22287   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
22288   gcc_assert (icode != CODE_FOR_nothing);
22289   emit_insn (GEN_FCN (icode) (target, target, elem));
22290 }
22291 
22292 /* Subroutine of aarch64_sve_expand_vector_init for handling
22293    trailing constants.
22294    This function works as follows:
22295    (a) Create a new vector consisting of trailing constants.
22296    (b) Initialize TARGET with the constant vector using emit_move_insn.
22297    (c) Insert remaining elements in TARGET using insr.
22298    NELTS is the total number of elements in original vector while
22299    while NELTS_REQD is the number of elements that are actually
22300    significant.
22301 
22302    ??? The heuristic used is to do above only if number of constants
22303    is at least half the total number of elements.  May need fine tuning.  */
22304 
22305 static bool
aarch64_sve_expand_vector_init_handle_trailing_constants(rtx target,const rtx_vector_builder & builder,int nelts,int nelts_reqd)22306 aarch64_sve_expand_vector_init_handle_trailing_constants
22307  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
22308 {
22309   machine_mode mode = GET_MODE (target);
22310   scalar_mode elem_mode = GET_MODE_INNER (mode);
22311   int n_trailing_constants = 0;
22312 
22313   for (int i = nelts_reqd - 1;
22314        i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
22315        i--)
22316     n_trailing_constants++;
22317 
22318   if (n_trailing_constants >= nelts_reqd / 2)
22319     {
22320       /* Try to use the natural pattern of BUILDER to extend the trailing
22321 	 constant elements to a full vector.  Replace any variables in the
22322 	 extra elements with zeros.
22323 
22324 	 ??? It would be better if the builders supported "don't care"
22325 	     elements, with the builder filling in whichever elements
22326 	     give the most compact encoding.  */
22327       rtx_vector_builder v (mode, nelts, 1);
22328       for (int i = 0; i < nelts; i++)
22329 	{
22330 	  rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
22331 	  if (!valid_for_const_vector_p (elem_mode, x))
22332 	    x = CONST0_RTX (elem_mode);
22333 	  v.quick_push (x);
22334 	}
22335       rtx const_vec = v.build ();
22336       emit_move_insn (target, const_vec);
22337 
22338       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
22339 	emit_insr (target, builder.elt (i));
22340 
22341       return true;
22342     }
22343 
22344   return false;
22345 }
22346 
22347 /* Subroutine of aarch64_sve_expand_vector_init.
22348    Works as follows:
22349    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
22350    (b) Skip trailing elements from BUILDER, which are the same as
22351        element NELTS_REQD - 1.
22352    (c) Insert earlier elements in reverse order in TARGET using insr.  */
22353 
22354 static void
aarch64_sve_expand_vector_init_insert_elems(rtx target,const rtx_vector_builder & builder,int nelts_reqd)22355 aarch64_sve_expand_vector_init_insert_elems (rtx target,
22356 					     const rtx_vector_builder &builder,
22357 					     int nelts_reqd)
22358 {
22359   machine_mode mode = GET_MODE (target);
22360   scalar_mode elem_mode = GET_MODE_INNER (mode);
22361 
22362   struct expand_operand ops[2];
22363   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
22364   gcc_assert (icode != CODE_FOR_nothing);
22365 
22366   create_output_operand (&ops[0], target, mode);
22367   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
22368   expand_insn (icode, 2, ops);
22369 
22370   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
22371   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
22372     emit_insr (target, builder.elt (i));
22373 }
22374 
22375 /* Subroutine of aarch64_sve_expand_vector_init to handle case
22376    when all trailing elements of builder are same.
22377    This works as follows:
22378    (a) Use expand_insn interface to broadcast last vector element in TARGET.
22379    (b) Insert remaining elements in TARGET using insr.
22380 
22381    ??? The heuristic used is to do above if number of same trailing elements
22382    is at least 3/4 of total number of elements, loosely based on
22383    heuristic from mostly_zeros_p.  May need fine-tuning.  */
22384 
22385 static bool
aarch64_sve_expand_vector_init_handle_trailing_same_elem(rtx target,const rtx_vector_builder & builder,int nelts_reqd)22386 aarch64_sve_expand_vector_init_handle_trailing_same_elem
22387  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
22388 {
22389   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
22390   if (ndups >= (3 * nelts_reqd) / 4)
22391     {
22392       aarch64_sve_expand_vector_init_insert_elems (target, builder,
22393 						   nelts_reqd - ndups + 1);
22394       return true;
22395     }
22396 
22397   return false;
22398 }
22399 
22400 /* Initialize register TARGET from BUILDER. NELTS is the constant number
22401    of elements in BUILDER.
22402 
22403    The function tries to initialize TARGET from BUILDER if it fits one
22404    of the special cases outlined below.
22405 
22406    Failing that, the function divides BUILDER into two sub-vectors:
22407    v_even = even elements of BUILDER;
22408    v_odd = odd elements of BUILDER;
22409 
22410    and recursively calls itself with v_even and v_odd.
22411 
22412    if (recursive call succeeded for v_even or v_odd)
22413      TARGET = zip (v_even, v_odd)
22414 
22415    The function returns true if it managed to build TARGET from BUILDER
22416    with one of the special cases, false otherwise.
22417 
22418    Example: {a, 1, b, 2, c, 3, d, 4}
22419 
22420    The vector gets divided into:
22421    v_even = {a, b, c, d}
22422    v_odd = {1, 2, 3, 4}
22423 
22424    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
22425    initialize tmp2 from constant vector v_odd using emit_move_insn.
22426 
22427    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
22428    4 elements, so we construct tmp1 from v_even using insr:
22429    tmp1 = dup(d)
22430    insr tmp1, c
22431    insr tmp1, b
22432    insr tmp1, a
22433 
22434    And finally:
22435    TARGET = zip (tmp1, tmp2)
22436    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
22437 
22438 static bool
aarch64_sve_expand_vector_init(rtx target,const rtx_vector_builder & builder,int nelts,int nelts_reqd)22439 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
22440 				int nelts, int nelts_reqd)
22441 {
22442   machine_mode mode = GET_MODE (target);
22443 
22444   /* Case 1: Vector contains trailing constants.  */
22445 
22446   if (aarch64_sve_expand_vector_init_handle_trailing_constants
22447        (target, builder, nelts, nelts_reqd))
22448     return true;
22449 
22450   /* Case 2: Vector contains leading constants.  */
22451 
22452   rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
22453   for (int i = 0; i < nelts_reqd; i++)
22454     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
22455   rev_builder.finalize ();
22456 
22457   if (aarch64_sve_expand_vector_init_handle_trailing_constants
22458        (target, rev_builder, nelts, nelts_reqd))
22459     {
22460       emit_insn (gen_aarch64_sve_rev (mode, target, target));
22461       return true;
22462     }
22463 
22464   /* Case 3: Vector contains trailing same element.  */
22465 
22466   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22467        (target, builder, nelts_reqd))
22468     return true;
22469 
22470   /* Case 4: Vector contains leading same element.  */
22471 
22472   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22473        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
22474     {
22475       emit_insn (gen_aarch64_sve_rev (mode, target, target));
22476       return true;
22477     }
22478 
22479   /* Avoid recursing below 4-elements.
22480      ??? The threshold 4 may need fine-tuning.  */
22481 
22482   if (nelts_reqd <= 4)
22483     return false;
22484 
22485   rtx_vector_builder v_even (mode, nelts, 1);
22486   rtx_vector_builder v_odd (mode, nelts, 1);
22487 
22488   for (int i = 0; i < nelts * 2; i += 2)
22489     {
22490       v_even.quick_push (builder.elt (i));
22491       v_odd.quick_push (builder.elt (i + 1));
22492     }
22493 
22494   v_even.finalize ();
22495   v_odd.finalize ();
22496 
22497   rtx tmp1 = gen_reg_rtx (mode);
22498   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
22499 						    nelts, nelts_reqd / 2);
22500 
22501   rtx tmp2 = gen_reg_rtx (mode);
22502   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
22503 						   nelts, nelts_reqd / 2);
22504 
22505   if (!did_even_p && !did_odd_p)
22506     return false;
22507 
22508   /* Initialize v_even and v_odd using INSR if it didn't match any of the
22509      special cases and zip v_even, v_odd.  */
22510 
22511   if (!did_even_p)
22512     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
22513 
22514   if (!did_odd_p)
22515     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
22516 
22517   rtvec v = gen_rtvec (2, tmp1, tmp2);
22518   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
22519   return true;
22520 }
22521 
22522 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
22523 
22524 void
aarch64_sve_expand_vector_init(rtx target,rtx vals)22525 aarch64_sve_expand_vector_init (rtx target, rtx vals)
22526 {
22527   machine_mode mode = GET_MODE (target);
22528   int nelts = XVECLEN (vals, 0);
22529 
22530   rtx_vector_builder v (mode, nelts, 1);
22531   for (int i = 0; i < nelts; i++)
22532     v.quick_push (XVECEXP (vals, 0, i));
22533   v.finalize ();
22534 
22535   /* If neither sub-vectors of v could be initialized specially,
22536      then use INSR to insert all elements from v into TARGET.
22537      ??? This might not be optimal for vectors with large
22538      initializers like 16-element or above.
22539      For nelts < 4, it probably isn't useful to handle specially.  */
22540 
22541   if (nelts < 4
22542       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
22543     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
22544 }
22545 
22546 /* Check whether VALUE is a vector constant in which every element
22547    is either a power of 2 or a negated power of 2.  If so, return
22548    a constant vector of log2s, and flip CODE between PLUS and MINUS
22549    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
22550 
22551 static rtx
aarch64_convert_mult_to_shift(rtx value,rtx_code & code)22552 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
22553 {
22554   if (!CONST_VECTOR_P (value))
22555     return NULL_RTX;
22556 
22557   rtx_vector_builder builder;
22558   if (!builder.new_unary_operation (GET_MODE (value), value, false))
22559     return NULL_RTX;
22560 
22561   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
22562   /* 1 if the result of the multiplication must be negated,
22563      0 if it mustn't, or -1 if we don't yet care.  */
22564   int negate = -1;
22565   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
22566   for (unsigned int i = 0; i < encoded_nelts; ++i)
22567     {
22568       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
22569       if (!CONST_SCALAR_INT_P (elt))
22570 	return NULL_RTX;
22571       rtx_mode_t val (elt, int_mode);
22572       wide_int pow2 = wi::neg (val);
22573       if (val != pow2)
22574 	{
22575 	  /* It matters whether we negate or not.  Make that choice,
22576 	     and make sure that it's consistent with previous elements.  */
22577 	  if (negate == !wi::neg_p (val))
22578 	    return NULL_RTX;
22579 	  negate = wi::neg_p (val);
22580 	  if (!negate)
22581 	    pow2 = val;
22582 	}
22583       /* POW2 is now the value that we want to be a power of 2.  */
22584       int shift = wi::exact_log2 (pow2);
22585       if (shift < 0)
22586 	return NULL_RTX;
22587       builder.quick_push (gen_int_mode (shift, int_mode));
22588     }
22589   if (negate == -1)
22590     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
22591     code = PLUS;
22592   else if (negate == 1)
22593     code = code == PLUS ? MINUS : PLUS;
22594   return builder.build ();
22595 }
22596 
22597 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
22598    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
22599    operands array, in the same order as for fma_optab.  Return true if
22600    the function emitted all the necessary instructions, false if the caller
22601    should generate the pattern normally with the new OPERANDS array.  */
22602 
22603 bool
aarch64_prepare_sve_int_fma(rtx * operands,rtx_code code)22604 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
22605 {
22606   machine_mode mode = GET_MODE (operands[0]);
22607   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
22608     {
22609       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
22610 				  NULL_RTX, true, OPTAB_DIRECT);
22611       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
22612 			  operands[3], product, operands[0], true,
22613 			  OPTAB_DIRECT);
22614       return true;
22615     }
22616   operands[2] = force_reg (mode, operands[2]);
22617   return false;
22618 }
22619 
22620 /* Likewise, but for a conditional pattern.  */
22621 
22622 bool
aarch64_prepare_sve_cond_int_fma(rtx * operands,rtx_code code)22623 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
22624 {
22625   machine_mode mode = GET_MODE (operands[0]);
22626   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
22627     {
22628       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
22629 				  NULL_RTX, true, OPTAB_DIRECT);
22630       emit_insn (gen_cond (code, mode, operands[0], operands[1],
22631 			   operands[4], product, operands[5]));
22632       return true;
22633     }
22634   operands[3] = force_reg (mode, operands[3]);
22635   return false;
22636 }
22637 
22638 static unsigned HOST_WIDE_INT
aarch64_shift_truncation_mask(machine_mode mode)22639 aarch64_shift_truncation_mask (machine_mode mode)
22640 {
22641   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
22642     return 0;
22643   return GET_MODE_UNIT_BITSIZE (mode) - 1;
22644 }
22645 
22646 /* Select a format to encode pointers in exception handling data.  */
22647 int
aarch64_asm_preferred_eh_data_format(int code ATTRIBUTE_UNUSED,int global)22648 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
22649 {
22650    int type;
22651    switch (aarch64_cmodel)
22652      {
22653      case AARCH64_CMODEL_TINY:
22654      case AARCH64_CMODEL_TINY_PIC:
22655      case AARCH64_CMODEL_SMALL:
22656      case AARCH64_CMODEL_SMALL_PIC:
22657      case AARCH64_CMODEL_SMALL_SPIC:
22658        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
22659 	  for everything.  */
22660        type = DW_EH_PE_sdata4;
22661        break;
22662      default:
22663        /* No assumptions here.  8-byte relocs required.  */
22664        type = DW_EH_PE_sdata8;
22665        break;
22666      }
22667    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
22668 }
22669 
22670 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
22671 
22672 static void
aarch64_asm_output_variant_pcs(FILE * stream,const tree decl,const char * name)22673 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
22674 {
22675   if (TREE_CODE (decl) == FUNCTION_DECL)
22676     {
22677       arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
22678       if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
22679 	{
22680 	  fprintf (stream, "\t.variant_pcs\t");
22681 	  assemble_name (stream, name);
22682 	  fprintf (stream, "\n");
22683 	}
22684     }
22685 }
22686 
22687 /* The last .arch and .tune assembly strings that we printed.  */
22688 static std::string aarch64_last_printed_arch_string;
22689 static std::string aarch64_last_printed_tune_string;
22690 
22691 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
22692    by the function fndecl.  */
22693 
22694 void
aarch64_declare_function_name(FILE * stream,const char * name,tree fndecl)22695 aarch64_declare_function_name (FILE *stream, const char* name,
22696 				tree fndecl)
22697 {
22698   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
22699 
22700   struct cl_target_option *targ_options;
22701   if (target_parts)
22702     targ_options = TREE_TARGET_OPTION (target_parts);
22703   else
22704     targ_options = TREE_TARGET_OPTION (target_option_current_node);
22705   gcc_assert (targ_options);
22706 
22707   const struct processor *this_arch
22708     = aarch64_get_arch (targ_options->x_explicit_arch);
22709 
22710   uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
22711   std::string extension
22712     = aarch64_get_extension_string_for_isa_flags (isa_flags,
22713 						  this_arch->flags);
22714   /* Only update the assembler .arch string if it is distinct from the last
22715      such string we printed.  */
22716   std::string to_print = this_arch->name + extension;
22717   if (to_print != aarch64_last_printed_arch_string)
22718     {
22719       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
22720       aarch64_last_printed_arch_string = to_print;
22721     }
22722 
22723   /* Print the cpu name we're tuning for in the comments, might be
22724      useful to readers of the generated asm.  Do it only when it changes
22725      from function to function and verbose assembly is requested.  */
22726   const struct processor *this_tune
22727     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
22728 
22729   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
22730     {
22731       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
22732 		   this_tune->name);
22733       aarch64_last_printed_tune_string = this_tune->name;
22734     }
22735 
22736   aarch64_asm_output_variant_pcs (stream, fndecl, name);
22737 
22738   /* Don't forget the type directive for ELF.  */
22739   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
22740   ASM_OUTPUT_LABEL (stream, name);
22741 
22742   cfun->machine->label_is_assembled = true;
22743 }
22744 
22745 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY.  */
22746 
22747 void
aarch64_print_patchable_function_entry(FILE * file,unsigned HOST_WIDE_INT patch_area_size,bool record_p)22748 aarch64_print_patchable_function_entry (FILE *file,
22749 					unsigned HOST_WIDE_INT patch_area_size,
22750 					bool record_p)
22751 {
22752   if (!cfun->machine->label_is_assembled)
22753     {
22754       /* Emit the patching area before the entry label, if any.  */
22755       default_print_patchable_function_entry (file, patch_area_size,
22756 					      record_p);
22757       return;
22758     }
22759 
22760   rtx pa = gen_patchable_area (GEN_INT (patch_area_size),
22761 			       GEN_INT (record_p));
22762   basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
22763 
22764   if (!aarch64_bti_enabled ()
22765       || cgraph_node::get (cfun->decl)->only_called_directly_p ())
22766     {
22767       /* Emit the patchable_area at the beginning of the function.  */
22768       rtx_insn *insn = emit_insn_before (pa, BB_HEAD (bb));
22769       INSN_ADDRESSES_NEW (insn, -1);
22770       return;
22771     }
22772 
22773   rtx_insn *insn = next_real_nondebug_insn (get_insns ());
22774   if (!insn
22775       || !INSN_P (insn)
22776       || GET_CODE (PATTERN (insn)) != UNSPEC_VOLATILE
22777       || XINT (PATTERN (insn), 1) != UNSPECV_BTI_C)
22778     {
22779       /* Emit a BTI_C.  */
22780       insn = emit_insn_before (gen_bti_c (), BB_HEAD (bb));
22781     }
22782 
22783   /* Emit the patchable_area after BTI_C.  */
22784   insn = emit_insn_after (pa, insn);
22785   INSN_ADDRESSES_NEW (insn, -1);
22786 }
22787 
22788 /* Output patchable area.  */
22789 
22790 void
aarch64_output_patchable_area(unsigned int patch_area_size,bool record_p)22791 aarch64_output_patchable_area (unsigned int patch_area_size, bool record_p)
22792 {
22793   default_print_patchable_function_entry (asm_out_file, patch_area_size,
22794 					  record_p);
22795 }
22796 
22797 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
22798 
22799 void
aarch64_asm_output_alias(FILE * stream,const tree decl,const tree target)22800 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
22801 {
22802   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
22803   const char *value = IDENTIFIER_POINTER (target);
22804   aarch64_asm_output_variant_pcs (stream, decl, name);
22805   ASM_OUTPUT_DEF (stream, name, value);
22806 }
22807 
22808 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
22809    function symbol references.  */
22810 
22811 void
aarch64_asm_output_external(FILE * stream,tree decl,const char * name)22812 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
22813 {
22814   default_elf_asm_output_external (stream, decl, name);
22815   aarch64_asm_output_variant_pcs (stream, decl, name);
22816 }
22817 
22818 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
22819    Used to output the .cfi_b_key_frame directive when signing the current
22820    function with the B key.  */
22821 
22822 void
aarch64_post_cfi_startproc(FILE * f,tree ignored ATTRIBUTE_UNUSED)22823 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
22824 {
22825   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
22826       && aarch64_ra_sign_key == AARCH64_KEY_B)
22827 	asm_fprintf (f, "\t.cfi_b_key_frame\n");
22828 }
22829 
22830 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
22831 
22832 static void
aarch64_start_file(void)22833 aarch64_start_file (void)
22834 {
22835   struct cl_target_option *default_options
22836     = TREE_TARGET_OPTION (target_option_default_node);
22837 
22838   const struct processor *default_arch
22839     = aarch64_get_arch (default_options->x_explicit_arch);
22840   uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
22841   std::string extension
22842     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
22843 						  default_arch->flags);
22844 
22845    aarch64_last_printed_arch_string = default_arch->name + extension;
22846    aarch64_last_printed_tune_string = "";
22847    asm_fprintf (asm_out_file, "\t.arch %s\n",
22848 		aarch64_last_printed_arch_string.c_str ());
22849 
22850    default_file_start ();
22851 }
22852 
22853 /* Emit load exclusive.  */
22854 
22855 static void
aarch64_emit_load_exclusive(machine_mode mode,rtx rval,rtx mem,rtx model_rtx)22856 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
22857 			     rtx mem, rtx model_rtx)
22858 {
22859   if (mode == TImode)
22860     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
22861 						gen_highpart (DImode, rval),
22862 						mem, model_rtx));
22863   else
22864     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
22865 }
22866 
22867 /* Emit store exclusive.  */
22868 
22869 static void
aarch64_emit_store_exclusive(machine_mode mode,rtx bval,rtx mem,rtx rval,rtx model_rtx)22870 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
22871 			      rtx mem, rtx rval, rtx model_rtx)
22872 {
22873   if (mode == TImode)
22874     emit_insn (gen_aarch64_store_exclusive_pair
22875 	       (bval, mem, operand_subword (rval, 0, 0, TImode),
22876 		operand_subword (rval, 1, 0, TImode), model_rtx));
22877   else
22878     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
22879 }
22880 
22881 /* Mark the previous jump instruction as unlikely.  */
22882 
22883 static void
aarch64_emit_unlikely_jump(rtx insn)22884 aarch64_emit_unlikely_jump (rtx insn)
22885 {
22886   rtx_insn *jump = emit_jump_insn (insn);
22887   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
22888 }
22889 
22890 /* We store the names of the various atomic helpers in a 5x5 array.
22891    Return the libcall function given MODE, MODEL and NAMES.  */
22892 
22893 rtx
aarch64_atomic_ool_func(machine_mode mode,rtx model_rtx,const atomic_ool_names * names)22894 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
22895 			const atomic_ool_names *names)
22896 {
22897   memmodel model = memmodel_from_int (INTVAL (model_rtx));
22898   int mode_idx, model_idx;
22899 
22900   switch (mode)
22901     {
22902     case E_QImode:
22903       mode_idx = 0;
22904       break;
22905     case E_HImode:
22906       mode_idx = 1;
22907       break;
22908     case E_SImode:
22909       mode_idx = 2;
22910       break;
22911     case E_DImode:
22912       mode_idx = 3;
22913       break;
22914     case E_TImode:
22915       mode_idx = 4;
22916       break;
22917     default:
22918       gcc_unreachable ();
22919     }
22920 
22921   switch (model)
22922     {
22923     case MEMMODEL_RELAXED:
22924       model_idx = 0;
22925       break;
22926     case MEMMODEL_CONSUME:
22927     case MEMMODEL_ACQUIRE:
22928       model_idx = 1;
22929       break;
22930     case MEMMODEL_RELEASE:
22931       model_idx = 2;
22932       break;
22933     case MEMMODEL_ACQ_REL:
22934     case MEMMODEL_SEQ_CST:
22935       model_idx = 3;
22936       break;
22937     case MEMMODEL_SYNC_ACQUIRE:
22938     case MEMMODEL_SYNC_RELEASE:
22939     case MEMMODEL_SYNC_SEQ_CST:
22940       model_idx = 4;
22941       break;
22942     default:
22943       gcc_unreachable ();
22944     }
22945 
22946   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
22947 				      VISIBILITY_HIDDEN);
22948 }
22949 
22950 #define DEF0(B, N) \
22951   { "__aarch64_" #B #N "_relax", \
22952     "__aarch64_" #B #N "_acq", \
22953     "__aarch64_" #B #N "_rel", \
22954     "__aarch64_" #B #N "_acq_rel", \
22955     "__aarch64_" #B #N "_sync" }
22956 
22957 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
22958 		 { NULL, NULL, NULL, NULL }
22959 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
22960 
22961 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
22962 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
22963 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
22964 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
22965 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
22966 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
22967 
22968 #undef DEF0
22969 #undef DEF4
22970 #undef DEF5
22971 
22972 /* Expand a compare and swap pattern.  */
22973 
22974 void
aarch64_expand_compare_and_swap(rtx operands[])22975 aarch64_expand_compare_and_swap (rtx operands[])
22976 {
22977   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
22978   machine_mode mode, r_mode;
22979 
22980   bval = operands[0];
22981   rval = operands[1];
22982   mem = operands[2];
22983   oldval = operands[3];
22984   newval = operands[4];
22985   is_weak = operands[5];
22986   mod_s = operands[6];
22987   mod_f = operands[7];
22988   mode = GET_MODE (mem);
22989 
22990   /* Normally the succ memory model must be stronger than fail, but in the
22991      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
22992      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
22993   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
22994       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
22995     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
22996 
22997   r_mode = mode;
22998   if (mode == QImode || mode == HImode)
22999     {
23000       r_mode = SImode;
23001       rval = gen_reg_rtx (r_mode);
23002     }
23003 
23004   if (TARGET_LSE)
23005     {
23006       /* The CAS insn requires oldval and rval overlap, but we need to
23007 	 have a copy of oldval saved across the operation to tell if
23008 	 the operation is successful.  */
23009       if (reg_overlap_mentioned_p (rval, oldval))
23010         rval = copy_to_mode_reg (r_mode, oldval);
23011       else
23012 	emit_move_insn (rval, gen_lowpart (r_mode, oldval));
23013       if (mode == TImode)
23014 	newval = force_reg (mode, newval);
23015 
23016       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
23017 						   newval, mod_s));
23018       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
23019     }
23020   else if (TARGET_OUTLINE_ATOMICS)
23021     {
23022       /* Oldval must satisfy compare afterward.  */
23023       if (!aarch64_plus_operand (oldval, mode))
23024 	oldval = force_reg (mode, oldval);
23025       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
23026       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
23027 				      oldval, mode, newval, mode,
23028 				      XEXP (mem, 0), Pmode);
23029       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
23030     }
23031   else
23032     {
23033       /* The oldval predicate varies by mode.  Test it and force to reg.  */
23034       insn_code code = code_for_aarch64_compare_and_swap (mode);
23035       if (!insn_data[code].operand[2].predicate (oldval, mode))
23036 	oldval = force_reg (mode, oldval);
23037 
23038       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
23039 				 is_weak, mod_s, mod_f));
23040       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
23041     }
23042 
23043   if (r_mode != mode)
23044     rval = gen_lowpart (mode, rval);
23045   emit_move_insn (operands[1], rval);
23046 
23047   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
23048   emit_insn (gen_rtx_SET (bval, x));
23049 }
23050 
23051 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
23052    sequence implementing an atomic operation.  */
23053 
23054 static void
aarch64_emit_post_barrier(enum memmodel model)23055 aarch64_emit_post_barrier (enum memmodel model)
23056 {
23057   const enum memmodel base_model = memmodel_base (model);
23058 
23059   if (is_mm_sync (model)
23060       && (base_model == MEMMODEL_ACQUIRE
23061 	  || base_model == MEMMODEL_ACQ_REL
23062 	  || base_model == MEMMODEL_SEQ_CST))
23063     {
23064       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
23065     }
23066 }
23067 
23068 /* Split a compare and swap pattern.  */
23069 
23070 void
aarch64_split_compare_and_swap(rtx operands[])23071 aarch64_split_compare_and_swap (rtx operands[])
23072 {
23073   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
23074   gcc_assert (epilogue_completed);
23075 
23076   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
23077   machine_mode mode;
23078   bool is_weak;
23079   rtx_code_label *label1, *label2;
23080   enum memmodel model;
23081 
23082   rval = operands[0];
23083   mem = operands[1];
23084   oldval = operands[2];
23085   newval = operands[3];
23086   is_weak = (operands[4] != const0_rtx);
23087   model_rtx = operands[5];
23088   scratch = operands[7];
23089   mode = GET_MODE (mem);
23090   model = memmodel_from_int (INTVAL (model_rtx));
23091 
23092   /* When OLDVAL is zero and we want the strong version we can emit a tighter
23093     loop:
23094     .label1:
23095 	LD[A]XR	rval, [mem]
23096 	CBNZ	rval, .label2
23097 	ST[L]XR	scratch, newval, [mem]
23098 	CBNZ	scratch, .label1
23099     .label2:
23100 	CMP	rval, 0.  */
23101   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
23102 			oldval == const0_rtx && mode != TImode);
23103 
23104   label1 = NULL;
23105   if (!is_weak)
23106     {
23107       label1 = gen_label_rtx ();
23108       emit_label (label1);
23109     }
23110   label2 = gen_label_rtx ();
23111 
23112   /* The initial load can be relaxed for a __sync operation since a final
23113      barrier will be emitted to stop code hoisting.  */
23114   if (is_mm_sync (model))
23115     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
23116   else
23117     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
23118 
23119   if (strong_zero_p)
23120     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
23121   else
23122     {
23123       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
23124       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
23125     }
23126   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23127 			    gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
23128   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23129 
23130   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
23131 
23132   if (!is_weak)
23133     {
23134       if (aarch64_track_speculation)
23135 	{
23136 	  /* Emit an explicit compare instruction, so that we can correctly
23137 	     track the condition codes.  */
23138 	  rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
23139 	  x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
23140 	}
23141       else
23142 	x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
23143 
23144       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23145 				gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
23146       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23147     }
23148   else
23149     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
23150 
23151   emit_label (label2);
23152 
23153   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
23154      to set the condition flags.  If this is not used it will be removed by
23155      later passes.  */
23156   if (strong_zero_p)
23157     aarch64_gen_compare_reg (NE, rval, const0_rtx);
23158 
23159   /* Emit any final barrier needed for a __sync operation.  */
23160   if (is_mm_sync (model))
23161     aarch64_emit_post_barrier (model);
23162 }
23163 
23164 /* Split an atomic operation.  */
23165 
23166 void
aarch64_split_atomic_op(enum rtx_code code,rtx old_out,rtx new_out,rtx mem,rtx value,rtx model_rtx,rtx cond)23167 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
23168 			 rtx value, rtx model_rtx, rtx cond)
23169 {
23170   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
23171   gcc_assert (epilogue_completed);
23172 
23173   machine_mode mode = GET_MODE (mem);
23174   machine_mode wmode = (mode == DImode ? DImode : SImode);
23175   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
23176   const bool is_sync = is_mm_sync (model);
23177   rtx_code_label *label;
23178   rtx x;
23179 
23180   /* Split the atomic operation into a sequence.  */
23181   label = gen_label_rtx ();
23182   emit_label (label);
23183 
23184   if (new_out)
23185     new_out = gen_lowpart (wmode, new_out);
23186   if (old_out)
23187     old_out = gen_lowpart (wmode, old_out);
23188   else
23189     old_out = new_out;
23190   value = simplify_gen_subreg (wmode, value, mode, 0);
23191 
23192   /* The initial load can be relaxed for a __sync operation since a final
23193      barrier will be emitted to stop code hoisting.  */
23194  if (is_sync)
23195     aarch64_emit_load_exclusive (mode, old_out, mem,
23196 				 GEN_INT (MEMMODEL_RELAXED));
23197   else
23198     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
23199 
23200   switch (code)
23201     {
23202     case SET:
23203       new_out = value;
23204       break;
23205 
23206     case NOT:
23207       x = gen_rtx_AND (wmode, old_out, value);
23208       emit_insn (gen_rtx_SET (new_out, x));
23209       x = gen_rtx_NOT (wmode, new_out);
23210       emit_insn (gen_rtx_SET (new_out, x));
23211       break;
23212 
23213     case MINUS:
23214       if (CONST_INT_P (value))
23215 	{
23216 	  value = GEN_INT (-UINTVAL (value));
23217 	  code = PLUS;
23218 	}
23219       /* Fall through.  */
23220 
23221     default:
23222       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
23223       emit_insn (gen_rtx_SET (new_out, x));
23224       break;
23225     }
23226 
23227   aarch64_emit_store_exclusive (mode, cond, mem,
23228 				gen_lowpart (mode, new_out), model_rtx);
23229 
23230   if (aarch64_track_speculation)
23231     {
23232       /* Emit an explicit compare instruction, so that we can correctly
23233 	 track the condition codes.  */
23234       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
23235       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
23236     }
23237   else
23238     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
23239 
23240   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23241 			    gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
23242   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23243 
23244   /* Emit any final barrier needed for a __sync operation.  */
23245   if (is_sync)
23246     aarch64_emit_post_barrier (model);
23247 }
23248 
23249 static void
aarch64_init_libfuncs(void)23250 aarch64_init_libfuncs (void)
23251 {
23252    /* Half-precision float operations.  The compiler handles all operations
23253      with NULL libfuncs by converting to SFmode.  */
23254 
23255   /* Conversions.  */
23256   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
23257   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
23258 
23259   /* Arithmetic.  */
23260   set_optab_libfunc (add_optab, HFmode, NULL);
23261   set_optab_libfunc (sdiv_optab, HFmode, NULL);
23262   set_optab_libfunc (smul_optab, HFmode, NULL);
23263   set_optab_libfunc (neg_optab, HFmode, NULL);
23264   set_optab_libfunc (sub_optab, HFmode, NULL);
23265 
23266   /* Comparisons.  */
23267   set_optab_libfunc (eq_optab, HFmode, NULL);
23268   set_optab_libfunc (ne_optab, HFmode, NULL);
23269   set_optab_libfunc (lt_optab, HFmode, NULL);
23270   set_optab_libfunc (le_optab, HFmode, NULL);
23271   set_optab_libfunc (ge_optab, HFmode, NULL);
23272   set_optab_libfunc (gt_optab, HFmode, NULL);
23273   set_optab_libfunc (unord_optab, HFmode, NULL);
23274 }
23275 
23276 /* Target hook for c_mode_for_suffix.  */
23277 static machine_mode
aarch64_c_mode_for_suffix(char suffix)23278 aarch64_c_mode_for_suffix (char suffix)
23279 {
23280   if (suffix == 'q')
23281     return TFmode;
23282 
23283   return VOIDmode;
23284 }
23285 
23286 /* We can only represent floating point constants which will fit in
23287    "quarter-precision" values.  These values are characterised by
23288    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
23289    by:
23290 
23291    (-1)^s * (n/16) * 2^r
23292 
23293    Where:
23294      's' is the sign bit.
23295      'n' is an integer in the range 16 <= n <= 31.
23296      'r' is an integer in the range -3 <= r <= 4.  */
23297 
23298 /* Return true iff X can be represented by a quarter-precision
23299    floating point immediate operand X.  Note, we cannot represent 0.0.  */
23300 bool
aarch64_float_const_representable_p(rtx x)23301 aarch64_float_const_representable_p (rtx x)
23302 {
23303   /* This represents our current view of how many bits
23304      make up the mantissa.  */
23305   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
23306   int exponent;
23307   unsigned HOST_WIDE_INT mantissa, mask;
23308   REAL_VALUE_TYPE r, m;
23309   bool fail;
23310 
23311   x = unwrap_const_vec_duplicate (x);
23312   if (!CONST_DOUBLE_P (x))
23313     return false;
23314 
23315   if (GET_MODE (x) == VOIDmode
23316       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
23317     return false;
23318 
23319   r = *CONST_DOUBLE_REAL_VALUE (x);
23320 
23321   /* We cannot represent infinities, NaNs or +/-zero.  We won't
23322      know if we have +zero until we analyse the mantissa, but we
23323      can reject the other invalid values.  */
23324   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
23325       || REAL_VALUE_MINUS_ZERO (r))
23326     return false;
23327 
23328   /* Extract exponent.  */
23329   r = real_value_abs (&r);
23330   exponent = REAL_EXP (&r);
23331 
23332   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
23333      highest (sign) bit, with a fixed binary point at bit point_pos.
23334      m1 holds the low part of the mantissa, m2 the high part.
23335      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
23336      bits for the mantissa, this can fail (low bits will be lost).  */
23337   real_ldexp (&m, &r, point_pos - exponent);
23338   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
23339 
23340   /* If the low part of the mantissa has bits set we cannot represent
23341      the value.  */
23342   if (w.ulow () != 0)
23343     return false;
23344   /* We have rejected the lower HOST_WIDE_INT, so update our
23345      understanding of how many bits lie in the mantissa and
23346      look only at the high HOST_WIDE_INT.  */
23347   mantissa = w.elt (1);
23348   point_pos -= HOST_BITS_PER_WIDE_INT;
23349 
23350   /* We can only represent values with a mantissa of the form 1.xxxx.  */
23351   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
23352   if ((mantissa & mask) != 0)
23353     return false;
23354 
23355   /* Having filtered unrepresentable values, we may now remove all
23356      but the highest 5 bits.  */
23357   mantissa >>= point_pos - 5;
23358 
23359   /* We cannot represent the value 0.0, so reject it.  This is handled
23360      elsewhere.  */
23361   if (mantissa == 0)
23362     return false;
23363 
23364   /* Then, as bit 4 is always set, we can mask it off, leaving
23365      the mantissa in the range [0, 15].  */
23366   mantissa &= ~(1 << 4);
23367   gcc_assert (mantissa <= 15);
23368 
23369   /* GCC internally does not use IEEE754-like encoding (where normalized
23370      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.cc).
23371      Our mantissa values are shifted 4 places to the left relative to
23372      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
23373      by 5 places to correct for GCC's representation.  */
23374   exponent = 5 - exponent;
23375 
23376   return (exponent >= 0 && exponent <= 7);
23377 }
23378 
23379 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
23380    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
23381    output MOVI/MVNI, ORR or BIC immediate.  */
23382 char*
aarch64_output_simd_mov_immediate(rtx const_vector,unsigned width,enum simd_immediate_check which)23383 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
23384 				   enum simd_immediate_check which)
23385 {
23386   bool is_valid;
23387   static char templ[40];
23388   const char *mnemonic;
23389   const char *shift_op;
23390   unsigned int lane_count = 0;
23391   char element_char;
23392 
23393   struct simd_immediate_info info;
23394 
23395   /* This will return true to show const_vector is legal for use as either
23396      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
23397      It will also update INFO to show how the immediate should be generated.
23398      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
23399   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
23400   gcc_assert (is_valid);
23401 
23402   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23403   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
23404 
23405   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
23406     {
23407       gcc_assert (info.insn == simd_immediate_info::MOV
23408 		  && info.u.mov.shift == 0);
23409       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
23410 	 move immediate path.  */
23411       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
23412         info.u.mov.value = GEN_INT (0);
23413       else
23414 	{
23415 	  const unsigned int buf_size = 20;
23416 	  char float_buf[buf_size] = {'\0'};
23417 	  real_to_decimal_for_mode (float_buf,
23418 				    CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
23419 				    buf_size, buf_size, 1, info.elt_mode);
23420 
23421 	  if (lane_count == 1)
23422 	    snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
23423 	  else
23424 	    snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
23425 		      lane_count, element_char, float_buf);
23426 	  return templ;
23427 	}
23428     }
23429 
23430   gcc_assert (CONST_INT_P (info.u.mov.value));
23431 
23432   if (which == AARCH64_CHECK_MOV)
23433     {
23434       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
23435       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
23436 		  ? "msl" : "lsl");
23437       if (lane_count == 1)
23438 	snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
23439 		  mnemonic, UINTVAL (info.u.mov.value));
23440       else if (info.u.mov.shift)
23441 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
23442 		  HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
23443 		  element_char, UINTVAL (info.u.mov.value), shift_op,
23444 		  info.u.mov.shift);
23445       else
23446 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
23447 		  HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
23448 		  element_char, UINTVAL (info.u.mov.value));
23449     }
23450   else
23451     {
23452       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
23453       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
23454       if (info.u.mov.shift)
23455 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
23456 		  HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
23457 		  element_char, UINTVAL (info.u.mov.value), "lsl",
23458 		  info.u.mov.shift);
23459       else
23460 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
23461 		  HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
23462 		  element_char, UINTVAL (info.u.mov.value));
23463     }
23464   return templ;
23465 }
23466 
23467 char*
aarch64_output_scalar_simd_mov_immediate(rtx immediate,scalar_int_mode mode)23468 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
23469 {
23470 
23471   /* If a floating point number was passed and we desire to use it in an
23472      integer mode do the conversion to integer.  */
23473   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
23474     {
23475       unsigned HOST_WIDE_INT ival;
23476       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
23477 	  gcc_unreachable ();
23478       immediate = gen_int_mode (ival, mode);
23479     }
23480 
23481   machine_mode vmode;
23482   /* use a 64 bit mode for everything except for DI/DF mode, where we use
23483      a 128 bit vector mode.  */
23484   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
23485 
23486   vmode = aarch64_simd_container_mode (mode, width);
23487   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
23488   return aarch64_output_simd_mov_immediate (v_op, width);
23489 }
23490 
23491 /* Return the output string to use for moving immediate CONST_VECTOR
23492    into an SVE register.  */
23493 
23494 char *
aarch64_output_sve_mov_immediate(rtx const_vector)23495 aarch64_output_sve_mov_immediate (rtx const_vector)
23496 {
23497   static char templ[40];
23498   struct simd_immediate_info info;
23499   char element_char;
23500 
23501   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
23502   gcc_assert (is_valid);
23503 
23504   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23505 
23506   machine_mode vec_mode = GET_MODE (const_vector);
23507   if (aarch64_sve_pred_mode_p (vec_mode))
23508     {
23509       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
23510       if (info.insn == simd_immediate_info::MOV)
23511 	{
23512 	  gcc_assert (info.u.mov.value == const0_rtx);
23513 	  snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
23514 	}
23515       else
23516 	{
23517 	  gcc_assert (info.insn == simd_immediate_info::PTRUE);
23518 	  unsigned int total_bytes;
23519 	  if (info.u.pattern == AARCH64_SV_ALL
23520 	      && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
23521 	    snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
23522 		      total_bytes / GET_MODE_SIZE (info.elt_mode));
23523 	  else
23524 	    snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
23525 		      svpattern_token (info.u.pattern));
23526 	}
23527       return buf;
23528     }
23529 
23530   if (info.insn == simd_immediate_info::INDEX)
23531     {
23532       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
23533 		HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
23534 		element_char, INTVAL (info.u.index.base),
23535 		INTVAL (info.u.index.step));
23536       return templ;
23537     }
23538 
23539   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
23540     {
23541       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
23542 	info.u.mov.value = GEN_INT (0);
23543       else
23544 	{
23545 	  const int buf_size = 20;
23546 	  char float_buf[buf_size] = {};
23547 	  real_to_decimal_for_mode (float_buf,
23548 				    CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
23549 				    buf_size, buf_size, 1, info.elt_mode);
23550 
23551 	  snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
23552 		    element_char, float_buf);
23553 	  return templ;
23554 	}
23555     }
23556 
23557   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
23558 	    element_char, INTVAL (info.u.mov.value));
23559   return templ;
23560 }
23561 
23562 /* Return the asm template for a PTRUES.  CONST_UNSPEC is the
23563    aarch64_sve_ptrue_svpattern_immediate that describes the predicate
23564    pattern.  */
23565 
23566 char *
aarch64_output_sve_ptrues(rtx const_unspec)23567 aarch64_output_sve_ptrues (rtx const_unspec)
23568 {
23569   static char templ[40];
23570 
23571   struct simd_immediate_info info;
23572   bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
23573   gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
23574 
23575   char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23576   snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
23577 	    svpattern_token (info.u.pattern));
23578   return templ;
23579 }
23580 
23581 /* Split operands into moves from op[1] + op[2] into op[0].  */
23582 
23583 void
aarch64_split_combinev16qi(rtx operands[3])23584 aarch64_split_combinev16qi (rtx operands[3])
23585 {
23586   unsigned int dest = REGNO (operands[0]);
23587   unsigned int src1 = REGNO (operands[1]);
23588   unsigned int src2 = REGNO (operands[2]);
23589   machine_mode halfmode = GET_MODE (operands[1]);
23590   unsigned int halfregs = REG_NREGS (operands[1]);
23591   rtx destlo, desthi;
23592 
23593   gcc_assert (halfmode == V16QImode);
23594 
23595   if (src1 == dest && src2 == dest + halfregs)
23596     {
23597       /* No-op move.  Can't split to nothing; emit something.  */
23598       emit_note (NOTE_INSN_DELETED);
23599       return;
23600     }
23601 
23602   /* Preserve register attributes for variable tracking.  */
23603   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
23604   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
23605 			       GET_MODE_SIZE (halfmode));
23606 
23607   /* Special case of reversed high/low parts.  */
23608   if (reg_overlap_mentioned_p (operands[2], destlo)
23609       && reg_overlap_mentioned_p (operands[1], desthi))
23610     {
23611       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
23612       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
23613       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
23614     }
23615   else if (!reg_overlap_mentioned_p (operands[2], destlo))
23616     {
23617       /* Try to avoid unnecessary moves if part of the result
23618 	 is in the right place already.  */
23619       if (src1 != dest)
23620 	emit_move_insn (destlo, operands[1]);
23621       if (src2 != dest + halfregs)
23622 	emit_move_insn (desthi, operands[2]);
23623     }
23624   else
23625     {
23626       if (src2 != dest + halfregs)
23627 	emit_move_insn (desthi, operands[2]);
23628       if (src1 != dest)
23629 	emit_move_insn (destlo, operands[1]);
23630     }
23631 }
23632 
23633 /* vec_perm support.  */
23634 
23635 struct expand_vec_perm_d
23636 {
23637   rtx target, op0, op1;
23638   vec_perm_indices perm;
23639   machine_mode vmode;
23640   unsigned int vec_flags;
23641   bool one_vector_p;
23642   bool testing_p;
23643 };
23644 
23645 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
23646 
23647 /* Generate a variable permutation.  */
23648 
23649 static void
aarch64_expand_vec_perm_1(rtx target,rtx op0,rtx op1,rtx sel)23650 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
23651 {
23652   machine_mode vmode = GET_MODE (target);
23653   bool one_vector_p = rtx_equal_p (op0, op1);
23654 
23655   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
23656   gcc_checking_assert (GET_MODE (op0) == vmode);
23657   gcc_checking_assert (GET_MODE (op1) == vmode);
23658   gcc_checking_assert (GET_MODE (sel) == vmode);
23659   gcc_checking_assert (TARGET_SIMD);
23660 
23661   if (one_vector_p)
23662     {
23663       if (vmode == V8QImode)
23664 	{
23665 	  /* Expand the argument to a V16QI mode by duplicating it.  */
23666 	  rtx pair = gen_reg_rtx (V16QImode);
23667 	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
23668 	  emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
23669 	}
23670       else
23671 	{
23672 	  emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel));
23673 	}
23674     }
23675   else
23676     {
23677       rtx pair;
23678 
23679       if (vmode == V8QImode)
23680 	{
23681 	  pair = gen_reg_rtx (V16QImode);
23682 	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
23683 	  emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
23684 	}
23685       else
23686 	{
23687 	  pair = gen_reg_rtx (V2x16QImode);
23688 	  emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
23689 	  emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel));
23690 	}
23691     }
23692 }
23693 
23694 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
23695    NELT is the number of elements in the vector.  */
23696 
23697 void
aarch64_expand_vec_perm(rtx target,rtx op0,rtx op1,rtx sel,unsigned int nelt)23698 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
23699 			 unsigned int nelt)
23700 {
23701   machine_mode vmode = GET_MODE (target);
23702   bool one_vector_p = rtx_equal_p (op0, op1);
23703   rtx mask;
23704 
23705   /* The TBL instruction does not use a modulo index, so we must take care
23706      of that ourselves.  */
23707   mask = aarch64_simd_gen_const_vector_dup (vmode,
23708       one_vector_p ? nelt - 1 : 2 * nelt - 1);
23709   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
23710 
23711   /* For big-endian, we also need to reverse the index within the vector
23712      (but not which vector).  */
23713   if (BYTES_BIG_ENDIAN)
23714     {
23715       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
23716       if (!one_vector_p)
23717         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
23718       sel = expand_simple_binop (vmode, XOR, sel, mask,
23719 				 NULL, 0, OPTAB_LIB_WIDEN);
23720     }
23721   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
23722 }
23723 
23724 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
23725 
23726 static void
emit_unspec2(rtx target,int code,rtx op0,rtx op1)23727 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
23728 {
23729   emit_insn (gen_rtx_SET (target,
23730 			  gen_rtx_UNSPEC (GET_MODE (target),
23731 					  gen_rtvec (2, op0, op1), code)));
23732 }
23733 
23734 /* Expand an SVE vec_perm with the given operands.  */
23735 
23736 void
aarch64_expand_sve_vec_perm(rtx target,rtx op0,rtx op1,rtx sel)23737 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
23738 {
23739   machine_mode data_mode = GET_MODE (target);
23740   machine_mode sel_mode = GET_MODE (sel);
23741   /* Enforced by the pattern condition.  */
23742   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
23743 
23744   /* Note: vec_perm indices are supposed to wrap when they go beyond the
23745      size of the two value vectors, i.e. the upper bits of the indices
23746      are effectively ignored.  SVE TBL instead produces 0 for any
23747      out-of-range indices, so we need to modulo all the vec_perm indices
23748      to ensure they are all in range.  */
23749   rtx sel_reg = force_reg (sel_mode, sel);
23750 
23751   /* Check if the sel only references the first values vector.  */
23752   if (CONST_VECTOR_P (sel)
23753       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
23754     {
23755       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
23756       return;
23757     }
23758 
23759   /* Check if the two values vectors are the same.  */
23760   if (rtx_equal_p (op0, op1))
23761     {
23762       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
23763       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
23764 					 NULL, 0, OPTAB_DIRECT);
23765       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
23766       return;
23767     }
23768 
23769   /* Run TBL on for each value vector and combine the results.  */
23770 
23771   rtx res0 = gen_reg_rtx (data_mode);
23772   rtx res1 = gen_reg_rtx (data_mode);
23773   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
23774   if (!CONST_VECTOR_P (sel)
23775       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
23776     {
23777       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
23778 						       2 * nunits - 1);
23779       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
23780 				     NULL, 0, OPTAB_DIRECT);
23781     }
23782   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
23783   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
23784 				     NULL, 0, OPTAB_DIRECT);
23785   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
23786   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
23787     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
23788   else
23789     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
23790 }
23791 
23792 /* Recognize patterns suitable for the TRN instructions.  */
23793 static bool
aarch64_evpc_trn(struct expand_vec_perm_d * d)23794 aarch64_evpc_trn (struct expand_vec_perm_d *d)
23795 {
23796   HOST_WIDE_INT odd;
23797   poly_uint64 nelt = d->perm.length ();
23798   rtx out, in0, in1, x;
23799   machine_mode vmode = d->vmode;
23800 
23801   if (GET_MODE_UNIT_SIZE (vmode) > 8)
23802     return false;
23803 
23804   /* Note that these are little-endian tests.
23805      We correct for big-endian later.  */
23806   if (!d->perm[0].is_constant (&odd)
23807       || (odd != 0 && odd != 1)
23808       || !d->perm.series_p (0, 2, odd, 2)
23809       || !d->perm.series_p (1, 2, nelt + odd, 2))
23810     return false;
23811 
23812   /* Success!  */
23813   if (d->testing_p)
23814     return true;
23815 
23816   in0 = d->op0;
23817   in1 = d->op1;
23818   /* We don't need a big-endian lane correction for SVE; see the comment
23819      at the head of aarch64-sve.md for details.  */
23820   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23821     {
23822       x = in0, in0 = in1, in1 = x;
23823       odd = !odd;
23824     }
23825   out = d->target;
23826 
23827   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23828 				      odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
23829   return true;
23830 }
23831 
23832 /* Try to re-encode the PERM constant so it combines odd and even elements.
23833    This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
23834    We retry with this new constant with the full suite of patterns.  */
23835 static bool
aarch64_evpc_reencode(struct expand_vec_perm_d * d)23836 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
23837 {
23838   expand_vec_perm_d newd;
23839   unsigned HOST_WIDE_INT nelt;
23840 
23841   if (d->vec_flags != VEC_ADVSIMD)
23842     return false;
23843 
23844   /* Get the new mode.  Always twice the size of the inner
23845      and half the elements.  */
23846   poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
23847   unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
23848   auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
23849   machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
23850 
23851   if (new_mode == word_mode)
23852     return false;
23853 
23854   /* to_constant is safe since this routine is specific to Advanced SIMD
23855      vectors.  */
23856   nelt = d->perm.length ().to_constant ();
23857 
23858   vec_perm_builder newpermconst;
23859   newpermconst.new_vector (nelt / 2, nelt / 2, 1);
23860 
23861   /* Convert the perm constant if we can.  Require even, odd as the pairs.  */
23862   for (unsigned int i = 0; i < nelt; i += 2)
23863     {
23864       poly_int64 elt0 = d->perm[i];
23865       poly_int64 elt1 = d->perm[i + 1];
23866       poly_int64 newelt;
23867       if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
23868 	return false;
23869       newpermconst.quick_push (newelt.to_constant ());
23870     }
23871   newpermconst.finalize ();
23872 
23873   newd.vmode = new_mode;
23874   newd.vec_flags = VEC_ADVSIMD;
23875   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
23876   newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
23877   newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
23878   newd.testing_p = d->testing_p;
23879   newd.one_vector_p = d->one_vector_p;
23880 
23881   newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
23882   return aarch64_expand_vec_perm_const_1 (&newd);
23883 }
23884 
23885 /* Recognize patterns suitable for the UZP instructions.  */
23886 static bool
aarch64_evpc_uzp(struct expand_vec_perm_d * d)23887 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
23888 {
23889   HOST_WIDE_INT odd;
23890   rtx out, in0, in1, x;
23891   machine_mode vmode = d->vmode;
23892 
23893   if (GET_MODE_UNIT_SIZE (vmode) > 8)
23894     return false;
23895 
23896   /* Note that these are little-endian tests.
23897      We correct for big-endian later.  */
23898   if (!d->perm[0].is_constant (&odd)
23899       || (odd != 0 && odd != 1)
23900       || !d->perm.series_p (0, 1, odd, 2))
23901     return false;
23902 
23903   /* Success!  */
23904   if (d->testing_p)
23905     return true;
23906 
23907   in0 = d->op0;
23908   in1 = d->op1;
23909   /* We don't need a big-endian lane correction for SVE; see the comment
23910      at the head of aarch64-sve.md for details.  */
23911   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23912     {
23913       x = in0, in0 = in1, in1 = x;
23914       odd = !odd;
23915     }
23916   out = d->target;
23917 
23918   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23919 				      odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
23920   return true;
23921 }
23922 
23923 /* Recognize patterns suitable for the ZIP instructions.  */
23924 static bool
aarch64_evpc_zip(struct expand_vec_perm_d * d)23925 aarch64_evpc_zip (struct expand_vec_perm_d *d)
23926 {
23927   unsigned int high;
23928   poly_uint64 nelt = d->perm.length ();
23929   rtx out, in0, in1, x;
23930   machine_mode vmode = d->vmode;
23931 
23932   if (GET_MODE_UNIT_SIZE (vmode) > 8)
23933     return false;
23934 
23935   /* Note that these are little-endian tests.
23936      We correct for big-endian later.  */
23937   poly_uint64 first = d->perm[0];
23938   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
23939       || !d->perm.series_p (0, 2, first, 1)
23940       || !d->perm.series_p (1, 2, first + nelt, 1))
23941     return false;
23942   high = maybe_ne (first, 0U);
23943 
23944   /* Success!  */
23945   if (d->testing_p)
23946     return true;
23947 
23948   in0 = d->op0;
23949   in1 = d->op1;
23950   /* We don't need a big-endian lane correction for SVE; see the comment
23951      at the head of aarch64-sve.md for details.  */
23952   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23953     {
23954       x = in0, in0 = in1, in1 = x;
23955       high = !high;
23956     }
23957   out = d->target;
23958 
23959   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23960 				      high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
23961   return true;
23962 }
23963 
23964 /* Recognize patterns for the EXT insn.  */
23965 
23966 static bool
aarch64_evpc_ext(struct expand_vec_perm_d * d)23967 aarch64_evpc_ext (struct expand_vec_perm_d *d)
23968 {
23969   HOST_WIDE_INT location;
23970   rtx offset;
23971 
23972   /* The first element always refers to the first vector.
23973      Check if the extracted indices are increasing by one.  */
23974   if (d->vec_flags == VEC_SVE_PRED
23975       || !d->perm[0].is_constant (&location)
23976       || !d->perm.series_p (0, 1, location, 1))
23977     return false;
23978 
23979   /* Success! */
23980   if (d->testing_p)
23981     return true;
23982 
23983   /* The case where (location == 0) is a no-op for both big- and little-endian,
23984      and is removed by the mid-end at optimization levels -O1 and higher.
23985 
23986      We don't need a big-endian lane correction for SVE; see the comment
23987      at the head of aarch64-sve.md for details.  */
23988   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
23989     {
23990       /* After setup, we want the high elements of the first vector (stored
23991          at the LSB end of the register), and the low elements of the second
23992          vector (stored at the MSB end of the register). So swap.  */
23993       std::swap (d->op0, d->op1);
23994       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
23995 	 to_constant () is safe since this is restricted to Advanced SIMD
23996 	 vectors.  */
23997       location = d->perm.length ().to_constant () - location;
23998     }
23999 
24000   offset = GEN_INT (location);
24001   emit_set_insn (d->target,
24002 		 gen_rtx_UNSPEC (d->vmode,
24003 				 gen_rtvec (3, d->op0, d->op1, offset),
24004 				 UNSPEC_EXT));
24005   return true;
24006 }
24007 
24008 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
24009    within each 64-bit, 32-bit or 16-bit granule.  */
24010 
24011 static bool
aarch64_evpc_rev_local(struct expand_vec_perm_d * d)24012 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
24013 {
24014   HOST_WIDE_INT diff;
24015   unsigned int i, size, unspec;
24016   machine_mode pred_mode;
24017 
24018   if (d->vec_flags == VEC_SVE_PRED
24019       || !d->one_vector_p
24020       || !d->perm[0].is_constant (&diff)
24021       || !diff)
24022     return false;
24023 
24024   if (d->vec_flags & VEC_SVE_DATA)
24025     size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
24026   else
24027     size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
24028   if (size == 64)
24029     {
24030       unspec = UNSPEC_REV64;
24031       pred_mode = VNx2BImode;
24032     }
24033   else if (size == 32)
24034     {
24035       unspec = UNSPEC_REV32;
24036       pred_mode = VNx4BImode;
24037     }
24038   else if (size == 16)
24039     {
24040       unspec = UNSPEC_REV16;
24041       pred_mode = VNx8BImode;
24042     }
24043   else
24044     return false;
24045 
24046   unsigned int step = diff + 1;
24047   for (i = 0; i < step; ++i)
24048     if (!d->perm.series_p (i, step, diff - i, step))
24049       return false;
24050 
24051   /* Success! */
24052   if (d->testing_p)
24053     return true;
24054 
24055   if (d->vec_flags & VEC_SVE_DATA)
24056     {
24057       rtx pred = aarch64_ptrue_reg (pred_mode);
24058       emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
24059 					 d->target, pred, d->op0));
24060       return true;
24061     }
24062   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
24063   emit_set_insn (d->target, src);
24064   return true;
24065 }
24066 
24067 /* Recognize patterns for the REV insn, which reverses elements within
24068    a full vector.  */
24069 
24070 static bool
aarch64_evpc_rev_global(struct expand_vec_perm_d * d)24071 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
24072 {
24073   poly_uint64 nelt = d->perm.length ();
24074 
24075   if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
24076     return false;
24077 
24078   if (!d->perm.series_p (0, 1, nelt - 1, -1))
24079     return false;
24080 
24081   /* Success! */
24082   if (d->testing_p)
24083     return true;
24084 
24085   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
24086   emit_set_insn (d->target, src);
24087   return true;
24088 }
24089 
24090 static bool
aarch64_evpc_dup(struct expand_vec_perm_d * d)24091 aarch64_evpc_dup (struct expand_vec_perm_d *d)
24092 {
24093   rtx out = d->target;
24094   rtx in0;
24095   HOST_WIDE_INT elt;
24096   machine_mode vmode = d->vmode;
24097   rtx lane;
24098 
24099   if (d->vec_flags == VEC_SVE_PRED
24100       || d->perm.encoding ().encoded_nelts () != 1
24101       || !d->perm[0].is_constant (&elt))
24102     return false;
24103 
24104   if ((d->vec_flags & VEC_SVE_DATA)
24105       && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
24106     return false;
24107 
24108   /* Success! */
24109   if (d->testing_p)
24110     return true;
24111 
24112   /* The generic preparation in aarch64_expand_vec_perm_const_1
24113      swaps the operand order and the permute indices if it finds
24114      d->perm[0] to be in the second operand.  Thus, we can always
24115      use d->op0 and need not do any extra arithmetic to get the
24116      correct lane number.  */
24117   in0 = d->op0;
24118   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
24119 
24120   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
24121   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
24122   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
24123   return true;
24124 }
24125 
24126 static bool
aarch64_evpc_tbl(struct expand_vec_perm_d * d)24127 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
24128 {
24129   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
24130   machine_mode vmode = d->vmode;
24131 
24132   /* Make sure that the indices are constant.  */
24133   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
24134   for (unsigned int i = 0; i < encoded_nelts; ++i)
24135     if (!d->perm[i].is_constant ())
24136       return false;
24137 
24138   if (d->testing_p)
24139     return true;
24140 
24141   /* Generic code will try constant permutation twice.  Once with the
24142      original mode and again with the elements lowered to QImode.
24143      So wait and don't do the selector expansion ourselves.  */
24144   if (vmode != V8QImode && vmode != V16QImode)
24145     return false;
24146 
24147   /* to_constant is safe since this routine is specific to Advanced SIMD
24148      vectors.  */
24149   unsigned int nelt = d->perm.length ().to_constant ();
24150   for (unsigned int i = 0; i < nelt; ++i)
24151     /* If big-endian and two vectors we end up with a weird mixed-endian
24152        mode on NEON.  Reverse the index within each word but not the word
24153        itself.  to_constant is safe because we checked is_constant above.  */
24154     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
24155 			? d->perm[i].to_constant () ^ (nelt - 1)
24156 			: d->perm[i].to_constant ());
24157 
24158   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
24159   sel = force_reg (vmode, sel);
24160 
24161   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
24162   return true;
24163 }
24164 
24165 /* Try to implement D using an SVE TBL instruction.  */
24166 
24167 static bool
aarch64_evpc_sve_tbl(struct expand_vec_perm_d * d)24168 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
24169 {
24170   unsigned HOST_WIDE_INT nelt;
24171 
24172   /* Permuting two variable-length vectors could overflow the
24173      index range.  */
24174   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
24175     return false;
24176 
24177   if (d->testing_p)
24178     return true;
24179 
24180   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
24181   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
24182   if (d->one_vector_p)
24183     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
24184   else
24185     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
24186   return true;
24187 }
24188 
24189 /* Try to implement D using SVE SEL instruction.  */
24190 
24191 static bool
aarch64_evpc_sel(struct expand_vec_perm_d * d)24192 aarch64_evpc_sel (struct expand_vec_perm_d *d)
24193 {
24194   machine_mode vmode = d->vmode;
24195   int unit_size = GET_MODE_UNIT_SIZE (vmode);
24196 
24197   if (d->vec_flags != VEC_SVE_DATA
24198       || unit_size > 8)
24199     return false;
24200 
24201   int n_patterns = d->perm.encoding ().npatterns ();
24202   poly_int64 vec_len = d->perm.length ();
24203 
24204   for (int i = 0; i < n_patterns; ++i)
24205     if (!known_eq (d->perm[i], i)
24206 	&& !known_eq (d->perm[i], vec_len + i))
24207       return false;
24208 
24209   for (int i = n_patterns; i < n_patterns * 2; i++)
24210     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
24211 	&& !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
24212       return false;
24213 
24214   if (d->testing_p)
24215     return true;
24216 
24217   machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
24218 
24219   /* Build a predicate that is true when op0 elements should be used.  */
24220   rtx_vector_builder builder (pred_mode, n_patterns, 2);
24221   for (int i = 0; i < n_patterns * 2; i++)
24222     {
24223       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
24224 					  : CONST0_RTX (BImode);
24225       builder.quick_push (elem);
24226     }
24227 
24228   rtx const_vec = builder.build ();
24229   rtx pred = force_reg (pred_mode, const_vec);
24230   /* TARGET = PRED ? OP0 : OP1.  */
24231   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
24232   return true;
24233 }
24234 
24235 /* Recognize patterns suitable for the INS instructions.  */
24236 static bool
aarch64_evpc_ins(struct expand_vec_perm_d * d)24237 aarch64_evpc_ins (struct expand_vec_perm_d *d)
24238 {
24239   machine_mode mode = d->vmode;
24240   unsigned HOST_WIDE_INT nelt;
24241 
24242   if (d->vec_flags != VEC_ADVSIMD)
24243     return false;
24244 
24245   /* to_constant is safe since this routine is specific to Advanced SIMD
24246      vectors.  */
24247   nelt = d->perm.length ().to_constant ();
24248   rtx insv = d->op0;
24249 
24250   HOST_WIDE_INT idx = -1;
24251 
24252   for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
24253     {
24254       HOST_WIDE_INT elt;
24255       if (!d->perm[i].is_constant (&elt))
24256 	return false;
24257       if (elt == (HOST_WIDE_INT) i)
24258 	continue;
24259       if (idx != -1)
24260 	{
24261 	  idx = -1;
24262 	  break;
24263 	}
24264       idx = i;
24265     }
24266 
24267   if (idx == -1)
24268     {
24269       insv = d->op1;
24270       for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
24271 	{
24272 	  if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
24273 	    continue;
24274 	  if (idx != -1)
24275 	    return false;
24276 	  idx = i;
24277 	}
24278 
24279       if (idx == -1)
24280 	return false;
24281     }
24282 
24283   if (d->testing_p)
24284     return true;
24285 
24286   gcc_assert (idx != -1);
24287 
24288   unsigned extractindex = d->perm[idx].to_constant ();
24289   rtx extractv = d->op0;
24290   if (extractindex >= nelt)
24291     {
24292       extractv = d->op1;
24293       extractindex -= nelt;
24294     }
24295   gcc_assert (extractindex < nelt);
24296 
24297   insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
24298   expand_operand ops[5];
24299   create_output_operand (&ops[0], d->target, mode);
24300   create_input_operand (&ops[1], insv, mode);
24301   create_integer_operand (&ops[2], 1 << idx);
24302   create_input_operand (&ops[3], extractv, mode);
24303   create_integer_operand (&ops[4], extractindex);
24304   expand_insn (icode, 5, ops);
24305 
24306   return true;
24307 }
24308 
24309 static bool
aarch64_expand_vec_perm_const_1(struct expand_vec_perm_d * d)24310 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
24311 {
24312   /* The pattern matching functions above are written to look for a small
24313      number to begin the sequence (0, 1, N/2).  If we begin with an index
24314      from the second operand, we can swap the operands.  */
24315   poly_int64 nelt = d->perm.length ();
24316   if (known_ge (d->perm[0], nelt))
24317     {
24318       d->perm.rotate_inputs (1);
24319       std::swap (d->op0, d->op1);
24320     }
24321 
24322   if ((d->vec_flags == VEC_ADVSIMD
24323        || d->vec_flags == VEC_SVE_DATA
24324        || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
24325        || d->vec_flags == VEC_SVE_PRED)
24326       && known_gt (nelt, 1))
24327     {
24328       if (aarch64_evpc_rev_local (d))
24329 	return true;
24330       else if (aarch64_evpc_rev_global (d))
24331 	return true;
24332       else if (aarch64_evpc_ext (d))
24333 	return true;
24334       else if (aarch64_evpc_dup (d))
24335 	return true;
24336       else if (aarch64_evpc_zip (d))
24337 	return true;
24338       else if (aarch64_evpc_uzp (d))
24339 	return true;
24340       else if (aarch64_evpc_trn (d))
24341 	return true;
24342       else if (aarch64_evpc_sel (d))
24343 	return true;
24344       else if (aarch64_evpc_ins (d))
24345 	return true;
24346       else if (aarch64_evpc_reencode (d))
24347 	return true;
24348       if (d->vec_flags == VEC_SVE_DATA)
24349 	return aarch64_evpc_sve_tbl (d);
24350       else if (d->vec_flags == VEC_ADVSIMD)
24351 	return aarch64_evpc_tbl (d);
24352     }
24353   return false;
24354 }
24355 
24356 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
24357 
24358 static bool
aarch64_vectorize_vec_perm_const(machine_mode vmode,rtx target,rtx op0,rtx op1,const vec_perm_indices & sel)24359 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
24360 				  rtx op1, const vec_perm_indices &sel)
24361 {
24362   struct expand_vec_perm_d d;
24363 
24364   /* Check whether the mask can be applied to a single vector.  */
24365   if (sel.ninputs () == 1
24366       || (op0 && rtx_equal_p (op0, op1)))
24367     d.one_vector_p = true;
24368   else if (sel.all_from_input_p (0))
24369     {
24370       d.one_vector_p = true;
24371       op1 = op0;
24372     }
24373   else if (sel.all_from_input_p (1))
24374     {
24375       d.one_vector_p = true;
24376       op0 = op1;
24377     }
24378   else
24379     d.one_vector_p = false;
24380 
24381   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
24382 		     sel.nelts_per_input ());
24383   d.vmode = vmode;
24384   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
24385   d.target = target;
24386   d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
24387   if (op0 == op1)
24388     d.op1 = d.op0;
24389   else
24390     d.op1 = op1 ? force_reg (vmode, op1) : NULL_RTX;
24391   d.testing_p = !target;
24392 
24393   if (!d.testing_p)
24394     return aarch64_expand_vec_perm_const_1 (&d);
24395 
24396   rtx_insn *last = get_last_insn ();
24397   bool ret = aarch64_expand_vec_perm_const_1 (&d);
24398   gcc_assert (last == get_last_insn ());
24399 
24400   return ret;
24401 }
24402 
24403 /* Generate a byte permute mask for a register of mode MODE,
24404    which has NUNITS units.  */
24405 
24406 rtx
aarch64_reverse_mask(machine_mode mode,unsigned int nunits)24407 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
24408 {
24409   /* We have to reverse each vector because we dont have
24410      a permuted load that can reverse-load according to ABI rules.  */
24411   rtx mask;
24412   rtvec v = rtvec_alloc (16);
24413   unsigned int i, j;
24414   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
24415 
24416   gcc_assert (BYTES_BIG_ENDIAN);
24417   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
24418 
24419   for (i = 0; i < nunits; i++)
24420     for (j = 0; j < usize; j++)
24421       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
24422   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
24423   return force_reg (V16QImode, mask);
24424 }
24425 
24426 /* Expand an SVE integer comparison using the SVE equivalent of:
24427 
24428      (set TARGET (CODE OP0 OP1)).  */
24429 
24430 void
aarch64_expand_sve_vec_cmp_int(rtx target,rtx_code code,rtx op0,rtx op1)24431 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
24432 {
24433   machine_mode pred_mode = GET_MODE (target);
24434   machine_mode data_mode = GET_MODE (op0);
24435   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
24436 				      op0, op1);
24437   if (!rtx_equal_p (target, res))
24438     emit_move_insn (target, res);
24439 }
24440 
24441 /* Return the UNSPEC_COND_* code for comparison CODE.  */
24442 
24443 static unsigned int
aarch64_unspec_cond_code(rtx_code code)24444 aarch64_unspec_cond_code (rtx_code code)
24445 {
24446   switch (code)
24447     {
24448     case NE:
24449       return UNSPEC_COND_FCMNE;
24450     case EQ:
24451       return UNSPEC_COND_FCMEQ;
24452     case LT:
24453       return UNSPEC_COND_FCMLT;
24454     case GT:
24455       return UNSPEC_COND_FCMGT;
24456     case LE:
24457       return UNSPEC_COND_FCMLE;
24458     case GE:
24459       return UNSPEC_COND_FCMGE;
24460     case UNORDERED:
24461       return UNSPEC_COND_FCMUO;
24462     default:
24463       gcc_unreachable ();
24464     }
24465 }
24466 
24467 /* Emit:
24468 
24469       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
24470 
24471    where <X> is the operation associated with comparison CODE.
24472    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
24473 
24474 static void
aarch64_emit_sve_fp_cond(rtx target,rtx_code code,rtx pred,bool known_ptrue_p,rtx op0,rtx op1)24475 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
24476 			  bool known_ptrue_p, rtx op0, rtx op1)
24477 {
24478   rtx flag = gen_int_mode (known_ptrue_p, SImode);
24479   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
24480 			       gen_rtvec (4, pred, flag, op0, op1),
24481 			       aarch64_unspec_cond_code (code));
24482   emit_set_insn (target, unspec);
24483 }
24484 
24485 /* Emit the SVE equivalent of:
24486 
24487       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
24488       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
24489       (set TARGET (ior:PRED_MODE TMP1 TMP2))
24490 
24491    where <Xi> is the operation associated with comparison CODEi.
24492    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
24493 
24494 static void
aarch64_emit_sve_or_fp_conds(rtx target,rtx_code code1,rtx_code code2,rtx pred,bool known_ptrue_p,rtx op0,rtx op1)24495 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
24496 			      rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
24497 {
24498   machine_mode pred_mode = GET_MODE (pred);
24499   rtx tmp1 = gen_reg_rtx (pred_mode);
24500   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
24501   rtx tmp2 = gen_reg_rtx (pred_mode);
24502   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
24503   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
24504 }
24505 
24506 /* Emit the SVE equivalent of:
24507 
24508       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
24509       (set TARGET (not TMP))
24510 
24511    where <X> is the operation associated with comparison CODE.
24512    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
24513 
24514 static void
aarch64_emit_sve_invert_fp_cond(rtx target,rtx_code code,rtx pred,bool known_ptrue_p,rtx op0,rtx op1)24515 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
24516 				 bool known_ptrue_p, rtx op0, rtx op1)
24517 {
24518   machine_mode pred_mode = GET_MODE (pred);
24519   rtx tmp = gen_reg_rtx (pred_mode);
24520   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
24521   aarch64_emit_unop (target, one_cmpl_optab, tmp);
24522 }
24523 
24524 /* Expand an SVE floating-point comparison using the SVE equivalent of:
24525 
24526      (set TARGET (CODE OP0 OP1))
24527 
24528    If CAN_INVERT_P is true, the caller can also handle inverted results;
24529    return true if the result is in fact inverted.  */
24530 
24531 bool
aarch64_expand_sve_vec_cmp_float(rtx target,rtx_code code,rtx op0,rtx op1,bool can_invert_p)24532 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
24533 				  rtx op0, rtx op1, bool can_invert_p)
24534 {
24535   machine_mode pred_mode = GET_MODE (target);
24536   machine_mode data_mode = GET_MODE (op0);
24537 
24538   rtx ptrue = aarch64_ptrue_reg (pred_mode);
24539   switch (code)
24540     {
24541     case UNORDERED:
24542       /* UNORDERED has no immediate form.  */
24543       op1 = force_reg (data_mode, op1);
24544       /* fall through */
24545     case LT:
24546     case LE:
24547     case GT:
24548     case GE:
24549     case EQ:
24550     case NE:
24551       {
24552 	/* There is native support for the comparison.  */
24553 	aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
24554 	return false;
24555       }
24556 
24557     case LTGT:
24558       /* This is a trapping operation (LT or GT).  */
24559       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
24560       return false;
24561 
24562     case UNEQ:
24563       if (!flag_trapping_math)
24564 	{
24565 	  /* This would trap for signaling NaNs.  */
24566 	  op1 = force_reg (data_mode, op1);
24567 	  aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
24568 					ptrue, true, op0, op1);
24569 	  return false;
24570 	}
24571       /* fall through */
24572     case UNLT:
24573     case UNLE:
24574     case UNGT:
24575     case UNGE:
24576       if (flag_trapping_math)
24577 	{
24578 	  /* Work out which elements are ordered.  */
24579 	  rtx ordered = gen_reg_rtx (pred_mode);
24580 	  op1 = force_reg (data_mode, op1);
24581 	  aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
24582 					   ptrue, true, op0, op1);
24583 
24584 	  /* Test the opposite condition for the ordered elements,
24585 	     then invert the result.  */
24586 	  if (code == UNEQ)
24587 	    code = NE;
24588 	  else
24589 	    code = reverse_condition_maybe_unordered (code);
24590 	  if (can_invert_p)
24591 	    {
24592 	      aarch64_emit_sve_fp_cond (target, code,
24593 					ordered, false, op0, op1);
24594 	      return true;
24595 	    }
24596 	  aarch64_emit_sve_invert_fp_cond (target, code,
24597 					   ordered, false, op0, op1);
24598 	  return false;
24599 	}
24600       break;
24601 
24602     case ORDERED:
24603       /* ORDERED has no immediate form.  */
24604       op1 = force_reg (data_mode, op1);
24605       break;
24606 
24607     default:
24608       gcc_unreachable ();
24609     }
24610 
24611   /* There is native support for the inverse comparison.  */
24612   code = reverse_condition_maybe_unordered (code);
24613   if (can_invert_p)
24614     {
24615       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
24616       return true;
24617     }
24618   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
24619   return false;
24620 }
24621 
24622 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
24623    of the data being selected and CMP_MODE is the mode of the values being
24624    compared.  */
24625 
24626 void
aarch64_expand_sve_vcond(machine_mode data_mode,machine_mode cmp_mode,rtx * ops)24627 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
24628 			  rtx *ops)
24629 {
24630   machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
24631   rtx pred = gen_reg_rtx (pred_mode);
24632   if (FLOAT_MODE_P (cmp_mode))
24633     {
24634       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
24635 					    ops[4], ops[5], true))
24636 	std::swap (ops[1], ops[2]);
24637     }
24638   else
24639     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
24640 
24641   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
24642     ops[1] = force_reg (data_mode, ops[1]);
24643   /* The "false" value can only be zero if the "true" value is a constant.  */
24644   if (register_operand (ops[1], data_mode)
24645       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
24646     ops[2] = force_reg (data_mode, ops[2]);
24647 
24648   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
24649   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
24650 }
24651 
24652 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
24653    true.  However due to issues with register allocation it is preferable
24654    to avoid tieing integer scalar and FP scalar modes.  Executing integer
24655    operations in general registers is better than treating them as scalar
24656    vector operations.  This reduces latency and avoids redundant int<->FP
24657    moves.  So tie modes if they are either the same class, or vector modes
24658    with other vector modes, vector structs or any scalar mode.  */
24659 
24660 static bool
aarch64_modes_tieable_p(machine_mode mode1,machine_mode mode2)24661 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
24662 {
24663   if ((aarch64_advsimd_partial_struct_mode_p (mode1)
24664        != aarch64_advsimd_partial_struct_mode_p (mode2))
24665       && maybe_gt (GET_MODE_SIZE (mode1), 8)
24666       && maybe_gt (GET_MODE_SIZE (mode2), 8))
24667     return false;
24668 
24669   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
24670     return true;
24671 
24672   /* We specifically want to allow elements of "structure" modes to
24673      be tieable to the structure.  This more general condition allows
24674      other rarer situations too.  The reason we don't extend this to
24675      predicate modes is that there are no predicate structure modes
24676      nor any specific instructions for extracting part of a predicate
24677      register.  */
24678   if (aarch64_vector_data_mode_p (mode1)
24679       && aarch64_vector_data_mode_p (mode2))
24680     return true;
24681 
24682   /* Also allow any scalar modes with vectors.  */
24683   if (aarch64_vector_mode_supported_p (mode1)
24684       || aarch64_vector_mode_supported_p (mode2))
24685     return true;
24686 
24687   return false;
24688 }
24689 
24690 /* Return a new RTX holding the result of moving POINTER forward by
24691    AMOUNT bytes.  */
24692 
24693 static rtx
aarch64_move_pointer(rtx pointer,poly_int64 amount)24694 aarch64_move_pointer (rtx pointer, poly_int64 amount)
24695 {
24696   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
24697 
24698   return adjust_automodify_address (pointer, GET_MODE (pointer),
24699 				    next, amount);
24700 }
24701 
24702 /* Return a new RTX holding the result of moving POINTER forward by the
24703    size of the mode it points to.  */
24704 
24705 static rtx
aarch64_progress_pointer(rtx pointer)24706 aarch64_progress_pointer (rtx pointer)
24707 {
24708   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
24709 }
24710 
24711 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
24712    MODE bytes.  */
24713 
24714 static void
aarch64_copy_one_block_and_progress_pointers(rtx * src,rtx * dst,machine_mode mode)24715 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
24716 					      machine_mode mode)
24717 {
24718   /* Handle 256-bit memcpy separately.  We do this by making 2 adjacent memory
24719      address copies using V4SImode so that we can use Q registers.  */
24720   if (known_eq (GET_MODE_BITSIZE (mode), 256))
24721     {
24722       mode = V4SImode;
24723       rtx reg1 = gen_reg_rtx (mode);
24724       rtx reg2 = gen_reg_rtx (mode);
24725       /* "Cast" the pointers to the correct mode.  */
24726       *src = adjust_address (*src, mode, 0);
24727       *dst = adjust_address (*dst, mode, 0);
24728       /* Emit the memcpy.  */
24729       emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
24730 					aarch64_progress_pointer (*src)));
24731       emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
24732 					 aarch64_progress_pointer (*dst), reg2));
24733       /* Move the pointers forward.  */
24734       *src = aarch64_move_pointer (*src, 32);
24735       *dst = aarch64_move_pointer (*dst, 32);
24736       return;
24737     }
24738 
24739   rtx reg = gen_reg_rtx (mode);
24740 
24741   /* "Cast" the pointers to the correct mode.  */
24742   *src = adjust_address (*src, mode, 0);
24743   *dst = adjust_address (*dst, mode, 0);
24744   /* Emit the memcpy.  */
24745   emit_move_insn (reg, *src);
24746   emit_move_insn (*dst, reg);
24747   /* Move the pointers forward.  */
24748   *src = aarch64_progress_pointer (*src);
24749   *dst = aarch64_progress_pointer (*dst);
24750 }
24751 
24752 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
24753    from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
24754    rather than memcpy.  Return true iff we succeeded.  */
24755 bool
aarch64_expand_cpymem_mops(rtx * operands,bool is_memmove=false)24756 aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove = false)
24757 {
24758   if (!TARGET_MOPS)
24759     return false;
24760 
24761   /* All three registers are changed by the instruction, so each one
24762      must be a fresh pseudo.  */
24763   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
24764   rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
24765   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
24766   rtx src_mem = replace_equiv_address (operands[1], src_addr);
24767   rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
24768   if (is_memmove)
24769     emit_insn (gen_aarch64_movmemdi (dst_mem, src_mem, sz_reg));
24770   else
24771     emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
24772   return true;
24773 }
24774 
24775 /* Expand cpymem, as if from a __builtin_memcpy.  Return true if
24776    we succeed, otherwise return false, indicating that a libcall to
24777    memcpy should be emitted.  */
24778 
24779 bool
aarch64_expand_cpymem(rtx * operands)24780 aarch64_expand_cpymem (rtx *operands)
24781 {
24782   int mode_bits;
24783   rtx dst = operands[0];
24784   rtx src = operands[1];
24785   rtx base;
24786   machine_mode cur_mode = BLKmode;
24787 
24788   /* Variable-sized memcpy can go through the MOPS expansion if available.  */
24789   if (!CONST_INT_P (operands[2]))
24790     return aarch64_expand_cpymem_mops (operands);
24791 
24792   unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
24793 
24794   /* Try to inline up to 256 bytes or use the MOPS threshold if available.  */
24795   unsigned HOST_WIDE_INT max_copy_size
24796     = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
24797 
24798   bool size_p = optimize_function_for_size_p (cfun);
24799 
24800   /* Large constant-sized cpymem should go through MOPS when possible.
24801      It should be a win even for size optimization in the general case.
24802      For speed optimization the choice between MOPS and the SIMD sequence
24803      depends on the size of the copy, rather than number of instructions,
24804      alignment etc.  */
24805   if (size > max_copy_size)
24806     return aarch64_expand_cpymem_mops (operands);
24807 
24808   int copy_bits = 256;
24809 
24810   /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
24811      support or slow 256-bit LDP/STP fall back to 128-bit chunks.  */
24812   if (size <= 24
24813       || !TARGET_SIMD
24814       || (aarch64_tune_params.extra_tuning_flags
24815 	  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
24816     copy_bits = 128;
24817 
24818   /* Emit an inline load+store sequence and count the number of operations
24819      involved.  We use a simple count of just the loads and stores emitted
24820      rather than rtx_insn count as all the pointer adjustments and reg copying
24821      in this function will get optimized away later in the pipeline.  */
24822   start_sequence ();
24823   unsigned nops = 0;
24824 
24825   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
24826   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
24827 
24828   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
24829   src = adjust_automodify_address (src, VOIDmode, base, 0);
24830 
24831   /* Convert size to bits to make the rest of the code simpler.  */
24832   int n = size * BITS_PER_UNIT;
24833 
24834   while (n > 0)
24835     {
24836       /* Find the largest mode in which to do the copy in without over reading
24837 	 or writing.  */
24838       opt_scalar_int_mode mode_iter;
24839       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
24840 	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
24841 	  cur_mode = mode_iter.require ();
24842 
24843       gcc_assert (cur_mode != BLKmode);
24844 
24845       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
24846 
24847       /* Prefer Q-register accesses for the last bytes.  */
24848       if (mode_bits == 128 && copy_bits == 256)
24849 	cur_mode = V4SImode;
24850 
24851       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
24852       /* A single block copy is 1 load + 1 store.  */
24853       nops += 2;
24854       n -= mode_bits;
24855 
24856       /* Emit trailing copies using overlapping unaligned accesses
24857 	(when !STRICT_ALIGNMENT) - this is smaller and faster.  */
24858       if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
24859 	{
24860 	  machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
24861 	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
24862 	  gcc_assert (n_bits <= mode_bits);
24863 	  src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
24864 	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
24865 	  n = n_bits;
24866 	}
24867     }
24868   rtx_insn *seq = get_insns ();
24869   end_sequence ();
24870   /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
24871      the constant size into a register.  */
24872   unsigned mops_cost = 3 + 1;
24873 
24874   /* If MOPS is available at this point we don't consider the libcall as it's
24875      not a win even on code size.  At this point only consider MOPS if
24876      optimizing for size.  For speed optimizations we will have chosen between
24877      the two based on copy size already.  */
24878   if (TARGET_MOPS)
24879     {
24880       if (size_p && mops_cost < nops)
24881 	return aarch64_expand_cpymem_mops (operands);
24882       emit_insn (seq);
24883       return true;
24884     }
24885 
24886   /* A memcpy libcall in the worst case takes 3 instructions to prepare the
24887      arguments + 1 for the call.  When MOPS is not available and we're
24888      optimizing for size a libcall may be preferable.  */
24889   unsigned libcall_cost = 4;
24890   if (size_p && libcall_cost < nops)
24891     return false;
24892 
24893   emit_insn (seq);
24894   return true;
24895 }
24896 
24897 /* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
24898    SRC is a register we have created with the duplicated value to be set.  */
24899 static void
aarch64_set_one_block_and_progress_pointer(rtx src,rtx * dst,machine_mode mode)24900 aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
24901 					    machine_mode mode)
24902 {
24903   /* If we are copying 128bits or 256bits, we can do that straight from
24904      the SIMD register we prepared.  */
24905   if (known_eq (GET_MODE_BITSIZE (mode), 256))
24906     {
24907       mode = GET_MODE (src);
24908       /* "Cast" the *dst to the correct mode.  */
24909       *dst = adjust_address (*dst, mode, 0);
24910       /* Emit the memset.  */
24911       emit_insn (aarch64_gen_store_pair (mode, *dst, src,
24912 					 aarch64_progress_pointer (*dst), src));
24913 
24914       /* Move the pointers forward.  */
24915       *dst = aarch64_move_pointer (*dst, 32);
24916       return;
24917     }
24918   if (known_eq (GET_MODE_BITSIZE (mode), 128))
24919     {
24920       /* "Cast" the *dst to the correct mode.  */
24921       *dst = adjust_address (*dst, GET_MODE (src), 0);
24922       /* Emit the memset.  */
24923       emit_move_insn (*dst, src);
24924       /* Move the pointers forward.  */
24925       *dst = aarch64_move_pointer (*dst, 16);
24926       return;
24927     }
24928   /* For copying less, we have to extract the right amount from src.  */
24929   rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
24930 
24931   /* "Cast" the *dst to the correct mode.  */
24932   *dst = adjust_address (*dst, mode, 0);
24933   /* Emit the memset.  */
24934   emit_move_insn (*dst, reg);
24935   /* Move the pointer forward.  */
24936   *dst = aarch64_progress_pointer (*dst);
24937 }
24938 
24939 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
24940    as for the setmem pattern.  Return true iff we succeed.  */
24941 static bool
aarch64_expand_setmem_mops(rtx * operands)24942 aarch64_expand_setmem_mops (rtx *operands)
24943 {
24944   if (!TARGET_MOPS)
24945     return false;
24946 
24947   /* The first two registers are changed by the instruction, so both
24948      of them must be a fresh pseudo.  */
24949   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
24950   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
24951   rtx sz_reg = copy_to_mode_reg (DImode, operands[1]);
24952   rtx val = operands[2];
24953   if (val != CONST0_RTX (QImode))
24954     val = force_reg (QImode, val);
24955   emit_insn (gen_aarch64_setmemdi (dst_mem, val, sz_reg));
24956   return true;
24957 }
24958 
24959 /* Expand setmem, as if from a __builtin_memset.  Return true if
24960    we succeed, otherwise return false.  */
24961 
24962 bool
aarch64_expand_setmem(rtx * operands)24963 aarch64_expand_setmem (rtx *operands)
24964 {
24965   int n, mode_bits;
24966   unsigned HOST_WIDE_INT len;
24967   rtx dst = operands[0];
24968   rtx val = operands[2], src;
24969   rtx base;
24970   machine_mode cur_mode = BLKmode, next_mode;
24971 
24972   /* If we don't have SIMD registers or the size is variable use the MOPS
24973      inlined sequence if possible.  */
24974   if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
24975     return aarch64_expand_setmem_mops (operands);
24976 
24977   bool size_p = optimize_function_for_size_p (cfun);
24978 
24979   /* Default the maximum to 256-bytes when considering only libcall vs
24980      SIMD broadcast sequence.  */
24981   unsigned max_set_size = 256;
24982 
24983   len = INTVAL (operands[1]);
24984   if (len > max_set_size && !TARGET_MOPS)
24985     return false;
24986 
24987   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
24988   /* The MOPS sequence takes:
24989      3 instructions for the memory storing
24990      + 1 to move the constant size into a reg
24991      + 1 if VAL is a non-zero constant to move into a reg
24992     (zero constants can use XZR directly).  */
24993   unsigned mops_cost = 3 + 1 + cst_val;
24994   /* A libcall to memset in the worst case takes 3 instructions to prepare
24995      the arguments + 1 for the call.  */
24996   unsigned libcall_cost = 4;
24997 
24998   /* Upper bound check.  For large constant-sized setmem use the MOPS sequence
24999      when available.  */
25000   if (TARGET_MOPS
25001       && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
25002     return aarch64_expand_setmem_mops (operands);
25003 
25004   /* Attempt a sequence with a vector broadcast followed by stores.
25005      Count the number of operations involved to see if it's worth it
25006      against the alternatives.  A simple counter simd_ops on the
25007      algorithmically-relevant operations is used rather than an rtx_insn count
25008      as all the pointer adjusmtents and mode reinterprets will be optimized
25009      away later.  */
25010   start_sequence ();
25011   unsigned simd_ops = 0;
25012 
25013   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
25014   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
25015 
25016   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
25017   src = expand_vector_broadcast (V16QImode, val);
25018   src = force_reg (V16QImode, src);
25019   simd_ops++;
25020   /* Convert len to bits to make the rest of the code simpler.  */
25021   n = len * BITS_PER_UNIT;
25022 
25023   /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
25024      AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
25025   const int copy_limit = (aarch64_tune_params.extra_tuning_flags
25026 			  & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
25027 			  ? GET_MODE_BITSIZE (TImode) : 256;
25028 
25029   while (n > 0)
25030     {
25031       /* Find the largest mode in which to do the copy without
25032 	 over writing.  */
25033       opt_scalar_int_mode mode_iter;
25034       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
25035 	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
25036 	  cur_mode = mode_iter.require ();
25037 
25038       gcc_assert (cur_mode != BLKmode);
25039 
25040       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
25041       aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
25042       simd_ops++;
25043       n -= mode_bits;
25044 
25045       /* Do certain trailing copies as overlapping if it's going to be
25046 	 cheaper.  i.e. less instructions to do so.  For instance doing a 15
25047 	 byte copy it's more efficient to do two overlapping 8 byte copies than
25048 	 8 + 4 + 2 + 1.  Only do this when -mstrict-align is not supplied.  */
25049       if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
25050 	{
25051 	  next_mode = smallest_mode_for_size (n, MODE_INT);
25052 	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
25053 	  gcc_assert (n_bits <= mode_bits);
25054 	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
25055 	  n = n_bits;
25056 	}
25057     }
25058   rtx_insn *seq = get_insns ();
25059   end_sequence ();
25060 
25061   if (size_p)
25062     {
25063       /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
25064 	 call to memset or the MOPS expansion.  */
25065       if (TARGET_MOPS
25066 	  && mops_cost <= libcall_cost
25067 	  && mops_cost <= simd_ops)
25068 	return aarch64_expand_setmem_mops (operands);
25069       /* If MOPS is not available or not shorter pick a libcall if the SIMD
25070 	 sequence is too long.  */
25071       else if (libcall_cost < simd_ops)
25072 	return false;
25073       emit_insn (seq);
25074       return true;
25075     }
25076 
25077   /* At this point the SIMD broadcast sequence is the best choice when
25078      optimizing for speed.  */
25079   emit_insn (seq);
25080   return true;
25081 }
25082 
25083 
25084 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
25085    SImode stores.  Handle the case when the constant has identical
25086    bottom and top halves.  This is beneficial when the two stores can be
25087    merged into an STP and we avoid synthesising potentially expensive
25088    immediates twice.  Return true if such a split is possible.  */
25089 
25090 bool
aarch64_split_dimode_const_store(rtx dst,rtx src)25091 aarch64_split_dimode_const_store (rtx dst, rtx src)
25092 {
25093   rtx lo = gen_lowpart (SImode, src);
25094   rtx hi = gen_highpart_mode (SImode, DImode, src);
25095 
25096   bool size_p = optimize_function_for_size_p (cfun);
25097 
25098   if (!rtx_equal_p (lo, hi))
25099     return false;
25100 
25101   unsigned int orig_cost
25102     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
25103   unsigned int lo_cost
25104     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
25105 
25106   /* We want to transform:
25107      MOV	x1, 49370
25108      MOVK	x1, 0x140, lsl 16
25109      MOVK	x1, 0xc0da, lsl 32
25110      MOVK	x1, 0x140, lsl 48
25111      STR	x1, [x0]
25112    into:
25113      MOV	w1, 49370
25114      MOVK	w1, 0x140, lsl 16
25115      STP	w1, w1, [x0]
25116    So we want to perform this only when we save two instructions
25117    or more.  When optimizing for size, however, accept any code size
25118    savings we can.  */
25119   if (size_p && orig_cost <= lo_cost)
25120     return false;
25121 
25122   if (!size_p
25123       && (orig_cost <= lo_cost + 1))
25124     return false;
25125 
25126   rtx mem_lo = adjust_address (dst, SImode, 0);
25127   if (!aarch64_mem_pair_operand (mem_lo, SImode))
25128     return false;
25129 
25130   rtx tmp_reg = gen_reg_rtx (SImode);
25131   aarch64_expand_mov_immediate (tmp_reg, lo);
25132   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
25133   /* Don't emit an explicit store pair as this may not be always profitable.
25134      Let the sched-fusion logic decide whether to merge them.  */
25135   emit_move_insn (mem_lo, tmp_reg);
25136   emit_move_insn (mem_hi, tmp_reg);
25137 
25138   return true;
25139 }
25140 
25141 /* Generate RTL for a conditional branch with rtx comparison CODE in
25142    mode CC_MODE.  The destination of the unlikely conditional branch
25143    is LABEL_REF.  */
25144 
25145 void
aarch64_gen_unlikely_cbranch(enum rtx_code code,machine_mode cc_mode,rtx label_ref)25146 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
25147 			      rtx label_ref)
25148 {
25149   rtx x;
25150   x = gen_rtx_fmt_ee (code, VOIDmode,
25151 		      gen_rtx_REG (cc_mode, CC_REGNUM),
25152 		      const0_rtx);
25153 
25154   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
25155 			    gen_rtx_LABEL_REF (VOIDmode, label_ref),
25156 			    pc_rtx);
25157   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
25158 }
25159 
25160 /* Generate DImode scratch registers for 128-bit (TImode) addition.
25161 
25162    OP1 represents the TImode destination operand 1
25163    OP2 represents the TImode destination operand 2
25164    LOW_DEST represents the low half (DImode) of TImode operand 0
25165    LOW_IN1 represents the low half (DImode) of TImode operand 1
25166    LOW_IN2 represents the low half (DImode) of TImode operand 2
25167    HIGH_DEST represents the high half (DImode) of TImode operand 0
25168    HIGH_IN1 represents the high half (DImode) of TImode operand 1
25169    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
25170 
25171 void
aarch64_addti_scratch_regs(rtx op1,rtx op2,rtx * low_dest,rtx * low_in1,rtx * low_in2,rtx * high_dest,rtx * high_in1,rtx * high_in2)25172 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
25173 			    rtx *low_in1, rtx *low_in2,
25174 			    rtx *high_dest, rtx *high_in1,
25175 			    rtx *high_in2)
25176 {
25177   *low_dest = gen_reg_rtx (DImode);
25178   *low_in1 = gen_lowpart (DImode, op1);
25179   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
25180 				  subreg_lowpart_offset (DImode, TImode));
25181   *high_dest = gen_reg_rtx (DImode);
25182   *high_in1 = gen_highpart (DImode, op1);
25183   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
25184 				   subreg_highpart_offset (DImode, TImode));
25185 }
25186 
25187 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
25188 
25189    This function differs from 'arch64_addti_scratch_regs' in that
25190    OP1 can be an immediate constant (zero). We must call
25191    subreg_highpart_offset with DImode and TImode arguments, otherwise
25192    VOIDmode will be used for the const_int which generates an internal
25193    error from subreg_size_highpart_offset which does not expect a size of zero.
25194 
25195    OP1 represents the TImode destination operand 1
25196    OP2 represents the TImode destination operand 2
25197    LOW_DEST represents the low half (DImode) of TImode operand 0
25198    LOW_IN1 represents the low half (DImode) of TImode operand 1
25199    LOW_IN2 represents the low half (DImode) of TImode operand 2
25200    HIGH_DEST represents the high half (DImode) of TImode operand 0
25201    HIGH_IN1 represents the high half (DImode) of TImode operand 1
25202    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
25203 
25204 
25205 void
aarch64_subvti_scratch_regs(rtx op1,rtx op2,rtx * low_dest,rtx * low_in1,rtx * low_in2,rtx * high_dest,rtx * high_in1,rtx * high_in2)25206 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
25207 			     rtx *low_in1, rtx *low_in2,
25208 			     rtx *high_dest, rtx *high_in1,
25209 			     rtx *high_in2)
25210 {
25211   *low_dest = gen_reg_rtx (DImode);
25212   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
25213 				  subreg_lowpart_offset (DImode, TImode));
25214 
25215   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
25216 				  subreg_lowpart_offset (DImode, TImode));
25217   *high_dest = gen_reg_rtx (DImode);
25218 
25219   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
25220 				   subreg_highpart_offset (DImode, TImode));
25221   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
25222 				   subreg_highpart_offset (DImode, TImode));
25223 }
25224 
25225 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
25226 
25227    OP0 represents the TImode destination operand 0
25228    LOW_DEST represents the low half (DImode) of TImode operand 0
25229    LOW_IN1 represents the low half (DImode) of TImode operand 1
25230    LOW_IN2 represents the low half (DImode) of TImode operand 2
25231    HIGH_DEST represents the high half (DImode) of TImode operand 0
25232    HIGH_IN1 represents the high half (DImode) of TImode operand 1
25233    HIGH_IN2 represents the high half (DImode) of TImode operand 2
25234    UNSIGNED_P is true if the operation is being performed on unsigned
25235    values.  */
25236 void
aarch64_expand_subvti(rtx op0,rtx low_dest,rtx low_in1,rtx low_in2,rtx high_dest,rtx high_in1,rtx high_in2,bool unsigned_p)25237 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
25238 		       rtx low_in2, rtx high_dest, rtx high_in1,
25239 		       rtx high_in2, bool unsigned_p)
25240 {
25241   if (low_in2 == const0_rtx)
25242     {
25243       low_dest = low_in1;
25244       high_in2 = force_reg (DImode, high_in2);
25245       if (unsigned_p)
25246 	emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
25247       else
25248 	emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
25249     }
25250   else
25251     {
25252       if (aarch64_plus_immediate (low_in2, DImode))
25253 	emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
25254 					    GEN_INT (-UINTVAL (low_in2))));
25255       else
25256 	{
25257 	  low_in2 = force_reg (DImode, low_in2);
25258 	  emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
25259 	}
25260       high_in2 = force_reg (DImode, high_in2);
25261 
25262       if (unsigned_p)
25263 	emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
25264       else
25265 	emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
25266     }
25267 
25268   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
25269   emit_move_insn (gen_highpart (DImode, op0), high_dest);
25270 
25271 }
25272 
25273 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
25274 
25275 static unsigned HOST_WIDE_INT
aarch64_asan_shadow_offset(void)25276 aarch64_asan_shadow_offset (void)
25277 {
25278   if (TARGET_ILP32)
25279     return (HOST_WIDE_INT_1 << 29);
25280   else
25281     return (HOST_WIDE_INT_1 << 36);
25282 }
25283 
25284 static rtx
aarch64_gen_ccmp_first(rtx_insn ** prep_seq,rtx_insn ** gen_seq,int code,tree treeop0,tree treeop1)25285 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
25286 			int code, tree treeop0, tree treeop1)
25287 {
25288   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
25289   rtx op0, op1;
25290   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
25291   insn_code icode;
25292   struct expand_operand ops[4];
25293 
25294   start_sequence ();
25295   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
25296 
25297   op_mode = GET_MODE (op0);
25298   if (op_mode == VOIDmode)
25299     op_mode = GET_MODE (op1);
25300 
25301   switch (op_mode)
25302     {
25303     case E_QImode:
25304     case E_HImode:
25305     case E_SImode:
25306       cmp_mode = SImode;
25307       icode = CODE_FOR_cmpsi;
25308       break;
25309 
25310     case E_DImode:
25311       cmp_mode = DImode;
25312       icode = CODE_FOR_cmpdi;
25313       break;
25314 
25315     case E_SFmode:
25316       cmp_mode = SFmode;
25317       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
25318       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
25319       break;
25320 
25321     case E_DFmode:
25322       cmp_mode = DFmode;
25323       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
25324       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
25325       break;
25326 
25327     default:
25328       end_sequence ();
25329       return NULL_RTX;
25330     }
25331 
25332   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
25333   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
25334   if (!op0 || !op1)
25335     {
25336       end_sequence ();
25337       return NULL_RTX;
25338     }
25339   *prep_seq = get_insns ();
25340   end_sequence ();
25341 
25342   create_fixed_operand (&ops[0], op0);
25343   create_fixed_operand (&ops[1], op1);
25344 
25345   start_sequence ();
25346   if (!maybe_expand_insn (icode, 2, ops))
25347     {
25348       end_sequence ();
25349       return NULL_RTX;
25350     }
25351   *gen_seq = get_insns ();
25352   end_sequence ();
25353 
25354   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
25355 			 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
25356 }
25357 
25358 static rtx
aarch64_gen_ccmp_next(rtx_insn ** prep_seq,rtx_insn ** gen_seq,rtx prev,int cmp_code,tree treeop0,tree treeop1,int bit_code)25359 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
25360 		       int cmp_code, tree treeop0, tree treeop1, int bit_code)
25361 {
25362   rtx op0, op1, target;
25363   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
25364   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
25365   insn_code icode;
25366   struct expand_operand ops[6];
25367   int aarch64_cond;
25368 
25369   push_to_sequence (*prep_seq);
25370   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
25371 
25372   op_mode = GET_MODE (op0);
25373   if (op_mode == VOIDmode)
25374     op_mode = GET_MODE (op1);
25375 
25376   switch (op_mode)
25377     {
25378     case E_QImode:
25379     case E_HImode:
25380     case E_SImode:
25381       cmp_mode = SImode;
25382       break;
25383 
25384     case E_DImode:
25385       cmp_mode = DImode;
25386       break;
25387 
25388     case E_SFmode:
25389       cmp_mode = SFmode;
25390       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
25391       break;
25392 
25393     case E_DFmode:
25394       cmp_mode = DFmode;
25395       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
25396       break;
25397 
25398     default:
25399       end_sequence ();
25400       return NULL_RTX;
25401     }
25402 
25403   icode = code_for_ccmp (cc_mode, cmp_mode);
25404 
25405   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
25406   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
25407   if (!op0 || !op1)
25408     {
25409       end_sequence ();
25410       return NULL_RTX;
25411     }
25412   *prep_seq = get_insns ();
25413   end_sequence ();
25414 
25415   target = gen_rtx_REG (cc_mode, CC_REGNUM);
25416   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
25417 
25418   if (bit_code != AND)
25419     {
25420       /* Treat the ccmp patterns as canonical and use them where possible,
25421 	 but fall back to ccmp_rev patterns if there's no other option.  */
25422       rtx_code prev_code = GET_CODE (prev);
25423       machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
25424       if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
25425 	  && !(prev_code == EQ
25426 	       || prev_code == NE
25427 	       || prev_code == ORDERED
25428 	       || prev_code == UNORDERED))
25429 	icode = code_for_ccmp_rev (cc_mode, cmp_mode);
25430       else
25431 	{
25432 	  rtx_code code = reverse_condition (prev_code);
25433 	  prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
25434 	}
25435       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
25436     }
25437 
25438   create_fixed_operand (&ops[0], XEXP (prev, 0));
25439   create_fixed_operand (&ops[1], target);
25440   create_fixed_operand (&ops[2], op0);
25441   create_fixed_operand (&ops[3], op1);
25442   create_fixed_operand (&ops[4], prev);
25443   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
25444 
25445   push_to_sequence (*gen_seq);
25446   if (!maybe_expand_insn (icode, 6, ops))
25447     {
25448       end_sequence ();
25449       return NULL_RTX;
25450     }
25451 
25452   *gen_seq = get_insns ();
25453   end_sequence ();
25454 
25455   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
25456 }
25457 
25458 #undef TARGET_GEN_CCMP_FIRST
25459 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
25460 
25461 #undef TARGET_GEN_CCMP_NEXT
25462 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
25463 
25464 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
25465    instruction fusion of some sort.  */
25466 
25467 static bool
aarch64_macro_fusion_p(void)25468 aarch64_macro_fusion_p (void)
25469 {
25470   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
25471 }
25472 
25473 
25474 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
25475    should be kept together during scheduling.  */
25476 
25477 static bool
aarch_macro_fusion_pair_p(rtx_insn * prev,rtx_insn * curr)25478 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
25479 {
25480   rtx set_dest;
25481   rtx prev_set = single_set (prev);
25482   rtx curr_set = single_set (curr);
25483   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
25484   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
25485 
25486   if (!aarch64_macro_fusion_p ())
25487     return false;
25488 
25489   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
25490     {
25491       /* We are trying to match:
25492          prev (mov)  == (set (reg r0) (const_int imm16))
25493          curr (movk) == (set (zero_extract (reg r0)
25494                                            (const_int 16)
25495                                            (const_int 16))
25496                              (const_int imm16_1))  */
25497 
25498       set_dest = SET_DEST (curr_set);
25499 
25500       if (GET_CODE (set_dest) == ZERO_EXTRACT
25501           && CONST_INT_P (SET_SRC (curr_set))
25502           && CONST_INT_P (SET_SRC (prev_set))
25503           && CONST_INT_P (XEXP (set_dest, 2))
25504           && INTVAL (XEXP (set_dest, 2)) == 16
25505           && REG_P (XEXP (set_dest, 0))
25506           && REG_P (SET_DEST (prev_set))
25507           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
25508         {
25509           return true;
25510         }
25511     }
25512 
25513   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
25514     {
25515 
25516       /*  We're trying to match:
25517           prev (adrp) == (set (reg r1)
25518                               (high (symbol_ref ("SYM"))))
25519           curr (add) == (set (reg r0)
25520                              (lo_sum (reg r1)
25521                                      (symbol_ref ("SYM"))))
25522           Note that r0 need not necessarily be the same as r1, especially
25523           during pre-regalloc scheduling.  */
25524 
25525       if (satisfies_constraint_Ush (SET_SRC (prev_set))
25526           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
25527         {
25528           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
25529               && REG_P (XEXP (SET_SRC (curr_set), 0))
25530               && REGNO (XEXP (SET_SRC (curr_set), 0))
25531                  == REGNO (SET_DEST (prev_set))
25532               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
25533                               XEXP (SET_SRC (curr_set), 1)))
25534             return true;
25535         }
25536     }
25537 
25538   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
25539     {
25540 
25541       /* We're trying to match:
25542          prev (movk) == (set (zero_extract (reg r0)
25543                                            (const_int 16)
25544                                            (const_int 32))
25545                              (const_int imm16_1))
25546          curr (movk) == (set (zero_extract (reg r0)
25547                                            (const_int 16)
25548                                            (const_int 48))
25549                              (const_int imm16_2))  */
25550 
25551       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
25552           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
25553           && REG_P (XEXP (SET_DEST (prev_set), 0))
25554           && REG_P (XEXP (SET_DEST (curr_set), 0))
25555           && REGNO (XEXP (SET_DEST (prev_set), 0))
25556              == REGNO (XEXP (SET_DEST (curr_set), 0))
25557           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
25558           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
25559           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
25560           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
25561           && CONST_INT_P (SET_SRC (prev_set))
25562           && CONST_INT_P (SET_SRC (curr_set)))
25563         return true;
25564 
25565     }
25566   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
25567     {
25568       /* We're trying to match:
25569           prev (adrp) == (set (reg r0)
25570                               (high (symbol_ref ("SYM"))))
25571           curr (ldr) == (set (reg r1)
25572                              (mem (lo_sum (reg r0)
25573                                              (symbol_ref ("SYM")))))
25574                  or
25575           curr (ldr) == (set (reg r1)
25576                              (zero_extend (mem
25577                                            (lo_sum (reg r0)
25578                                                    (symbol_ref ("SYM"))))))  */
25579       if (satisfies_constraint_Ush (SET_SRC (prev_set))
25580           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
25581         {
25582           rtx curr_src = SET_SRC (curr_set);
25583 
25584           if (GET_CODE (curr_src) == ZERO_EXTEND)
25585             curr_src = XEXP (curr_src, 0);
25586 
25587           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
25588               && REG_P (XEXP (XEXP (curr_src, 0), 0))
25589               && REGNO (XEXP (XEXP (curr_src, 0), 0))
25590                  == REGNO (SET_DEST (prev_set))
25591               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
25592                               XEXP (SET_SRC (prev_set), 0)))
25593               return true;
25594         }
25595     }
25596 
25597   /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch.  */
25598   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
25599       && prev_set && curr_set && any_condjump_p (curr)
25600       && GET_CODE (SET_SRC (prev_set)) == COMPARE
25601       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
25602       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
25603     return true;
25604 
25605   /* Fuse flag-setting ALU instructions and conditional branch.  */
25606   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
25607       && any_condjump_p (curr))
25608     {
25609       unsigned int condreg1, condreg2;
25610       rtx cc_reg_1;
25611       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
25612       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
25613 
25614       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
25615 	  && prev
25616 	  && modified_in_p (cc_reg_1, prev))
25617 	{
25618 	  enum attr_type prev_type = get_attr_type (prev);
25619 
25620 	  /* FIXME: this misses some which is considered simple arthematic
25621 	     instructions for ThunderX.  Simple shifts are missed here.  */
25622 	  if (prev_type == TYPE_ALUS_SREG
25623 	      || prev_type == TYPE_ALUS_IMM
25624 	      || prev_type == TYPE_LOGICS_REG
25625 	      || prev_type == TYPE_LOGICS_IMM)
25626 	    return true;
25627 	}
25628     }
25629 
25630   /* Fuse ALU instructions and CBZ/CBNZ.  */
25631   if (prev_set
25632       && curr_set
25633       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
25634       && any_condjump_p (curr))
25635     {
25636       /* We're trying to match:
25637 	  prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
25638 	  curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
25639 							 (const_int 0))
25640 						 (label_ref ("SYM"))
25641 						 (pc))  */
25642       if (SET_DEST (curr_set) == (pc_rtx)
25643 	  && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
25644 	  && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
25645 	  && REG_P (SET_DEST (prev_set))
25646 	  && REGNO (SET_DEST (prev_set))
25647 	     == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
25648 	{
25649 	  /* Fuse ALU operations followed by conditional branch instruction.  */
25650 	  switch (get_attr_type (prev))
25651 	    {
25652 	    case TYPE_ALU_IMM:
25653 	    case TYPE_ALU_SREG:
25654 	    case TYPE_ADC_REG:
25655 	    case TYPE_ADC_IMM:
25656 	    case TYPE_ADCS_REG:
25657 	    case TYPE_ADCS_IMM:
25658 	    case TYPE_LOGIC_REG:
25659 	    case TYPE_LOGIC_IMM:
25660 	    case TYPE_CSEL:
25661 	    case TYPE_ADR:
25662 	    case TYPE_MOV_IMM:
25663 	    case TYPE_SHIFT_REG:
25664 	    case TYPE_SHIFT_IMM:
25665 	    case TYPE_BFM:
25666 	    case TYPE_RBIT:
25667 	    case TYPE_REV:
25668 	    case TYPE_EXTEND:
25669 	      return true;
25670 
25671 	    default:;
25672 	    }
25673 	}
25674     }
25675 
25676   /* Fuse A+B+1 and A-B-1 */
25677   if (simple_sets_p
25678       && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
25679     {
25680       /* We're trying to match:
25681 	  prev == (set (r0) (plus (r0) (r1)))
25682 	  curr == (set (r0) (plus (r0) (const_int 1)))
25683 	or:
25684 	  prev == (set (r0) (minus (r0) (r1)))
25685 	  curr == (set (r0) (plus (r0) (const_int -1))) */
25686 
25687       rtx prev_src = SET_SRC (prev_set);
25688       rtx curr_src = SET_SRC (curr_set);
25689 
25690       int polarity = 1;
25691       if (GET_CODE (prev_src) == MINUS)
25692 	polarity = -1;
25693 
25694       if (GET_CODE (curr_src) == PLUS
25695 	  && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
25696 	  && CONST_INT_P (XEXP (curr_src, 1))
25697 	  && INTVAL (XEXP (curr_src, 1)) == polarity
25698 	  && REG_P (XEXP (curr_src, 0))
25699 	  && REG_P (SET_DEST (prev_set))
25700 	  && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
25701 	return true;
25702     }
25703 
25704   return false;
25705 }
25706 
25707 /* Return true iff the instruction fusion described by OP is enabled.  */
25708 
25709 bool
aarch64_fusion_enabled_p(enum aarch64_fusion_pairs op)25710 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
25711 {
25712   return (aarch64_tune_params.fusible_ops & op) != 0;
25713 }
25714 
25715 /* If MEM is in the form of [base+offset], extract the two parts
25716    of address and set to BASE and OFFSET, otherwise return false
25717    after clearing BASE and OFFSET.  */
25718 
25719 bool
extract_base_offset_in_addr(rtx mem,rtx * base,rtx * offset)25720 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
25721 {
25722   rtx addr;
25723 
25724   gcc_assert (MEM_P (mem));
25725 
25726   addr = XEXP (mem, 0);
25727 
25728   if (REG_P (addr))
25729     {
25730       *base = addr;
25731       *offset = const0_rtx;
25732       return true;
25733     }
25734 
25735   if (GET_CODE (addr) == PLUS
25736       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
25737     {
25738       *base = XEXP (addr, 0);
25739       *offset = XEXP (addr, 1);
25740       return true;
25741     }
25742 
25743   *base = NULL_RTX;
25744   *offset = NULL_RTX;
25745 
25746   return false;
25747 }
25748 
25749 /* Types for scheduling fusion.  */
25750 enum sched_fusion_type
25751 {
25752   SCHED_FUSION_NONE = 0,
25753   SCHED_FUSION_LD_SIGN_EXTEND,
25754   SCHED_FUSION_LD_ZERO_EXTEND,
25755   SCHED_FUSION_LD,
25756   SCHED_FUSION_ST,
25757   SCHED_FUSION_NUM
25758 };
25759 
25760 /* If INSN is a load or store of address in the form of [base+offset],
25761    extract the two parts and set to BASE and OFFSET.  Return scheduling
25762    fusion type this INSN is.  */
25763 
25764 static enum sched_fusion_type
fusion_load_store(rtx_insn * insn,rtx * base,rtx * offset)25765 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
25766 {
25767   rtx x, dest, src;
25768   enum sched_fusion_type fusion = SCHED_FUSION_LD;
25769 
25770   gcc_assert (INSN_P (insn));
25771   x = PATTERN (insn);
25772   if (GET_CODE (x) != SET)
25773     return SCHED_FUSION_NONE;
25774 
25775   src = SET_SRC (x);
25776   dest = SET_DEST (x);
25777 
25778   machine_mode dest_mode = GET_MODE (dest);
25779 
25780   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
25781     return SCHED_FUSION_NONE;
25782 
25783   if (GET_CODE (src) == SIGN_EXTEND)
25784     {
25785       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
25786       src = XEXP (src, 0);
25787       if (!MEM_P (src) || GET_MODE (src) != SImode)
25788 	return SCHED_FUSION_NONE;
25789     }
25790   else if (GET_CODE (src) == ZERO_EXTEND)
25791     {
25792       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
25793       src = XEXP (src, 0);
25794       if (!MEM_P (src) || GET_MODE (src) != SImode)
25795 	return SCHED_FUSION_NONE;
25796     }
25797 
25798   if (MEM_P (src) && REG_P (dest))
25799     extract_base_offset_in_addr (src, base, offset);
25800   else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
25801     {
25802       fusion = SCHED_FUSION_ST;
25803       extract_base_offset_in_addr (dest, base, offset);
25804     }
25805   else
25806     return SCHED_FUSION_NONE;
25807 
25808   if (*base == NULL_RTX || *offset == NULL_RTX)
25809     fusion = SCHED_FUSION_NONE;
25810 
25811   return fusion;
25812 }
25813 
25814 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
25815 
25816    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
25817    and PRI are only calculated for these instructions.  For other instruction,
25818    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
25819    type instruction fusion can be added by returning different priorities.
25820 
25821    It's important that irrelevant instructions get the largest FUSION_PRI.  */
25822 
25823 static void
aarch64_sched_fusion_priority(rtx_insn * insn,int max_pri,int * fusion_pri,int * pri)25824 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
25825 			       int *fusion_pri, int *pri)
25826 {
25827   int tmp, off_val;
25828   rtx base, offset;
25829   enum sched_fusion_type fusion;
25830 
25831   gcc_assert (INSN_P (insn));
25832 
25833   tmp = max_pri - 1;
25834   fusion = fusion_load_store (insn, &base, &offset);
25835   if (fusion == SCHED_FUSION_NONE)
25836     {
25837       *pri = tmp;
25838       *fusion_pri = tmp;
25839       return;
25840     }
25841 
25842   /* Set FUSION_PRI according to fusion type and base register.  */
25843   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
25844 
25845   /* Calculate PRI.  */
25846   tmp /= 2;
25847 
25848   /* INSN with smaller offset goes first.  */
25849   off_val = (int)(INTVAL (offset));
25850   if (off_val >= 0)
25851     tmp -= (off_val & 0xfffff);
25852   else
25853     tmp += ((- off_val) & 0xfffff);
25854 
25855   *pri = tmp;
25856   return;
25857 }
25858 
25859 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
25860    Adjust priority of sha1h instructions so they are scheduled before
25861    other SHA1 instructions.  */
25862 
25863 static int
aarch64_sched_adjust_priority(rtx_insn * insn,int priority)25864 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
25865 {
25866   rtx x = PATTERN (insn);
25867 
25868   if (GET_CODE (x) == SET)
25869     {
25870       x = SET_SRC (x);
25871 
25872       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
25873 	return priority + 10;
25874     }
25875 
25876   return priority;
25877 }
25878 
25879 /* If REVERSED is null, return true if memory reference *MEM2 comes
25880    immediately after memory reference *MEM1.  Do not change the references
25881    in this case.
25882 
25883    Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
25884    if they are, try to make them use constant offsets from the same base
25885    register.  Return true on success.  When returning true, set *REVERSED
25886    to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2.  */
25887 static bool
aarch64_check_consecutive_mems(rtx * mem1,rtx * mem2,bool * reversed)25888 aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
25889 {
25890   if (reversed)
25891     *reversed = false;
25892 
25893   if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
25894       || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
25895     return false;
25896 
25897   if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
25898     return false;
25899 
25900   auto size1 = MEM_SIZE (*mem1);
25901   auto size2 = MEM_SIZE (*mem2);
25902 
25903   rtx base1, base2, offset1, offset2;
25904   extract_base_offset_in_addr (*mem1, &base1, &offset1);
25905   extract_base_offset_in_addr (*mem2, &base2, &offset2);
25906 
25907   /* Make sure at least one memory is in base+offset form.  */
25908   if (!(base1 && offset1) && !(base2 && offset2))
25909     return false;
25910 
25911   /* If both mems already use the same base register, just check the
25912      offsets.  */
25913   if (base1 && base2 && rtx_equal_p (base1, base2))
25914     {
25915       if (!offset1 || !offset2)
25916 	return false;
25917 
25918       if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
25919 	return true;
25920 
25921       if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed)
25922 	{
25923 	  *reversed = true;
25924 	  return true;
25925 	}
25926 
25927       return false;
25928     }
25929 
25930   /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
25931      guarantee that the values are consecutive.  */
25932   if (MEM_EXPR (*mem1)
25933       && MEM_EXPR (*mem2)
25934       && MEM_OFFSET_KNOWN_P (*mem1)
25935       && MEM_OFFSET_KNOWN_P (*mem2))
25936     {
25937       poly_int64 expr_offset1;
25938       poly_int64 expr_offset2;
25939       tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
25940 						       &expr_offset1);
25941       tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
25942 						       &expr_offset2);
25943       if (!expr_base1
25944 	  || !expr_base2
25945 	  || !DECL_P (expr_base1)
25946 	  || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
25947 	return false;
25948 
25949       expr_offset1 += MEM_OFFSET (*mem1);
25950       expr_offset2 += MEM_OFFSET (*mem2);
25951 
25952       if (known_eq (expr_offset1 + size1, expr_offset2))
25953 	;
25954       else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed)
25955 	*reversed = true;
25956       else
25957 	return false;
25958 
25959       if (reversed)
25960 	{
25961 	  if (base2)
25962 	    {
25963 	      rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
25964 					 expr_offset1 - expr_offset2);
25965 	      *mem1 = replace_equiv_address_nv (*mem1, addr1);
25966 	    }
25967 	  else
25968 	    {
25969 	      rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
25970 					 expr_offset2 - expr_offset1);
25971 	      *mem2 = replace_equiv_address_nv (*mem2, addr2);
25972 	    }
25973 	}
25974       return true;
25975     }
25976 
25977   return false;
25978 }
25979 
25980 /* Return true if MEM1 and MEM2 can be combined into a single access
25981    of mode MODE, with the combined access having the same address as MEM1.  */
25982 
25983 bool
aarch64_mergeable_load_pair_p(machine_mode mode,rtx mem1,rtx mem2)25984 aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2)
25985 {
25986   if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode))
25987     return false;
25988   return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr);
25989 }
25990 
25991 /* Given OPERANDS of consecutive load/store, check if we can merge
25992    them into ldp/stp.  LOAD is true if they are load instructions.
25993    MODE is the mode of memory operands.  */
25994 
25995 bool
aarch64_operands_ok_for_ldpstp(rtx * operands,bool load,machine_mode mode)25996 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
25997 				machine_mode mode)
25998 {
25999   enum reg_class rclass_1, rclass_2;
26000   rtx mem_1, mem_2, reg_1, reg_2;
26001 
26002   /* Allow the tuning structure to disable LDP instruction formation
26003      from combining instructions (e.g., in peephole2).  */
26004   if (load && (aarch64_tune_params.extra_tuning_flags
26005 	       & AARCH64_EXTRA_TUNE_NO_LDP_COMBINE))
26006     return false;
26007 
26008   if (load)
26009     {
26010       mem_1 = operands[1];
26011       mem_2 = operands[3];
26012       reg_1 = operands[0];
26013       reg_2 = operands[2];
26014       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
26015       if (REGNO (reg_1) == REGNO (reg_2))
26016 	return false;
26017       if (reg_overlap_mentioned_p (reg_1, mem_2))
26018 	return false;
26019     }
26020   else
26021     {
26022       mem_1 = operands[0];
26023       mem_2 = operands[2];
26024       reg_1 = operands[1];
26025       reg_2 = operands[3];
26026     }
26027 
26028   /* The mems cannot be volatile.  */
26029   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
26030     return false;
26031 
26032   /* If we have SImode and slow unaligned ldp,
26033      check the alignment to be at least 8 byte. */
26034   if (mode == SImode
26035       && (aarch64_tune_params.extra_tuning_flags
26036           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
26037       && !optimize_size
26038       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
26039     return false;
26040 
26041   /* Check if the addresses are in the form of [base+offset].  */
26042   bool reversed = false;
26043   if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
26044     return false;
26045 
26046   /* The operands must be of the same size.  */
26047   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
26048 			GET_MODE_SIZE (GET_MODE (mem_2))));
26049 
26050   /* The lower memory access must be a mem-pair operand.  */
26051   rtx lower_mem = reversed ? mem_2 : mem_1;
26052   if (!aarch64_mem_pair_operand (lower_mem, GET_MODE (lower_mem)))
26053     return false;
26054 
26055   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
26056     rclass_1 = FP_REGS;
26057   else
26058     rclass_1 = GENERAL_REGS;
26059 
26060   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
26061     rclass_2 = FP_REGS;
26062   else
26063     rclass_2 = GENERAL_REGS;
26064 
26065   /* Check if the registers are of same class.  */
26066   if (rclass_1 != rclass_2)
26067     return false;
26068 
26069   return true;
26070 }
26071 
26072 /* Given OPERANDS of consecutive load/store that can be merged,
26073    swap them if they are not in ascending order.  */
26074 void
aarch64_swap_ldrstr_operands(rtx * operands,bool load)26075 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
26076 {
26077   int mem_op = load ? 1 : 0;
26078   bool reversed = false;
26079   if (!aarch64_check_consecutive_mems (operands + mem_op,
26080 				       operands + mem_op + 2, &reversed))
26081     gcc_unreachable ();
26082 
26083   if (reversed)
26084     {
26085       /* Irrespective of whether this is a load or a store,
26086 	 we do the same swap.  */
26087       std::swap (operands[0], operands[2]);
26088       std::swap (operands[1], operands[3]);
26089     }
26090 }
26091 
26092 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
26093    comparison between the two.  */
26094 int
aarch64_host_wide_int_compare(const void * x,const void * y)26095 aarch64_host_wide_int_compare (const void *x, const void *y)
26096 {
26097   return wi::cmps (* ((const HOST_WIDE_INT *) x),
26098 		   * ((const HOST_WIDE_INT *) y));
26099 }
26100 
26101 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
26102    other pointing to a REG rtx containing an offset, compare the offsets
26103    of the two pairs.
26104 
26105    Return:
26106 
26107 	1 iff offset (X) > offset (Y)
26108 	0 iff offset (X) == offset (Y)
26109 	-1 iff offset (X) < offset (Y)  */
26110 int
aarch64_ldrstr_offset_compare(const void * x,const void * y)26111 aarch64_ldrstr_offset_compare (const void *x, const void *y)
26112 {
26113   const rtx * operands_1 = (const rtx *) x;
26114   const rtx * operands_2 = (const rtx *) y;
26115   rtx mem_1, mem_2, base, offset_1, offset_2;
26116 
26117   if (MEM_P (operands_1[0]))
26118     mem_1 = operands_1[0];
26119   else
26120     mem_1 = operands_1[1];
26121 
26122   if (MEM_P (operands_2[0]))
26123     mem_2 = operands_2[0];
26124   else
26125     mem_2 = operands_2[1];
26126 
26127   /* Extract the offsets.  */
26128   extract_base_offset_in_addr (mem_1, &base, &offset_1);
26129   extract_base_offset_in_addr (mem_2, &base, &offset_2);
26130 
26131   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
26132 
26133   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
26134 }
26135 
26136 /* Given OPERANDS of consecutive load/store, check if we can merge
26137    them into ldp/stp by adjusting the offset.  LOAD is true if they
26138    are load instructions.  MODE is the mode of memory operands.
26139 
26140    Given below consecutive stores:
26141 
26142      str  w1, [xb, 0x100]
26143      str  w1, [xb, 0x104]
26144      str  w1, [xb, 0x108]
26145      str  w1, [xb, 0x10c]
26146 
26147    Though the offsets are out of the range supported by stp, we can
26148    still pair them after adjusting the offset, like:
26149 
26150      add  scratch, xb, 0x100
26151      stp  w1, w1, [scratch]
26152      stp  w1, w1, [scratch, 0x8]
26153 
26154    The peephole patterns detecting this opportunity should guarantee
26155    the scratch register is avaliable.  */
26156 
26157 bool
aarch64_operands_adjust_ok_for_ldpstp(rtx * operands,bool load,machine_mode mode)26158 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
26159 				       machine_mode mode)
26160 {
26161   const int num_insns = 4;
26162   enum reg_class rclass;
26163   HOST_WIDE_INT offvals[num_insns], msize;
26164   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
26165 
26166   if (load)
26167     {
26168       for (int i = 0; i < num_insns; i++)
26169 	{
26170 	  reg[i] = operands[2 * i];
26171 	  mem[i] = operands[2 * i + 1];
26172 
26173 	  gcc_assert (REG_P (reg[i]));
26174 	}
26175 
26176       /* Do not attempt to merge the loads if the loads clobber each other.  */
26177       for (int i = 0; i < 8; i += 2)
26178 	for (int j = i + 2; j < 8; j += 2)
26179 	  if (reg_overlap_mentioned_p (operands[i], operands[j]))
26180 	    return false;
26181     }
26182   else
26183     for (int i = 0; i < num_insns; i++)
26184       {
26185 	mem[i] = operands[2 * i];
26186 	reg[i] = operands[2 * i + 1];
26187       }
26188 
26189   /* Skip if memory operand is by itself valid for ldp/stp.  */
26190   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
26191     return false;
26192 
26193   for (int i = 0; i < num_insns; i++)
26194     {
26195       /* The mems cannot be volatile.  */
26196       if (MEM_VOLATILE_P (mem[i]))
26197 	return false;
26198 
26199       /* Check if the addresses are in the form of [base+offset].  */
26200       extract_base_offset_in_addr (mem[i], base + i, offset + i);
26201       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
26202 	return false;
26203     }
26204 
26205   /* Check if the registers are of same class.  */
26206   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
26207     ? FP_REGS : GENERAL_REGS;
26208 
26209   for (int i = 1; i < num_insns; i++)
26210     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
26211       {
26212 	if (rclass != FP_REGS)
26213 	  return false;
26214       }
26215     else
26216       {
26217 	if (rclass != GENERAL_REGS)
26218 	  return false;
26219       }
26220 
26221   /* Only the last register in the order in which they occur
26222      may be clobbered by the load.  */
26223   if (rclass == GENERAL_REGS && load)
26224     for (int i = 0; i < num_insns - 1; i++)
26225       if (reg_mentioned_p (reg[i], mem[i]))
26226 	return false;
26227 
26228   /* Check if the bases are same.  */
26229   for (int i = 0; i < num_insns - 1; i++)
26230     if (!rtx_equal_p (base[i], base[i + 1]))
26231       return false;
26232 
26233   for (int i = 0; i < num_insns; i++)
26234     offvals[i] = INTVAL (offset[i]);
26235 
26236   msize = GET_MODE_SIZE (mode).to_constant ();
26237 
26238   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
26239   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
26240 	 aarch64_host_wide_int_compare);
26241 
26242   if (!(offvals[1] == offvals[0] + msize
26243 	&& offvals[3] == offvals[2] + msize))
26244     return false;
26245 
26246   /* Check that offsets are within range of each other.  The ldp/stp
26247      instructions have 7 bit immediate offsets, so use 0x80.  */
26248   if (offvals[2] - offvals[0] >= msize * 0x80)
26249     return false;
26250 
26251   /* The offsets must be aligned with respect to each other.  */
26252   if (offvals[0] % msize != offvals[2] % msize)
26253     return false;
26254 
26255   /* If we have SImode and slow unaligned ldp,
26256      check the alignment to be at least 8 byte. */
26257   if (mode == SImode
26258       && (aarch64_tune_params.extra_tuning_flags
26259 	  & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
26260       && !optimize_size
26261       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
26262     return false;
26263 
26264   return true;
26265 }
26266 
26267 /* Given OPERANDS of consecutive load/store, this function pairs them
26268    into LDP/STP after adjusting the offset.  It depends on the fact
26269    that the operands can be sorted so the offsets are correct for STP.
26270    MODE is the mode of memory operands.  CODE is the rtl operator
26271    which should be applied to all memory operands, it's SIGN_EXTEND,
26272    ZERO_EXTEND or UNKNOWN.  */
26273 
26274 bool
aarch64_gen_adjusted_ldpstp(rtx * operands,bool load,machine_mode mode,RTX_CODE code)26275 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
26276 			     machine_mode mode, RTX_CODE code)
26277 {
26278   rtx base, offset_1, offset_3, t1, t2;
26279   rtx mem_1, mem_2, mem_3, mem_4;
26280   rtx temp_operands[8];
26281   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
26282 		stp_off_upper_limit, stp_off_lower_limit, msize;
26283 
26284   /* We make changes on a copy as we may still bail out.  */
26285   for (int i = 0; i < 8; i ++)
26286     temp_operands[i] = operands[i];
26287 
26288   /* Sort the operands.  Note for cases as below:
26289        [base + 0x310] = A
26290        [base + 0x320] = B
26291        [base + 0x330] = C
26292        [base + 0x320] = D
26293      We need stable sorting otherwise wrong data may be store to offset 0x320.
26294      Also note the dead store in above case should be optimized away, but no
26295      guarantees here.  */
26296   gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *),
26297 		 aarch64_ldrstr_offset_compare);
26298 
26299   /* Copy the memory operands so that if we have to bail for some
26300      reason the original addresses are unchanged.  */
26301   if (load)
26302     {
26303       mem_1 = copy_rtx (temp_operands[1]);
26304       mem_2 = copy_rtx (temp_operands[3]);
26305       mem_3 = copy_rtx (temp_operands[5]);
26306       mem_4 = copy_rtx (temp_operands[7]);
26307     }
26308   else
26309     {
26310       mem_1 = copy_rtx (temp_operands[0]);
26311       mem_2 = copy_rtx (temp_operands[2]);
26312       mem_3 = copy_rtx (temp_operands[4]);
26313       mem_4 = copy_rtx (temp_operands[6]);
26314       gcc_assert (code == UNKNOWN);
26315     }
26316 
26317   extract_base_offset_in_addr (mem_1, &base, &offset_1);
26318   extract_base_offset_in_addr (mem_3, &base, &offset_3);
26319   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
26320 	      && offset_3 != NULL_RTX);
26321 
26322   /* Adjust offset so it can fit in LDP/STP instruction.  */
26323   msize = GET_MODE_SIZE (mode).to_constant();
26324   stp_off_upper_limit = msize * (0x40 - 1);
26325   stp_off_lower_limit = - msize * 0x40;
26326 
26327   off_val_1 = INTVAL (offset_1);
26328   off_val_3 = INTVAL (offset_3);
26329 
26330   /* The base offset is optimally half way between the two STP/LDP offsets.  */
26331   if (msize <= 4)
26332     base_off = (off_val_1 + off_val_3) / 2;
26333   else
26334     /* However, due to issues with negative LDP/STP offset generation for
26335        larger modes, for DF, DI and vector modes. we must not use negative
26336        addresses smaller than 9 signed unadjusted bits can store.  This
26337        provides the most range in this case.  */
26338     base_off = off_val_1;
26339 
26340   /* Adjust the base so that it is aligned with the addresses but still
26341      optimal.  */
26342   if (base_off % msize != off_val_1 % msize)
26343     /* Fix the offset, bearing in mind we want to make it bigger not
26344        smaller.  */
26345     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26346   else if (msize <= 4)
26347     /* The negative range of LDP/STP is one larger than the positive range.  */
26348     base_off += msize;
26349 
26350   /* Check if base offset is too big or too small.  We can attempt to resolve
26351      this issue by setting it to the maximum value and seeing if the offsets
26352      still fit.  */
26353   if (base_off >= 0x1000)
26354     {
26355       base_off = 0x1000 - 1;
26356       /* We must still make sure that the base offset is aligned with respect
26357 	 to the address.  But it may not be made any bigger.  */
26358       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26359     }
26360 
26361   /* Likewise for the case where the base is too small.  */
26362   if (base_off <= -0x1000)
26363     {
26364       base_off = -0x1000 + 1;
26365       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26366     }
26367 
26368   /* Offset of the first STP/LDP.  */
26369   new_off_1 = off_val_1 - base_off;
26370 
26371   /* Offset of the second STP/LDP.  */
26372   new_off_3 = off_val_3 - base_off;
26373 
26374   /* The offsets must be within the range of the LDP/STP instructions.  */
26375   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
26376       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
26377     return false;
26378 
26379   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
26380 						  new_off_1), true);
26381   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
26382 						  new_off_1 + msize), true);
26383   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
26384 						  new_off_3), true);
26385   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
26386 						  new_off_3 + msize), true);
26387 
26388   if (!aarch64_mem_pair_operand (mem_1, mode)
26389       || !aarch64_mem_pair_operand (mem_3, mode))
26390     return false;
26391 
26392   if (code == ZERO_EXTEND)
26393     {
26394       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
26395       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
26396       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
26397       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
26398     }
26399   else if (code == SIGN_EXTEND)
26400     {
26401       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
26402       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
26403       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
26404       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
26405     }
26406 
26407   if (load)
26408     {
26409       operands[0] = temp_operands[0];
26410       operands[1] = mem_1;
26411       operands[2] = temp_operands[2];
26412       operands[3] = mem_2;
26413       operands[4] = temp_operands[4];
26414       operands[5] = mem_3;
26415       operands[6] = temp_operands[6];
26416       operands[7] = mem_4;
26417     }
26418   else
26419     {
26420       operands[0] = mem_1;
26421       operands[1] = temp_operands[1];
26422       operands[2] = mem_2;
26423       operands[3] = temp_operands[3];
26424       operands[4] = mem_3;
26425       operands[5] = temp_operands[5];
26426       operands[6] = mem_4;
26427       operands[7] = temp_operands[7];
26428     }
26429 
26430   /* Emit adjusting instruction.  */
26431   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
26432   /* Emit ldp/stp instructions.  */
26433   t1 = gen_rtx_SET (operands[0], operands[1]);
26434   t2 = gen_rtx_SET (operands[2], operands[3]);
26435   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
26436   t1 = gen_rtx_SET (operands[4], operands[5]);
26437   t2 = gen_rtx_SET (operands[6], operands[7]);
26438   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
26439   return true;
26440 }
26441 
26442 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
26443    it isn't worth branching around empty masked ops (including masked
26444    stores).  */
26445 
26446 static bool
aarch64_empty_mask_is_expensive(unsigned)26447 aarch64_empty_mask_is_expensive (unsigned)
26448 {
26449   return false;
26450 }
26451 
26452 /* Return 1 if pseudo register should be created and used to hold
26453    GOT address for PIC code.  */
26454 
26455 bool
aarch64_use_pseudo_pic_reg(void)26456 aarch64_use_pseudo_pic_reg (void)
26457 {
26458   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
26459 }
26460 
26461 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
26462 
26463 static int
aarch64_unspec_may_trap_p(const_rtx x,unsigned flags)26464 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
26465 {
26466   switch (XINT (x, 1))
26467     {
26468     case UNSPEC_GOTSMALLPIC:
26469     case UNSPEC_GOTSMALLPIC28K:
26470     case UNSPEC_GOTTINYPIC:
26471       return 0;
26472     default:
26473       break;
26474     }
26475 
26476   return default_unspec_may_trap_p (x, flags);
26477 }
26478 
26479 
26480 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
26481    return the log2 of that value.  Otherwise return -1.  */
26482 
26483 int
aarch64_fpconst_pow_of_2(rtx x)26484 aarch64_fpconst_pow_of_2 (rtx x)
26485 {
26486   const REAL_VALUE_TYPE *r;
26487 
26488   if (!CONST_DOUBLE_P (x))
26489     return -1;
26490 
26491   r = CONST_DOUBLE_REAL_VALUE (x);
26492 
26493   if (REAL_VALUE_NEGATIVE (*r)
26494       || REAL_VALUE_ISNAN (*r)
26495       || REAL_VALUE_ISINF (*r)
26496       || !real_isinteger (r, DFmode))
26497     return -1;
26498 
26499   return exact_log2 (real_to_integer (r));
26500 }
26501 
26502 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
26503    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
26504    return n. Otherwise return -1.  */
26505 
26506 int
aarch64_fpconst_pow2_recip(rtx x)26507 aarch64_fpconst_pow2_recip (rtx x)
26508 {
26509   REAL_VALUE_TYPE r0;
26510 
26511   if (!CONST_DOUBLE_P (x))
26512     return -1;
26513 
26514   r0 = *CONST_DOUBLE_REAL_VALUE (x);
26515   if (exact_real_inverse (DFmode, &r0)
26516       && !REAL_VALUE_NEGATIVE (r0))
26517     {
26518 	int ret = exact_log2 (real_to_integer (&r0));
26519 	if (ret >= 1 && ret <= 32)
26520 	    return ret;
26521     }
26522   return -1;
26523 }
26524 
26525 /* If X is a vector of equal CONST_DOUBLE values and that value is
26526    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
26527 
26528 int
aarch64_vec_fpconst_pow_of_2(rtx x)26529 aarch64_vec_fpconst_pow_of_2 (rtx x)
26530 {
26531   int nelts;
26532   if (!CONST_VECTOR_P (x)
26533       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
26534     return -1;
26535 
26536   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
26537     return -1;
26538 
26539   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
26540   if (firstval <= 0)
26541     return -1;
26542 
26543   for (int i = 1; i < nelts; i++)
26544     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
26545       return -1;
26546 
26547   return firstval;
26548 }
26549 
26550 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
26551    to float.
26552 
26553    __fp16 always promotes through this hook.
26554    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
26555    through the generic excess precision logic rather than here.  */
26556 
26557 static tree
aarch64_promoted_type(const_tree t)26558 aarch64_promoted_type (const_tree t)
26559 {
26560   if (SCALAR_FLOAT_TYPE_P (t)
26561       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
26562     return float_type_node;
26563 
26564   return NULL_TREE;
26565 }
26566 
26567 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
26568 
26569 static bool
aarch64_optab_supported_p(int op,machine_mode mode1,machine_mode,optimization_type opt_type)26570 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
26571 			   optimization_type opt_type)
26572 {
26573   switch (op)
26574     {
26575     case rsqrt_optab:
26576       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
26577 
26578     default:
26579       return true;
26580     }
26581 }
26582 
26583 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
26584 
26585 static unsigned int
aarch64_dwarf_poly_indeterminate_value(unsigned int i,unsigned int * factor,int * offset)26586 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
26587 					int *offset)
26588 {
26589   /* Polynomial invariant 1 == (VG / 2) - 1.  */
26590   gcc_assert (i == 1);
26591   *factor = 2;
26592   *offset = 1;
26593   return AARCH64_DWARF_VG;
26594 }
26595 
26596 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
26597    if MODE is HFmode, and punt to the generic implementation otherwise.  */
26598 
26599 static bool
aarch64_libgcc_floating_mode_supported_p(scalar_float_mode mode)26600 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
26601 {
26602   return (mode == HFmode
26603 	  ? true
26604 	  : default_libgcc_floating_mode_supported_p (mode));
26605 }
26606 
26607 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
26608    if MODE is HFmode, and punt to the generic implementation otherwise.  */
26609 
26610 static bool
aarch64_scalar_mode_supported_p(scalar_mode mode)26611 aarch64_scalar_mode_supported_p (scalar_mode mode)
26612 {
26613   return (mode == HFmode
26614 	  ? true
26615 	  : default_scalar_mode_supported_p (mode));
26616 }
26617 
26618 /* Set the value of FLT_EVAL_METHOD.
26619    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
26620 
26621     0: evaluate all operations and constants, whose semantic type has at
26622        most the range and precision of type float, to the range and
26623        precision of float; evaluate all other operations and constants to
26624        the range and precision of the semantic type;
26625 
26626     N, where _FloatN is a supported interchange floating type
26627        evaluate all operations and constants, whose semantic type has at
26628        most the range and precision of _FloatN type, to the range and
26629        precision of the _FloatN type; evaluate all other operations and
26630        constants to the range and precision of the semantic type;
26631 
26632    If we have the ARMv8.2-A extensions then we support _Float16 in native
26633    precision, so we should set this to 16.  Otherwise, we support the type,
26634    but want to evaluate expressions in float precision, so set this to
26635    0.  */
26636 
26637 static enum flt_eval_method
aarch64_excess_precision(enum excess_precision_type type)26638 aarch64_excess_precision (enum excess_precision_type type)
26639 {
26640   switch (type)
26641     {
26642       case EXCESS_PRECISION_TYPE_FAST:
26643       case EXCESS_PRECISION_TYPE_STANDARD:
26644 	/* We can calculate either in 16-bit range and precision or
26645 	   32-bit range and precision.  Make that decision based on whether
26646 	   we have native support for the ARMv8.2-A 16-bit floating-point
26647 	   instructions or not.  */
26648 	return (TARGET_FP_F16INST
26649 		? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
26650 		: FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
26651       case EXCESS_PRECISION_TYPE_IMPLICIT:
26652       case EXCESS_PRECISION_TYPE_FLOAT16:
26653 	return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
26654       default:
26655 	gcc_unreachable ();
26656     }
26657   return FLT_EVAL_METHOD_UNPREDICTABLE;
26658 }
26659 
26660 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
26661    scheduled for speculative execution.  Reject the long-running division
26662    and square-root instructions.  */
26663 
26664 static bool
aarch64_sched_can_speculate_insn(rtx_insn * insn)26665 aarch64_sched_can_speculate_insn (rtx_insn *insn)
26666 {
26667   switch (get_attr_type (insn))
26668     {
26669       case TYPE_SDIV:
26670       case TYPE_UDIV:
26671       case TYPE_FDIVS:
26672       case TYPE_FDIVD:
26673       case TYPE_FSQRTS:
26674       case TYPE_FSQRTD:
26675       case TYPE_NEON_FP_SQRT_S:
26676       case TYPE_NEON_FP_SQRT_D:
26677       case TYPE_NEON_FP_SQRT_S_Q:
26678       case TYPE_NEON_FP_SQRT_D_Q:
26679       case TYPE_NEON_FP_DIV_S:
26680       case TYPE_NEON_FP_DIV_D:
26681       case TYPE_NEON_FP_DIV_S_Q:
26682       case TYPE_NEON_FP_DIV_D_Q:
26683 	return false;
26684       default:
26685 	return true;
26686     }
26687 }
26688 
26689 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
26690 
26691 static int
aarch64_compute_pressure_classes(reg_class * classes)26692 aarch64_compute_pressure_classes (reg_class *classes)
26693 {
26694   int i = 0;
26695   classes[i++] = GENERAL_REGS;
26696   classes[i++] = FP_REGS;
26697   /* PR_REGS isn't a useful pressure class because many predicate pseudo
26698      registers need to go in PR_LO_REGS at some point during their
26699      lifetime.  Splitting it into two halves has the effect of making
26700      all predicates count against PR_LO_REGS, so that we try whenever
26701      possible to restrict the number of live predicates to 8.  This
26702      greatly reduces the amount of spilling in certain loops.  */
26703   classes[i++] = PR_LO_REGS;
26704   classes[i++] = PR_HI_REGS;
26705   return i;
26706 }
26707 
26708 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
26709 
26710 static bool
aarch64_can_change_mode_class(machine_mode from,machine_mode to,reg_class_t)26711 aarch64_can_change_mode_class (machine_mode from,
26712 			       machine_mode to, reg_class_t)
26713 {
26714   unsigned int from_flags = aarch64_classify_vector_mode (from);
26715   unsigned int to_flags = aarch64_classify_vector_mode (to);
26716 
26717   bool from_sve_p = (from_flags & VEC_ANY_SVE);
26718   bool to_sve_p = (to_flags & VEC_ANY_SVE);
26719 
26720   bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
26721   bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
26722 
26723   bool from_pred_p = (from_flags & VEC_SVE_PRED);
26724   bool to_pred_p = (to_flags & VEC_SVE_PRED);
26725 
26726   bool from_full_advsimd_struct_p = (from_flags == (VEC_ADVSIMD | VEC_STRUCT));
26727   bool to_partial_advsimd_struct_p = (to_flags == (VEC_ADVSIMD | VEC_STRUCT
26728 						   | VEC_PARTIAL));
26729 
26730   /* Don't allow changes between predicate modes and other modes.
26731      Only predicate registers can hold predicate modes and only
26732      non-predicate registers can hold non-predicate modes, so any
26733      attempt to mix them would require a round trip through memory.  */
26734   if (from_pred_p != to_pred_p)
26735     return false;
26736 
26737   /* Don't allow changes between partial SVE modes and other modes.
26738      The contents of partial SVE modes are distributed evenly across
26739      the register, whereas GCC expects them to be clustered together.  */
26740   if (from_partial_sve_p != to_partial_sve_p)
26741     return false;
26742 
26743   /* Similarly reject changes between partial SVE modes that have
26744      different patterns of significant and insignificant bits.  */
26745   if (from_partial_sve_p
26746       && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
26747 	  || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
26748     return false;
26749 
26750   /* Don't allow changes between partial and full Advanced SIMD structure
26751      modes.  */
26752   if (from_full_advsimd_struct_p && to_partial_advsimd_struct_p)
26753     return false;
26754 
26755   if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
26756     {
26757       /* Don't allow changes between SVE modes and other modes that might
26758 	 be bigger than 128 bits.  In particular, OImode, CImode and XImode
26759 	 divide into 128-bit quantities while SVE modes divide into
26760 	 BITS_PER_SVE_VECTOR quantities.  */
26761       if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
26762 	return false;
26763       if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
26764 	return false;
26765     }
26766 
26767   if (BYTES_BIG_ENDIAN)
26768     {
26769       /* Don't allow changes between SVE data modes and non-SVE modes.
26770 	 See the comment at the head of aarch64-sve.md for details.  */
26771       if (from_sve_p != to_sve_p)
26772 	return false;
26773 
26774       /* Don't allow changes in element size: lane 0 of the new vector
26775 	 would not then be lane 0 of the old vector.  See the comment
26776 	 above aarch64_maybe_expand_sve_subreg_move for a more detailed
26777 	 description.
26778 
26779 	 In the worst case, this forces a register to be spilled in
26780 	 one mode and reloaded in the other, which handles the
26781 	 endianness correctly.  */
26782       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
26783 	return false;
26784     }
26785   return true;
26786 }
26787 
26788 /* Implement TARGET_EARLY_REMAT_MODES.  */
26789 
26790 static void
aarch64_select_early_remat_modes(sbitmap modes)26791 aarch64_select_early_remat_modes (sbitmap modes)
26792 {
26793   /* SVE values are not normally live across a call, so it should be
26794      worth doing early rematerialization even in VL-specific mode.  */
26795   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
26796     if (aarch64_sve_mode_p ((machine_mode) i))
26797       bitmap_set_bit (modes, i);
26798 }
26799 
26800 /* Override the default target speculation_safe_value.  */
26801 static rtx
aarch64_speculation_safe_value(machine_mode mode,rtx result,rtx val,rtx failval)26802 aarch64_speculation_safe_value (machine_mode mode,
26803 				rtx result, rtx val, rtx failval)
26804 {
26805   /* Maybe we should warn if falling back to hard barriers.  They are
26806      likely to be noticably more expensive than the alternative below.  */
26807   if (!aarch64_track_speculation)
26808     return default_speculation_safe_value (mode, result, val, failval);
26809 
26810   if (!REG_P (val))
26811     val = copy_to_mode_reg (mode, val);
26812 
26813   if (!aarch64_reg_or_zero (failval, mode))
26814     failval = copy_to_mode_reg (mode, failval);
26815 
26816   emit_insn (gen_despeculate_copy (mode, result, val, failval));
26817   return result;
26818 }
26819 
26820 /* Implement TARGET_ESTIMATED_POLY_VALUE.
26821    Look into the tuning structure for an estimate.
26822    KIND specifies the type of requested estimate: min, max or likely.
26823    For cores with a known SVE width all three estimates are the same.
26824    For generic SVE tuning we want to distinguish the maximum estimate from
26825    the minimum and likely ones.
26826    The likely estimate is the same as the minimum in that case to give a
26827    conservative behavior of auto-vectorizing with SVE when it is a win
26828    even for 128-bit SVE.
26829    When SVE width information is available VAL.coeffs[1] is multiplied by
26830    the number of VQ chunks over the initial Advanced SIMD 128 bits.  */
26831 
26832 static HOST_WIDE_INT
aarch64_estimated_poly_value(poly_int64 val,poly_value_estimate_kind kind=POLY_VALUE_LIKELY)26833 aarch64_estimated_poly_value (poly_int64 val,
26834 			      poly_value_estimate_kind kind
26835 				= POLY_VALUE_LIKELY)
26836 {
26837   unsigned int width_source = aarch64_tune_params.sve_width;
26838 
26839   /* If there is no core-specific information then the minimum and likely
26840      values are based on 128-bit vectors and the maximum is based on
26841      the architectural maximum of 2048 bits.  */
26842   if (width_source == SVE_SCALABLE)
26843     switch (kind)
26844       {
26845       case POLY_VALUE_MIN:
26846       case POLY_VALUE_LIKELY:
26847 	return val.coeffs[0];
26848       case POLY_VALUE_MAX:
26849 	  return val.coeffs[0] + val.coeffs[1] * 15;
26850       }
26851 
26852   /* Allow sve_width to be a bitmask of different VL, treating the lowest
26853      as likely.  This could be made more general if future -mtune options
26854      need it to be.  */
26855   if (kind == POLY_VALUE_MAX)
26856     width_source = 1 << floor_log2 (width_source);
26857   else
26858     width_source = least_bit_hwi (width_source);
26859 
26860   /* If the core provides width information, use that.  */
26861   HOST_WIDE_INT over_128 = width_source - 128;
26862   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
26863 }
26864 
26865 
26866 /* Return true for types that could be supported as SIMD return or
26867    argument types.  */
26868 
26869 static bool
supported_simd_type(tree t)26870 supported_simd_type (tree t)
26871 {
26872   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
26873     {
26874       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
26875       return s == 1 || s == 2 || s == 4 || s == 8;
26876     }
26877   return false;
26878 }
26879 
26880 /* Return true for types that currently are supported as SIMD return
26881    or argument types.  */
26882 
26883 static bool
currently_supported_simd_type(tree t,tree b)26884 currently_supported_simd_type (tree t, tree b)
26885 {
26886   if (COMPLEX_FLOAT_TYPE_P (t))
26887     return false;
26888 
26889   if (TYPE_SIZE (t) != TYPE_SIZE (b))
26890     return false;
26891 
26892   return supported_simd_type (t);
26893 }
26894 
26895 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
26896 
26897 static int
aarch64_simd_clone_compute_vecsize_and_simdlen(struct cgraph_node * node,struct cgraph_simd_clone * clonei,tree base_type,int num)26898 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
26899 					struct cgraph_simd_clone *clonei,
26900 					tree base_type, int num)
26901 {
26902   tree t, ret_type;
26903   unsigned int elt_bits, count = 0;
26904   unsigned HOST_WIDE_INT const_simdlen;
26905   poly_uint64 vec_bits;
26906 
26907   if (!TARGET_SIMD)
26908     return 0;
26909 
26910   /* For now, SVE simdclones won't produce illegal simdlen, So only check
26911      const simdlens here.  */
26912   if (maybe_ne (clonei->simdlen, 0U)
26913       && clonei->simdlen.is_constant (&const_simdlen)
26914       && (const_simdlen < 2
26915 	  || const_simdlen > 1024
26916 	  || (const_simdlen & (const_simdlen - 1)) != 0))
26917     {
26918       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26919 		  "unsupported simdlen %wd", const_simdlen);
26920       return 0;
26921     }
26922 
26923   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
26924   if (TREE_CODE (ret_type) != VOID_TYPE
26925       && !currently_supported_simd_type (ret_type, base_type))
26926     {
26927       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
26928 	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26929 		    "GCC does not currently support mixed size types "
26930 		    "for %<simd%> functions");
26931       else if (supported_simd_type (ret_type))
26932 	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26933 		    "GCC does not currently support return type %qT "
26934 		    "for %<simd%> functions", ret_type);
26935       else
26936 	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26937 		    "unsupported return type %qT for %<simd%> functions",
26938 		    ret_type);
26939       return 0;
26940     }
26941 
26942   int i;
26943   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
26944   bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
26945 
26946   for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
26947        t && t != void_list_node; t = TREE_CHAIN (t), i++)
26948     {
26949       tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
26950 
26951       if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
26952 	  && !currently_supported_simd_type (arg_type, base_type))
26953 	{
26954 	  if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
26955 	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26956 			"GCC does not currently support mixed size types "
26957 			"for %<simd%> functions");
26958 	  else
26959 	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26960 			"GCC does not currently support argument type %qT "
26961 			"for %<simd%> functions", arg_type);
26962 	  return 0;
26963 	}
26964     }
26965 
26966   clonei->vecsize_mangle = 'n';
26967   clonei->mask_mode = VOIDmode;
26968   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
26969   if (known_eq (clonei->simdlen, 0U))
26970     {
26971       /* We don't support simdlen == 1.  */
26972       if (known_eq (elt_bits, 64))
26973 	{
26974 	  count = 1;
26975 	  vec_bits = 128;
26976 	}
26977       else
26978 	{
26979 	  count = 2;
26980 	  vec_bits = (num == 0 ? 64 : 128);
26981 	}
26982       clonei->simdlen = exact_div (vec_bits, elt_bits);
26983     }
26984   else
26985     {
26986       count = 1;
26987       vec_bits = clonei->simdlen * elt_bits;
26988       /* For now, SVE simdclones won't produce illegal simdlen, So only check
26989 	 const simdlens here.  */
26990       if (clonei->simdlen.is_constant (&const_simdlen)
26991 	  && maybe_ne (vec_bits, 64U) && maybe_ne (vec_bits, 128U))
26992 	{
26993 	  warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26994 		      "GCC does not currently support simdlen %wd for type %qT",
26995 		      const_simdlen, base_type);
26996 	  return 0;
26997 	}
26998     }
26999 
27000   clonei->vecsize_int = vec_bits;
27001   clonei->vecsize_float = vec_bits;
27002   return count;
27003 }
27004 
27005 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
27006 
27007 static void
aarch64_simd_clone_adjust(struct cgraph_node * node)27008 aarch64_simd_clone_adjust (struct cgraph_node *node)
27009 {
27010   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
27011      use the correct ABI.  */
27012 
27013   tree t = TREE_TYPE (node->decl);
27014   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
27015 					TYPE_ATTRIBUTES (t));
27016 }
27017 
27018 /* Implement TARGET_SIMD_CLONE_USABLE.  */
27019 
27020 static int
aarch64_simd_clone_usable(struct cgraph_node * node)27021 aarch64_simd_clone_usable (struct cgraph_node *node)
27022 {
27023   switch (node->simdclone->vecsize_mangle)
27024     {
27025     case 'n':
27026       if (!TARGET_SIMD)
27027 	return -1;
27028       return 0;
27029     default:
27030       gcc_unreachable ();
27031     }
27032 }
27033 
27034 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
27035 
27036 static int
aarch64_comp_type_attributes(const_tree type1,const_tree type2)27037 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
27038 {
27039   auto check_attr = [&](const char *name) {
27040     tree attr1 = lookup_attribute (name, TYPE_ATTRIBUTES (type1));
27041     tree attr2 = lookup_attribute (name, TYPE_ATTRIBUTES (type2));
27042     if (!attr1 && !attr2)
27043       return true;
27044 
27045     return attr1 && attr2 && attribute_value_equal (attr1, attr2);
27046   };
27047 
27048   if (!check_attr ("aarch64_vector_pcs"))
27049     return 0;
27050   if (!check_attr ("Advanced SIMD type"))
27051     return 0;
27052   if (!check_attr ("SVE type"))
27053     return 0;
27054   if (!check_attr ("SVE sizeless type"))
27055     return 0;
27056   return 1;
27057 }
27058 
27059 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
27060 
27061 static const char *
aarch64_get_multilib_abi_name(void)27062 aarch64_get_multilib_abi_name (void)
27063 {
27064   if (TARGET_BIG_END)
27065     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
27066   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
27067 }
27068 
27069 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
27070    global variable based guard use the default else
27071    return a null tree.  */
27072 static tree
aarch64_stack_protect_guard(void)27073 aarch64_stack_protect_guard (void)
27074 {
27075   if (aarch64_stack_protector_guard == SSP_GLOBAL)
27076     return default_stack_protect_guard ();
27077 
27078   return NULL_TREE;
27079 }
27080 
27081 /* Return the diagnostic message string if conversion from FROMTYPE to
27082    TOTYPE is not allowed, NULL otherwise.  */
27083 
27084 static const char *
aarch64_invalid_conversion(const_tree fromtype,const_tree totype)27085 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
27086 {
27087   if (element_mode (fromtype) != element_mode (totype))
27088     {
27089       /* Do no allow conversions to/from BFmode scalar types.  */
27090       if (TYPE_MODE (fromtype) == BFmode)
27091 	return N_("invalid conversion from type %<bfloat16_t%>");
27092       if (TYPE_MODE (totype) == BFmode)
27093 	return N_("invalid conversion to type %<bfloat16_t%>");
27094     }
27095 
27096   /* Conversion allowed.  */
27097   return NULL;
27098 }
27099 
27100 /* Return the diagnostic message string if the unary operation OP is
27101    not permitted on TYPE, NULL otherwise.  */
27102 
27103 static const char *
aarch64_invalid_unary_op(int op,const_tree type)27104 aarch64_invalid_unary_op (int op, const_tree type)
27105 {
27106   /* Reject all single-operand operations on BFmode except for &.  */
27107   if (element_mode (type) == BFmode && op != ADDR_EXPR)
27108     return N_("operation not permitted on type %<bfloat16_t%>");
27109 
27110   /* Operation allowed.  */
27111   return NULL;
27112 }
27113 
27114 /* Return the diagnostic message string if the binary operation OP is
27115    not permitted on TYPE1 and TYPE2, NULL otherwise.  */
27116 
27117 static const char *
aarch64_invalid_binary_op(int op ATTRIBUTE_UNUSED,const_tree type1,const_tree type2)27118 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
27119 			   const_tree type2)
27120 {
27121   /* Reject all 2-operand operations on BFmode.  */
27122   if (element_mode (type1) == BFmode
27123       || element_mode (type2) == BFmode)
27124     return N_("operation not permitted on type %<bfloat16_t%>");
27125 
27126   if (VECTOR_TYPE_P (type1)
27127       && VECTOR_TYPE_P (type2)
27128       && !TYPE_INDIVISIBLE_P (type1)
27129       && !TYPE_INDIVISIBLE_P (type2)
27130       && (aarch64_sve::builtin_type_p (type1)
27131 	  != aarch64_sve::builtin_type_p (type2)))
27132     return N_("cannot combine GNU and SVE vectors in a binary operation");
27133 
27134   /* Operation allowed.  */
27135   return NULL;
27136 }
27137 
27138 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES.  Here we tell the rest of the
27139    compiler that we automatically ignore the top byte of our pointers, which
27140    allows using -fsanitize=hwaddress.  */
27141 bool
aarch64_can_tag_addresses()27142 aarch64_can_tag_addresses ()
27143 {
27144   return !TARGET_ILP32;
27145 }
27146 
27147 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
27148    section at the end if needed.  */
27149 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND	0xc0000000
27150 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI	(1U << 0)
27151 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC	(1U << 1)
27152 void
aarch64_file_end_indicate_exec_stack()27153 aarch64_file_end_indicate_exec_stack ()
27154 {
27155   file_end_indicate_exec_stack ();
27156 
27157   unsigned feature_1_and = 0;
27158   if (aarch64_bti_enabled ())
27159     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
27160 
27161   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
27162     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
27163 
27164   if (feature_1_and)
27165     {
27166       /* Generate .note.gnu.property section.  */
27167       switch_to_section (get_section (".note.gnu.property",
27168 				      SECTION_NOTYPE, NULL));
27169 
27170       /* PT_NOTE header: namesz, descsz, type.
27171 	 namesz = 4 ("GNU\0")
27172 	 descsz = 16 (Size of the program property array)
27173 		  [(12 + padding) * Number of array elements]
27174 	 type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
27175       assemble_align (POINTER_SIZE);
27176       assemble_integer (GEN_INT (4), 4, 32, 1);
27177       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
27178       assemble_integer (GEN_INT (5), 4, 32, 1);
27179 
27180       /* PT_NOTE name.  */
27181       assemble_string ("GNU", 4);
27182 
27183       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
27184 	 type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
27185 	 datasz = 4
27186 	 data   = feature_1_and.  */
27187       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
27188       assemble_integer (GEN_INT (4), 4, 32, 1);
27189       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
27190 
27191       /* Pad the size of the note to the required alignment.  */
27192       assemble_align (POINTER_SIZE);
27193     }
27194 }
27195 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
27196 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
27197 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
27198 
27199 /* Helper function for straight line speculation.
27200    Return what barrier should be emitted for straight line speculation
27201    mitigation.
27202    When not mitigating against straight line speculation this function returns
27203    an empty string.
27204    When mitigating against straight line speculation, use:
27205    * SB when the v8.5-A SB extension is enabled.
27206    * DSB+ISB otherwise.  */
27207 const char *
aarch64_sls_barrier(int mitigation_required)27208 aarch64_sls_barrier (int mitigation_required)
27209 {
27210   return mitigation_required
27211     ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
27212     : "";
27213 }
27214 
27215 static GTY (()) tree aarch64_sls_shared_thunks[30];
27216 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
27217 const char *indirect_symbol_names[30] = {
27218     "__call_indirect_x0",
27219     "__call_indirect_x1",
27220     "__call_indirect_x2",
27221     "__call_indirect_x3",
27222     "__call_indirect_x4",
27223     "__call_indirect_x5",
27224     "__call_indirect_x6",
27225     "__call_indirect_x7",
27226     "__call_indirect_x8",
27227     "__call_indirect_x9",
27228     "__call_indirect_x10",
27229     "__call_indirect_x11",
27230     "__call_indirect_x12",
27231     "__call_indirect_x13",
27232     "__call_indirect_x14",
27233     "__call_indirect_x15",
27234     "", /* "__call_indirect_x16",  */
27235     "", /* "__call_indirect_x17",  */
27236     "__call_indirect_x18",
27237     "__call_indirect_x19",
27238     "__call_indirect_x20",
27239     "__call_indirect_x21",
27240     "__call_indirect_x22",
27241     "__call_indirect_x23",
27242     "__call_indirect_x24",
27243     "__call_indirect_x25",
27244     "__call_indirect_x26",
27245     "__call_indirect_x27",
27246     "__call_indirect_x28",
27247     "__call_indirect_x29",
27248 };
27249 
27250 /* Function to create a BLR thunk.  This thunk is used to mitigate straight
27251    line speculation.  Instead of a simple BLR that can be speculated past,
27252    we emit a BL to this thunk, and this thunk contains a BR to the relevant
27253    register.  These thunks have the relevant speculation barries put after
27254    their indirect branch so that speculation is blocked.
27255 
27256    We use such a thunk so the speculation barriers are kept off the
27257    architecturally executed path in order to reduce the performance overhead.
27258 
27259    When optimizing for size we use stubs shared by the linked object.
27260    When optimizing for performance we emit stubs for each function in the hope
27261    that the branch predictor can better train on jumps specific for a given
27262    function.  */
27263 rtx
aarch64_sls_create_blr_label(int regnum)27264 aarch64_sls_create_blr_label (int regnum)
27265 {
27266   gcc_assert (STUB_REGNUM_P (regnum));
27267   if (optimize_function_for_size_p (cfun))
27268     {
27269       /* For the thunks shared between different functions in this compilation
27270 	 unit we use a named symbol -- this is just for users to more easily
27271 	 understand the generated assembly.  */
27272       aarch64_sls_shared_thunks_needed = true;
27273       const char *thunk_name = indirect_symbol_names[regnum];
27274       if (aarch64_sls_shared_thunks[regnum] == NULL)
27275 	{
27276 	  /* Build a decl representing this function stub and record it for
27277 	     later.  We build a decl here so we can use the GCC machinery for
27278 	     handling sections automatically (through `get_named_section` and
27279 	     `make_decl_one_only`).  That saves us a lot of trouble handling
27280 	     the specifics of different output file formats.  */
27281 	  tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
27282 				  get_identifier (thunk_name),
27283 				  build_function_type_list (void_type_node,
27284 							    NULL_TREE));
27285 	  DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
27286 					   NULL_TREE, void_type_node);
27287 	  TREE_PUBLIC (decl) = 1;
27288 	  TREE_STATIC (decl) = 1;
27289 	  DECL_IGNORED_P (decl) = 1;
27290 	  DECL_ARTIFICIAL (decl) = 1;
27291 	  make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
27292 	  resolve_unique_section (decl, 0, false);
27293 	  aarch64_sls_shared_thunks[regnum] = decl;
27294 	}
27295 
27296       return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
27297     }
27298 
27299   if (cfun->machine->call_via[regnum] == NULL)
27300     cfun->machine->call_via[regnum]
27301       = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
27302   return cfun->machine->call_via[regnum];
27303 }
27304 
27305 /* Helper function for aarch64_sls_emit_blr_function_thunks and
27306    aarch64_sls_emit_shared_blr_thunks below.  */
27307 static void
aarch64_sls_emit_function_stub(FILE * out_file,int regnum)27308 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
27309 {
27310   /* Save in x16 and branch to that function so this transformation does
27311      not prevent jumping to `BTI c` instructions.  */
27312   asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
27313   asm_fprintf (out_file, "\tbr\tx16\n");
27314 }
27315 
27316 /* Emit all BLR stubs for this particular function.
27317    Here we emit all the BLR stubs needed for the current function.  Since we
27318    emit these stubs in a consecutive block we know there will be no speculation
27319    gadgets between each stub, and hence we only emit a speculation barrier at
27320    the end of the stub sequences.
27321 
27322    This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook.  */
27323 void
aarch64_sls_emit_blr_function_thunks(FILE * out_file)27324 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
27325 {
27326   if (! aarch64_harden_sls_blr_p ())
27327     return;
27328 
27329   bool any_functions_emitted = false;
27330   /* We must save and restore the current function section since this assembly
27331      is emitted at the end of the function.  This means it can be emitted *just
27332      after* the cold section of a function.  That cold part would be emitted in
27333      a different section.  That switch would trigger a `.cfi_endproc` directive
27334      to be emitted in the original section and a `.cfi_startproc` directive to
27335      be emitted in the new section.  Switching to the original section without
27336      restoring would mean that the `.cfi_endproc` emitted as a function ends
27337      would happen in a different section -- leaving an unmatched
27338      `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
27339      in the standard text section.  */
27340   section *save_text_section = in_section;
27341   switch_to_section (function_section (current_function_decl));
27342   for (int regnum = 0; regnum < 30; ++regnum)
27343     {
27344       rtx specu_label = cfun->machine->call_via[regnum];
27345       if (specu_label == NULL)
27346 	continue;
27347 
27348       targetm.asm_out.print_operand (out_file, specu_label, 0);
27349       asm_fprintf (out_file, ":\n");
27350       aarch64_sls_emit_function_stub (out_file, regnum);
27351       any_functions_emitted = true;
27352     }
27353   if (any_functions_emitted)
27354     /* Can use the SB if needs be here, since this stub will only be used
27355       by the current function, and hence for the current target.  */
27356     asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
27357   switch_to_section (save_text_section);
27358 }
27359 
27360 /* Emit shared BLR stubs for the current compilation unit.
27361    Over the course of compiling this unit we may have converted some BLR
27362    instructions to a BL to a shared stub function.  This is where we emit those
27363    stub functions.
27364    This function is for the stubs shared between different functions in this
27365    compilation unit.  We share when optimizing for size instead of speed.
27366 
27367    This function is called through the TARGET_ASM_FILE_END hook.  */
27368 void
aarch64_sls_emit_shared_blr_thunks(FILE * out_file)27369 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
27370 {
27371   if (! aarch64_sls_shared_thunks_needed)
27372     return;
27373 
27374   for (int regnum = 0; regnum < 30; ++regnum)
27375     {
27376       tree decl = aarch64_sls_shared_thunks[regnum];
27377       if (!decl)
27378 	continue;
27379 
27380       const char *name = indirect_symbol_names[regnum];
27381       switch_to_section (get_named_section (decl, NULL, 0));
27382       ASM_OUTPUT_ALIGN (out_file, 2);
27383       targetm.asm_out.globalize_label (out_file, name);
27384       /* Only emits if the compiler is configured for an assembler that can
27385 	 handle visibility directives.  */
27386       targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
27387       ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
27388       ASM_OUTPUT_LABEL (out_file, name);
27389       aarch64_sls_emit_function_stub (out_file, regnum);
27390       /* Use the most conservative target to ensure it can always be used by any
27391 	 function in the translation unit.  */
27392       asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
27393       ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
27394     }
27395 }
27396 
27397 /* Implement TARGET_ASM_FILE_END.  */
27398 void
aarch64_asm_file_end()27399 aarch64_asm_file_end ()
27400 {
27401   aarch64_sls_emit_shared_blr_thunks (asm_out_file);
27402   /* Since this function will be called for the ASM_FILE_END hook, we ensure
27403      that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
27404      for FreeBSD) still gets called.  */
27405 #ifdef TARGET_ASM_FILE_END
27406   TARGET_ASM_FILE_END ();
27407 #endif
27408 }
27409 
27410 const char *
aarch64_indirect_call_asm(rtx addr)27411 aarch64_indirect_call_asm (rtx addr)
27412 {
27413   gcc_assert (REG_P (addr));
27414   if (aarch64_harden_sls_blr_p ())
27415     {
27416       rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
27417       output_asm_insn ("bl\t%0", &stub_label);
27418     }
27419   else
27420    output_asm_insn ("blr\t%0", &addr);
27421   return "";
27422 }
27423 
27424 /* Target-specific selftests.  */
27425 
27426 #if CHECKING_P
27427 
27428 namespace selftest {
27429 
27430 /* Selftest for the RTL loader.
27431    Verify that the RTL loader copes with a dump from
27432    print_rtx_function.  This is essentially just a test that class
27433    function_reader can handle a real dump, but it also verifies
27434    that lookup_reg_by_dump_name correctly handles hard regs.
27435    The presence of hard reg names in the dump means that the test is
27436    target-specific, hence it is in this file.  */
27437 
27438 static void
aarch64_test_loading_full_dump()27439 aarch64_test_loading_full_dump ()
27440 {
27441   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
27442 
27443   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
27444 
27445   rtx_insn *insn_1 = get_insn_by_uid (1);
27446   ASSERT_EQ (NOTE, GET_CODE (insn_1));
27447 
27448   rtx_insn *insn_15 = get_insn_by_uid (15);
27449   ASSERT_EQ (INSN, GET_CODE (insn_15));
27450   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
27451 
27452   /* Verify crtl->return_rtx.  */
27453   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
27454   ASSERT_EQ (0, REGNO (crtl->return_rtx));
27455   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
27456 }
27457 
27458 /* Test the fractional_cost class.  */
27459 
27460 static void
aarch64_test_fractional_cost()27461 aarch64_test_fractional_cost ()
27462 {
27463   using cf = fractional_cost;
27464 
27465   ASSERT_EQ (cf (0, 20), 0);
27466 
27467   ASSERT_EQ (cf (4, 2), 2);
27468   ASSERT_EQ (3, cf (9, 3));
27469 
27470   ASSERT_NE (cf (5, 2), 2);
27471   ASSERT_NE (3, cf (8, 3));
27472 
27473   ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
27474   ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
27475   ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
27476 
27477   ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
27478   ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
27479   ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
27480   ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
27481   ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
27482   ASSERT_EQ (3 - cf (10, 3), 0);
27483 
27484   ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
27485   ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
27486 
27487   ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27488   ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27489   ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27490   ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27491   ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27492   ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27493   ASSERT_TRUE (cf (239, 240) < 1);
27494   ASSERT_FALSE (cf (240, 240) < 1);
27495   ASSERT_FALSE (cf (241, 240) < 1);
27496   ASSERT_FALSE (2 < cf (207, 104));
27497   ASSERT_FALSE (2 < cf (208, 104));
27498   ASSERT_TRUE (2 < cf (209, 104));
27499 
27500   ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27501   ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27502   ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27503   ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27504   ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27505   ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27506   ASSERT_TRUE (cf (239, 240) < 1);
27507   ASSERT_FALSE (cf (240, 240) < 1);
27508   ASSERT_FALSE (cf (241, 240) < 1);
27509   ASSERT_FALSE (2 < cf (207, 104));
27510   ASSERT_FALSE (2 < cf (208, 104));
27511   ASSERT_TRUE (2 < cf (209, 104));
27512 
27513   ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
27514   ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
27515   ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
27516   ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
27517   ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
27518   ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
27519   ASSERT_FALSE (cf (239, 240) >= 1);
27520   ASSERT_TRUE (cf (240, 240) >= 1);
27521   ASSERT_TRUE (cf (241, 240) >= 1);
27522   ASSERT_TRUE (2 >= cf (207, 104));
27523   ASSERT_TRUE (2 >= cf (208, 104));
27524   ASSERT_FALSE (2 >= cf (209, 104));
27525 
27526   ASSERT_FALSE (cf (4, 15) > cf (5, 15));
27527   ASSERT_FALSE (cf (5, 15) > cf (5, 15));
27528   ASSERT_TRUE (cf (6, 15) > cf (5, 15));
27529   ASSERT_FALSE (cf (1, 3) > cf (2, 5));
27530   ASSERT_FALSE (cf (1, 12) > cf (1, 6));
27531   ASSERT_FALSE (cf (5, 3) > cf (5, 3));
27532   ASSERT_FALSE (cf (239, 240) > 1);
27533   ASSERT_FALSE (cf (240, 240) > 1);
27534   ASSERT_TRUE (cf (241, 240) > 1);
27535   ASSERT_TRUE (2 > cf (207, 104));
27536   ASSERT_FALSE (2 > cf (208, 104));
27537   ASSERT_FALSE (2 > cf (209, 104));
27538 
27539   ASSERT_EQ (cf (1, 2).ceil (), 1);
27540   ASSERT_EQ (cf (11, 7).ceil (), 2);
27541   ASSERT_EQ (cf (20, 1).ceil (), 20);
27542   ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
27543   ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
27544   ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
27545   ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
27546   ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
27547 
27548   ASSERT_EQ (cf (1, 2).as_double (), 0.5);
27549 }
27550 
27551 /* Run all target-specific selftests.  */
27552 
27553 static void
aarch64_run_selftests(void)27554 aarch64_run_selftests (void)
27555 {
27556   aarch64_test_loading_full_dump ();
27557   aarch64_test_fractional_cost ();
27558 }
27559 
27560 } // namespace selftest
27561 
27562 #endif /* #if CHECKING_P */
27563 
27564 #undef TARGET_STACK_PROTECT_GUARD
27565 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
27566 
27567 #undef TARGET_ADDRESS_COST
27568 #define TARGET_ADDRESS_COST aarch64_address_cost
27569 
27570 /* This hook will determines whether unnamed bitfields affect the alignment
27571    of the containing structure.  The hook returns true if the structure
27572    should inherit the alignment requirements of an unnamed bitfield's
27573    type.  */
27574 #undef TARGET_ALIGN_ANON_BITFIELD
27575 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
27576 
27577 #undef TARGET_ASM_ALIGNED_DI_OP
27578 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
27579 
27580 #undef TARGET_ASM_ALIGNED_HI_OP
27581 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
27582 
27583 #undef TARGET_ASM_ALIGNED_SI_OP
27584 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
27585 
27586 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
27587 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
27588   hook_bool_const_tree_hwi_hwi_const_tree_true
27589 
27590 #undef TARGET_ASM_FILE_START
27591 #define TARGET_ASM_FILE_START aarch64_start_file
27592 
27593 #undef TARGET_ASM_OUTPUT_MI_THUNK
27594 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
27595 
27596 #undef TARGET_ASM_SELECT_RTX_SECTION
27597 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
27598 
27599 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
27600 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
27601 
27602 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
27603 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
27604 
27605 #undef TARGET_BUILD_BUILTIN_VA_LIST
27606 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
27607 
27608 #undef TARGET_CALLEE_COPIES
27609 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
27610 
27611 #undef TARGET_CAN_ELIMINATE
27612 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
27613 
27614 #undef TARGET_CAN_INLINE_P
27615 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
27616 
27617 #undef TARGET_CANNOT_FORCE_CONST_MEM
27618 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
27619 
27620 #undef TARGET_CASE_VALUES_THRESHOLD
27621 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
27622 
27623 #undef TARGET_CONDITIONAL_REGISTER_USAGE
27624 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
27625 
27626 #undef TARGET_MEMBER_TYPE_FORCES_BLK
27627 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
27628 
27629 /* Only the least significant bit is used for initialization guard
27630    variables.  */
27631 #undef TARGET_CXX_GUARD_MASK_BIT
27632 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
27633 
27634 #undef TARGET_C_MODE_FOR_SUFFIX
27635 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
27636 
27637 #ifdef TARGET_BIG_ENDIAN_DEFAULT
27638 #undef  TARGET_DEFAULT_TARGET_FLAGS
27639 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
27640 #endif
27641 
27642 #undef TARGET_CLASS_MAX_NREGS
27643 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
27644 
27645 #undef TARGET_BUILTIN_DECL
27646 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
27647 
27648 #undef TARGET_BUILTIN_RECIPROCAL
27649 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
27650 
27651 #undef TARGET_C_EXCESS_PRECISION
27652 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
27653 
27654 #undef  TARGET_EXPAND_BUILTIN
27655 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
27656 
27657 #undef TARGET_EXPAND_BUILTIN_VA_START
27658 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
27659 
27660 #undef TARGET_FOLD_BUILTIN
27661 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
27662 
27663 #undef TARGET_FUNCTION_ARG
27664 #define TARGET_FUNCTION_ARG aarch64_function_arg
27665 
27666 #undef TARGET_FUNCTION_ARG_ADVANCE
27667 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
27668 
27669 #undef TARGET_FUNCTION_ARG_BOUNDARY
27670 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
27671 
27672 #undef TARGET_FUNCTION_ARG_PADDING
27673 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
27674 
27675 #undef TARGET_GET_RAW_RESULT_MODE
27676 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
27677 #undef TARGET_GET_RAW_ARG_MODE
27678 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
27679 
27680 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
27681 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
27682 
27683 #undef TARGET_FUNCTION_VALUE
27684 #define TARGET_FUNCTION_VALUE aarch64_function_value
27685 
27686 #undef TARGET_FUNCTION_VALUE_REGNO_P
27687 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
27688 
27689 #undef TARGET_GIMPLE_FOLD_BUILTIN
27690 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
27691 
27692 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
27693 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
27694 
27695 #undef  TARGET_INIT_BUILTINS
27696 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
27697 
27698 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
27699 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
27700   aarch64_ira_change_pseudo_allocno_class
27701 
27702 #undef TARGET_LEGITIMATE_ADDRESS_P
27703 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
27704 
27705 #undef TARGET_LEGITIMATE_CONSTANT_P
27706 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
27707 
27708 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
27709 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
27710   aarch64_legitimize_address_displacement
27711 
27712 #undef TARGET_LIBGCC_CMP_RETURN_MODE
27713 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
27714 
27715 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
27716 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
27717 aarch64_libgcc_floating_mode_supported_p
27718 
27719 #undef TARGET_MANGLE_TYPE
27720 #define TARGET_MANGLE_TYPE aarch64_mangle_type
27721 
27722 #undef TARGET_INVALID_CONVERSION
27723 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
27724 
27725 #undef TARGET_INVALID_UNARY_OP
27726 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
27727 
27728 #undef TARGET_INVALID_BINARY_OP
27729 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
27730 
27731 #undef TARGET_VERIFY_TYPE_CONTEXT
27732 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
27733 
27734 #undef TARGET_MEMORY_MOVE_COST
27735 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
27736 
27737 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
27738 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
27739 
27740 #undef TARGET_MUST_PASS_IN_STACK
27741 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
27742 
27743 /* This target hook should return true if accesses to volatile bitfields
27744    should use the narrowest mode possible.  It should return false if these
27745    accesses should use the bitfield container type.  */
27746 #undef TARGET_NARROW_VOLATILE_BITFIELD
27747 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
27748 
27749 #undef  TARGET_OPTION_OVERRIDE
27750 #define TARGET_OPTION_OVERRIDE aarch64_override_options
27751 
27752 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
27753 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
27754   aarch64_override_options_after_change
27755 
27756 #undef TARGET_OFFLOAD_OPTIONS
27757 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
27758 
27759 #undef TARGET_OPTION_SAVE
27760 #define TARGET_OPTION_SAVE aarch64_option_save
27761 
27762 #undef TARGET_OPTION_RESTORE
27763 #define TARGET_OPTION_RESTORE aarch64_option_restore
27764 
27765 #undef TARGET_OPTION_PRINT
27766 #define TARGET_OPTION_PRINT aarch64_option_print
27767 
27768 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
27769 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
27770 
27771 #undef TARGET_SET_CURRENT_FUNCTION
27772 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
27773 
27774 #undef TARGET_PASS_BY_REFERENCE
27775 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
27776 
27777 #undef TARGET_PREFERRED_RELOAD_CLASS
27778 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
27779 
27780 #undef TARGET_SCHED_REASSOCIATION_WIDTH
27781 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
27782 
27783 #undef TARGET_PROMOTED_TYPE
27784 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
27785 
27786 #undef TARGET_SECONDARY_RELOAD
27787 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
27788 
27789 #undef TARGET_SHIFT_TRUNCATION_MASK
27790 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
27791 
27792 #undef TARGET_SETUP_INCOMING_VARARGS
27793 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
27794 
27795 #undef TARGET_STRUCT_VALUE_RTX
27796 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
27797 
27798 #undef TARGET_REGISTER_MOVE_COST
27799 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
27800 
27801 #undef TARGET_RETURN_IN_MEMORY
27802 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
27803 
27804 #undef TARGET_RETURN_IN_MSB
27805 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
27806 
27807 #undef TARGET_RTX_COSTS
27808 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
27809 
27810 #undef TARGET_SCALAR_MODE_SUPPORTED_P
27811 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
27812 
27813 #undef TARGET_SCHED_ISSUE_RATE
27814 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
27815 
27816 #undef TARGET_SCHED_VARIABLE_ISSUE
27817 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
27818 
27819 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
27820 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
27821   aarch64_sched_first_cycle_multipass_dfa_lookahead
27822 
27823 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
27824 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
27825   aarch64_first_cycle_multipass_dfa_lookahead_guard
27826 
27827 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
27828 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
27829   aarch64_get_separate_components
27830 
27831 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
27832 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
27833   aarch64_components_for_bb
27834 
27835 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
27836 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
27837   aarch64_disqualify_components
27838 
27839 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
27840 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
27841   aarch64_emit_prologue_components
27842 
27843 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
27844 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
27845   aarch64_emit_epilogue_components
27846 
27847 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
27848 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
27849   aarch64_set_handled_components
27850 
27851 #undef TARGET_TRAMPOLINE_INIT
27852 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
27853 
27854 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
27855 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
27856 
27857 #undef TARGET_VECTOR_MODE_SUPPORTED_P
27858 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
27859 
27860 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
27861 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
27862 
27863 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
27864 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
27865   aarch64_builtin_support_vector_misalignment
27866 
27867 #undef TARGET_ARRAY_MODE
27868 #define TARGET_ARRAY_MODE aarch64_array_mode
27869 
27870 #undef TARGET_ARRAY_MODE_SUPPORTED_P
27871 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
27872 
27873 #undef TARGET_VECTORIZE_CREATE_COSTS
27874 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
27875 
27876 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
27877 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
27878   aarch64_builtin_vectorization_cost
27879 
27880 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
27881 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
27882 
27883 #undef TARGET_VECTORIZE_BUILTINS
27884 #define TARGET_VECTORIZE_BUILTINS
27885 
27886 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
27887 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
27888   aarch64_builtin_vectorized_function
27889 
27890 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
27891 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
27892   aarch64_autovectorize_vector_modes
27893 
27894 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
27895 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
27896   aarch64_atomic_assign_expand_fenv
27897 
27898 /* Section anchor support.  */
27899 
27900 #undef TARGET_MIN_ANCHOR_OFFSET
27901 #define TARGET_MIN_ANCHOR_OFFSET -256
27902 
27903 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
27904    byte offset; we can do much more for larger data types, but have no way
27905    to determine the size of the access.  We assume accesses are aligned.  */
27906 #undef TARGET_MAX_ANCHOR_OFFSET
27907 #define TARGET_MAX_ANCHOR_OFFSET 4095
27908 
27909 #undef TARGET_VECTOR_ALIGNMENT
27910 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
27911 
27912 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
27913 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
27914   aarch64_vectorize_preferred_vector_alignment
27915 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
27916 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
27917   aarch64_simd_vector_alignment_reachable
27918 
27919 /* vec_perm support.  */
27920 
27921 #undef TARGET_VECTORIZE_VEC_PERM_CONST
27922 #define TARGET_VECTORIZE_VEC_PERM_CONST \
27923   aarch64_vectorize_vec_perm_const
27924 
27925 #undef TARGET_VECTORIZE_RELATED_MODE
27926 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
27927 #undef TARGET_VECTORIZE_GET_MASK_MODE
27928 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
27929 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
27930 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
27931   aarch64_empty_mask_is_expensive
27932 #undef TARGET_PREFERRED_ELSE_VALUE
27933 #define TARGET_PREFERRED_ELSE_VALUE \
27934   aarch64_preferred_else_value
27935 
27936 #undef TARGET_INIT_LIBFUNCS
27937 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
27938 
27939 #undef TARGET_FIXED_CONDITION_CODE_REGS
27940 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
27941 
27942 #undef TARGET_FLAGS_REGNUM
27943 #define TARGET_FLAGS_REGNUM CC_REGNUM
27944 
27945 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
27946 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
27947 
27948 #undef TARGET_ASAN_SHADOW_OFFSET
27949 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
27950 
27951 #undef TARGET_LEGITIMIZE_ADDRESS
27952 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
27953 
27954 #undef TARGET_SCHED_CAN_SPECULATE_INSN
27955 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
27956 
27957 #undef TARGET_CAN_USE_DOLOOP_P
27958 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
27959 
27960 #undef TARGET_SCHED_ADJUST_PRIORITY
27961 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
27962 
27963 #undef TARGET_SCHED_MACRO_FUSION_P
27964 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
27965 
27966 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
27967 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
27968 
27969 #undef TARGET_SCHED_FUSION_PRIORITY
27970 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
27971 
27972 #undef TARGET_UNSPEC_MAY_TRAP_P
27973 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
27974 
27975 #undef TARGET_USE_PSEUDO_PIC_REG
27976 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
27977 
27978 #undef TARGET_PRINT_OPERAND
27979 #define TARGET_PRINT_OPERAND aarch64_print_operand
27980 
27981 #undef TARGET_PRINT_OPERAND_ADDRESS
27982 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
27983 
27984 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
27985 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
27986 
27987 #undef TARGET_OPTAB_SUPPORTED_P
27988 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
27989 
27990 #undef TARGET_OMIT_STRUCT_RETURN_REG
27991 #define TARGET_OMIT_STRUCT_RETURN_REG true
27992 
27993 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
27994 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
27995   aarch64_dwarf_poly_indeterminate_value
27996 
27997 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
27998 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
27999 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
28000 
28001 #undef TARGET_HARD_REGNO_NREGS
28002 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
28003 #undef TARGET_HARD_REGNO_MODE_OK
28004 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
28005 
28006 #undef TARGET_MODES_TIEABLE_P
28007 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
28008 
28009 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
28010 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
28011   aarch64_hard_regno_call_part_clobbered
28012 
28013 #undef TARGET_INSN_CALLEE_ABI
28014 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
28015 
28016 #undef TARGET_CONSTANT_ALIGNMENT
28017 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
28018 
28019 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
28020 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
28021   aarch64_stack_clash_protection_alloca_probe_range
28022 
28023 #undef TARGET_COMPUTE_PRESSURE_CLASSES
28024 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
28025 
28026 #undef TARGET_CAN_CHANGE_MODE_CLASS
28027 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
28028 
28029 #undef TARGET_SELECT_EARLY_REMAT_MODES
28030 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
28031 
28032 #undef TARGET_SPECULATION_SAFE_VALUE
28033 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
28034 
28035 #undef TARGET_ESTIMATED_POLY_VALUE
28036 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
28037 
28038 #undef TARGET_ATTRIBUTE_TABLE
28039 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
28040 
28041 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
28042 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
28043   aarch64_simd_clone_compute_vecsize_and_simdlen
28044 
28045 #undef TARGET_SIMD_CLONE_ADJUST
28046 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
28047 
28048 #undef TARGET_SIMD_CLONE_USABLE
28049 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
28050 
28051 #undef TARGET_COMP_TYPE_ATTRIBUTES
28052 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
28053 
28054 #undef TARGET_GET_MULTILIB_ABI_NAME
28055 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
28056 
28057 #undef TARGET_FNTYPE_ABI
28058 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
28059 
28060 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
28061 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
28062 
28063 #if CHECKING_P
28064 #undef TARGET_RUN_TARGET_SELFTESTS
28065 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
28066 #endif /* #if CHECKING_P */
28067 
28068 #undef TARGET_ASM_POST_CFI_STARTPROC
28069 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
28070 
28071 #undef TARGET_STRICT_ARGUMENT_NAMING
28072 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
28073 
28074 #undef TARGET_MD_ASM_ADJUST
28075 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
28076 
28077 #undef TARGET_ASM_FILE_END
28078 #define TARGET_ASM_FILE_END aarch64_asm_file_end
28079 
28080 #undef TARGET_ASM_FUNCTION_EPILOGUE
28081 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
28082 
28083 #undef TARGET_HAVE_SHADOW_CALL_STACK
28084 #define TARGET_HAVE_SHADOW_CALL_STACK true
28085 
28086 struct gcc_target targetm = TARGET_INITIALIZER;
28087 
28088 #include "gt-aarch64.h"
28089