xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/config/aarch64/aarch64.c (revision 9fb66d812c00ebfb445c0b47dea128f32aa6fe96)
1 /* Machine description for AArch64 architecture.
2    Copyright (C) 2009-2019 Free Software Foundation, Inc.
3    Contributed by ARM Ltd.
4 
5    This file is part of GCC.
6 
7    GCC is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3, or (at your option)
10    any later version.
11 
12    GCC is distributed in the hope that it will be useful, but
13    WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15    General Public License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GCC; see the file COPYING3.  If not see
19    <http://www.gnu.org/licenses/>.  */
20 
21 #define IN_TARGET_CODE 1
22 
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "params.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76 
77 /* This file should be included last.  */
78 #include "target-def.h"
79 
80 /* Defined for convenience.  */
81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
82 
83 /* Information about a legitimate vector immediate operand.  */
84 struct simd_immediate_info
85 {
86   enum insn_type { MOV, MVN };
87   enum modifier_type { LSL, MSL };
88 
89   simd_immediate_info () {}
90   simd_immediate_info (scalar_float_mode, rtx);
91   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
92 		       insn_type = MOV, modifier_type = LSL,
93 		       unsigned int = 0);
94   simd_immediate_info (scalar_mode, rtx, rtx);
95 
96   /* The mode of the elements.  */
97   scalar_mode elt_mode;
98 
99   /* The value of each element if all elements are the same, or the
100      first value if the constant is a series.  */
101   rtx value;
102 
103   /* The value of the step if the constant is a series, null otherwise.  */
104   rtx step;
105 
106   /* The instruction to use to move the immediate into a vector.  */
107   insn_type insn;
108 
109   /* The kind of shift modifier to use, and the number of bits to shift.
110      This is (LSL, 0) if no shift is needed.  */
111   modifier_type modifier;
112   unsigned int shift;
113 };
114 
115 /* Construct a floating-point immediate in which each element has mode
116    ELT_MODE_IN and value VALUE_IN.  */
117 inline simd_immediate_info
118 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
119   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
120     modifier (LSL), shift (0)
121 {}
122 
123 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
124    and value VALUE_IN.  The other parameters are as for the structure
125    fields.  */
126 inline simd_immediate_info
127 ::simd_immediate_info (scalar_int_mode elt_mode_in,
128 		       unsigned HOST_WIDE_INT value_in,
129 		       insn_type insn_in, modifier_type modifier_in,
130 		       unsigned int shift_in)
131   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
132     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
133 {}
134 
135 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
136    and where element I is equal to VALUE_IN + I * STEP_IN.  */
137 inline simd_immediate_info
138 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
139   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
140     modifier (LSL), shift (0)
141 {}
142 
143 /* The current code model.  */
144 enum aarch64_code_model aarch64_cmodel;
145 
146 /* The number of 64-bit elements in an SVE vector.  */
147 poly_uint16 aarch64_sve_vg;
148 
149 #ifdef HAVE_AS_TLS
150 #undef TARGET_HAVE_TLS
151 #define TARGET_HAVE_TLS 1
152 #endif
153 
154 static bool aarch64_composite_type_p (const_tree, machine_mode);
155 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
156 						     const_tree,
157 						     machine_mode *, int *,
158 						     bool *);
159 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
160 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
161 static void aarch64_override_options_after_change (void);
162 static bool aarch64_vector_mode_supported_p (machine_mode);
163 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
164 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
165 							 const_tree type,
166 							 int misalignment,
167 							 bool is_packed);
168 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
169 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
170 					    aarch64_addr_query_type);
171 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
172 
173 /* Major revision number of the ARM Architecture implemented by the target.  */
174 unsigned aarch64_architecture_version;
175 
176 /* The processor for which instructions should be scheduled.  */
177 enum aarch64_processor aarch64_tune = cortexa53;
178 
179 /* Mask to specify which instruction scheduling options should be used.  */
180 unsigned long aarch64_tune_flags = 0;
181 
182 /* Global flag for PC relative loads.  */
183 bool aarch64_pcrelative_literal_loads;
184 
185 /* Global flag for whether frame pointer is enabled.  */
186 bool aarch64_use_frame_pointer;
187 
188 #define BRANCH_PROTECT_STR_MAX 255
189 char *accepted_branch_protection_string = NULL;
190 
191 static enum aarch64_parse_opt_result
192 aarch64_parse_branch_protection (const char*, char**);
193 
194 /* Support for command line parsing of boolean flags in the tuning
195    structures.  */
196 struct aarch64_flag_desc
197 {
198   const char* name;
199   unsigned int flag;
200 };
201 
202 #define AARCH64_FUSION_PAIR(name, internal_name) \
203   { name, AARCH64_FUSE_##internal_name },
204 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
205 {
206   { "none", AARCH64_FUSE_NOTHING },
207 #include "aarch64-fusion-pairs.def"
208   { "all", AARCH64_FUSE_ALL },
209   { NULL, AARCH64_FUSE_NOTHING }
210 };
211 
212 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
213   { name, AARCH64_EXTRA_TUNE_##internal_name },
214 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
215 {
216   { "none", AARCH64_EXTRA_TUNE_NONE },
217 #include "aarch64-tuning-flags.def"
218   { "all", AARCH64_EXTRA_TUNE_ALL },
219   { NULL, AARCH64_EXTRA_TUNE_NONE }
220 };
221 
222 /* Tuning parameters.  */
223 
224 static const struct cpu_addrcost_table generic_addrcost_table =
225 {
226     {
227       1, /* hi  */
228       0, /* si  */
229       0, /* di  */
230       1, /* ti  */
231     },
232   0, /* pre_modify  */
233   0, /* post_modify  */
234   0, /* register_offset  */
235   0, /* register_sextend  */
236   0, /* register_zextend  */
237   0 /* imm_offset  */
238 };
239 
240 static const struct cpu_addrcost_table exynosm1_addrcost_table =
241 {
242     {
243       0, /* hi  */
244       0, /* si  */
245       0, /* di  */
246       2, /* ti  */
247     },
248   0, /* pre_modify  */
249   0, /* post_modify  */
250   1, /* register_offset  */
251   1, /* register_sextend  */
252   2, /* register_zextend  */
253   0, /* imm_offset  */
254 };
255 
256 static const struct cpu_addrcost_table xgene1_addrcost_table =
257 {
258     {
259       1, /* hi  */
260       0, /* si  */
261       0, /* di  */
262       1, /* ti  */
263     },
264   1, /* pre_modify  */
265   1, /* post_modify  */
266   0, /* register_offset  */
267   1, /* register_sextend  */
268   1, /* register_zextend  */
269   0, /* imm_offset  */
270 };
271 
272 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
273 {
274     {
275       1, /* hi  */
276       1, /* si  */
277       1, /* di  */
278       2, /* ti  */
279     },
280   0, /* pre_modify  */
281   0, /* post_modify  */
282   2, /* register_offset  */
283   3, /* register_sextend  */
284   3, /* register_zextend  */
285   0, /* imm_offset  */
286 };
287 
288 static const struct cpu_addrcost_table tsv110_addrcost_table =
289 {
290     {
291       1, /* hi  */
292       0, /* si  */
293       0, /* di  */
294       1, /* ti  */
295     },
296   0, /* pre_modify  */
297   0, /* post_modify  */
298   0, /* register_offset  */
299   1, /* register_sextend  */
300   1, /* register_zextend  */
301   0, /* imm_offset  */
302 };
303 
304 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
305 {
306     {
307       1, /* hi  */
308       1, /* si  */
309       1, /* di  */
310       2, /* ti  */
311     },
312   1, /* pre_modify  */
313   1, /* post_modify  */
314   3, /* register_offset  */
315   3, /* register_sextend  */
316   3, /* register_zextend  */
317   2, /* imm_offset  */
318 };
319 
320 static const struct cpu_regmove_cost generic_regmove_cost =
321 {
322   1, /* GP2GP  */
323   /* Avoid the use of slow int<->fp moves for spilling by setting
324      their cost higher than memmov_cost.  */
325   5, /* GP2FP  */
326   5, /* FP2GP  */
327   2 /* FP2FP  */
328 };
329 
330 static const struct cpu_regmove_cost cortexa57_regmove_cost =
331 {
332   1, /* GP2GP  */
333   /* Avoid the use of slow int<->fp moves for spilling by setting
334      their cost higher than memmov_cost.  */
335   5, /* GP2FP  */
336   5, /* FP2GP  */
337   2 /* FP2FP  */
338 };
339 
340 static const struct cpu_regmove_cost cortexa53_regmove_cost =
341 {
342   1, /* GP2GP  */
343   /* Avoid the use of slow int<->fp moves for spilling by setting
344      their cost higher than memmov_cost.  */
345   5, /* GP2FP  */
346   5, /* FP2GP  */
347   2 /* FP2FP  */
348 };
349 
350 static const struct cpu_regmove_cost exynosm1_regmove_cost =
351 {
352   1, /* GP2GP  */
353   /* Avoid the use of slow int<->fp moves for spilling by setting
354      their cost higher than memmov_cost (actual, 4 and 9).  */
355   9, /* GP2FP  */
356   9, /* FP2GP  */
357   1 /* FP2FP  */
358 };
359 
360 static const struct cpu_regmove_cost thunderx_regmove_cost =
361 {
362   2, /* GP2GP  */
363   2, /* GP2FP  */
364   6, /* FP2GP  */
365   4 /* FP2FP  */
366 };
367 
368 static const struct cpu_regmove_cost xgene1_regmove_cost =
369 {
370   1, /* GP2GP  */
371   /* Avoid the use of slow int<->fp moves for spilling by setting
372      their cost higher than memmov_cost.  */
373   8, /* GP2FP  */
374   8, /* FP2GP  */
375   2 /* FP2FP  */
376 };
377 
378 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
379 {
380   2, /* GP2GP  */
381   /* Avoid the use of int<->fp moves for spilling.  */
382   6, /* GP2FP  */
383   6, /* FP2GP  */
384   4 /* FP2FP  */
385 };
386 
387 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
388 {
389   1, /* GP2GP  */
390   /* Avoid the use of int<->fp moves for spilling.  */
391   8, /* GP2FP  */
392   8, /* FP2GP  */
393   4  /* FP2FP  */
394 };
395 
396 static const struct cpu_regmove_cost tsv110_regmove_cost =
397 {
398   1, /* GP2GP  */
399   /* Avoid the use of slow int<->fp moves for spilling by setting
400      their cost higher than memmov_cost.  */
401   2, /* GP2FP  */
402   3, /* FP2GP  */
403   2  /* FP2FP  */
404 };
405 
406 /* Generic costs for vector insn classes.  */
407 static const struct cpu_vector_cost generic_vector_cost =
408 {
409   1, /* scalar_int_stmt_cost  */
410   1, /* scalar_fp_stmt_cost  */
411   1, /* scalar_load_cost  */
412   1, /* scalar_store_cost  */
413   1, /* vec_int_stmt_cost  */
414   1, /* vec_fp_stmt_cost  */
415   2, /* vec_permute_cost  */
416   1, /* vec_to_scalar_cost  */
417   1, /* scalar_to_vec_cost  */
418   1, /* vec_align_load_cost  */
419   1, /* vec_unalign_load_cost  */
420   1, /* vec_unalign_store_cost  */
421   1, /* vec_store_cost  */
422   3, /* cond_taken_branch_cost  */
423   1 /* cond_not_taken_branch_cost  */
424 };
425 
426 /* QDF24XX costs for vector insn classes.  */
427 static const struct cpu_vector_cost qdf24xx_vector_cost =
428 {
429   1, /* scalar_int_stmt_cost  */
430   1, /* scalar_fp_stmt_cost  */
431   1, /* scalar_load_cost  */
432   1, /* scalar_store_cost  */
433   1, /* vec_int_stmt_cost  */
434   3, /* vec_fp_stmt_cost  */
435   2, /* vec_permute_cost  */
436   1, /* vec_to_scalar_cost  */
437   1, /* scalar_to_vec_cost  */
438   1, /* vec_align_load_cost  */
439   1, /* vec_unalign_load_cost  */
440   1, /* vec_unalign_store_cost  */
441   1, /* vec_store_cost  */
442   3, /* cond_taken_branch_cost  */
443   1 /* cond_not_taken_branch_cost  */
444 };
445 
446 /* ThunderX costs for vector insn classes.  */
447 static const struct cpu_vector_cost thunderx_vector_cost =
448 {
449   1, /* scalar_int_stmt_cost  */
450   1, /* scalar_fp_stmt_cost  */
451   3, /* scalar_load_cost  */
452   1, /* scalar_store_cost  */
453   4, /* vec_int_stmt_cost  */
454   1, /* vec_fp_stmt_cost  */
455   4, /* vec_permute_cost  */
456   2, /* vec_to_scalar_cost  */
457   2, /* scalar_to_vec_cost  */
458   3, /* vec_align_load_cost  */
459   5, /* vec_unalign_load_cost  */
460   5, /* vec_unalign_store_cost  */
461   1, /* vec_store_cost  */
462   3, /* cond_taken_branch_cost  */
463   3 /* cond_not_taken_branch_cost  */
464 };
465 
466 static const struct cpu_vector_cost tsv110_vector_cost =
467 {
468   1, /* scalar_int_stmt_cost  */
469   1, /* scalar_fp_stmt_cost  */
470   5, /* scalar_load_cost  */
471   1, /* scalar_store_cost  */
472   2, /* vec_int_stmt_cost  */
473   2, /* vec_fp_stmt_cost  */
474   2, /* vec_permute_cost  */
475   3, /* vec_to_scalar_cost  */
476   2, /* scalar_to_vec_cost  */
477   5, /* vec_align_load_cost  */
478   5, /* vec_unalign_load_cost  */
479   1, /* vec_unalign_store_cost  */
480   1, /* vec_store_cost  */
481   1, /* cond_taken_branch_cost  */
482   1 /* cond_not_taken_branch_cost  */
483 };
484 
485 /* Generic costs for vector insn classes.  */
486 static const struct cpu_vector_cost cortexa57_vector_cost =
487 {
488   1, /* scalar_int_stmt_cost  */
489   1, /* scalar_fp_stmt_cost  */
490   4, /* scalar_load_cost  */
491   1, /* scalar_store_cost  */
492   2, /* vec_int_stmt_cost  */
493   2, /* vec_fp_stmt_cost  */
494   3, /* vec_permute_cost  */
495   8, /* vec_to_scalar_cost  */
496   8, /* scalar_to_vec_cost  */
497   4, /* vec_align_load_cost  */
498   4, /* vec_unalign_load_cost  */
499   1, /* vec_unalign_store_cost  */
500   1, /* vec_store_cost  */
501   1, /* cond_taken_branch_cost  */
502   1 /* cond_not_taken_branch_cost  */
503 };
504 
505 static const struct cpu_vector_cost exynosm1_vector_cost =
506 {
507   1, /* scalar_int_stmt_cost  */
508   1, /* scalar_fp_stmt_cost  */
509   5, /* scalar_load_cost  */
510   1, /* scalar_store_cost  */
511   3, /* vec_int_stmt_cost  */
512   3, /* vec_fp_stmt_cost  */
513   3, /* vec_permute_cost  */
514   3, /* vec_to_scalar_cost  */
515   3, /* scalar_to_vec_cost  */
516   5, /* vec_align_load_cost  */
517   5, /* vec_unalign_load_cost  */
518   1, /* vec_unalign_store_cost  */
519   1, /* vec_store_cost  */
520   1, /* cond_taken_branch_cost  */
521   1 /* cond_not_taken_branch_cost  */
522 };
523 
524 /* Generic costs for vector insn classes.  */
525 static const struct cpu_vector_cost xgene1_vector_cost =
526 {
527   1, /* scalar_int_stmt_cost  */
528   1, /* scalar_fp_stmt_cost  */
529   5, /* scalar_load_cost  */
530   1, /* scalar_store_cost  */
531   2, /* vec_int_stmt_cost  */
532   2, /* vec_fp_stmt_cost  */
533   2, /* vec_permute_cost  */
534   4, /* vec_to_scalar_cost  */
535   4, /* scalar_to_vec_cost  */
536   10, /* vec_align_load_cost  */
537   10, /* vec_unalign_load_cost  */
538   2, /* vec_unalign_store_cost  */
539   2, /* vec_store_cost  */
540   2, /* cond_taken_branch_cost  */
541   1 /* cond_not_taken_branch_cost  */
542 };
543 
544 /* Costs for vector insn classes for Vulcan.  */
545 static const struct cpu_vector_cost thunderx2t99_vector_cost =
546 {
547   1, /* scalar_int_stmt_cost  */
548   6, /* scalar_fp_stmt_cost  */
549   4, /* scalar_load_cost  */
550   1, /* scalar_store_cost  */
551   5, /* vec_int_stmt_cost  */
552   6, /* vec_fp_stmt_cost  */
553   3, /* vec_permute_cost  */
554   6, /* vec_to_scalar_cost  */
555   5, /* scalar_to_vec_cost  */
556   8, /* vec_align_load_cost  */
557   8, /* vec_unalign_load_cost  */
558   4, /* vec_unalign_store_cost  */
559   4, /* vec_store_cost  */
560   2, /* cond_taken_branch_cost  */
561   1  /* cond_not_taken_branch_cost  */
562 };
563 
564 /* Generic costs for branch instructions.  */
565 static const struct cpu_branch_cost generic_branch_cost =
566 {
567   1,  /* Predictable.  */
568   3   /* Unpredictable.  */
569 };
570 
571 /* Generic approximation modes.  */
572 static const cpu_approx_modes generic_approx_modes =
573 {
574   AARCH64_APPROX_NONE,	/* division  */
575   AARCH64_APPROX_NONE,	/* sqrt  */
576   AARCH64_APPROX_NONE	/* recip_sqrt  */
577 };
578 
579 /* Approximation modes for Exynos M1.  */
580 static const cpu_approx_modes exynosm1_approx_modes =
581 {
582   AARCH64_APPROX_NONE,	/* division  */
583   AARCH64_APPROX_ALL,	/* sqrt  */
584   AARCH64_APPROX_ALL	/* recip_sqrt  */
585 };
586 
587 /* Approximation modes for X-Gene 1.  */
588 static const cpu_approx_modes xgene1_approx_modes =
589 {
590   AARCH64_APPROX_NONE,	/* division  */
591   AARCH64_APPROX_NONE,	/* sqrt  */
592   AARCH64_APPROX_ALL	/* recip_sqrt  */
593 };
594 
595 /* Generic prefetch settings (which disable prefetch).  */
596 static const cpu_prefetch_tune generic_prefetch_tune =
597 {
598   0,			/* num_slots  */
599   -1,			/* l1_cache_size  */
600   -1,			/* l1_cache_line_size  */
601   -1,			/* l2_cache_size  */
602   true,			/* prefetch_dynamic_strides */
603   -1,			/* minimum_stride */
604   -1			/* default_opt_level  */
605 };
606 
607 static const cpu_prefetch_tune exynosm1_prefetch_tune =
608 {
609   0,			/* num_slots  */
610   -1,			/* l1_cache_size  */
611   64,			/* l1_cache_line_size  */
612   -1,			/* l2_cache_size  */
613   true,			/* prefetch_dynamic_strides */
614   -1,			/* minimum_stride */
615   -1			/* default_opt_level  */
616 };
617 
618 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
619 {
620   4,			/* num_slots  */
621   32,			/* l1_cache_size  */
622   64,			/* l1_cache_line_size  */
623   512,			/* l2_cache_size  */
624   false,		/* prefetch_dynamic_strides */
625   2048,			/* minimum_stride */
626   3			/* default_opt_level  */
627 };
628 
629 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
630 {
631   8,			/* num_slots  */
632   32,			/* l1_cache_size  */
633   128,			/* l1_cache_line_size  */
634   16*1024,		/* l2_cache_size  */
635   true,			/* prefetch_dynamic_strides */
636   -1,			/* minimum_stride */
637   3			/* default_opt_level  */
638 };
639 
640 static const cpu_prefetch_tune thunderx_prefetch_tune =
641 {
642   8,			/* num_slots  */
643   32,			/* l1_cache_size  */
644   128,			/* l1_cache_line_size  */
645   -1,			/* l2_cache_size  */
646   true,			/* prefetch_dynamic_strides */
647   -1,			/* minimum_stride */
648   -1			/* default_opt_level  */
649 };
650 
651 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
652 {
653   8,			/* num_slots  */
654   32,			/* l1_cache_size  */
655   64,			/* l1_cache_line_size  */
656   256,			/* l2_cache_size  */
657   true,			/* prefetch_dynamic_strides */
658   -1,			/* minimum_stride */
659   -1			/* default_opt_level  */
660 };
661 
662 static const cpu_prefetch_tune tsv110_prefetch_tune =
663 {
664   0,                    /* num_slots  */
665   64,                   /* l1_cache_size  */
666   64,                   /* l1_cache_line_size  */
667   512,                  /* l2_cache_size  */
668   true,                 /* prefetch_dynamic_strides */
669   -1,                   /* minimum_stride */
670   -1                    /* default_opt_level  */
671 };
672 
673 static const cpu_prefetch_tune xgene1_prefetch_tune =
674 {
675   8,			/* num_slots  */
676   32,			/* l1_cache_size  */
677   64,			/* l1_cache_line_size  */
678   256,			/* l2_cache_size  */
679   true,                 /* prefetch_dynamic_strides */
680   -1,                   /* minimum_stride */
681   -1			/* default_opt_level  */
682 };
683 
684 static const struct tune_params generic_tunings =
685 {
686   &cortexa57_extra_costs,
687   &generic_addrcost_table,
688   &generic_regmove_cost,
689   &generic_vector_cost,
690   &generic_branch_cost,
691   &generic_approx_modes,
692   SVE_NOT_IMPLEMENTED, /* sve_width  */
693   4, /* memmov_cost  */
694   2, /* issue_rate  */
695   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
696   "8",	/* function_align.  */
697   "4",	/* jump_align.  */
698   "8",	/* loop_align.  */
699   2,	/* int_reassoc_width.  */
700   4,	/* fp_reassoc_width.  */
701   1,	/* vec_reassoc_width.  */
702   2,	/* min_div_recip_mul_sf.  */
703   2,	/* min_div_recip_mul_df.  */
704   0,	/* max_case_values.  */
705   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
706   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
707   &generic_prefetch_tune
708 };
709 
710 static const struct tune_params cortexa35_tunings =
711 {
712   &cortexa53_extra_costs,
713   &generic_addrcost_table,
714   &cortexa53_regmove_cost,
715   &generic_vector_cost,
716   &generic_branch_cost,
717   &generic_approx_modes,
718   SVE_NOT_IMPLEMENTED, /* sve_width  */
719   4, /* memmov_cost  */
720   1, /* issue_rate  */
721   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
722    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
723   "16",	/* function_align.  */
724   "4",	/* jump_align.  */
725   "8",	/* loop_align.  */
726   2,	/* int_reassoc_width.  */
727   4,	/* fp_reassoc_width.  */
728   1,	/* vec_reassoc_width.  */
729   2,	/* min_div_recip_mul_sf.  */
730   2,	/* min_div_recip_mul_df.  */
731   0,	/* max_case_values.  */
732   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
733   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
734   &generic_prefetch_tune
735 };
736 
737 static const struct tune_params cortexa53_tunings =
738 {
739   &cortexa53_extra_costs,
740   &generic_addrcost_table,
741   &cortexa53_regmove_cost,
742   &generic_vector_cost,
743   &generic_branch_cost,
744   &generic_approx_modes,
745   SVE_NOT_IMPLEMENTED, /* sve_width  */
746   4, /* memmov_cost  */
747   2, /* issue_rate  */
748   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
749    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
750   "16",	/* function_align.  */
751   "4",	/* jump_align.  */
752   "8",	/* loop_align.  */
753   2,	/* int_reassoc_width.  */
754   4,	/* fp_reassoc_width.  */
755   1,	/* vec_reassoc_width.  */
756   2,	/* min_div_recip_mul_sf.  */
757   2,	/* min_div_recip_mul_df.  */
758   0,	/* max_case_values.  */
759   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
760   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
761   &generic_prefetch_tune
762 };
763 
764 static const struct tune_params cortexa57_tunings =
765 {
766   &cortexa57_extra_costs,
767   &generic_addrcost_table,
768   &cortexa57_regmove_cost,
769   &cortexa57_vector_cost,
770   &generic_branch_cost,
771   &generic_approx_modes,
772   SVE_NOT_IMPLEMENTED, /* sve_width  */
773   4, /* memmov_cost  */
774   3, /* issue_rate  */
775   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
776    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
777   "16",	/* function_align.  */
778   "4",	/* jump_align.  */
779   "8",	/* loop_align.  */
780   2,	/* int_reassoc_width.  */
781   4,	/* fp_reassoc_width.  */
782   1,	/* vec_reassoc_width.  */
783   2,	/* min_div_recip_mul_sf.  */
784   2,	/* min_div_recip_mul_df.  */
785   0,	/* max_case_values.  */
786   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
787   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS),	/* tune_flags.  */
788   &generic_prefetch_tune
789 };
790 
791 static const struct tune_params cortexa72_tunings =
792 {
793   &cortexa57_extra_costs,
794   &generic_addrcost_table,
795   &cortexa57_regmove_cost,
796   &cortexa57_vector_cost,
797   &generic_branch_cost,
798   &generic_approx_modes,
799   SVE_NOT_IMPLEMENTED, /* sve_width  */
800   4, /* memmov_cost  */
801   3, /* issue_rate  */
802   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
803    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
804   "16",	/* function_align.  */
805   "4",	/* jump_align.  */
806   "8",	/* loop_align.  */
807   2,	/* int_reassoc_width.  */
808   4,	/* fp_reassoc_width.  */
809   1,	/* vec_reassoc_width.  */
810   2,	/* min_div_recip_mul_sf.  */
811   2,	/* min_div_recip_mul_df.  */
812   0,	/* max_case_values.  */
813   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
814   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
815   &generic_prefetch_tune
816 };
817 
818 static const struct tune_params cortexa73_tunings =
819 {
820   &cortexa57_extra_costs,
821   &generic_addrcost_table,
822   &cortexa57_regmove_cost,
823   &cortexa57_vector_cost,
824   &generic_branch_cost,
825   &generic_approx_modes,
826   SVE_NOT_IMPLEMENTED, /* sve_width  */
827   4, /* memmov_cost.  */
828   2, /* issue_rate.  */
829   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
830    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
831   "16",	/* function_align.  */
832   "4",	/* jump_align.  */
833   "8",	/* loop_align.  */
834   2,	/* int_reassoc_width.  */
835   4,	/* fp_reassoc_width.  */
836   1,	/* vec_reassoc_width.  */
837   2,	/* min_div_recip_mul_sf.  */
838   2,	/* min_div_recip_mul_df.  */
839   0,	/* max_case_values.  */
840   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
841   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
842   &generic_prefetch_tune
843 };
844 
845 
846 
847 static const struct tune_params exynosm1_tunings =
848 {
849   &exynosm1_extra_costs,
850   &exynosm1_addrcost_table,
851   &exynosm1_regmove_cost,
852   &exynosm1_vector_cost,
853   &generic_branch_cost,
854   &exynosm1_approx_modes,
855   SVE_NOT_IMPLEMENTED, /* sve_width  */
856   4,	/* memmov_cost  */
857   3,	/* issue_rate  */
858   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
859   "4",	/* function_align.  */
860   "4",	/* jump_align.  */
861   "4",	/* loop_align.  */
862   2,	/* int_reassoc_width.  */
863   4,	/* fp_reassoc_width.  */
864   1,	/* vec_reassoc_width.  */
865   2,	/* min_div_recip_mul_sf.  */
866   2,	/* min_div_recip_mul_df.  */
867   48,	/* max_case_values.  */
868   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
869   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
870   &exynosm1_prefetch_tune
871 };
872 
873 static const struct tune_params thunderxt88_tunings =
874 {
875   &thunderx_extra_costs,
876   &generic_addrcost_table,
877   &thunderx_regmove_cost,
878   &thunderx_vector_cost,
879   &generic_branch_cost,
880   &generic_approx_modes,
881   SVE_NOT_IMPLEMENTED, /* sve_width  */
882   6, /* memmov_cost  */
883   2, /* issue_rate  */
884   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
885   "8",	/* function_align.  */
886   "8",	/* jump_align.  */
887   "8",	/* loop_align.  */
888   2,	/* int_reassoc_width.  */
889   4,	/* fp_reassoc_width.  */
890   1,	/* vec_reassoc_width.  */
891   2,	/* min_div_recip_mul_sf.  */
892   2,	/* min_div_recip_mul_df.  */
893   0,	/* max_case_values.  */
894   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
895   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),	/* tune_flags.  */
896   &thunderxt88_prefetch_tune
897 };
898 
899 static const struct tune_params thunderx_tunings =
900 {
901   &thunderx_extra_costs,
902   &generic_addrcost_table,
903   &thunderx_regmove_cost,
904   &thunderx_vector_cost,
905   &generic_branch_cost,
906   &generic_approx_modes,
907   SVE_NOT_IMPLEMENTED, /* sve_width  */
908   6, /* memmov_cost  */
909   2, /* issue_rate  */
910   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
911   "8",	/* function_align.  */
912   "8",	/* jump_align.  */
913   "8",	/* loop_align.  */
914   2,	/* int_reassoc_width.  */
915   4,	/* fp_reassoc_width.  */
916   1,	/* vec_reassoc_width.  */
917   2,	/* min_div_recip_mul_sf.  */
918   2,	/* min_div_recip_mul_df.  */
919   0,	/* max_case_values.  */
920   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
921   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
922    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),	/* tune_flags.  */
923   &thunderx_prefetch_tune
924 };
925 
926 static const struct tune_params tsv110_tunings =
927 {
928   &tsv110_extra_costs,
929   &tsv110_addrcost_table,
930   &tsv110_regmove_cost,
931   &tsv110_vector_cost,
932   &generic_branch_cost,
933   &generic_approx_modes,
934   SVE_NOT_IMPLEMENTED, /* sve_width  */
935   4,    /* memmov_cost  */
936   4,    /* issue_rate  */
937   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
938    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
939   "16", /* function_align.  */
940   "4",  /* jump_align.  */
941   "8",  /* loop_align.  */
942   2,    /* int_reassoc_width.  */
943   4,    /* fp_reassoc_width.  */
944   1,    /* vec_reassoc_width.  */
945   2,    /* min_div_recip_mul_sf.  */
946   2,    /* min_div_recip_mul_df.  */
947   0,    /* max_case_values.  */
948   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
949   (AARCH64_EXTRA_TUNE_NONE),     /* tune_flags.  */
950   &tsv110_prefetch_tune
951 };
952 
953 static const struct tune_params xgene1_tunings =
954 {
955   &xgene1_extra_costs,
956   &xgene1_addrcost_table,
957   &xgene1_regmove_cost,
958   &xgene1_vector_cost,
959   &generic_branch_cost,
960   &xgene1_approx_modes,
961   SVE_NOT_IMPLEMENTED, /* sve_width  */
962   6, /* memmov_cost  */
963   4, /* issue_rate  */
964   AARCH64_FUSE_NOTHING, /* fusible_ops  */
965   "16",	/* function_align.  */
966   "16",	/* jump_align.  */
967   "16",	/* loop_align.  */
968   2,	/* int_reassoc_width.  */
969   4,	/* fp_reassoc_width.  */
970   1,	/* vec_reassoc_width.  */
971   2,	/* min_div_recip_mul_sf.  */
972   2,	/* min_div_recip_mul_df.  */
973   17,	/* max_case_values.  */
974   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
975   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
976   &xgene1_prefetch_tune
977 };
978 
979 static const struct tune_params emag_tunings =
980 {
981   &xgene1_extra_costs,
982   &xgene1_addrcost_table,
983   &xgene1_regmove_cost,
984   &xgene1_vector_cost,
985   &generic_branch_cost,
986   &xgene1_approx_modes,
987   SVE_NOT_IMPLEMENTED,
988   6, /* memmov_cost  */
989   4, /* issue_rate  */
990   AARCH64_FUSE_NOTHING, /* fusible_ops  */
991   "16",	/* function_align.  */
992   "16",	/* jump_align.  */
993   "16",	/* loop_align.  */
994   2,	/* int_reassoc_width.  */
995   4,	/* fp_reassoc_width.  */
996   1,	/* vec_reassoc_width.  */
997   2,	/* min_div_recip_mul_sf.  */
998   2,	/* min_div_recip_mul_df.  */
999   17,	/* max_case_values.  */
1000   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
1001   (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS),	/* tune_flags.  */
1002   &xgene1_prefetch_tune
1003 };
1004 
1005 static const struct tune_params qdf24xx_tunings =
1006 {
1007   &qdf24xx_extra_costs,
1008   &qdf24xx_addrcost_table,
1009   &qdf24xx_regmove_cost,
1010   &qdf24xx_vector_cost,
1011   &generic_branch_cost,
1012   &generic_approx_modes,
1013   SVE_NOT_IMPLEMENTED, /* sve_width  */
1014   4, /* memmov_cost  */
1015   4, /* issue_rate  */
1016   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1017    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1018   "16",	/* function_align.  */
1019   "8",	/* jump_align.  */
1020   "16",	/* loop_align.  */
1021   2,	/* int_reassoc_width.  */
1022   4,	/* fp_reassoc_width.  */
1023   1,	/* vec_reassoc_width.  */
1024   2,	/* min_div_recip_mul_sf.  */
1025   2,	/* min_div_recip_mul_df.  */
1026   0,	/* max_case_values.  */
1027   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1028   AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags.  */
1029   &qdf24xx_prefetch_tune
1030 };
1031 
1032 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
1033    for now.  */
1034 static const struct tune_params saphira_tunings =
1035 {
1036   &generic_extra_costs,
1037   &generic_addrcost_table,
1038   &generic_regmove_cost,
1039   &generic_vector_cost,
1040   &generic_branch_cost,
1041   &generic_approx_modes,
1042   SVE_NOT_IMPLEMENTED, /* sve_width  */
1043   4, /* memmov_cost  */
1044   4, /* issue_rate  */
1045   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1046    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
1047   "16",	/* function_align.  */
1048   "8",	/* jump_align.  */
1049   "16",	/* loop_align.  */
1050   2,	/* int_reassoc_width.  */
1051   4,	/* fp_reassoc_width.  */
1052   1,	/* vec_reassoc_width.  */
1053   2,	/* min_div_recip_mul_sf.  */
1054   2,	/* min_div_recip_mul_df.  */
1055   0,	/* max_case_values.  */
1056   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1057   (AARCH64_EXTRA_TUNE_NONE),		/* tune_flags.  */
1058   &generic_prefetch_tune
1059 };
1060 
1061 static const struct tune_params thunderx2t99_tunings =
1062 {
1063   &thunderx2t99_extra_costs,
1064   &thunderx2t99_addrcost_table,
1065   &thunderx2t99_regmove_cost,
1066   &thunderx2t99_vector_cost,
1067   &generic_branch_cost,
1068   &generic_approx_modes,
1069   SVE_NOT_IMPLEMENTED, /* sve_width  */
1070   4, /* memmov_cost.  */
1071   4, /* issue_rate.  */
1072   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1073    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
1074   "16",	/* function_align.  */
1075   "8",	/* jump_align.  */
1076   "16",	/* loop_align.  */
1077   3,	/* int_reassoc_width.  */
1078   2,	/* fp_reassoc_width.  */
1079   2,	/* vec_reassoc_width.  */
1080   2,	/* min_div_recip_mul_sf.  */
1081   2,	/* min_div_recip_mul_df.  */
1082   0,	/* max_case_values.  */
1083   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1084   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1085   &thunderx2t99_prefetch_tune
1086 };
1087 
1088 static const struct tune_params neoversen1_tunings =
1089 {
1090   &cortexa57_extra_costs,
1091   &generic_addrcost_table,
1092   &generic_regmove_cost,
1093   &cortexa57_vector_cost,
1094   &generic_branch_cost,
1095   &generic_approx_modes,
1096   SVE_NOT_IMPLEMENTED, /* sve_width  */
1097   4, /* memmov_cost  */
1098   3, /* issue_rate  */
1099   AARCH64_FUSE_AES_AESMC, /* fusible_ops  */
1100   "32:16",	/* function_align.  */
1101   "32:16",	/* jump_align.  */
1102   "32:16",	/* loop_align.  */
1103   2,	/* int_reassoc_width.  */
1104   4,	/* fp_reassoc_width.  */
1105   2,	/* vec_reassoc_width.  */
1106   2,	/* min_div_recip_mul_sf.  */
1107   2,	/* min_div_recip_mul_df.  */
1108   0,	/* max_case_values.  */
1109   tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
1110   (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
1111   &generic_prefetch_tune
1112 };
1113 
1114 /* Support for fine-grained override of the tuning structures.  */
1115 struct aarch64_tuning_override_function
1116 {
1117   const char* name;
1118   void (*parse_override)(const char*, struct tune_params*);
1119 };
1120 
1121 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1122 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1123 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1124 
1125 static const struct aarch64_tuning_override_function
1126 aarch64_tuning_override_functions[] =
1127 {
1128   { "fuse", aarch64_parse_fuse_string },
1129   { "tune", aarch64_parse_tune_string },
1130   { "sve_width", aarch64_parse_sve_width_string },
1131   { NULL, NULL }
1132 };
1133 
1134 /* A processor implementing AArch64.  */
1135 struct processor
1136 {
1137   const char *const name;
1138   enum aarch64_processor ident;
1139   enum aarch64_processor sched_core;
1140   enum aarch64_arch arch;
1141   unsigned architecture_version;
1142   const unsigned long flags;
1143   const struct tune_params *const tune;
1144 };
1145 
1146 /* Architectures implementing AArch64.  */
1147 static const struct processor all_architectures[] =
1148 {
1149 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1150   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1151 #include "aarch64-arches.def"
1152   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1153 };
1154 
1155 /* Processor cores implementing AArch64.  */
1156 static const struct processor all_cores[] =
1157 {
1158 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1159   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,				\
1160   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,	\
1161   FLAGS, &COSTS##_tunings},
1162 #include "aarch64-cores.def"
1163   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1164     AARCH64_FL_FOR_ARCH8, &generic_tunings},
1165   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1166 };
1167 
1168 
1169 /* Target specification.  These are populated by the -march, -mtune, -mcpu
1170    handling code or by target attributes.  */
1171 static const struct processor *selected_arch;
1172 static const struct processor *selected_cpu;
1173 static const struct processor *selected_tune;
1174 
1175 /* The current tuning set.  */
1176 struct tune_params aarch64_tune_params = generic_tunings;
1177 
1178 /* Table of machine attributes.  */
1179 static const struct attribute_spec aarch64_attribute_table[] =
1180 {
1181   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1182        affects_type_identity, handler, exclude } */
1183   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,  NULL, NULL },
1184   { NULL,                 0, 0, false, false, false, false, NULL, NULL }
1185 };
1186 
1187 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1188 
1189 /* An ISA extension in the co-processor and main instruction set space.  */
1190 struct aarch64_option_extension
1191 {
1192   const char *const name;
1193   const unsigned long flags_on;
1194   const unsigned long flags_off;
1195 };
1196 
1197 typedef enum aarch64_cond_code
1198 {
1199   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1200   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1201   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1202 }
1203 aarch64_cc;
1204 
1205 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1206 
1207 struct aarch64_branch_protect_type
1208 {
1209   /* The type's name that the user passes to the branch-protection option
1210     string.  */
1211   const char* name;
1212   /* Function to handle the protection type and set global variables.
1213     First argument is the string token corresponding with this type and the
1214     second argument is the next token in the option string.
1215     Return values:
1216     * AARCH64_PARSE_OK: Handling was sucessful.
1217     * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1218       should print an error.
1219     * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1220       own error.  */
1221   enum aarch64_parse_opt_result (*handler)(char*, char*);
1222   /* A list of types that can follow this type in the option string.  */
1223   const aarch64_branch_protect_type* subtypes;
1224   unsigned int num_subtypes;
1225 };
1226 
1227 static enum aarch64_parse_opt_result
1228 aarch64_handle_no_branch_protection (char* str, char* rest)
1229 {
1230   aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1231   aarch64_enable_bti = 0;
1232   if (rest)
1233     {
1234       error ("unexpected %<%s%> after %<%s%>", rest, str);
1235       return AARCH64_PARSE_INVALID_FEATURE;
1236     }
1237   return AARCH64_PARSE_OK;
1238 }
1239 
1240 static enum aarch64_parse_opt_result
1241 aarch64_handle_standard_branch_protection (char* str, char* rest)
1242 {
1243   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1244   aarch64_enable_bti = 1;
1245   if (rest)
1246     {
1247       error ("unexpected %<%s%> after %<%s%>", rest, str);
1248       return AARCH64_PARSE_INVALID_FEATURE;
1249     }
1250   return AARCH64_PARSE_OK;
1251 }
1252 
1253 static enum aarch64_parse_opt_result
1254 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1255 				    char* rest ATTRIBUTE_UNUSED)
1256 {
1257   aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1258   return AARCH64_PARSE_OK;
1259 }
1260 
1261 static enum aarch64_parse_opt_result
1262 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1263 			      char* rest ATTRIBUTE_UNUSED)
1264 {
1265   aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1266   return AARCH64_PARSE_OK;
1267 }
1268 
1269 static enum aarch64_parse_opt_result
1270 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1271 				    char* rest ATTRIBUTE_UNUSED)
1272 {
1273   aarch64_enable_bti = 1;
1274   return AARCH64_PARSE_OK;
1275 }
1276 
1277 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1278   { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1279   { NULL, NULL, NULL, 0 }
1280 };
1281 
1282 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1283   { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1284   { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1285   { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1286     ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1287   { "bti", aarch64_handle_bti_protection, NULL, 0 },
1288   { NULL, NULL, NULL, 0 }
1289 };
1290 
1291 /* The condition codes of the processor, and the inverse function.  */
1292 static const char * const aarch64_condition_codes[] =
1293 {
1294   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1295   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1296 };
1297 
1298 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1299 const char *
1300 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1301 			const char * branch_format)
1302 {
1303     rtx_code_label * tmp_label = gen_label_rtx ();
1304     char label_buf[256];
1305     char buffer[128];
1306     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1307 				 CODE_LABEL_NUMBER (tmp_label));
1308     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1309     rtx dest_label = operands[pos_label];
1310     operands[pos_label] = tmp_label;
1311 
1312     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1313     output_asm_insn (buffer, operands);
1314 
1315     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1316     operands[pos_label] = dest_label;
1317     output_asm_insn (buffer, operands);
1318     return "";
1319 }
1320 
1321 void
1322 aarch64_err_no_fpadvsimd (machine_mode mode)
1323 {
1324   if (TARGET_GENERAL_REGS_ONLY)
1325     if (FLOAT_MODE_P (mode))
1326       error ("%qs is incompatible with the use of floating-point types",
1327 	     "-mgeneral-regs-only");
1328     else
1329       error ("%qs is incompatible with the use of vector types",
1330 	     "-mgeneral-regs-only");
1331   else
1332     if (FLOAT_MODE_P (mode))
1333       error ("%qs feature modifier is incompatible with the use of"
1334 	     " floating-point types", "+nofp");
1335     else
1336       error ("%qs feature modifier is incompatible with the use of"
1337 	     " vector types", "+nofp");
1338 }
1339 
1340 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1341    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1342    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1343    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1344    and GENERAL_REGS is lower than the memory cost (in this case the best class
1345    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1346    cost results in bad allocations with many redundant int<->FP moves which
1347    are expensive on various cores.
1348    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1349    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1350    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1351    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1352    The result of this is that it is no longer inefficient to have a higher
1353    memory move cost than the register move cost.
1354 */
1355 
1356 static reg_class_t
1357 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1358 					 reg_class_t best_class)
1359 {
1360   machine_mode mode;
1361 
1362   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1363       || !reg_class_subset_p (FP_REGS, allocno_class))
1364     return allocno_class;
1365 
1366   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1367       || !reg_class_subset_p (FP_REGS, best_class))
1368     return best_class;
1369 
1370   mode = PSEUDO_REGNO_MODE (regno);
1371   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1372 }
1373 
1374 static unsigned int
1375 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1376 {
1377   if (GET_MODE_UNIT_SIZE (mode) == 4)
1378     return aarch64_tune_params.min_div_recip_mul_sf;
1379   return aarch64_tune_params.min_div_recip_mul_df;
1380 }
1381 
1382 /* Return the reassociation width of treeop OPC with mode MODE.  */
1383 static int
1384 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1385 {
1386   if (VECTOR_MODE_P (mode))
1387     return aarch64_tune_params.vec_reassoc_width;
1388   if (INTEGRAL_MODE_P (mode))
1389     return aarch64_tune_params.int_reassoc_width;
1390   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1391   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1392     return aarch64_tune_params.fp_reassoc_width;
1393   return 1;
1394 }
1395 
1396 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1397 unsigned
1398 aarch64_dbx_register_number (unsigned regno)
1399 {
1400    if (GP_REGNUM_P (regno))
1401      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1402    else if (regno == SP_REGNUM)
1403      return AARCH64_DWARF_SP;
1404    else if (FP_REGNUM_P (regno))
1405      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1406    else if (PR_REGNUM_P (regno))
1407      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1408    else if (regno == VG_REGNUM)
1409      return AARCH64_DWARF_VG;
1410 
1411    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1412       equivalent DWARF register.  */
1413    return DWARF_FRAME_REGISTERS;
1414 }
1415 
1416 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1417 static bool
1418 aarch64_advsimd_struct_mode_p (machine_mode mode)
1419 {
1420   return (TARGET_SIMD
1421 	  && (mode == OImode || mode == CImode || mode == XImode));
1422 }
1423 
1424 /* Return true if MODE is an SVE predicate mode.  */
1425 static bool
1426 aarch64_sve_pred_mode_p (machine_mode mode)
1427 {
1428   return (TARGET_SVE
1429 	  && (mode == VNx16BImode
1430 	      || mode == VNx8BImode
1431 	      || mode == VNx4BImode
1432 	      || mode == VNx2BImode));
1433 }
1434 
1435 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1436 const unsigned int VEC_ADVSIMD  = 1;
1437 const unsigned int VEC_SVE_DATA = 2;
1438 const unsigned int VEC_SVE_PRED = 4;
1439 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1440    a structure of 2, 3 or 4 vectors.  */
1441 const unsigned int VEC_STRUCT   = 8;
1442 /* Useful combinations of the above.  */
1443 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1444 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1445 
1446 /* Return a set of flags describing the vector properties of mode MODE.
1447    Ignore modes that are not supported by the current target.  */
1448 static unsigned int
1449 aarch64_classify_vector_mode (machine_mode mode)
1450 {
1451   if (aarch64_advsimd_struct_mode_p (mode))
1452     return VEC_ADVSIMD | VEC_STRUCT;
1453 
1454   if (aarch64_sve_pred_mode_p (mode))
1455     return VEC_SVE_PRED;
1456 
1457   scalar_mode inner = GET_MODE_INNER (mode);
1458   if (VECTOR_MODE_P (mode)
1459       && (inner == QImode
1460 	  || inner == HImode
1461 	  || inner == HFmode
1462 	  || inner == SImode
1463 	  || inner == SFmode
1464 	  || inner == DImode
1465 	  || inner == DFmode))
1466     {
1467       if (TARGET_SVE)
1468 	{
1469 	  if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1470 	    return VEC_SVE_DATA;
1471 	  if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1472 	      || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1473 	      || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1474 	    return VEC_SVE_DATA | VEC_STRUCT;
1475 	}
1476 
1477       /* This includes V1DF but not V1DI (which doesn't exist).  */
1478       if (TARGET_SIMD
1479 	  && (known_eq (GET_MODE_BITSIZE (mode), 64)
1480 	      || known_eq (GET_MODE_BITSIZE (mode), 128)))
1481 	return VEC_ADVSIMD;
1482     }
1483 
1484   return 0;
1485 }
1486 
1487 /* Return true if MODE is any of the data vector modes, including
1488    structure modes.  */
1489 static bool
1490 aarch64_vector_data_mode_p (machine_mode mode)
1491 {
1492   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1493 }
1494 
1495 /* Return true if MODE is an SVE data vector mode; either a single vector
1496    or a structure of vectors.  */
1497 static bool
1498 aarch64_sve_data_mode_p (machine_mode mode)
1499 {
1500   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1501 }
1502 
1503 /* Implement target hook TARGET_ARRAY_MODE.  */
1504 static opt_machine_mode
1505 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1506 {
1507   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1508       && IN_RANGE (nelems, 2, 4))
1509     return mode_for_vector (GET_MODE_INNER (mode),
1510 			    GET_MODE_NUNITS (mode) * nelems);
1511 
1512   return opt_machine_mode ();
1513 }
1514 
1515 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1516 static bool
1517 aarch64_array_mode_supported_p (machine_mode mode,
1518 				unsigned HOST_WIDE_INT nelems)
1519 {
1520   if (TARGET_SIMD
1521       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1522 	  || AARCH64_VALID_SIMD_DREG_MODE (mode))
1523       && (nelems >= 2 && nelems <= 4))
1524     return true;
1525 
1526   return false;
1527 }
1528 
1529 /* Return the SVE predicate mode to use for elements that have
1530    ELEM_NBYTES bytes, if such a mode exists.  */
1531 
1532 opt_machine_mode
1533 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1534 {
1535   if (TARGET_SVE)
1536     {
1537       if (elem_nbytes == 1)
1538 	return VNx16BImode;
1539       if (elem_nbytes == 2)
1540 	return VNx8BImode;
1541       if (elem_nbytes == 4)
1542 	return VNx4BImode;
1543       if (elem_nbytes == 8)
1544 	return VNx2BImode;
1545     }
1546   return opt_machine_mode ();
1547 }
1548 
1549 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1550 
1551 static opt_machine_mode
1552 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1553 {
1554   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1555     {
1556       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1557       machine_mode pred_mode;
1558       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1559 	return pred_mode;
1560     }
1561 
1562   return default_get_mask_mode (nunits, nbytes);
1563 }
1564 
1565 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1566    prefer to use the first arithmetic operand as the else value if
1567    the else value doesn't matter, since that exactly matches the SVE
1568    destructive merging form.  For ternary operations we could either
1569    pick the first operand and use FMAD-like instructions or the last
1570    operand and use FMLA-like instructions; the latter seems more
1571    natural.  */
1572 
1573 static tree
1574 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1575 {
1576   return nops == 3 ? ops[2] : ops[0];
1577 }
1578 
1579 /* Implement TARGET_HARD_REGNO_NREGS.  */
1580 
1581 static unsigned int
1582 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1583 {
1584   /* ??? Logically we should only need to provide a value when
1585      HARD_REGNO_MODE_OK says that the combination is valid,
1586      but at the moment we need to handle all modes.  Just ignore
1587      any runtime parts for registers that can't store them.  */
1588   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1589   switch (aarch64_regno_regclass (regno))
1590     {
1591     case FP_REGS:
1592     case FP_LO_REGS:
1593       if (aarch64_sve_data_mode_p (mode))
1594 	return exact_div (GET_MODE_SIZE (mode),
1595 			  BYTES_PER_SVE_VECTOR).to_constant ();
1596       return CEIL (lowest_size, UNITS_PER_VREG);
1597     case PR_REGS:
1598     case PR_LO_REGS:
1599     case PR_HI_REGS:
1600       return 1;
1601     default:
1602       return CEIL (lowest_size, UNITS_PER_WORD);
1603     }
1604   gcc_unreachable ();
1605 }
1606 
1607 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1608 
1609 static bool
1610 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1611 {
1612   if (GET_MODE_CLASS (mode) == MODE_CC)
1613     return regno == CC_REGNUM;
1614 
1615   if (regno == VG_REGNUM)
1616     /* This must have the same size as _Unwind_Word.  */
1617     return mode == DImode;
1618 
1619   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1620   if (vec_flags & VEC_SVE_PRED)
1621     return PR_REGNUM_P (regno);
1622 
1623   if (PR_REGNUM_P (regno))
1624     return 0;
1625 
1626   if (regno == SP_REGNUM)
1627     /* The purpose of comparing with ptr_mode is to support the
1628        global register variable associated with the stack pointer
1629        register via the syntax of asm ("wsp") in ILP32.  */
1630     return mode == Pmode || mode == ptr_mode;
1631 
1632   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1633     return mode == Pmode;
1634 
1635   if (GP_REGNUM_P (regno))
1636     {
1637       if (known_le (GET_MODE_SIZE (mode), 8))
1638 	return true;
1639       else if (known_le (GET_MODE_SIZE (mode), 16))
1640 	return (regno & 1) == 0;
1641     }
1642   else if (FP_REGNUM_P (regno))
1643     {
1644       if (vec_flags & VEC_STRUCT)
1645 	return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1646       else
1647 	return !VECTOR_MODE_P (mode) || vec_flags != 0;
1648     }
1649 
1650   return false;
1651 }
1652 
1653 /* Return true if this is a definition of a vectorized simd function.  */
1654 
1655 static bool
1656 aarch64_simd_decl_p (tree fndecl)
1657 {
1658   tree fntype;
1659 
1660   if (fndecl == NULL)
1661     return false;
1662   fntype = TREE_TYPE (fndecl);
1663   if (fntype == NULL)
1664     return false;
1665 
1666   /* Functions with the aarch64_vector_pcs attribute use the simd ABI.  */
1667   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL)
1668     return true;
1669 
1670   return false;
1671 }
1672 
1673 /* Return the mode a register save/restore should use.  DImode for integer
1674    registers, DFmode for FP registers in non-SIMD functions (they only save
1675    the bottom half of a 128 bit register), or TFmode for FP registers in
1676    SIMD functions.  */
1677 
1678 static machine_mode
1679 aarch64_reg_save_mode (tree fndecl, unsigned regno)
1680 {
1681   return GP_REGNUM_P (regno)
1682 	   ? E_DImode
1683 	   : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode);
1684 }
1685 
1686 /* Return true if the instruction is a call to a SIMD function, false
1687    if it is not a SIMD function or if we do not know anything about
1688    the function.  */
1689 
1690 static bool
1691 aarch64_simd_call_p (rtx_insn *insn)
1692 {
1693   rtx symbol;
1694   rtx call;
1695   tree fndecl;
1696 
1697   gcc_assert (CALL_P (insn));
1698   call = get_call_rtx_from (insn);
1699   symbol = XEXP (XEXP (call, 0), 0);
1700   if (GET_CODE (symbol) != SYMBOL_REF)
1701     return false;
1702   fndecl = SYMBOL_REF_DECL (symbol);
1703   if (!fndecl)
1704     return false;
1705 
1706   return aarch64_simd_decl_p (fndecl);
1707 }
1708 
1709 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS.  If INSN calls
1710    a function that uses the SIMD ABI, take advantage of the extra
1711    call-preserved registers that the ABI provides.  */
1712 
1713 void
1714 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn,
1715 					  HARD_REG_SET *return_set)
1716 {
1717   if (aarch64_simd_call_p (insn))
1718     {
1719       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1720 	if (FP_SIMD_SAVED_REGNUM_P (regno))
1721 	  CLEAR_HARD_REG_BIT (*return_set, regno);
1722     }
1723 }
1724 
1725 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1726    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1727    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1728 
1729 static bool
1730 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno,
1731 					machine_mode mode)
1732 {
1733   bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn);
1734   return FP_REGNUM_P (regno)
1735 	 && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8);
1736 }
1737 
1738 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS.  */
1739 
1740 rtx_insn *
1741 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2)
1742 {
1743   gcc_assert (CALL_P (call_1) && CALL_P (call_2));
1744 
1745   if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2))
1746     return call_1;
1747   else
1748     return call_2;
1749 }
1750 
1751 /* Implement REGMODE_NATURAL_SIZE.  */
1752 poly_uint64
1753 aarch64_regmode_natural_size (machine_mode mode)
1754 {
1755   /* The natural size for SVE data modes is one SVE data vector,
1756      and similarly for predicates.  We can't independently modify
1757      anything smaller than that.  */
1758   /* ??? For now, only do this for variable-width SVE registers.
1759      Doing it for constant-sized registers breaks lower-subreg.c.  */
1760   /* ??? And once that's fixed, we should probably have similar
1761      code for Advanced SIMD.  */
1762   if (!aarch64_sve_vg.is_constant ())
1763     {
1764       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1765       if (vec_flags & VEC_SVE_PRED)
1766 	return BYTES_PER_SVE_PRED;
1767       if (vec_flags & VEC_SVE_DATA)
1768 	return BYTES_PER_SVE_VECTOR;
1769     }
1770   return UNITS_PER_WORD;
1771 }
1772 
1773 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1774 machine_mode
1775 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1776 				     machine_mode mode)
1777 {
1778   /* The predicate mode determines which bits are significant and
1779      which are "don't care".  Decreasing the number of lanes would
1780      lose data while increasing the number of lanes would make bits
1781      unnecessarily significant.  */
1782   if (PR_REGNUM_P (regno))
1783     return mode;
1784   if (known_ge (GET_MODE_SIZE (mode), 4))
1785     return mode;
1786   else
1787     return SImode;
1788 }
1789 
1790 /* Return true if I's bits are consecutive ones from the MSB.  */
1791 bool
1792 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1793 {
1794   return exact_log2 (-i) != HOST_WIDE_INT_M1;
1795 }
1796 
1797 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1798    that strcpy from constants will be faster.  */
1799 
1800 static HOST_WIDE_INT
1801 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1802 {
1803   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1804     return MAX (align, BITS_PER_WORD);
1805   return align;
1806 }
1807 
1808 /* Return true if calls to DECL should be treated as
1809    long-calls (ie called via a register).  */
1810 static bool
1811 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1812 {
1813   return false;
1814 }
1815 
1816 /* Return true if calls to symbol-ref SYM should be treated as
1817    long-calls (ie called via a register).  */
1818 bool
1819 aarch64_is_long_call_p (rtx sym)
1820 {
1821   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1822 }
1823 
1824 /* Return true if calls to symbol-ref SYM should not go through
1825    plt stubs.  */
1826 
1827 bool
1828 aarch64_is_noplt_call_p (rtx sym)
1829 {
1830   const_tree decl = SYMBOL_REF_DECL (sym);
1831 
1832   if (flag_pic
1833       && decl
1834       && (!flag_plt
1835 	  || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1836       && !targetm.binds_local_p (decl))
1837     return true;
1838 
1839   return false;
1840 }
1841 
1842 /* Return true if the offsets to a zero/sign-extract operation
1843    represent an expression that matches an extend operation.  The
1844    operands represent the paramters from
1845 
1846    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1847 bool
1848 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1849 				rtx extract_imm)
1850 {
1851   HOST_WIDE_INT mult_val, extract_val;
1852 
1853   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1854     return false;
1855 
1856   mult_val = INTVAL (mult_imm);
1857   extract_val = INTVAL (extract_imm);
1858 
1859   if (extract_val > 8
1860       && extract_val < GET_MODE_BITSIZE (mode)
1861       && exact_log2 (extract_val & ~7) > 0
1862       && (extract_val & 7) <= 4
1863       && mult_val == (1 << (extract_val & 7)))
1864     return true;
1865 
1866   return false;
1867 }
1868 
1869 /* Emit an insn that's a simple single-set.  Both the operands must be
1870    known to be valid.  */
1871 inline static rtx_insn *
1872 emit_set_insn (rtx x, rtx y)
1873 {
1874   return emit_insn (gen_rtx_SET (x, y));
1875 }
1876 
1877 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1878    return the rtx for register 0 in the proper mode.  */
1879 rtx
1880 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1881 {
1882   machine_mode mode = SELECT_CC_MODE (code, x, y);
1883   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1884 
1885   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1886   return cc_reg;
1887 }
1888 
1889 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
1890 
1891 static rtx
1892 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
1893                                   machine_mode y_mode)
1894 {
1895   if (y_mode == E_QImode || y_mode == E_HImode)
1896     {
1897       if (CONST_INT_P (y))
1898 	y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
1899       else
1900 	{
1901 	  rtx t, cc_reg;
1902 	  machine_mode cc_mode;
1903 
1904 	  t = gen_rtx_ZERO_EXTEND (SImode, y);
1905 	  t = gen_rtx_COMPARE (CC_SWPmode, t, x);
1906 	  cc_mode = CC_SWPmode;
1907 	  cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
1908 	  emit_set_insn (cc_reg, t);
1909 	  return cc_reg;
1910 	}
1911     }
1912 
1913   if (!aarch64_plus_operand (y, y_mode))
1914     y = force_reg (y_mode, y);
1915 
1916   return aarch64_gen_compare_reg (code, x, y);
1917 }
1918 
1919 /* Build the SYMBOL_REF for __tls_get_addr.  */
1920 
1921 static GTY(()) rtx tls_get_addr_libfunc;
1922 
1923 rtx
1924 aarch64_tls_get_addr (void)
1925 {
1926   if (!tls_get_addr_libfunc)
1927     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1928   return tls_get_addr_libfunc;
1929 }
1930 
1931 /* Return the TLS model to use for ADDR.  */
1932 
1933 static enum tls_model
1934 tls_symbolic_operand_type (rtx addr)
1935 {
1936   enum tls_model tls_kind = TLS_MODEL_NONE;
1937   if (GET_CODE (addr) == CONST)
1938     {
1939       poly_int64 addend;
1940       rtx sym = strip_offset (addr, &addend);
1941       if (GET_CODE (sym) == SYMBOL_REF)
1942 	tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1943     }
1944   else if (GET_CODE (addr) == SYMBOL_REF)
1945     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1946 
1947   return tls_kind;
1948 }
1949 
1950 /* We'll allow lo_sum's in addresses in our legitimate addresses
1951    so that combine would take care of combining addresses where
1952    necessary, but for generation purposes, we'll generate the address
1953    as :
1954    RTL                               Absolute
1955    tmp = hi (symbol_ref);            adrp  x1, foo
1956    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1957                                      nop
1958 
1959    PIC                               TLS
1960    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1961    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1962                                      bl   __tls_get_addr
1963                                      nop
1964 
1965    Load TLS symbol, depending on TLS mechanism and TLS access model.
1966 
1967    Global Dynamic - Traditional TLS:
1968    adrp tmp, :tlsgd:imm
1969    add  dest, tmp, #:tlsgd_lo12:imm
1970    bl   __tls_get_addr
1971 
1972    Global Dynamic - TLS Descriptors:
1973    adrp dest, :tlsdesc:imm
1974    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1975    add  dest, dest, #:tlsdesc_lo12:imm
1976    blr  tmp
1977    mrs  tp, tpidr_el0
1978    add  dest, dest, tp
1979 
1980    Initial Exec:
1981    mrs  tp, tpidr_el0
1982    adrp tmp, :gottprel:imm
1983    ldr  dest, [tmp, #:gottprel_lo12:imm]
1984    add  dest, dest, tp
1985 
1986    Local Exec:
1987    mrs  tp, tpidr_el0
1988    add  t0, tp, #:tprel_hi12:imm, lsl #12
1989    add  t0, t0, #:tprel_lo12_nc:imm
1990 */
1991 
1992 static void
1993 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1994 				   enum aarch64_symbol_type type)
1995 {
1996   switch (type)
1997     {
1998     case SYMBOL_SMALL_ABSOLUTE:
1999       {
2000 	/* In ILP32, the mode of dest can be either SImode or DImode.  */
2001 	rtx tmp_reg = dest;
2002 	machine_mode mode = GET_MODE (dest);
2003 
2004 	gcc_assert (mode == Pmode || mode == ptr_mode);
2005 
2006 	if (can_create_pseudo_p ())
2007 	  tmp_reg = gen_reg_rtx (mode);
2008 
2009 	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2010 	emit_insn (gen_add_losym (dest, tmp_reg, imm));
2011 	return;
2012       }
2013 
2014     case SYMBOL_TINY_ABSOLUTE:
2015       emit_insn (gen_rtx_SET (dest, imm));
2016       return;
2017 
2018     case SYMBOL_SMALL_GOT_28K:
2019       {
2020 	machine_mode mode = GET_MODE (dest);
2021 	rtx gp_rtx = pic_offset_table_rtx;
2022 	rtx insn;
2023 	rtx mem;
2024 
2025 	/* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2026 	   here before rtl expand.  Tree IVOPT will generate rtl pattern to
2027 	   decide rtx costs, in which case pic_offset_table_rtx is not
2028 	   initialized.  For that case no need to generate the first adrp
2029 	   instruction as the final cost for global variable access is
2030 	   one instruction.  */
2031 	if (gp_rtx != NULL)
2032 	  {
2033 	    /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2034 	       using the page base as GOT base, the first page may be wasted,
2035 	       in the worst scenario, there is only 28K space for GOT).
2036 
2037 	       The generate instruction sequence for accessing global variable
2038 	       is:
2039 
2040 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2041 
2042 	       Only one instruction needed. But we must initialize
2043 	       pic_offset_table_rtx properly.  We generate initialize insn for
2044 	       every global access, and allow CSE to remove all redundant.
2045 
2046 	       The final instruction sequences will look like the following
2047 	       for multiply global variables access.
2048 
2049 		 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2050 
2051 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2052 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2053 		 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2054 		 ...  */
2055 
2056 	    rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2057 	    crtl->uses_pic_offset_table = 1;
2058 	    emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2059 
2060 	    if (mode != GET_MODE (gp_rtx))
2061              gp_rtx = gen_lowpart (mode, gp_rtx);
2062 
2063 	  }
2064 
2065 	if (mode == ptr_mode)
2066 	  {
2067 	    if (mode == DImode)
2068 	      insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2069 	    else
2070 	      insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2071 
2072 	    mem = XVECEXP (SET_SRC (insn), 0, 0);
2073 	  }
2074 	else
2075 	  {
2076 	    gcc_assert (mode == Pmode);
2077 
2078 	    insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2079 	    mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2080 	  }
2081 
2082 	/* The operand is expected to be MEM.  Whenever the related insn
2083 	   pattern changed, above code which calculate mem should be
2084 	   updated.  */
2085 	gcc_assert (GET_CODE (mem) == MEM);
2086 	MEM_READONLY_P (mem) = 1;
2087 	MEM_NOTRAP_P (mem) = 1;
2088 	emit_insn (insn);
2089 	return;
2090       }
2091 
2092     case SYMBOL_SMALL_GOT_4G:
2093       {
2094 	/* In ILP32, the mode of dest can be either SImode or DImode,
2095 	   while the got entry is always of SImode size.  The mode of
2096 	   dest depends on how dest is used: if dest is assigned to a
2097 	   pointer (e.g. in the memory), it has SImode; it may have
2098 	   DImode if dest is dereferenced to access the memeory.
2099 	   This is why we have to handle three different ldr_got_small
2100 	   patterns here (two patterns for ILP32).  */
2101 
2102 	rtx insn;
2103 	rtx mem;
2104 	rtx tmp_reg = dest;
2105 	machine_mode mode = GET_MODE (dest);
2106 
2107 	if (can_create_pseudo_p ())
2108 	  tmp_reg = gen_reg_rtx (mode);
2109 
2110 	emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2111 	if (mode == ptr_mode)
2112 	  {
2113 	    if (mode == DImode)
2114 	      insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
2115 	    else
2116 	      insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
2117 
2118 	    mem = XVECEXP (SET_SRC (insn), 0, 0);
2119 	  }
2120 	else
2121 	  {
2122 	    gcc_assert (mode == Pmode);
2123 
2124 	    insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
2125 	    mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2126 	  }
2127 
2128 	gcc_assert (GET_CODE (mem) == MEM);
2129 	MEM_READONLY_P (mem) = 1;
2130 	MEM_NOTRAP_P (mem) = 1;
2131 	emit_insn (insn);
2132 	return;
2133       }
2134 
2135     case SYMBOL_SMALL_TLSGD:
2136       {
2137 	rtx_insn *insns;
2138 	machine_mode mode = GET_MODE (dest);
2139 	rtx result = gen_rtx_REG (mode, R0_REGNUM);
2140 
2141 	start_sequence ();
2142 	if (TARGET_ILP32)
2143 	  aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2144 	else
2145 	  aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2146 	insns = get_insns ();
2147 	end_sequence ();
2148 
2149 	RTL_CONST_CALL_P (insns) = 1;
2150 	emit_libcall_block (insns, dest, result, imm);
2151 	return;
2152       }
2153 
2154     case SYMBOL_SMALL_TLSDESC:
2155       {
2156 	machine_mode mode = GET_MODE (dest);
2157 	rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2158 	rtx tp;
2159 
2160 	gcc_assert (mode == Pmode || mode == ptr_mode);
2161 
2162 	/* In ILP32, the got entry is always of SImode size.  Unlike
2163 	   small GOT, the dest is fixed at reg 0.  */
2164 	if (TARGET_ILP32)
2165 	  emit_insn (gen_tlsdesc_small_si (imm));
2166 	else
2167 	  emit_insn (gen_tlsdesc_small_di (imm));
2168 	tp = aarch64_load_tp (NULL);
2169 
2170 	if (mode != Pmode)
2171 	  tp = gen_lowpart (mode, tp);
2172 
2173 	emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2174 	if (REG_P (dest))
2175 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2176 	return;
2177       }
2178 
2179     case SYMBOL_SMALL_TLSIE:
2180       {
2181 	/* In ILP32, the mode of dest can be either SImode or DImode,
2182 	   while the got entry is always of SImode size.  The mode of
2183 	   dest depends on how dest is used: if dest is assigned to a
2184 	   pointer (e.g. in the memory), it has SImode; it may have
2185 	   DImode if dest is dereferenced to access the memeory.
2186 	   This is why we have to handle three different tlsie_small
2187 	   patterns here (two patterns for ILP32).  */
2188 	machine_mode mode = GET_MODE (dest);
2189 	rtx tmp_reg = gen_reg_rtx (mode);
2190 	rtx tp = aarch64_load_tp (NULL);
2191 
2192 	if (mode == ptr_mode)
2193 	  {
2194 	    if (mode == DImode)
2195 	      emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2196 	    else
2197 	      {
2198 		emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2199 		tp = gen_lowpart (mode, tp);
2200 	      }
2201 	  }
2202 	else
2203 	  {
2204 	    gcc_assert (mode == Pmode);
2205 	    emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2206 	  }
2207 
2208 	emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2209 	if (REG_P (dest))
2210 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2211 	return;
2212       }
2213 
2214     case SYMBOL_TLSLE12:
2215     case SYMBOL_TLSLE24:
2216     case SYMBOL_TLSLE32:
2217     case SYMBOL_TLSLE48:
2218       {
2219 	machine_mode mode = GET_MODE (dest);
2220 	rtx tp = aarch64_load_tp (NULL);
2221 
2222 	if (mode != Pmode)
2223 	  tp = gen_lowpart (mode, tp);
2224 
2225 	switch (type)
2226 	  {
2227 	  case SYMBOL_TLSLE12:
2228 	    emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
2229 			(dest, tp, imm));
2230 	    break;
2231 	  case SYMBOL_TLSLE24:
2232 	    emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
2233 			(dest, tp, imm));
2234 	  break;
2235 	  case SYMBOL_TLSLE32:
2236 	    emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
2237 			(dest, imm));
2238 	    emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2239 			(dest, dest, tp));
2240 	  break;
2241 	  case SYMBOL_TLSLE48:
2242 	    emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
2243 			(dest, imm));
2244 	    emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
2245 			(dest, dest, tp));
2246 	    break;
2247 	  default:
2248 	    gcc_unreachable ();
2249 	  }
2250 
2251 	if (REG_P (dest))
2252 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2253 	return;
2254       }
2255 
2256     case SYMBOL_TINY_GOT:
2257       emit_insn (gen_ldr_got_tiny (dest, imm));
2258       return;
2259 
2260     case SYMBOL_TINY_TLSIE:
2261       {
2262 	machine_mode mode = GET_MODE (dest);
2263 	rtx tp = aarch64_load_tp (NULL);
2264 
2265 	if (mode == ptr_mode)
2266 	  {
2267 	    if (mode == DImode)
2268 	      emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
2269 	    else
2270 	      {
2271 		tp = gen_lowpart (mode, tp);
2272 		emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
2273 	      }
2274 	  }
2275 	else
2276 	  {
2277 	    gcc_assert (mode == Pmode);
2278 	    emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
2279 	  }
2280 
2281 	if (REG_P (dest))
2282 	  set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2283 	return;
2284       }
2285 
2286     default:
2287       gcc_unreachable ();
2288     }
2289 }
2290 
2291 /* Emit a move from SRC to DEST.  Assume that the move expanders can
2292    handle all moves if !can_create_pseudo_p ().  The distinction is
2293    important because, unlike emit_move_insn, the move expanders know
2294    how to force Pmode objects into the constant pool even when the
2295    constant pool address is not itself legitimate.  */
2296 static rtx
2297 aarch64_emit_move (rtx dest, rtx src)
2298 {
2299   return (can_create_pseudo_p ()
2300 	  ? emit_move_insn (dest, src)
2301 	  : emit_move_insn_1 (dest, src));
2302 }
2303 
2304 /* Apply UNOPTAB to OP and store the result in DEST.  */
2305 
2306 static void
2307 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2308 {
2309   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2310   if (dest != tmp)
2311     emit_move_insn (dest, tmp);
2312 }
2313 
2314 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
2315 
2316 static void
2317 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2318 {
2319   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2320 			  OPTAB_DIRECT);
2321   if (dest != tmp)
2322     emit_move_insn (dest, tmp);
2323 }
2324 
2325 /* Split a 128-bit move operation into two 64-bit move operations,
2326    taking care to handle partial overlap of register to register
2327    copies.  Special cases are needed when moving between GP regs and
2328    FP regs.  SRC can be a register, constant or memory; DST a register
2329    or memory.  If either operand is memory it must not have any side
2330    effects.  */
2331 void
2332 aarch64_split_128bit_move (rtx dst, rtx src)
2333 {
2334   rtx dst_lo, dst_hi;
2335   rtx src_lo, src_hi;
2336 
2337   machine_mode mode = GET_MODE (dst);
2338 
2339   gcc_assert (mode == TImode || mode == TFmode);
2340   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2341   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2342 
2343   if (REG_P (dst) && REG_P (src))
2344     {
2345       int src_regno = REGNO (src);
2346       int dst_regno = REGNO (dst);
2347 
2348       /* Handle FP <-> GP regs.  */
2349       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2350 	{
2351 	  src_lo = gen_lowpart (word_mode, src);
2352 	  src_hi = gen_highpart (word_mode, src);
2353 
2354 	  emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2355 	  emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2356 	  return;
2357 	}
2358       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2359 	{
2360 	  dst_lo = gen_lowpart (word_mode, dst);
2361 	  dst_hi = gen_highpart (word_mode, dst);
2362 
2363 	  emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2364 	  emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2365 	  return;
2366 	}
2367     }
2368 
2369   dst_lo = gen_lowpart (word_mode, dst);
2370   dst_hi = gen_highpart (word_mode, dst);
2371   src_lo = gen_lowpart (word_mode, src);
2372   src_hi = gen_highpart_mode (word_mode, mode, src);
2373 
2374   /* At most one pairing may overlap.  */
2375   if (reg_overlap_mentioned_p (dst_lo, src_hi))
2376     {
2377       aarch64_emit_move (dst_hi, src_hi);
2378       aarch64_emit_move (dst_lo, src_lo);
2379     }
2380   else
2381     {
2382       aarch64_emit_move (dst_lo, src_lo);
2383       aarch64_emit_move (dst_hi, src_hi);
2384     }
2385 }
2386 
2387 bool
2388 aarch64_split_128bit_move_p (rtx dst, rtx src)
2389 {
2390   return (! REG_P (src)
2391 	  || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2392 }
2393 
2394 /* Split a complex SIMD combine.  */
2395 
2396 void
2397 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2398 {
2399   machine_mode src_mode = GET_MODE (src1);
2400   machine_mode dst_mode = GET_MODE (dst);
2401 
2402   gcc_assert (VECTOR_MODE_P (dst_mode));
2403   gcc_assert (register_operand (dst, dst_mode)
2404 	      && register_operand (src1, src_mode)
2405 	      && register_operand (src2, src_mode));
2406 
2407   emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2408   return;
2409 }
2410 
2411 /* Split a complex SIMD move.  */
2412 
2413 void
2414 aarch64_split_simd_move (rtx dst, rtx src)
2415 {
2416   machine_mode src_mode = GET_MODE (src);
2417   machine_mode dst_mode = GET_MODE (dst);
2418 
2419   gcc_assert (VECTOR_MODE_P (dst_mode));
2420 
2421   if (REG_P (dst) && REG_P (src))
2422     {
2423       gcc_assert (VECTOR_MODE_P (src_mode));
2424       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2425     }
2426 }
2427 
2428 bool
2429 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2430 			      machine_mode ymode, rtx y)
2431 {
2432   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2433   gcc_assert (r != NULL);
2434   return rtx_equal_p (x, r);
2435 }
2436 
2437 
2438 static rtx
2439 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2440 {
2441   if (can_create_pseudo_p ())
2442     return force_reg (mode, value);
2443   else
2444     {
2445       gcc_assert (x);
2446       aarch64_emit_move (x, value);
2447       return x;
2448     }
2449 }
2450 
2451 /* Return true if we can move VALUE into a register using a single
2452    CNT[BHWD] instruction.  */
2453 
2454 static bool
2455 aarch64_sve_cnt_immediate_p (poly_int64 value)
2456 {
2457   HOST_WIDE_INT factor = value.coeffs[0];
2458   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2459   return (value.coeffs[1] == factor
2460 	  && IN_RANGE (factor, 2, 16 * 16)
2461 	  && (factor & 1) == 0
2462 	  && factor <= 16 * (factor & -factor));
2463 }
2464 
2465 /* Likewise for rtx X.  */
2466 
2467 bool
2468 aarch64_sve_cnt_immediate_p (rtx x)
2469 {
2470   poly_int64 value;
2471   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2472 }
2473 
2474 /* Return the asm string for an instruction with a CNT-like vector size
2475    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2476    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2477    first part of the operands template (the part that comes before the
2478    vector size itself).  FACTOR is the number of quadwords.
2479    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2480    If it is zero, we can use any element size.  */
2481 
2482 static char *
2483 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2484 				  unsigned int factor,
2485 				  unsigned int nelts_per_vq)
2486 {
2487   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2488 
2489   if (nelts_per_vq == 0)
2490     /* There is some overlap in the ranges of the four CNT instructions.
2491        Here we always use the smallest possible element size, so that the
2492        multiplier is 1 whereever possible.  */
2493     nelts_per_vq = factor & -factor;
2494   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2495   gcc_assert (IN_RANGE (shift, 1, 4));
2496   char suffix = "dwhb"[shift - 1];
2497 
2498   factor >>= shift;
2499   unsigned int written;
2500   if (factor == 1)
2501     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2502 			prefix, suffix, operands);
2503   else
2504     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2505 			prefix, suffix, operands, factor);
2506   gcc_assert (written < sizeof (buffer));
2507   return buffer;
2508 }
2509 
2510 /* Return the asm string for an instruction with a CNT-like vector size
2511    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2512    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2513    first part of the operands template (the part that comes before the
2514    vector size itself).  X is the value of the vector size operand,
2515    as a polynomial integer rtx.  */
2516 
2517 char *
2518 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2519 				  rtx x)
2520 {
2521   poly_int64 value = rtx_to_poly_int64 (x);
2522   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2523   return aarch64_output_sve_cnt_immediate (prefix, operands,
2524 					   value.coeffs[1], 0);
2525 }
2526 
2527 /* Return true if we can add VALUE to a register using a single ADDVL
2528    or ADDPL instruction.  */
2529 
2530 static bool
2531 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2532 {
2533   HOST_WIDE_INT factor = value.coeffs[0];
2534   if (factor == 0 || value.coeffs[1] != factor)
2535     return false;
2536   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2537      and a value of 16 is one vector width.  */
2538   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2539 	  || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2540 }
2541 
2542 /* Likewise for rtx X.  */
2543 
2544 bool
2545 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2546 {
2547   poly_int64 value;
2548   return (poly_int_rtx_p (x, &value)
2549 	  && aarch64_sve_addvl_addpl_immediate_p (value));
2550 }
2551 
2552 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2553    and storing the result in operand 0.  */
2554 
2555 char *
2556 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2557 {
2558   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2559   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2560   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2561 
2562   /* Use INC or DEC if possible.  */
2563   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2564     {
2565       if (aarch64_sve_cnt_immediate_p (offset_value))
2566 	return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2567 						 offset_value.coeffs[1], 0);
2568       if (aarch64_sve_cnt_immediate_p (-offset_value))
2569 	return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2570 						 -offset_value.coeffs[1], 0);
2571     }
2572 
2573   int factor = offset_value.coeffs[1];
2574   if ((factor & 15) == 0)
2575     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2576   else
2577     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2578   return buffer;
2579 }
2580 
2581 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2582    instruction.  If it is, store the number of elements in each vector
2583    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2584    factor in *FACTOR_OUT (if nonnull).  */
2585 
2586 bool
2587 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2588 				 unsigned int *nelts_per_vq_out)
2589 {
2590   rtx elt;
2591   poly_int64 value;
2592 
2593   if (!const_vec_duplicate_p (x, &elt)
2594       || !poly_int_rtx_p (elt, &value))
2595     return false;
2596 
2597   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2598   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2599     /* There's no vector INCB.  */
2600     return false;
2601 
2602   HOST_WIDE_INT factor = value.coeffs[0];
2603   if (value.coeffs[1] != factor)
2604     return false;
2605 
2606   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2607   if ((factor % nelts_per_vq) != 0
2608       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2609     return false;
2610 
2611   if (factor_out)
2612     *factor_out = factor;
2613   if (nelts_per_vq_out)
2614     *nelts_per_vq_out = nelts_per_vq;
2615   return true;
2616 }
2617 
2618 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2619    instruction.  */
2620 
2621 bool
2622 aarch64_sve_inc_dec_immediate_p (rtx x)
2623 {
2624   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2625 }
2626 
2627 /* Return the asm template for an SVE vector INC or DEC instruction.
2628    OPERANDS gives the operands before the vector count and X is the
2629    value of the vector count operand itself.  */
2630 
2631 char *
2632 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2633 {
2634   int factor;
2635   unsigned int nelts_per_vq;
2636   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2637     gcc_unreachable ();
2638   if (factor < 0)
2639     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2640 					     nelts_per_vq);
2641   else
2642     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2643 					     nelts_per_vq);
2644 }
2645 
2646 static int
2647 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2648 				scalar_int_mode mode)
2649 {
2650   int i;
2651   unsigned HOST_WIDE_INT val, val2, mask;
2652   int one_match, zero_match;
2653   int num_insns;
2654 
2655   val = INTVAL (imm);
2656 
2657   if (aarch64_move_imm (val, mode))
2658     {
2659       if (generate)
2660 	emit_insn (gen_rtx_SET (dest, imm));
2661       return 1;
2662     }
2663 
2664   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2665      (with XXXX non-zero). In that case check to see if the move can be done in
2666      a smaller mode.  */
2667   val2 = val & 0xffffffff;
2668   if (mode == DImode
2669       && aarch64_move_imm (val2, SImode)
2670       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2671     {
2672       if (generate)
2673 	emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2674 
2675       /* Check if we have to emit a second instruction by checking to see
2676          if any of the upper 32 bits of the original DI mode value is set.  */
2677       if (val == val2)
2678 	return 1;
2679 
2680       i = (val >> 48) ? 48 : 32;
2681 
2682       if (generate)
2683 	 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2684 				    GEN_INT ((val >> i) & 0xffff)));
2685 
2686       return 2;
2687     }
2688 
2689   if ((val >> 32) == 0 || mode == SImode)
2690     {
2691       if (generate)
2692 	{
2693 	  emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2694 	  if (mode == SImode)
2695 	    emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2696 				       GEN_INT ((val >> 16) & 0xffff)));
2697 	  else
2698 	    emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2699 				       GEN_INT ((val >> 16) & 0xffff)));
2700 	}
2701       return 2;
2702     }
2703 
2704   /* Remaining cases are all for DImode.  */
2705 
2706   mask = 0xffff;
2707   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2708     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2709   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2710     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2711 
2712   if (zero_match != 2 && one_match != 2)
2713     {
2714       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2715 	 For a 64-bit bitmask try whether changing 16 bits to all ones or
2716 	 zeroes creates a valid bitmask.  To check any repeated bitmask,
2717 	 try using 16 bits from the other 32-bit half of val.  */
2718 
2719       for (i = 0; i < 64; i += 16, mask <<= 16)
2720 	{
2721 	  val2 = val & ~mask;
2722 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
2723 	    break;
2724 	  val2 = val | mask;
2725 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
2726 	    break;
2727 	  val2 = val2 & ~mask;
2728 	  val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2729 	  if (val2 != val && aarch64_bitmask_imm (val2, mode))
2730 	    break;
2731 	}
2732       if (i != 64)
2733 	{
2734 	  if (generate)
2735 	    {
2736 	      emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2737 	      emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2738 					 GEN_INT ((val >> i) & 0xffff)));
2739 	    }
2740 	  return 2;
2741 	}
2742     }
2743 
2744   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2745      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2746      otherwise skip zero bits.  */
2747 
2748   num_insns = 1;
2749   mask = 0xffff;
2750   val2 = one_match > zero_match ? ~val : val;
2751   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2752 
2753   if (generate)
2754     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2755 					   ? (val | ~(mask << i))
2756 					   : (val & (mask << i)))));
2757   for (i += 16; i < 64; i += 16)
2758     {
2759       if ((val2 & (mask << i)) == 0)
2760 	continue;
2761       if (generate)
2762 	emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2763 				   GEN_INT ((val >> i) & 0xffff)));
2764       num_insns ++;
2765     }
2766 
2767   return num_insns;
2768 }
2769 
2770 /* Return whether imm is a 128-bit immediate which is simple enough to
2771    expand inline.  */
2772 bool
2773 aarch64_mov128_immediate (rtx imm)
2774 {
2775   if (GET_CODE (imm) == CONST_INT)
2776     return true;
2777 
2778   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2779 
2780   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2781   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2782 
2783   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2784 	 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2785 }
2786 
2787 
2788 /* Return the number of temporary registers that aarch64_add_offset_1
2789    would need to add OFFSET to a register.  */
2790 
2791 static unsigned int
2792 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2793 {
2794   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2795 }
2796 
2797 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2798    a non-polynomial OFFSET.  MODE is the mode of the addition.
2799    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2800    be set and CFA adjustments added to the generated instructions.
2801 
2802    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2803    temporary if register allocation is already complete.  This temporary
2804    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2805    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2806    the immediate again.
2807 
2808    Since this function may be used to adjust the stack pointer, we must
2809    ensure that it cannot cause transient stack deallocation (for example
2810    by first incrementing SP and then decrementing when adjusting by a
2811    large immediate).  */
2812 
2813 static void
2814 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2815 		      rtx src, HOST_WIDE_INT offset, rtx temp1,
2816 		      bool frame_related_p, bool emit_move_imm)
2817 {
2818   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2819   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2820 
2821   HOST_WIDE_INT moffset = abs_hwi (offset);
2822   rtx_insn *insn;
2823 
2824   if (!moffset)
2825     {
2826       if (!rtx_equal_p (dest, src))
2827 	{
2828 	  insn = emit_insn (gen_rtx_SET (dest, src));
2829 	  RTX_FRAME_RELATED_P (insn) = frame_related_p;
2830 	}
2831       return;
2832     }
2833 
2834   /* Single instruction adjustment.  */
2835   if (aarch64_uimm12_shift (moffset))
2836     {
2837       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2838       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2839       return;
2840     }
2841 
2842   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2843      and either:
2844 
2845      a) the offset cannot be loaded by a 16-bit move or
2846      b) there is no spare register into which we can move it.  */
2847   if (moffset < 0x1000000
2848       && ((!temp1 && !can_create_pseudo_p ())
2849 	  || !aarch64_move_imm (moffset, mode)))
2850     {
2851       HOST_WIDE_INT low_off = moffset & 0xfff;
2852 
2853       low_off = offset < 0 ? -low_off : low_off;
2854       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2855       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2856       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2857       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2858       return;
2859     }
2860 
2861   /* Emit a move immediate if required and an addition/subtraction.  */
2862   if (emit_move_imm)
2863     {
2864       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2865       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2866     }
2867   insn = emit_insn (offset < 0
2868 		    ? gen_sub3_insn (dest, src, temp1)
2869 		    : gen_add3_insn (dest, src, temp1));
2870   if (frame_related_p)
2871     {
2872       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2873       rtx adj = plus_constant (mode, src, offset);
2874       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2875     }
2876 }
2877 
2878 /* Return the number of temporary registers that aarch64_add_offset
2879    would need to move OFFSET into a register or add OFFSET to a register;
2880    ADD_P is true if we want the latter rather than the former.  */
2881 
2882 static unsigned int
2883 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2884 {
2885   /* This follows the same structure as aarch64_add_offset.  */
2886   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2887     return 0;
2888 
2889   unsigned int count = 0;
2890   HOST_WIDE_INT factor = offset.coeffs[1];
2891   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2892   poly_int64 poly_offset (factor, factor);
2893   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2894     /* Need one register for the ADDVL/ADDPL result.  */
2895     count += 1;
2896   else if (factor != 0)
2897     {
2898       factor = abs (factor);
2899       if (factor > 16 * (factor & -factor))
2900 	/* Need one register for the CNT result and one for the multiplication
2901 	   factor.  If necessary, the second temporary can be reused for the
2902 	   constant part of the offset.  */
2903 	return 2;
2904       /* Need one register for the CNT result (which might then
2905 	 be shifted).  */
2906       count += 1;
2907     }
2908   return count + aarch64_add_offset_1_temporaries (constant);
2909 }
2910 
2911 /* If X can be represented as a poly_int64, return the number
2912    of temporaries that are required to add it to a register.
2913    Return -1 otherwise.  */
2914 
2915 int
2916 aarch64_add_offset_temporaries (rtx x)
2917 {
2918   poly_int64 offset;
2919   if (!poly_int_rtx_p (x, &offset))
2920     return -1;
2921   return aarch64_offset_temporaries (true, offset);
2922 }
2923 
2924 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2925    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2926    be set and CFA adjustments added to the generated instructions.
2927 
2928    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2929    temporary if register allocation is already complete.  This temporary
2930    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2931    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2932    false to avoid emitting the immediate again.
2933 
2934    TEMP2, if nonnull, is a second temporary register that doesn't
2935    overlap either DEST or REG.
2936 
2937    Since this function may be used to adjust the stack pointer, we must
2938    ensure that it cannot cause transient stack deallocation (for example
2939    by first incrementing SP and then decrementing when adjusting by a
2940    large immediate).  */
2941 
2942 static void
2943 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2944 		    poly_int64 offset, rtx temp1, rtx temp2,
2945 		    bool frame_related_p, bool emit_move_imm = true)
2946 {
2947   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2948   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2949   gcc_assert (temp1 == NULL_RTX
2950 	      || !frame_related_p
2951 	      || !reg_overlap_mentioned_p (temp1, dest));
2952   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2953 
2954   /* Try using ADDVL or ADDPL to add the whole value.  */
2955   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2956     {
2957       rtx offset_rtx = gen_int_mode (offset, mode);
2958       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2959       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2960       return;
2961     }
2962 
2963   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2964      SVE vector register, over and above the minimum size of 128 bits.
2965      This is equivalent to half the value returned by CNTD with a
2966      vector shape of ALL.  */
2967   HOST_WIDE_INT factor = offset.coeffs[1];
2968   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2969 
2970   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2971   poly_int64 poly_offset (factor, factor);
2972   if (src != const0_rtx
2973       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2974     {
2975       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2976       if (frame_related_p)
2977 	{
2978 	  rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2979 	  RTX_FRAME_RELATED_P (insn) = true;
2980 	  src = dest;
2981 	}
2982       else
2983 	{
2984 	  rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2985 	  src = aarch64_force_temporary (mode, temp1, addr);
2986 	  temp1 = temp2;
2987 	  temp2 = NULL_RTX;
2988 	}
2989     }
2990   /* Otherwise use a CNT-based sequence.  */
2991   else if (factor != 0)
2992     {
2993       /* Use a subtraction if we have a negative factor.  */
2994       rtx_code code = PLUS;
2995       if (factor < 0)
2996 	{
2997 	  factor = -factor;
2998 	  code = MINUS;
2999 	}
3000 
3001       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
3002 	 into the multiplication.  */
3003       rtx val;
3004       int shift = 0;
3005       if (factor & 1)
3006 	/* Use a right shift by 1.  */
3007 	shift = -1;
3008       else
3009 	factor /= 2;
3010       HOST_WIDE_INT low_bit = factor & -factor;
3011       if (factor <= 16 * low_bit)
3012 	{
3013 	  if (factor > 16 * 8)
3014 	    {
3015 	      /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
3016 		 the value with the minimum multiplier and shift it into
3017 		 position.  */
3018 	      int extra_shift = exact_log2 (low_bit);
3019 	      shift += extra_shift;
3020 	      factor >>= extra_shift;
3021 	    }
3022 	  val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
3023 	}
3024       else
3025 	{
3026 	  /* Use CNTD, then multiply it by FACTOR.  */
3027 	  val = gen_int_mode (poly_int64 (2, 2), mode);
3028 	  val = aarch64_force_temporary (mode, temp1, val);
3029 
3030 	  /* Go back to using a negative multiplication factor if we have
3031 	     no register from which to subtract.  */
3032 	  if (code == MINUS && src == const0_rtx)
3033 	    {
3034 	      factor = -factor;
3035 	      code = PLUS;
3036 	    }
3037 	  rtx coeff1 = gen_int_mode (factor, mode);
3038 	  coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
3039 	  val = gen_rtx_MULT (mode, val, coeff1);
3040 	}
3041 
3042       if (shift > 0)
3043 	{
3044 	  /* Multiply by 1 << SHIFT.  */
3045 	  val = aarch64_force_temporary (mode, temp1, val);
3046 	  val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
3047 	}
3048       else if (shift == -1)
3049 	{
3050 	  /* Divide by 2.  */
3051 	  val = aarch64_force_temporary (mode, temp1, val);
3052 	  val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
3053 	}
3054 
3055       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
3056       if (src != const0_rtx)
3057 	{
3058 	  val = aarch64_force_temporary (mode, temp1, val);
3059 	  val = gen_rtx_fmt_ee (code, mode, src, val);
3060 	}
3061       else if (code == MINUS)
3062 	{
3063 	  val = aarch64_force_temporary (mode, temp1, val);
3064 	  val = gen_rtx_NEG (mode, val);
3065 	}
3066 
3067       if (constant == 0 || frame_related_p)
3068 	{
3069 	  rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
3070 	  if (frame_related_p)
3071 	    {
3072 	      RTX_FRAME_RELATED_P (insn) = true;
3073 	      add_reg_note (insn, REG_CFA_ADJUST_CFA,
3074 			    gen_rtx_SET (dest, plus_constant (Pmode, src,
3075 							      poly_offset)));
3076 	    }
3077 	  src = dest;
3078 	  if (constant == 0)
3079 	    return;
3080 	}
3081       else
3082 	{
3083 	  src = aarch64_force_temporary (mode, temp1, val);
3084 	  temp1 = temp2;
3085 	  temp2 = NULL_RTX;
3086 	}
3087 
3088       emit_move_imm = true;
3089     }
3090 
3091   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
3092 			frame_related_p, emit_move_imm);
3093 }
3094 
3095 /* Like aarch64_add_offset, but the offset is given as an rtx rather
3096    than a poly_int64.  */
3097 
3098 void
3099 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
3100 			  rtx offset_rtx, rtx temp1, rtx temp2)
3101 {
3102   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
3103 		      temp1, temp2, false);
3104 }
3105 
3106 /* Add DELTA to the stack pointer, marking the instructions frame-related.
3107    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
3108    if TEMP1 already contains abs (DELTA).  */
3109 
3110 static inline void
3111 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
3112 {
3113   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
3114 		      temp1, temp2, true, emit_move_imm);
3115 }
3116 
3117 /* Subtract DELTA from the stack pointer, marking the instructions
3118    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
3119    if nonnull.  */
3120 
3121 static inline void
3122 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
3123 		bool emit_move_imm = true)
3124 {
3125   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
3126 		      temp1, temp2, frame_related_p, emit_move_imm);
3127 }
3128 
3129 /* Set DEST to (vec_series BASE STEP).  */
3130 
3131 static void
3132 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
3133 {
3134   machine_mode mode = GET_MODE (dest);
3135   scalar_mode inner = GET_MODE_INNER (mode);
3136 
3137   /* Each operand can be a register or an immediate in the range [-16, 15].  */
3138   if (!aarch64_sve_index_immediate_p (base))
3139     base = force_reg (inner, base);
3140   if (!aarch64_sve_index_immediate_p (step))
3141     step = force_reg (inner, step);
3142 
3143   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
3144 }
3145 
3146 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
3147    integer of mode INT_MODE.  Return true on success.  */
3148 
3149 static bool
3150 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
3151 				      rtx src)
3152 {
3153   /* If the constant is smaller than 128 bits, we can do the move
3154      using a vector of SRC_MODEs.  */
3155   if (src_mode != TImode)
3156     {
3157       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
3158 				     GET_MODE_SIZE (src_mode));
3159       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
3160       emit_move_insn (gen_lowpart (dup_mode, dest),
3161 		      gen_const_vec_duplicate (dup_mode, src));
3162       return true;
3163     }
3164 
3165   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
3166   src = force_const_mem (src_mode, src);
3167   if (!src)
3168     return false;
3169 
3170   /* Make sure that the address is legitimate.  */
3171   if (!aarch64_sve_ld1r_operand_p (src))
3172     {
3173       rtx addr = force_reg (Pmode, XEXP (src, 0));
3174       src = replace_equiv_address (src, addr);
3175     }
3176 
3177   machine_mode mode = GET_MODE (dest);
3178   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
3179   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
3180   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3181   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
3182   emit_insn (gen_rtx_SET (dest, src));
3183   return true;
3184 }
3185 
3186 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
3187    isn't a simple duplicate or series.  */
3188 
3189 static void
3190 aarch64_expand_sve_const_vector (rtx dest, rtx src)
3191 {
3192   machine_mode mode = GET_MODE (src);
3193   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
3194   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
3195   gcc_assert (npatterns > 1);
3196 
3197   if (nelts_per_pattern == 1)
3198     {
3199       /* The constant is a repeating seqeuence of at least two elements,
3200 	 where the repeating elements occupy no more than 128 bits.
3201 	 Get an integer representation of the replicated value.  */
3202       scalar_int_mode int_mode;
3203       if (BYTES_BIG_ENDIAN)
3204 	/* For now, always use LD1RQ to load the value on big-endian
3205 	   targets, since the handling of smaller integers includes a
3206 	   subreg that is semantically an element reverse.  */
3207 	int_mode = TImode;
3208       else
3209 	{
3210 	  unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
3211 	  gcc_assert (int_bits <= 128);
3212 	  int_mode = int_mode_for_size (int_bits, 0).require ();
3213 	}
3214       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
3215       if (int_value
3216 	  && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
3217 	return;
3218     }
3219 
3220   /* Expand each pattern individually.  */
3221   rtx_vector_builder builder;
3222   auto_vec<rtx, 16> vectors (npatterns);
3223   for (unsigned int i = 0; i < npatterns; ++i)
3224     {
3225       builder.new_vector (mode, 1, nelts_per_pattern);
3226       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
3227 	builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
3228       vectors.quick_push (force_reg (mode, builder.build ()));
3229     }
3230 
3231   /* Use permutes to interleave the separate vectors.  */
3232   while (npatterns > 1)
3233     {
3234       npatterns /= 2;
3235       for (unsigned int i = 0; i < npatterns; ++i)
3236 	{
3237 	  rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
3238 	  rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
3239 	  emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
3240 	  vectors[i] = tmp;
3241 	}
3242     }
3243   gcc_assert (vectors[0] == dest);
3244 }
3245 
3246 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
3247    is a pattern that can be used to set DEST to a replicated scalar
3248    element.  */
3249 
3250 void
3251 aarch64_expand_mov_immediate (rtx dest, rtx imm,
3252 			      rtx (*gen_vec_duplicate) (rtx, rtx))
3253 {
3254   machine_mode mode = GET_MODE (dest);
3255 
3256   /* Check on what type of symbol it is.  */
3257   scalar_int_mode int_mode;
3258   if ((GET_CODE (imm) == SYMBOL_REF
3259        || GET_CODE (imm) == LABEL_REF
3260        || GET_CODE (imm) == CONST
3261        || GET_CODE (imm) == CONST_POLY_INT)
3262       && is_a <scalar_int_mode> (mode, &int_mode))
3263     {
3264       rtx mem;
3265       poly_int64 offset;
3266       HOST_WIDE_INT const_offset;
3267       enum aarch64_symbol_type sty;
3268 
3269       /* If we have (const (plus symbol offset)), separate out the offset
3270 	 before we start classifying the symbol.  */
3271       rtx base = strip_offset (imm, &offset);
3272 
3273       /* We must always add an offset involving VL separately, rather than
3274 	 folding it into the relocation.  */
3275       if (!offset.is_constant (&const_offset))
3276 	{
3277 	  if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
3278 	    emit_insn (gen_rtx_SET (dest, imm));
3279 	  else
3280 	    {
3281 	      /* Do arithmetic on 32-bit values if the result is smaller
3282 		 than that.  */
3283 	      if (partial_subreg_p (int_mode, SImode))
3284 		{
3285 		  /* It is invalid to do symbol calculations in modes
3286 		     narrower than SImode.  */
3287 		  gcc_assert (base == const0_rtx);
3288 		  dest = gen_lowpart (SImode, dest);
3289 		  int_mode = SImode;
3290 		}
3291 	      if (base != const0_rtx)
3292 		{
3293 		  base = aarch64_force_temporary (int_mode, dest, base);
3294 		  aarch64_add_offset (int_mode, dest, base, offset,
3295 				      NULL_RTX, NULL_RTX, false);
3296 		}
3297 	      else
3298 		aarch64_add_offset (int_mode, dest, base, offset,
3299 				    dest, NULL_RTX, false);
3300 	    }
3301 	  return;
3302 	}
3303 
3304       sty = aarch64_classify_symbol (base, const_offset);
3305       switch (sty)
3306 	{
3307 	case SYMBOL_FORCE_TO_MEM:
3308 	  if (const_offset != 0
3309 	      && targetm.cannot_force_const_mem (int_mode, imm))
3310 	    {
3311 	      gcc_assert (can_create_pseudo_p ());
3312 	      base = aarch64_force_temporary (int_mode, dest, base);
3313 	      aarch64_add_offset (int_mode, dest, base, const_offset,
3314 				  NULL_RTX, NULL_RTX, false);
3315 	      return;
3316 	    }
3317 
3318 	  mem = force_const_mem (ptr_mode, imm);
3319 	  gcc_assert (mem);
3320 
3321 	  /* If we aren't generating PC relative literals, then
3322 	     we need to expand the literal pool access carefully.
3323 	     This is something that needs to be done in a number
3324 	     of places, so could well live as a separate function.  */
3325 	  if (!aarch64_pcrelative_literal_loads)
3326 	    {
3327 	      gcc_assert (can_create_pseudo_p ());
3328 	      base = gen_reg_rtx (ptr_mode);
3329 	      aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3330 	      if (ptr_mode != Pmode)
3331 		base = convert_memory_address (Pmode, base);
3332 	      mem = gen_rtx_MEM (ptr_mode, base);
3333 	    }
3334 
3335 	  if (int_mode != ptr_mode)
3336 	    mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3337 
3338 	  emit_insn (gen_rtx_SET (dest, mem));
3339 
3340 	  return;
3341 
3342         case SYMBOL_SMALL_TLSGD:
3343         case SYMBOL_SMALL_TLSDESC:
3344 	case SYMBOL_SMALL_TLSIE:
3345 	case SYMBOL_SMALL_GOT_28K:
3346 	case SYMBOL_SMALL_GOT_4G:
3347 	case SYMBOL_TINY_GOT:
3348 	case SYMBOL_TINY_TLSIE:
3349 	  if (const_offset != 0)
3350 	    {
3351 	      gcc_assert(can_create_pseudo_p ());
3352 	      base = aarch64_force_temporary (int_mode, dest, base);
3353 	      aarch64_add_offset (int_mode, dest, base, const_offset,
3354 				  NULL_RTX, NULL_RTX, false);
3355 	      return;
3356 	    }
3357 	  /* FALLTHRU */
3358 
3359 	case SYMBOL_SMALL_ABSOLUTE:
3360 	case SYMBOL_TINY_ABSOLUTE:
3361 	case SYMBOL_TLSLE12:
3362 	case SYMBOL_TLSLE24:
3363 	case SYMBOL_TLSLE32:
3364 	case SYMBOL_TLSLE48:
3365 	  aarch64_load_symref_appropriately (dest, imm, sty);
3366 	  return;
3367 
3368 	default:
3369 	  gcc_unreachable ();
3370 	}
3371     }
3372 
3373   if (!CONST_INT_P (imm))
3374     {
3375       rtx base, step, value;
3376       if (GET_CODE (imm) == HIGH
3377 	  || aarch64_simd_valid_immediate (imm, NULL))
3378 	emit_insn (gen_rtx_SET (dest, imm));
3379       else if (const_vec_series_p (imm, &base, &step))
3380 	aarch64_expand_vec_series (dest, base, step);
3381       else if (const_vec_duplicate_p (imm, &value))
3382 	{
3383 	  /* If the constant is out of range of an SVE vector move,
3384 	     load it from memory if we can, otherwise move it into
3385 	     a register and use a DUP.  */
3386 	  scalar_mode inner_mode = GET_MODE_INNER (mode);
3387 	  rtx op = force_const_mem (inner_mode, value);
3388 	  if (!op)
3389 	    op = force_reg (inner_mode, value);
3390 	  else if (!aarch64_sve_ld1r_operand_p (op))
3391 	    {
3392 	      rtx addr = force_reg (Pmode, XEXP (op, 0));
3393 	      op = replace_equiv_address (op, addr);
3394 	    }
3395 	  emit_insn (gen_vec_duplicate (dest, op));
3396 	}
3397       else if (GET_CODE (imm) == CONST_VECTOR
3398 	       && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3399 	aarch64_expand_sve_const_vector (dest, imm);
3400       else
3401 	{
3402 	  rtx mem = force_const_mem (mode, imm);
3403 	  gcc_assert (mem);
3404 	  emit_move_insn (dest, mem);
3405 	}
3406 
3407       return;
3408     }
3409 
3410   aarch64_internal_mov_immediate (dest, imm, true,
3411 				  as_a <scalar_int_mode> (mode));
3412 }
3413 
3414 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3415    that is known to contain PTRUE.  */
3416 
3417 void
3418 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3419 {
3420   expand_operand ops[3];
3421   machine_mode mode = GET_MODE (dest);
3422   create_output_operand (&ops[0], dest, mode);
3423   create_input_operand (&ops[1], pred, GET_MODE(pred));
3424   create_input_operand (&ops[2], src, mode);
3425   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
3426 }
3427 
3428 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3429    operand is in memory.  In this case we need to use the predicated LD1
3430    and ST1 instead of LDR and STR, both for correctness on big-endian
3431    targets and because LD1 and ST1 support a wider range of addressing modes.
3432    PRED_MODE is the mode of the predicate.
3433 
3434    See the comment at the head of aarch64-sve.md for details about the
3435    big-endian handling.  */
3436 
3437 void
3438 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3439 {
3440   machine_mode mode = GET_MODE (dest);
3441   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3442   if (!register_operand (src, mode)
3443       && !register_operand (dest, mode))
3444     {
3445       rtx tmp = gen_reg_rtx (mode);
3446       if (MEM_P (src))
3447 	aarch64_emit_sve_pred_move (tmp, ptrue, src);
3448       else
3449 	emit_move_insn (tmp, src);
3450       src = tmp;
3451     }
3452   aarch64_emit_sve_pred_move (dest, ptrue, src);
3453 }
3454 
3455 /* Called only on big-endian targets.  See whether an SVE vector move
3456    from SRC to DEST is effectively a REV[BHW] instruction, because at
3457    least one operand is a subreg of an SVE vector that has wider or
3458    narrower elements.  Return true and emit the instruction if so.
3459 
3460    For example:
3461 
3462      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3463 
3464    represents a VIEW_CONVERT between the following vectors, viewed
3465    in memory order:
3466 
3467      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3468      R1: { [0],      [1],      [2],      [3],     ... }
3469 
3470    The high part of lane X in R2 should therefore correspond to lane X*2
3471    of R1, but the register representations are:
3472 
3473          msb                                      lsb
3474      R2: ...... [1].high  [1].low   [0].high  [0].low
3475      R1: ...... [3]       [2]       [1]       [0]
3476 
3477    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3478    We therefore need a reverse operation to swap the high and low values
3479    around.
3480 
3481    This is purely an optimization.  Without it we would spill the
3482    subreg operand to the stack in one mode and reload it in the
3483    other mode, which has the same effect as the REV.  */
3484 
3485 bool
3486 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3487 {
3488   gcc_assert (BYTES_BIG_ENDIAN);
3489   if (GET_CODE (dest) == SUBREG)
3490     dest = SUBREG_REG (dest);
3491   if (GET_CODE (src) == SUBREG)
3492     src = SUBREG_REG (src);
3493 
3494   /* The optimization handles two single SVE REGs with different element
3495      sizes.  */
3496   if (!REG_P (dest)
3497       || !REG_P (src)
3498       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3499       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3500       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3501 	  == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3502     return false;
3503 
3504   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3505   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3506   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3507 			       UNSPEC_REV_SUBREG);
3508   emit_insn (gen_rtx_SET (dest, unspec));
3509   return true;
3510 }
3511 
3512 /* Return a copy of X with mode MODE, without changing its other
3513    attributes.  Unlike gen_lowpart, this doesn't care whether the
3514    mode change is valid.  */
3515 
3516 static rtx
3517 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3518 {
3519   if (GET_MODE (x) == mode)
3520     return x;
3521 
3522   x = shallow_copy_rtx (x);
3523   set_mode_and_regno (x, mode, REGNO (x));
3524   return x;
3525 }
3526 
3527 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3528    operands.  */
3529 
3530 void
3531 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3532 {
3533   /* Decide which REV operation we need.  The mode with narrower elements
3534      determines the mode of the operands and the mode with the wider
3535      elements determines the reverse width.  */
3536   machine_mode mode_with_wider_elts = GET_MODE (dest);
3537   machine_mode mode_with_narrower_elts = GET_MODE (src);
3538   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3539       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3540     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3541 
3542   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3543   unsigned int unspec;
3544   if (wider_bytes == 8)
3545     unspec = UNSPEC_REV64;
3546   else if (wider_bytes == 4)
3547     unspec = UNSPEC_REV32;
3548   else if (wider_bytes == 2)
3549     unspec = UNSPEC_REV16;
3550   else
3551     gcc_unreachable ();
3552   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3553 
3554   /* Emit:
3555 
3556        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3557 			 UNSPEC_MERGE_PTRUE))
3558 
3559      with the appropriate modes.  */
3560   ptrue = gen_lowpart (pred_mode, ptrue);
3561   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3562   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3563   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3564   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3565 			UNSPEC_MERGE_PTRUE);
3566   emit_insn (gen_rtx_SET (dest, src));
3567 }
3568 
3569 static bool
3570 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3571 				 tree exp ATTRIBUTE_UNUSED)
3572 {
3573   if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl))
3574     return false;
3575 
3576   return true;
3577 }
3578 
3579 /* Implement TARGET_PASS_BY_REFERENCE.  */
3580 
3581 static bool
3582 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3583 			   machine_mode mode,
3584 			   const_tree type,
3585 			   bool named ATTRIBUTE_UNUSED)
3586 {
3587   HOST_WIDE_INT size;
3588   machine_mode dummymode;
3589   int nregs;
3590 
3591   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3592   if (mode == BLKmode && type)
3593     size = int_size_in_bytes (type);
3594   else
3595     /* No frontends can create types with variable-sized modes, so we
3596        shouldn't be asked to pass or return them.  */
3597     size = GET_MODE_SIZE (mode).to_constant ();
3598 
3599   /* Aggregates are passed by reference based on their size.  */
3600   if (type && AGGREGATE_TYPE_P (type))
3601     {
3602       size = int_size_in_bytes (type);
3603     }
3604 
3605   /* Variable sized arguments are always returned by reference.  */
3606   if (size < 0)
3607     return true;
3608 
3609   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3610   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3611 					       &dummymode, &nregs,
3612 					       NULL))
3613     return false;
3614 
3615   /* Arguments which are variable sized or larger than 2 registers are
3616      passed by reference unless they are a homogenous floating point
3617      aggregate.  */
3618   return size > 2 * UNITS_PER_WORD;
3619 }
3620 
3621 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3622 static bool
3623 aarch64_return_in_msb (const_tree valtype)
3624 {
3625   machine_mode dummy_mode;
3626   int dummy_int;
3627 
3628   /* Never happens in little-endian mode.  */
3629   if (!BYTES_BIG_ENDIAN)
3630     return false;
3631 
3632   /* Only composite types smaller than or equal to 16 bytes can
3633      be potentially returned in registers.  */
3634   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3635       || int_size_in_bytes (valtype) <= 0
3636       || int_size_in_bytes (valtype) > 16)
3637     return false;
3638 
3639   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3640      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3641      is always passed/returned in the least significant bits of fp/simd
3642      register(s).  */
3643   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3644 					       &dummy_mode, &dummy_int, NULL))
3645     return false;
3646 
3647   return true;
3648 }
3649 
3650 /* Implement TARGET_FUNCTION_VALUE.
3651    Define how to find the value returned by a function.  */
3652 
3653 static rtx
3654 aarch64_function_value (const_tree type, const_tree func,
3655 			bool outgoing ATTRIBUTE_UNUSED)
3656 {
3657   machine_mode mode;
3658   int unsignedp;
3659   int count;
3660   machine_mode ag_mode;
3661 
3662   mode = TYPE_MODE (type);
3663   if (INTEGRAL_TYPE_P (type))
3664     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3665 
3666   if (aarch64_return_in_msb (type))
3667     {
3668       HOST_WIDE_INT size = int_size_in_bytes (type);
3669 
3670       if (size % UNITS_PER_WORD != 0)
3671 	{
3672 	  size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3673 	  mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3674 	}
3675     }
3676 
3677   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3678 					       &ag_mode, &count, NULL))
3679     {
3680       if (!aarch64_composite_type_p (type, mode))
3681 	{
3682 	  gcc_assert (count == 1 && mode == ag_mode);
3683 	  return gen_rtx_REG (mode, V0_REGNUM);
3684 	}
3685       else
3686 	{
3687 	  int i;
3688 	  rtx par;
3689 
3690 	  par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3691 	  for (i = 0; i < count; i++)
3692 	    {
3693 	      rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3694 	      rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3695 	      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3696 	      XVECEXP (par, 0, i) = tmp;
3697 	    }
3698 	  return par;
3699 	}
3700     }
3701   else
3702     return gen_rtx_REG (mode, R0_REGNUM);
3703 }
3704 
3705 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3706    Return true if REGNO is the number of a hard register in which the values
3707    of called function may come back.  */
3708 
3709 static bool
3710 aarch64_function_value_regno_p (const unsigned int regno)
3711 {
3712   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3713      of 16-byte return values are: 128-bit integers and 16-byte small
3714      structures (excluding homogeneous floating-point aggregates).  */
3715   if (regno == R0_REGNUM || regno == R1_REGNUM)
3716     return true;
3717 
3718   /* Up to four fp/simd registers can return a function value, e.g. a
3719      homogeneous floating-point aggregate having four members.  */
3720   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3721     return TARGET_FLOAT;
3722 
3723   return false;
3724 }
3725 
3726 /* Implement TARGET_RETURN_IN_MEMORY.
3727 
3728    If the type T of the result of a function is such that
3729      void func (T arg)
3730    would require that arg be passed as a value in a register (or set of
3731    registers) according to the parameter passing rules, then the result
3732    is returned in the same registers as would be used for such an
3733    argument.  */
3734 
3735 static bool
3736 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3737 {
3738   HOST_WIDE_INT size;
3739   machine_mode ag_mode;
3740   int count;
3741 
3742   if (!AGGREGATE_TYPE_P (type)
3743       && TREE_CODE (type) != COMPLEX_TYPE
3744       && TREE_CODE (type) != VECTOR_TYPE)
3745     /* Simple scalar types always returned in registers.  */
3746     return false;
3747 
3748   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3749 					       type,
3750 					       &ag_mode,
3751 					       &count,
3752 					       NULL))
3753     return false;
3754 
3755   /* Types larger than 2 registers returned in memory.  */
3756   size = int_size_in_bytes (type);
3757   return (size < 0 || size > 2 * UNITS_PER_WORD);
3758 }
3759 
3760 static bool
3761 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3762 			       const_tree type, int *nregs)
3763 {
3764   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3765   return aarch64_vfp_is_call_or_return_candidate (mode,
3766 						  type,
3767 						  &pcum->aapcs_vfp_rmode,
3768 						  nregs,
3769 						  NULL);
3770 }
3771 
3772 /* Given MODE and TYPE of a function argument, return the alignment in
3773    bits.  The idea is to suppress any stronger alignment requested by
3774    the user and opt for the natural alignment (specified in AAPCS64 \S
3775    4.1).  ABI_BREAK is set to true if the alignment was incorrectly
3776    calculated in versions of GCC prior to GCC-9.  This is a helper
3777    function for local use only.  */
3778 
3779 static unsigned int
3780 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
3781 				bool *abi_break)
3782 {
3783   *abi_break = false;
3784   if (!type)
3785     return GET_MODE_ALIGNMENT (mode);
3786 
3787   if (integer_zerop (TYPE_SIZE (type)))
3788     return 0;
3789 
3790   gcc_assert (TYPE_MODE (type) == mode);
3791 
3792   if (!AGGREGATE_TYPE_P (type))
3793     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3794 
3795   if (TREE_CODE (type) == ARRAY_TYPE)
3796     return TYPE_ALIGN (TREE_TYPE (type));
3797 
3798   unsigned int alignment = 0;
3799   unsigned int bitfield_alignment = 0;
3800   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3801     if (TREE_CODE (field) == FIELD_DECL)
3802       {
3803 	alignment = std::max (alignment, DECL_ALIGN (field));
3804 	if (DECL_BIT_FIELD_TYPE (field))
3805 	  bitfield_alignment
3806 	    = std::max (bitfield_alignment,
3807 			TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
3808       }
3809 
3810   if (bitfield_alignment > alignment)
3811     {
3812       *abi_break = true;
3813       return bitfield_alignment;
3814     }
3815 
3816   return alignment;
3817 }
3818 
3819 /* Layout a function argument according to the AAPCS64 rules.  The rule
3820    numbers refer to the rule numbers in the AAPCS64.  */
3821 
3822 static void
3823 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3824 		    const_tree type,
3825 		    bool named ATTRIBUTE_UNUSED)
3826 {
3827   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3828   int ncrn, nvrn, nregs;
3829   bool allocate_ncrn, allocate_nvrn;
3830   HOST_WIDE_INT size;
3831   bool abi_break;
3832 
3833   /* We need to do this once per argument.  */
3834   if (pcum->aapcs_arg_processed)
3835     return;
3836 
3837   pcum->aapcs_arg_processed = true;
3838 
3839   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3840   if (type)
3841     size = int_size_in_bytes (type);
3842   else
3843     /* No frontends can create types with variable-sized modes, so we
3844        shouldn't be asked to pass or return them.  */
3845     size = GET_MODE_SIZE (mode).to_constant ();
3846   size = ROUND_UP (size, UNITS_PER_WORD);
3847 
3848   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3849   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3850 						 mode,
3851 						 type,
3852 						 &nregs);
3853 
3854   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3855      The following code thus handles passing by SIMD/FP registers first.  */
3856 
3857   nvrn = pcum->aapcs_nvrn;
3858 
3859   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3860      and homogenous short-vector aggregates (HVA).  */
3861   if (allocate_nvrn)
3862     {
3863       if (!TARGET_FLOAT)
3864 	aarch64_err_no_fpadvsimd (mode);
3865 
3866       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3867 	{
3868 	  pcum->aapcs_nextnvrn = nvrn + nregs;
3869 	  if (!aarch64_composite_type_p (type, mode))
3870 	    {
3871 	      gcc_assert (nregs == 1);
3872 	      pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3873 	    }
3874 	  else
3875 	    {
3876 	      rtx par;
3877 	      int i;
3878 	      par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3879 	      for (i = 0; i < nregs; i++)
3880 		{
3881 		  rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3882 					 V0_REGNUM + nvrn + i);
3883 		  rtx offset = gen_int_mode
3884 		    (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3885 		  tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3886 		  XVECEXP (par, 0, i) = tmp;
3887 		}
3888 	      pcum->aapcs_reg = par;
3889 	    }
3890 	  return;
3891 	}
3892       else
3893 	{
3894 	  /* C.3 NSRN is set to 8.  */
3895 	  pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3896 	  goto on_stack;
3897 	}
3898     }
3899 
3900   ncrn = pcum->aapcs_ncrn;
3901   nregs = size / UNITS_PER_WORD;
3902 
3903   /* C6 - C9.  though the sign and zero extension semantics are
3904      handled elsewhere.  This is the case where the argument fits
3905      entirely general registers.  */
3906   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3907     {
3908       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3909 
3910       /* C.8 if the argument has an alignment of 16 then the NGRN is
3911 	 rounded up to the next even number.  */
3912       if (nregs == 2
3913 	  && ncrn % 2
3914 	  /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3915 	     comparison is there because for > 16 * BITS_PER_UNIT
3916 	     alignment nregs should be > 2 and therefore it should be
3917 	     passed by reference rather than value.  */
3918 	  && (aarch64_function_arg_alignment (mode, type, &abi_break)
3919 	      == 16 * BITS_PER_UNIT))
3920 	{
3921 	  if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
3922 	    inform (input_location, "parameter passing for argument of type "
3923 		    "%qT changed in GCC 9.1", type);
3924 	  ++ncrn;
3925 	  gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3926 	}
3927 
3928       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3929 	 A reg is still generated for it, but the caller should be smart
3930 	 enough not to use it.  */
3931       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3932 	pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3933       else
3934 	{
3935 	  rtx par;
3936 	  int i;
3937 
3938 	  par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3939 	  for (i = 0; i < nregs; i++)
3940 	    {
3941 	      rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3942 	      tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3943 				       GEN_INT (i * UNITS_PER_WORD));
3944 	      XVECEXP (par, 0, i) = tmp;
3945 	    }
3946 	  pcum->aapcs_reg = par;
3947 	}
3948 
3949       pcum->aapcs_nextncrn = ncrn + nregs;
3950       return;
3951     }
3952 
3953   /* C.11  */
3954   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3955 
3956   /* The argument is passed on stack; record the needed number of words for
3957      this argument and align the total size if necessary.  */
3958 on_stack:
3959   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3960 
3961   if (aarch64_function_arg_alignment (mode, type, &abi_break)
3962       == 16 * BITS_PER_UNIT)
3963     {
3964       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
3965       if (pcum->aapcs_stack_size != new_size)
3966 	{
3967 	  if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
3968 	    inform (input_location, "parameter passing for argument of type "
3969 		    "%qT changed in GCC 9.1", type);
3970 	  pcum->aapcs_stack_size = new_size;
3971 	}
3972     }
3973   return;
3974 }
3975 
3976 /* Implement TARGET_FUNCTION_ARG.  */
3977 
3978 static rtx
3979 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3980 		      const_tree type, bool named)
3981 {
3982   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3983   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3984 
3985   if (mode == VOIDmode)
3986     return NULL_RTX;
3987 
3988   aarch64_layout_arg (pcum_v, mode, type, named);
3989   return pcum->aapcs_reg;
3990 }
3991 
3992 void
3993 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3994 			   const_tree fntype ATTRIBUTE_UNUSED,
3995 			   rtx libname ATTRIBUTE_UNUSED,
3996 			   const_tree fndecl ATTRIBUTE_UNUSED,
3997 			   unsigned n_named ATTRIBUTE_UNUSED)
3998 {
3999   pcum->aapcs_ncrn = 0;
4000   pcum->aapcs_nvrn = 0;
4001   pcum->aapcs_nextncrn = 0;
4002   pcum->aapcs_nextnvrn = 0;
4003   pcum->pcs_variant = ARM_PCS_AAPCS64;
4004   pcum->aapcs_reg = NULL_RTX;
4005   pcum->aapcs_arg_processed = false;
4006   pcum->aapcs_stack_words = 0;
4007   pcum->aapcs_stack_size = 0;
4008 
4009   if (!TARGET_FLOAT
4010       && fndecl && TREE_PUBLIC (fndecl)
4011       && fntype && fntype != error_mark_node)
4012     {
4013       const_tree type = TREE_TYPE (fntype);
4014       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
4015       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
4016       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
4017 						   &mode, &nregs, NULL))
4018 	aarch64_err_no_fpadvsimd (TYPE_MODE (type));
4019     }
4020   return;
4021 }
4022 
4023 static void
4024 aarch64_function_arg_advance (cumulative_args_t pcum_v,
4025 			      machine_mode mode,
4026 			      const_tree type,
4027 			      bool named)
4028 {
4029   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
4030   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
4031     {
4032       aarch64_layout_arg (pcum_v, mode, type, named);
4033       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
4034 		  != (pcum->aapcs_stack_words != 0));
4035       pcum->aapcs_arg_processed = false;
4036       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
4037       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
4038       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
4039       pcum->aapcs_stack_words = 0;
4040       pcum->aapcs_reg = NULL_RTX;
4041     }
4042 }
4043 
4044 bool
4045 aarch64_function_arg_regno_p (unsigned regno)
4046 {
4047   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
4048 	  || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
4049 }
4050 
4051 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
4052    PARM_BOUNDARY bits of alignment, but will be given anything up
4053    to STACK_BOUNDARY bits if the type requires it.  This makes sure
4054    that both before and after the layout of each argument, the Next
4055    Stacked Argument Address (NSAA) will have a minimum alignment of
4056    8 bytes.  */
4057 
4058 static unsigned int
4059 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
4060 {
4061   bool abi_break;
4062   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
4063 							   &abi_break);
4064   if (abi_break & warn_psabi)
4065     inform (input_location, "parameter passing for argument of type "
4066 	    "%qT changed in GCC 9.1", type);
4067 
4068   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
4069 }
4070 
4071 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
4072 
4073 static fixed_size_mode
4074 aarch64_get_reg_raw_mode (int regno)
4075 {
4076   if (TARGET_SVE && FP_REGNUM_P (regno))
4077     /* Don't use the SVE part of the register for __builtin_apply and
4078        __builtin_return.  The SVE registers aren't used by the normal PCS,
4079        so using them there would be a waste of time.  The PCS extensions
4080        for SVE types are fundamentally incompatible with the
4081        __builtin_return/__builtin_apply interface.  */
4082     return as_a <fixed_size_mode> (V16QImode);
4083   return default_get_reg_raw_mode (regno);
4084 }
4085 
4086 /* Implement TARGET_FUNCTION_ARG_PADDING.
4087 
4088    Small aggregate types are placed in the lowest memory address.
4089 
4090    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
4091 
4092 static pad_direction
4093 aarch64_function_arg_padding (machine_mode mode, const_tree type)
4094 {
4095   /* On little-endian targets, the least significant byte of every stack
4096      argument is passed at the lowest byte address of the stack slot.  */
4097   if (!BYTES_BIG_ENDIAN)
4098     return PAD_UPWARD;
4099 
4100   /* Otherwise, integral, floating-point and pointer types are padded downward:
4101      the least significant byte of a stack argument is passed at the highest
4102      byte address of the stack slot.  */
4103   if (type
4104       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
4105 	 || POINTER_TYPE_P (type))
4106       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
4107     return PAD_DOWNWARD;
4108 
4109   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
4110   return PAD_UPWARD;
4111 }
4112 
4113 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
4114 
4115    It specifies padding for the last (may also be the only)
4116    element of a block move between registers and memory.  If
4117    assuming the block is in the memory, padding upward means that
4118    the last element is padded after its highest significant byte,
4119    while in downward padding, the last element is padded at the
4120    its least significant byte side.
4121 
4122    Small aggregates and small complex types are always padded
4123    upwards.
4124 
4125    We don't need to worry about homogeneous floating-point or
4126    short-vector aggregates; their move is not affected by the
4127    padding direction determined here.  Regardless of endianness,
4128    each element of such an aggregate is put in the least
4129    significant bits of a fp/simd register.
4130 
4131    Return !BYTES_BIG_ENDIAN if the least significant byte of the
4132    register has useful data, and return the opposite if the most
4133    significant byte does.  */
4134 
4135 bool
4136 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
4137 		     bool first ATTRIBUTE_UNUSED)
4138 {
4139 
4140   /* Small composite types are always padded upward.  */
4141   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
4142     {
4143       HOST_WIDE_INT size;
4144       if (type)
4145 	size = int_size_in_bytes (type);
4146       else
4147 	/* No frontends can create types with variable-sized modes, so we
4148 	   shouldn't be asked to pass or return them.  */
4149 	size = GET_MODE_SIZE (mode).to_constant ();
4150       if (size < 2 * UNITS_PER_WORD)
4151 	return true;
4152     }
4153 
4154   /* Otherwise, use the default padding.  */
4155   return !BYTES_BIG_ENDIAN;
4156 }
4157 
4158 static scalar_int_mode
4159 aarch64_libgcc_cmp_return_mode (void)
4160 {
4161   return SImode;
4162 }
4163 
4164 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
4165 
4166 /* We use the 12-bit shifted immediate arithmetic instructions so values
4167    must be multiple of (1 << 12), i.e. 4096.  */
4168 #define ARITH_FACTOR 4096
4169 
4170 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
4171 #error Cannot use simple address calculation for stack probing
4172 #endif
4173 
4174 /* The pair of scratch registers used for stack probing.  */
4175 #define PROBE_STACK_FIRST_REG  R9_REGNUM
4176 #define PROBE_STACK_SECOND_REG R10_REGNUM
4177 
4178 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
4179    inclusive.  These are offsets from the current stack pointer.  */
4180 
4181 static void
4182 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
4183 {
4184   HOST_WIDE_INT size;
4185   if (!poly_size.is_constant (&size))
4186     {
4187       sorry ("stack probes for SVE frames");
4188       return;
4189     }
4190 
4191   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
4192 
4193   /* See the same assertion on PROBE_INTERVAL above.  */
4194   gcc_assert ((first % ARITH_FACTOR) == 0);
4195 
4196   /* See if we have a constant small number of probes to generate.  If so,
4197      that's the easy case.  */
4198   if (size <= PROBE_INTERVAL)
4199     {
4200       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
4201 
4202       emit_set_insn (reg1,
4203 		     plus_constant (Pmode,
4204 				    stack_pointer_rtx, -(first + base)));
4205       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
4206     }
4207 
4208   /* The run-time loop is made up of 8 insns in the generic case while the
4209      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
4210   else if (size <= 4 * PROBE_INTERVAL)
4211     {
4212       HOST_WIDE_INT i, rem;
4213 
4214       emit_set_insn (reg1,
4215 		     plus_constant (Pmode,
4216 				    stack_pointer_rtx,
4217 				    -(first + PROBE_INTERVAL)));
4218       emit_stack_probe (reg1);
4219 
4220       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
4221 	 it exceeds SIZE.  If only two probes are needed, this will not
4222 	 generate any code.  Then probe at FIRST + SIZE.  */
4223       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
4224 	{
4225 	  emit_set_insn (reg1,
4226 			 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
4227 	  emit_stack_probe (reg1);
4228 	}
4229 
4230       rem = size - (i - PROBE_INTERVAL);
4231       if (rem > 256)
4232 	{
4233 	  const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4234 
4235 	  emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
4236 	  emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
4237 	}
4238       else
4239 	emit_stack_probe (plus_constant (Pmode, reg1, -rem));
4240     }
4241 
4242   /* Otherwise, do the same as above, but in a loop.  Note that we must be
4243      extra careful with variables wrapping around because we might be at
4244      the very top (or the very bottom) of the address space and we have
4245      to be able to handle this case properly; in particular, we use an
4246      equality test for the loop condition.  */
4247   else
4248     {
4249       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
4250 
4251       /* Step 1: round SIZE to the previous multiple of the interval.  */
4252 
4253       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
4254 
4255 
4256       /* Step 2: compute initial and final value of the loop counter.  */
4257 
4258       /* TEST_ADDR = SP + FIRST.  */
4259       emit_set_insn (reg1,
4260 		     plus_constant (Pmode, stack_pointer_rtx, -first));
4261 
4262       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
4263       HOST_WIDE_INT adjustment = - (first + rounded_size);
4264       if (! aarch64_uimm12_shift (adjustment))
4265 	{
4266 	  aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
4267 					  true, Pmode);
4268 	  emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
4269 	}
4270       else
4271 	emit_set_insn (reg2,
4272 		       plus_constant (Pmode, stack_pointer_rtx, adjustment));
4273 
4274       /* Step 3: the loop
4275 
4276 	 do
4277 	   {
4278 	     TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
4279 	     probe at TEST_ADDR
4280 	   }
4281 	 while (TEST_ADDR != LAST_ADDR)
4282 
4283 	 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
4284 	 until it is equal to ROUNDED_SIZE.  */
4285 
4286       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
4287 
4288 
4289       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
4290 	 that SIZE is equal to ROUNDED_SIZE.  */
4291 
4292       if (size != rounded_size)
4293 	{
4294 	  HOST_WIDE_INT rem = size - rounded_size;
4295 
4296 	  if (rem > 256)
4297 	    {
4298 	      const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
4299 
4300 	      emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
4301 	      emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
4302 	    }
4303 	  else
4304 	    emit_stack_probe (plus_constant (Pmode, reg2, -rem));
4305 	}
4306     }
4307 
4308   /* Make sure nothing is scheduled before we are done.  */
4309   emit_insn (gen_blockage ());
4310 }
4311 
4312 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
4313    absolute addresses.  */
4314 
4315 const char *
4316 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
4317 {
4318   static int labelno = 0;
4319   char loop_lab[32];
4320   rtx xops[2];
4321 
4322   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
4323 
4324   /* Loop.  */
4325   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
4326 
4327   HOST_WIDE_INT stack_clash_probe_interval
4328     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
4329 
4330   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
4331   xops[0] = reg1;
4332   HOST_WIDE_INT interval;
4333   if (flag_stack_clash_protection)
4334     interval = stack_clash_probe_interval;
4335   else
4336     interval = PROBE_INTERVAL;
4337 
4338   gcc_assert (aarch64_uimm12_shift (interval));
4339   xops[1] = GEN_INT (interval);
4340 
4341   output_asm_insn ("sub\t%0, %0, %1", xops);
4342 
4343   /* If doing stack clash protection then we probe up by the ABI specified
4344      amount.  We do this because we're dropping full pages at a time in the
4345      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
4346   if (flag_stack_clash_protection)
4347     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
4348   else
4349     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
4350 
4351   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
4352      by this amount for each iteration.  */
4353   output_asm_insn ("str\txzr, [%0, %1]", xops);
4354 
4355   /* Test if TEST_ADDR == LAST_ADDR.  */
4356   xops[1] = reg2;
4357   output_asm_insn ("cmp\t%0, %1", xops);
4358 
4359   /* Branch.  */
4360   fputs ("\tb.ne\t", asm_out_file);
4361   assemble_name_raw (asm_out_file, loop_lab);
4362   fputc ('\n', asm_out_file);
4363 
4364   return "";
4365 }
4366 
4367 /* Emit the probe loop for doing stack clash probes and stack adjustments for
4368    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
4369    of GUARD_SIZE.  When a probe is emitted it is done at most
4370    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
4371    at most MIN_PROBE_THRESHOLD.  By the end of this function
4372    BASE = BASE - ADJUSTMENT.  */
4373 
4374 const char *
4375 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
4376 				      rtx min_probe_threshold, rtx guard_size)
4377 {
4378   /* This function is not allowed to use any instruction generation function
4379      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
4380      so instead emit the code you want using output_asm_insn.  */
4381   gcc_assert (flag_stack_clash_protection);
4382   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
4383   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
4384 
4385   /* The minimum required allocation before the residual requires probing.  */
4386   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
4387 
4388   /* Clamp the value down to the nearest value that can be used with a cmp.  */
4389   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
4390   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
4391 
4392   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
4393   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
4394 
4395   static int labelno = 0;
4396   char loop_start_lab[32];
4397   char loop_end_lab[32];
4398   rtx xops[2];
4399 
4400   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
4401   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
4402 
4403   /* Emit loop start label.  */
4404   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
4405 
4406   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
4407   xops[0] = adjustment;
4408   xops[1] = probe_offset_value_rtx;
4409   output_asm_insn ("cmp\t%0, %1", xops);
4410 
4411   /* Branch to end if not enough adjustment to probe.  */
4412   fputs ("\tb.lt\t", asm_out_file);
4413   assemble_name_raw (asm_out_file, loop_end_lab);
4414   fputc ('\n', asm_out_file);
4415 
4416   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
4417   xops[0] = base;
4418   xops[1] = probe_offset_value_rtx;
4419   output_asm_insn ("sub\t%0, %0, %1", xops);
4420 
4421   /* Probe at BASE.  */
4422   xops[1] = const0_rtx;
4423   output_asm_insn ("str\txzr, [%0, %1]", xops);
4424 
4425   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
4426   xops[0] = adjustment;
4427   xops[1] = probe_offset_value_rtx;
4428   output_asm_insn ("sub\t%0, %0, %1", xops);
4429 
4430   /* Branch to start if still more bytes to allocate.  */
4431   fputs ("\tb\t", asm_out_file);
4432   assemble_name_raw (asm_out_file, loop_start_lab);
4433   fputc ('\n', asm_out_file);
4434 
4435   /* No probe leave.  */
4436   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
4437 
4438   /* BASE = BASE - ADJUSTMENT.  */
4439   xops[0] = base;
4440   xops[1] = adjustment;
4441   output_asm_insn ("sub\t%0, %0, %1", xops);
4442   return "";
4443 }
4444 
4445 /* Determine whether a frame chain needs to be generated.  */
4446 static bool
4447 aarch64_needs_frame_chain (void)
4448 {
4449   /* Force a frame chain for EH returns so the return address is at FP+8.  */
4450   if (frame_pointer_needed || crtl->calls_eh_return)
4451     return true;
4452 
4453   /* A leaf function cannot have calls or write LR.  */
4454   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4455 
4456   /* Don't use a frame chain in leaf functions if leaf frame pointers
4457      are disabled.  */
4458   if (flag_omit_leaf_frame_pointer && is_leaf)
4459     return false;
4460 
4461   return aarch64_use_frame_pointer;
4462 }
4463 
4464 /* Mark the registers that need to be saved by the callee and calculate
4465    the size of the callee-saved registers area and frame record (both FP
4466    and LR may be omitted).  */
4467 static void
4468 aarch64_layout_frame (void)
4469 {
4470   HOST_WIDE_INT offset = 0;
4471   int regno, last_fp_reg = INVALID_REGNUM;
4472   bool simd_function = aarch64_simd_decl_p (cfun->decl);
4473 
4474   cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4475 
4476   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
4477      the mid-end is doing.  */
4478   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
4479 
4480 #define SLOT_NOT_REQUIRED (-2)
4481 #define SLOT_REQUIRED     (-1)
4482 
4483   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4484   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4485 
4486   /* If this is a non-leaf simd function with calls we assume that
4487      at least one of those calls is to a non-simd function and thus
4488      we must save V8 to V23 in the prologue.  */
4489 
4490   if (simd_function && !crtl->is_leaf)
4491     {
4492       for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4493 	if (FP_SIMD_SAVED_REGNUM_P (regno))
4494  	  df_set_regs_ever_live (regno, true);
4495     }
4496 
4497   /* First mark all the registers that really need to be saved...  */
4498   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4499     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4500 
4501   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4502     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4503 
4504   /* ... that includes the eh data registers (if needed)...  */
4505   if (crtl->calls_eh_return)
4506     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4507       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4508 	= SLOT_REQUIRED;
4509 
4510   /* ... and any callee saved register that dataflow says is live.  */
4511   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4512     if (df_regs_ever_live_p (regno)
4513 	&& (regno == R30_REGNUM
4514 	    || !call_used_regs[regno]))
4515       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4516 
4517   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4518     if (df_regs_ever_live_p (regno)
4519 	&& (!call_used_regs[regno]
4520 	    || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))))
4521       {
4522 	cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4523 	last_fp_reg = regno;
4524       }
4525 
4526   if (cfun->machine->frame.emit_frame_chain)
4527     {
4528       /* FP and LR are placed in the linkage record.  */
4529       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4530       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4531       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4532       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4533       offset = 2 * UNITS_PER_WORD;
4534     }
4535 
4536   /* With stack-clash, LR must be saved in non-leaf functions.  */
4537   gcc_assert (crtl->is_leaf
4538 	      || (cfun->machine->frame.reg_offset[R30_REGNUM]
4539 		  != SLOT_NOT_REQUIRED));
4540 
4541   /* Now assign stack slots for them.  */
4542   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4543     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4544       {
4545 	cfun->machine->frame.reg_offset[regno] = offset;
4546 	if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4547 	  cfun->machine->frame.wb_candidate1 = regno;
4548 	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4549 	  cfun->machine->frame.wb_candidate2 = regno;
4550 	offset += UNITS_PER_WORD;
4551       }
4552 
4553   HOST_WIDE_INT max_int_offset = offset;
4554   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4555   bool has_align_gap = offset != max_int_offset;
4556 
4557   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4558     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4559       {
4560 	/* If there is an alignment gap between integer and fp callee-saves,
4561 	   allocate the last fp register to it if possible.  */
4562 	if (regno == last_fp_reg
4563 	    && has_align_gap
4564 	    && !simd_function
4565 	    && (offset & 8) == 0)
4566 	  {
4567 	    cfun->machine->frame.reg_offset[regno] = max_int_offset;
4568 	    break;
4569 	  }
4570 
4571 	cfun->machine->frame.reg_offset[regno] = offset;
4572 	if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4573 	  cfun->machine->frame.wb_candidate1 = regno;
4574 	else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4575 		 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4576 	  cfun->machine->frame.wb_candidate2 = regno;
4577 	offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD;
4578       }
4579 
4580   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4581 
4582   cfun->machine->frame.saved_regs_size = offset;
4583 
4584   HOST_WIDE_INT varargs_and_saved_regs_size
4585     = offset + cfun->machine->frame.saved_varargs_size;
4586 
4587   cfun->machine->frame.hard_fp_offset
4588     = aligned_upper_bound (varargs_and_saved_regs_size
4589 			   + get_frame_size (),
4590 			   STACK_BOUNDARY / BITS_PER_UNIT);
4591 
4592   /* Both these values are already aligned.  */
4593   gcc_assert (multiple_p (crtl->outgoing_args_size,
4594 			  STACK_BOUNDARY / BITS_PER_UNIT));
4595   cfun->machine->frame.frame_size
4596     = (cfun->machine->frame.hard_fp_offset
4597        + crtl->outgoing_args_size);
4598 
4599   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4600 
4601   cfun->machine->frame.initial_adjust = 0;
4602   cfun->machine->frame.final_adjust = 0;
4603   cfun->machine->frame.callee_adjust = 0;
4604   cfun->machine->frame.callee_offset = 0;
4605 
4606   HOST_WIDE_INT max_push_offset = 0;
4607   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4608     max_push_offset = 512;
4609   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4610     max_push_offset = 256;
4611 
4612   HOST_WIDE_INT const_size, const_fp_offset;
4613   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4614       && const_size < max_push_offset
4615       && known_eq (crtl->outgoing_args_size, 0))
4616     {
4617       /* Simple, small frame with no outgoing arguments:
4618 	 stp reg1, reg2, [sp, -frame_size]!
4619 	 stp reg3, reg4, [sp, 16]  */
4620       cfun->machine->frame.callee_adjust = const_size;
4621     }
4622   else if (known_lt (crtl->outgoing_args_size
4623 		     + cfun->machine->frame.saved_regs_size, 512)
4624 	   && !(cfun->calls_alloca
4625 		&& known_lt (cfun->machine->frame.hard_fp_offset,
4626 			     max_push_offset)))
4627     {
4628       /* Frame with small outgoing arguments:
4629 	 sub sp, sp, frame_size
4630 	 stp reg1, reg2, [sp, outgoing_args_size]
4631 	 stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4632       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4633       cfun->machine->frame.callee_offset
4634 	= cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4635     }
4636   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4637 	   && const_fp_offset < max_push_offset)
4638     {
4639       /* Frame with large outgoing arguments but a small local area:
4640 	 stp reg1, reg2, [sp, -hard_fp_offset]!
4641 	 stp reg3, reg4, [sp, 16]
4642 	 sub sp, sp, outgoing_args_size  */
4643       cfun->machine->frame.callee_adjust = const_fp_offset;
4644       cfun->machine->frame.final_adjust
4645 	= cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4646     }
4647   else
4648     {
4649       /* Frame with large local area and outgoing arguments using frame pointer:
4650 	 sub sp, sp, hard_fp_offset
4651 	 stp x29, x30, [sp, 0]
4652 	 add x29, sp, 0
4653 	 stp reg3, reg4, [sp, 16]
4654 	 sub sp, sp, outgoing_args_size  */
4655       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4656       cfun->machine->frame.final_adjust
4657 	= cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4658     }
4659 
4660   cfun->machine->frame.laid_out = true;
4661 }
4662 
4663 /* Return true if the register REGNO is saved on entry to
4664    the current function.  */
4665 
4666 static bool
4667 aarch64_register_saved_on_entry (int regno)
4668 {
4669   return cfun->machine->frame.reg_offset[regno] >= 0;
4670 }
4671 
4672 /* Return the next register up from REGNO up to LIMIT for the callee
4673    to save.  */
4674 
4675 static unsigned
4676 aarch64_next_callee_save (unsigned regno, unsigned limit)
4677 {
4678   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4679     regno ++;
4680   return regno;
4681 }
4682 
4683 /* Push the register number REGNO of mode MODE to the stack with write-back
4684    adjusting the stack by ADJUSTMENT.  */
4685 
4686 static void
4687 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4688 			   HOST_WIDE_INT adjustment)
4689  {
4690   rtx base_rtx = stack_pointer_rtx;
4691   rtx insn, reg, mem;
4692 
4693   reg = gen_rtx_REG (mode, regno);
4694   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4695 			    plus_constant (Pmode, base_rtx, -adjustment));
4696   mem = gen_frame_mem (mode, mem);
4697 
4698   insn = emit_move_insn (mem, reg);
4699   RTX_FRAME_RELATED_P (insn) = 1;
4700 }
4701 
4702 /* Generate and return an instruction to store the pair of registers
4703    REG and REG2 of mode MODE to location BASE with write-back adjusting
4704    the stack location BASE by ADJUSTMENT.  */
4705 
4706 static rtx
4707 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4708 			  HOST_WIDE_INT adjustment)
4709 {
4710   switch (mode)
4711     {
4712     case E_DImode:
4713       return gen_storewb_pairdi_di (base, base, reg, reg2,
4714 				    GEN_INT (-adjustment),
4715 				    GEN_INT (UNITS_PER_WORD - adjustment));
4716     case E_DFmode:
4717       return gen_storewb_pairdf_di (base, base, reg, reg2,
4718 				    GEN_INT (-adjustment),
4719 				    GEN_INT (UNITS_PER_WORD - adjustment));
4720     case E_TFmode:
4721       return gen_storewb_pairtf_di (base, base, reg, reg2,
4722 				    GEN_INT (-adjustment),
4723 				    GEN_INT (UNITS_PER_VREG - adjustment));
4724     default:
4725       gcc_unreachable ();
4726     }
4727 }
4728 
4729 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4730    stack pointer by ADJUSTMENT.  */
4731 
4732 static void
4733 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4734 {
4735   rtx_insn *insn;
4736   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4737 
4738   if (regno2 == INVALID_REGNUM)
4739     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4740 
4741   rtx reg1 = gen_rtx_REG (mode, regno1);
4742   rtx reg2 = gen_rtx_REG (mode, regno2);
4743 
4744   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4745 					      reg2, adjustment));
4746   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4747   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4748   RTX_FRAME_RELATED_P (insn) = 1;
4749 }
4750 
4751 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4752    adjusting it by ADJUSTMENT afterwards.  */
4753 
4754 static rtx
4755 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4756 			 HOST_WIDE_INT adjustment)
4757 {
4758   switch (mode)
4759     {
4760     case E_DImode:
4761       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4762 				   GEN_INT (UNITS_PER_WORD));
4763     case E_DFmode:
4764       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4765 				   GEN_INT (UNITS_PER_WORD));
4766     case E_TFmode:
4767       return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
4768 				   GEN_INT (UNITS_PER_VREG));
4769     default:
4770       gcc_unreachable ();
4771     }
4772 }
4773 
4774 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4775    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4776    into CFI_OPS.  */
4777 
4778 static void
4779 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4780 		  rtx *cfi_ops)
4781 {
4782   machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1);
4783   rtx reg1 = gen_rtx_REG (mode, regno1);
4784 
4785   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4786 
4787   if (regno2 == INVALID_REGNUM)
4788     {
4789       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4790       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4791       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4792     }
4793   else
4794     {
4795       rtx reg2 = gen_rtx_REG (mode, regno2);
4796       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4797       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4798 					  reg2, adjustment));
4799     }
4800 }
4801 
4802 /* Generate and return a store pair instruction of mode MODE to store
4803    register REG1 to MEM1 and register REG2 to MEM2.  */
4804 
4805 static rtx
4806 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4807 			rtx reg2)
4808 {
4809   switch (mode)
4810     {
4811     case E_DImode:
4812       return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4813 
4814     case E_DFmode:
4815       return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4816 
4817     case E_TFmode:
4818       return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
4819 
4820     default:
4821       gcc_unreachable ();
4822     }
4823 }
4824 
4825 /* Generate and regurn a load pair isntruction of mode MODE to load register
4826    REG1 from MEM1 and register REG2 from MEM2.  */
4827 
4828 static rtx
4829 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4830 		       rtx mem2)
4831 {
4832   switch (mode)
4833     {
4834     case E_DImode:
4835       return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4836 
4837     case E_DFmode:
4838       return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4839 
4840     case E_TFmode:
4841       return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
4842 
4843     default:
4844       gcc_unreachable ();
4845     }
4846 }
4847 
4848 /* Return TRUE if return address signing should be enabled for the current
4849    function, otherwise return FALSE.  */
4850 
4851 bool
4852 aarch64_return_address_signing_enabled (void)
4853 {
4854   /* This function should only be called after frame laid out.   */
4855   gcc_assert (cfun->machine->frame.laid_out);
4856 
4857   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4858      if it's LR is pushed onto stack.  */
4859   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4860 	  || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4861 	      && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4862 }
4863 
4864 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
4865 bool
4866 aarch64_bti_enabled (void)
4867 {
4868   return (aarch64_enable_bti == 1);
4869 }
4870 
4871 /* Emit code to save the callee-saved registers from register number START
4872    to LIMIT to the stack at the location starting at offset START_OFFSET,
4873    skipping any write-back candidates if SKIP_WB is true.  */
4874 
4875 static void
4876 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4877 			   unsigned start, unsigned limit, bool skip_wb)
4878 {
4879   rtx_insn *insn;
4880   unsigned regno;
4881   unsigned regno2;
4882 
4883   for (regno = aarch64_next_callee_save (start, limit);
4884        regno <= limit;
4885        regno = aarch64_next_callee_save (regno + 1, limit))
4886     {
4887       rtx reg, mem;
4888       poly_int64 offset;
4889       int offset_diff;
4890 
4891       if (skip_wb
4892 	  && (regno == cfun->machine->frame.wb_candidate1
4893 	      || regno == cfun->machine->frame.wb_candidate2))
4894 	continue;
4895 
4896       if (cfun->machine->reg_is_wrapped_separately[regno])
4897        continue;
4898 
4899       reg = gen_rtx_REG (mode, regno);
4900       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4901       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4902 						offset));
4903 
4904       regno2 = aarch64_next_callee_save (regno + 1, limit);
4905       offset_diff = cfun->machine->frame.reg_offset[regno2]
4906 		    - cfun->machine->frame.reg_offset[regno];
4907 
4908       if (regno2 <= limit
4909 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
4910 	  && known_eq (GET_MODE_SIZE (mode), offset_diff))
4911 	{
4912 	  rtx reg2 = gen_rtx_REG (mode, regno2);
4913 	  rtx mem2;
4914 
4915 	  offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4916 	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4917 						     offset));
4918 	  insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4919 						    reg2));
4920 
4921 	  /* The first part of a frame-related parallel insn is
4922 	     always assumed to be relevant to the frame
4923 	     calculations; subsequent parts, are only
4924 	     frame-related if explicitly marked.  */
4925 	  RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4926 	  regno = regno2;
4927 	}
4928       else
4929 	insn = emit_move_insn (mem, reg);
4930 
4931       RTX_FRAME_RELATED_P (insn) = 1;
4932     }
4933 }
4934 
4935 /* Emit code to restore the callee registers of mode MODE from register
4936    number START up to and including LIMIT.  Restore from the stack offset
4937    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4938    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4939 
4940 static void
4941 aarch64_restore_callee_saves (machine_mode mode,
4942 			      poly_int64 start_offset, unsigned start,
4943 			      unsigned limit, bool skip_wb, rtx *cfi_ops)
4944 {
4945   rtx base_rtx = stack_pointer_rtx;
4946   unsigned regno;
4947   unsigned regno2;
4948   poly_int64 offset;
4949 
4950   for (regno = aarch64_next_callee_save (start, limit);
4951        regno <= limit;
4952        regno = aarch64_next_callee_save (regno + 1, limit))
4953     {
4954       if (cfun->machine->reg_is_wrapped_separately[regno])
4955        continue;
4956 
4957       rtx reg, mem;
4958       int offset_diff;
4959 
4960       if (skip_wb
4961 	  && (regno == cfun->machine->frame.wb_candidate1
4962 	      || regno == cfun->machine->frame.wb_candidate2))
4963 	continue;
4964 
4965       reg = gen_rtx_REG (mode, regno);
4966       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4967       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4968 
4969       regno2 = aarch64_next_callee_save (regno + 1, limit);
4970       offset_diff = cfun->machine->frame.reg_offset[regno2]
4971 		    - cfun->machine->frame.reg_offset[regno];
4972 
4973       if (regno2 <= limit
4974 	  && !cfun->machine->reg_is_wrapped_separately[regno2]
4975 	  && known_eq (GET_MODE_SIZE (mode), offset_diff))
4976 	{
4977 	  rtx reg2 = gen_rtx_REG (mode, regno2);
4978 	  rtx mem2;
4979 
4980 	  offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4981 	  mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4982 	  emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4983 
4984 	  *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4985 	  regno = regno2;
4986 	}
4987       else
4988 	emit_move_insn (reg, mem);
4989       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4990     }
4991 }
4992 
4993 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4994    of MODE.  */
4995 
4996 static inline bool
4997 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4998 {
4999   HOST_WIDE_INT multiple;
5000   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5001 	  && IN_RANGE (multiple, -8, 7));
5002 }
5003 
5004 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
5005    of MODE.  */
5006 
5007 static inline bool
5008 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5009 {
5010   HOST_WIDE_INT multiple;
5011   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5012 	  && IN_RANGE (multiple, 0, 63));
5013 }
5014 
5015 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
5016    of MODE.  */
5017 
5018 bool
5019 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5020 {
5021   HOST_WIDE_INT multiple;
5022   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5023 	  && IN_RANGE (multiple, -64, 63));
5024 }
5025 
5026 /* Return true if OFFSET is a signed 9-bit value.  */
5027 
5028 bool
5029 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
5030 				       poly_int64 offset)
5031 {
5032   HOST_WIDE_INT const_offset;
5033   return (offset.is_constant (&const_offset)
5034 	  && IN_RANGE (const_offset, -256, 255));
5035 }
5036 
5037 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
5038    of MODE.  */
5039 
5040 static inline bool
5041 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
5042 {
5043   HOST_WIDE_INT multiple;
5044   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5045 	  && IN_RANGE (multiple, -256, 255));
5046 }
5047 
5048 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
5049    of MODE.  */
5050 
5051 static inline bool
5052 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
5053 {
5054   HOST_WIDE_INT multiple;
5055   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
5056 	  && IN_RANGE (multiple, 0, 4095));
5057 }
5058 
5059 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
5060 
5061 static sbitmap
5062 aarch64_get_separate_components (void)
5063 {
5064   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5065   bitmap_clear (components);
5066 
5067   /* The registers we need saved to the frame.  */
5068   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5069     if (aarch64_register_saved_on_entry (regno))
5070       {
5071 	poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5072 	if (!frame_pointer_needed)
5073 	  offset += cfun->machine->frame.frame_size
5074 		    - cfun->machine->frame.hard_fp_offset;
5075 	/* Check that we can access the stack slot of the register with one
5076 	   direct load with no adjustments needed.  */
5077 	if (offset_12bit_unsigned_scaled_p (DImode, offset))
5078 	  bitmap_set_bit (components, regno);
5079       }
5080 
5081   /* Don't mess with the hard frame pointer.  */
5082   if (frame_pointer_needed)
5083     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
5084 
5085   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5086   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5087   /* If registers have been chosen to be stored/restored with
5088      writeback don't interfere with them to avoid having to output explicit
5089      stack adjustment instructions.  */
5090   if (reg2 != INVALID_REGNUM)
5091     bitmap_clear_bit (components, reg2);
5092   if (reg1 != INVALID_REGNUM)
5093     bitmap_clear_bit (components, reg1);
5094 
5095   bitmap_clear_bit (components, LR_REGNUM);
5096   bitmap_clear_bit (components, SP_REGNUM);
5097 
5098   return components;
5099 }
5100 
5101 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
5102 
5103 static sbitmap
5104 aarch64_components_for_bb (basic_block bb)
5105 {
5106   bitmap in = DF_LIVE_IN (bb);
5107   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
5108   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
5109   bool simd_function = aarch64_simd_decl_p (cfun->decl);
5110 
5111   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
5112   bitmap_clear (components);
5113 
5114   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
5115   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5116     if ((!call_used_regs[regno]
5117 	|| (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))
5118        && (bitmap_bit_p (in, regno)
5119 	   || bitmap_bit_p (gen, regno)
5120 	   || bitmap_bit_p (kill, regno)))
5121       {
5122 	unsigned regno2, offset, offset2;
5123 	bitmap_set_bit (components, regno);
5124 
5125 	/* If there is a callee-save at an adjacent offset, add it too
5126 	   to increase the use of LDP/STP.  */
5127 	offset = cfun->machine->frame.reg_offset[regno];
5128 	regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
5129 
5130 	if (regno2 <= LAST_SAVED_REGNUM)
5131 	  {
5132 	    offset2 = cfun->machine->frame.reg_offset[regno2];
5133 	    if ((offset & ~8) == (offset2 & ~8))
5134 	      bitmap_set_bit (components, regno2);
5135 	  }
5136       }
5137 
5138   return components;
5139 }
5140 
5141 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
5142    Nothing to do for aarch64.  */
5143 
5144 static void
5145 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
5146 {
5147 }
5148 
5149 /* Return the next set bit in BMP from START onwards.  Return the total number
5150    of bits in BMP if no set bit is found at or after START.  */
5151 
5152 static unsigned int
5153 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
5154 {
5155   unsigned int nbits = SBITMAP_SIZE (bmp);
5156   if (start == nbits)
5157     return start;
5158 
5159   gcc_assert (start < nbits);
5160   for (unsigned int i = start; i < nbits; i++)
5161     if (bitmap_bit_p (bmp, i))
5162       return i;
5163 
5164   return nbits;
5165 }
5166 
5167 /* Do the work for aarch64_emit_prologue_components and
5168    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
5169    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
5170    for these components or the epilogue sequence.  That is, it determines
5171    whether we should emit stores or loads and what kind of CFA notes to attach
5172    to the insns.  Otherwise the logic for the two sequences is very
5173    similar.  */
5174 
5175 static void
5176 aarch64_process_components (sbitmap components, bool prologue_p)
5177 {
5178   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
5179 			     ? HARD_FRAME_POINTER_REGNUM
5180 			     : STACK_POINTER_REGNUM);
5181 
5182   unsigned last_regno = SBITMAP_SIZE (components);
5183   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
5184   rtx_insn *insn = NULL;
5185 
5186   while (regno != last_regno)
5187     {
5188       /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved
5189 	 so DFmode for the vector registers is enough.  For simd functions
5190 	 we want to save the low 128 bits.  */
5191       machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno);
5192 
5193       rtx reg = gen_rtx_REG (mode, regno);
5194       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
5195       if (!frame_pointer_needed)
5196 	offset += cfun->machine->frame.frame_size
5197 		  - cfun->machine->frame.hard_fp_offset;
5198       rtx addr = plus_constant (Pmode, ptr_reg, offset);
5199       rtx mem = gen_frame_mem (mode, addr);
5200 
5201       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
5202       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
5203       /* No more registers to handle after REGNO.
5204 	 Emit a single save/restore and exit.  */
5205       if (regno2 == last_regno)
5206 	{
5207 	  insn = emit_insn (set);
5208 	  RTX_FRAME_RELATED_P (insn) = 1;
5209 	  if (prologue_p)
5210 	    add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5211 	  else
5212 	    add_reg_note (insn, REG_CFA_RESTORE, reg);
5213 	  break;
5214 	}
5215 
5216       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
5217       /* The next register is not of the same class or its offset is not
5218 	 mergeable with the current one into a pair.  */
5219       if (!satisfies_constraint_Ump (mem)
5220 	  || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
5221 	  || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno))
5222 	  || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
5223 		       GET_MODE_SIZE (mode)))
5224 	{
5225 	  insn = emit_insn (set);
5226 	  RTX_FRAME_RELATED_P (insn) = 1;
5227 	  if (prologue_p)
5228 	    add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
5229 	  else
5230 	    add_reg_note (insn, REG_CFA_RESTORE, reg);
5231 
5232 	  regno = regno2;
5233 	  continue;
5234 	}
5235 
5236       /* REGNO2 can be saved/restored in a pair with REGNO.  */
5237       rtx reg2 = gen_rtx_REG (mode, regno2);
5238       if (!frame_pointer_needed)
5239 	offset2 += cfun->machine->frame.frame_size
5240 		  - cfun->machine->frame.hard_fp_offset;
5241       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
5242       rtx mem2 = gen_frame_mem (mode, addr2);
5243       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
5244 			     : gen_rtx_SET (reg2, mem2);
5245 
5246       if (prologue_p)
5247 	insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
5248       else
5249 	insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
5250 
5251       RTX_FRAME_RELATED_P (insn) = 1;
5252       if (prologue_p)
5253 	{
5254 	  add_reg_note (insn, REG_CFA_OFFSET, set);
5255 	  add_reg_note (insn, REG_CFA_OFFSET, set2);
5256 	}
5257       else
5258 	{
5259 	  add_reg_note (insn, REG_CFA_RESTORE, reg);
5260 	  add_reg_note (insn, REG_CFA_RESTORE, reg2);
5261 	}
5262 
5263       regno = aarch64_get_next_set_bit (components, regno2 + 1);
5264     }
5265 }
5266 
5267 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
5268 
5269 static void
5270 aarch64_emit_prologue_components (sbitmap components)
5271 {
5272   aarch64_process_components (components, true);
5273 }
5274 
5275 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
5276 
5277 static void
5278 aarch64_emit_epilogue_components (sbitmap components)
5279 {
5280   aarch64_process_components (components, false);
5281 }
5282 
5283 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
5284 
5285 static void
5286 aarch64_set_handled_components (sbitmap components)
5287 {
5288   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
5289     if (bitmap_bit_p (components, regno))
5290       cfun->machine->reg_is_wrapped_separately[regno] = true;
5291 }
5292 
5293 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
5294    determining the probe offset for alloca.  */
5295 
5296 static HOST_WIDE_INT
5297 aarch64_stack_clash_protection_alloca_probe_range (void)
5298 {
5299   return STACK_CLASH_CALLER_GUARD;
5300 }
5301 
5302 
5303 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
5304    registers.  If POLY_SIZE is not large enough to require a probe this function
5305    will only adjust the stack.  When allocating the stack space
5306    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
5307    FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
5308    arguments.  If we are then we ensure that any allocation larger than the ABI
5309    defined buffer needs a probe so that the invariant of having a 1KB buffer is
5310    maintained.
5311 
5312    We emit barriers after each stack adjustment to prevent optimizations from
5313    breaking the invariant that we never drop the stack more than a page.  This
5314    invariant is needed to make it easier to correctly handle asynchronous
5315    events, e.g. if we were to allow the stack to be dropped by more than a page
5316    and then have multiple probes up and we take a signal somewhere in between
5317    then the signal handler doesn't know the state of the stack and can make no
5318    assumptions about which pages have been probed.  */
5319 
5320 static void
5321 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
5322 					poly_int64 poly_size,
5323 					bool frame_related_p,
5324 					bool final_adjustment_p)
5325 {
5326   HOST_WIDE_INT guard_size
5327     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5328   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5329   /* When doing the final adjustment for the outgoing argument size we can't
5330      assume that LR was saved at position 0.  So subtract it's offset from the
5331      ABI safe buffer so that we don't accidentally allow an adjustment that
5332      would result in an allocation larger than the ABI buffer without
5333      probing.  */
5334   HOST_WIDE_INT min_probe_threshold
5335     = final_adjustment_p
5336       ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM]
5337       : guard_size - guard_used_by_caller;
5338 
5339   poly_int64 frame_size = cfun->machine->frame.frame_size;
5340 
5341   /* We should always have a positive probe threshold.  */
5342   gcc_assert (min_probe_threshold > 0);
5343 
5344   if (flag_stack_clash_protection && !final_adjustment_p)
5345     {
5346       poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5347       poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5348 
5349       if (known_eq (frame_size, 0))
5350 	{
5351 	  dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
5352 	}
5353       else if (known_lt (initial_adjust, guard_size - guard_used_by_caller)
5354 	       && known_lt (final_adjust, guard_used_by_caller))
5355 	{
5356 	  dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
5357 	}
5358     }
5359 
5360   /* If SIZE is not large enough to require probing, just adjust the stack and
5361      exit.  */
5362   if (known_lt (poly_size, min_probe_threshold)
5363       || !flag_stack_clash_protection)
5364     {
5365       aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
5366       return;
5367     }
5368 
5369   HOST_WIDE_INT size;
5370   /* Handle the SVE non-constant case first.  */
5371   if (!poly_size.is_constant (&size))
5372     {
5373      if (dump_file)
5374       {
5375 	fprintf (dump_file, "Stack clash SVE prologue: ");
5376 	print_dec (poly_size, dump_file);
5377 	fprintf (dump_file, " bytes, dynamic probing will be required.\n");
5378       }
5379 
5380       /* First calculate the amount of bytes we're actually spilling.  */
5381       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
5382 			  poly_size, temp1, temp2, false, true);
5383 
5384       rtx_insn *insn = get_last_insn ();
5385 
5386       if (frame_related_p)
5387 	{
5388 	  /* This is done to provide unwinding information for the stack
5389 	     adjustments we're about to do, however to prevent the optimizers
5390 	     from removing the R11 move and leaving the CFA note (which would be
5391 	     very wrong) we tie the old and new stack pointer together.
5392 	     The tie will expand to nothing but the optimizers will not touch
5393 	     the instruction.  */
5394 	  rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
5395 	  emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
5396 	  emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
5397 
5398 	  /* We want the CFA independent of the stack pointer for the
5399 	     duration of the loop.  */
5400 	  add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
5401 	  RTX_FRAME_RELATED_P (insn) = 1;
5402 	}
5403 
5404       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
5405       rtx guard_const = gen_int_mode (guard_size, Pmode);
5406 
5407       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
5408 						   stack_pointer_rtx, temp1,
5409 						   probe_const, guard_const));
5410 
5411       /* Now reset the CFA register if needed.  */
5412       if (frame_related_p)
5413 	{
5414 	  add_reg_note (insn, REG_CFA_DEF_CFA,
5415 			gen_rtx_PLUS (Pmode, stack_pointer_rtx,
5416 				      gen_int_mode (poly_size, Pmode)));
5417 	  RTX_FRAME_RELATED_P (insn) = 1;
5418 	}
5419 
5420       return;
5421     }
5422 
5423   if (dump_file)
5424     fprintf (dump_file,
5425 	     "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
5426 	     " bytes, probing will be required.\n", size);
5427 
5428   /* Round size to the nearest multiple of guard_size, and calculate the
5429      residual as the difference between the original size and the rounded
5430      size.  */
5431   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
5432   HOST_WIDE_INT residual = size - rounded_size;
5433 
5434   /* We can handle a small number of allocations/probes inline.  Otherwise
5435      punt to a loop.  */
5436   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
5437     {
5438       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
5439 	{
5440 	  aarch64_sub_sp (NULL, temp2, guard_size, true);
5441 	  emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5442 					   guard_used_by_caller));
5443 	  emit_insn (gen_blockage ());
5444 	}
5445       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
5446     }
5447   else
5448     {
5449       /* Compute the ending address.  */
5450       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
5451 			  temp1, NULL, false, true);
5452       rtx_insn *insn = get_last_insn ();
5453 
5454       /* For the initial allocation, we don't have a frame pointer
5455 	 set up, so we always need CFI notes.  If we're doing the
5456 	 final allocation, then we may have a frame pointer, in which
5457 	 case it is the CFA, otherwise we need CFI notes.
5458 
5459 	 We can determine which allocation we are doing by looking at
5460 	 the value of FRAME_RELATED_P since the final allocations are not
5461 	 frame related.  */
5462       if (frame_related_p)
5463 	{
5464 	  /* We want the CFA independent of the stack pointer for the
5465 	     duration of the loop.  */
5466 	  add_reg_note (insn, REG_CFA_DEF_CFA,
5467 			plus_constant (Pmode, temp1, rounded_size));
5468 	  RTX_FRAME_RELATED_P (insn) = 1;
5469 	}
5470 
5471       /* This allocates and probes the stack.  Note that this re-uses some of
5472 	 the existing Ada stack protection code.  However we are guaranteed not
5473 	 to enter the non loop or residual branches of that code.
5474 
5475 	 The non-loop part won't be entered because if our allocation amount
5476 	 doesn't require a loop, the case above would handle it.
5477 
5478 	 The residual amount won't be entered because TEMP1 is a mutliple of
5479 	 the allocation size.  The residual will always be 0.  As such, the only
5480 	 part we are actually using from that code is the loop setup.  The
5481 	 actual probing is done in aarch64_output_probe_stack_range.  */
5482       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
5483 					       stack_pointer_rtx, temp1));
5484 
5485       /* Now reset the CFA register if needed.  */
5486       if (frame_related_p)
5487 	{
5488 	  add_reg_note (insn, REG_CFA_DEF_CFA,
5489 			plus_constant (Pmode, stack_pointer_rtx, rounded_size));
5490 	  RTX_FRAME_RELATED_P (insn) = 1;
5491 	}
5492 
5493       emit_insn (gen_blockage ());
5494       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
5495     }
5496 
5497   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
5498      be probed.  This maintains the requirement that each page is probed at
5499      least once.  For initial probing we probe only if the allocation is
5500      more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
5501      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
5502      GUARD_SIZE.  This works that for any allocation that is large enough to
5503      trigger a probe here, we'll have at least one, and if they're not large
5504      enough for this code to emit anything for them, The page would have been
5505      probed by the saving of FP/LR either by this function or any callees.  If
5506      we don't have any callees then we won't have more stack adjustments and so
5507      are still safe.  */
5508   if (residual)
5509     {
5510       HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
5511       /* If we're doing final adjustments, and we've done any full page
5512 	 allocations then any residual needs to be probed.  */
5513       if (final_adjustment_p && rounded_size != 0)
5514 	min_probe_threshold = 0;
5515       /* If doing a small final adjustment, we always probe at offset 0.
5516 	 This is done to avoid issues when LR is not at position 0 or when
5517 	 the final adjustment is smaller than the probing offset.  */
5518       else if (final_adjustment_p && rounded_size == 0)
5519 	residual_probe_offset = 0;
5520 
5521       aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
5522       if (residual >= min_probe_threshold)
5523 	{
5524 	  if (dump_file)
5525 	    fprintf (dump_file,
5526 		     "Stack clash AArch64 prologue residuals: "
5527 		     HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
5528 		     "\n", residual);
5529 
5530 	    emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
5531 					     residual_probe_offset));
5532 	  emit_insn (gen_blockage ());
5533 	}
5534     }
5535 }
5536 
5537 /* Return 1 if the register is used by the epilogue.  We need to say the
5538    return register is used, but only after epilogue generation is complete.
5539    Note that in the case of sibcalls, the values "used by the epilogue" are
5540    considered live at the start of the called function.
5541 
5542    For SIMD functions we need to return 1 for FP registers that are saved and
5543    restored by a function but are not zero in call_used_regs.  If we do not do
5544    this optimizations may remove the restore of the register.  */
5545 
5546 int
5547 aarch64_epilogue_uses (int regno)
5548 {
5549   if (epilogue_completed)
5550     {
5551       if (regno == LR_REGNUM)
5552 	return 1;
5553       if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno))
5554 	return 1;
5555     }
5556   return 0;
5557 }
5558 
5559 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
5560    is saved at BASE + OFFSET.  */
5561 
5562 static void
5563 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
5564 			    rtx base, poly_int64 offset)
5565 {
5566   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
5567   add_reg_note (insn, REG_CFA_EXPRESSION,
5568 		gen_rtx_SET (mem, regno_reg_rtx[reg]));
5569 }
5570 
5571 /* AArch64 stack frames generated by this compiler look like:
5572 
5573 	+-------------------------------+
5574 	|                               |
5575 	|  incoming stack arguments     |
5576 	|                               |
5577 	+-------------------------------+
5578 	|                               | <-- incoming stack pointer (aligned)
5579 	|  callee-allocated save area   |
5580 	|  for register varargs         |
5581 	|                               |
5582 	+-------------------------------+
5583 	|  local variables              | <-- frame_pointer_rtx
5584 	|                               |
5585 	+-------------------------------+
5586 	|  padding                      | \
5587 	+-------------------------------+  |
5588 	|  callee-saved registers       |  | frame.saved_regs_size
5589 	+-------------------------------+  |
5590 	|  LR'                          |  |
5591 	+-------------------------------+  |
5592 	|  FP'                          | / <- hard_frame_pointer_rtx (aligned)
5593         +-------------------------------+
5594 	|  dynamic allocation           |
5595 	+-------------------------------+
5596 	|  padding                      |
5597 	+-------------------------------+
5598 	|  outgoing stack arguments     | <-- arg_pointer
5599         |                               |
5600 	+-------------------------------+
5601 	|                               | <-- stack_pointer_rtx (aligned)
5602 
5603    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
5604    but leave frame_pointer_rtx and hard_frame_pointer_rtx
5605    unchanged.
5606 
5607    By default for stack-clash we assume the guard is at least 64KB, but this
5608    value is configurable to either 4KB or 64KB.  We also force the guard size to
5609    be the same as the probing interval and both values are kept in sync.
5610 
5611    With those assumptions the callee can allocate up to 63KB (or 3KB depending
5612    on the guard size) of stack space without probing.
5613 
5614    When probing is needed, we emit a probe at the start of the prologue
5615    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
5616 
5617    We have to track how much space has been allocated and the only stores
5618    to the stack we track as implicit probes are the FP/LR stores.
5619 
5620    For outgoing arguments we probe if the size is larger than 1KB, such that
5621    the ABI specified buffer is maintained for the next callee.
5622 
5623    The following registers are reserved during frame layout and should not be
5624    used for any other purpose:
5625 
5626    - r11: Used by stack clash protection when SVE is enabled.
5627    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
5628    - r14 and r15: Used for speculation tracking.
5629    - r16(IP0), r17(IP1): Used by indirect tailcalls.
5630    - r30(LR), r29(FP): Used by standard frame layout.
5631 
5632    These registers must be avoided in frame layout related code unless the
5633    explicit intention is to interact with one of the features listed above.  */
5634 
5635 /* Generate the prologue instructions for entry into a function.
5636    Establish the stack frame by decreasing the stack pointer with a
5637    properly calculated size and, if necessary, create a frame record
5638    filled with the values of LR and previous frame pointer.  The
5639    current FP is also set up if it is in use.  */
5640 
5641 void
5642 aarch64_expand_prologue (void)
5643 {
5644   poly_int64 frame_size = cfun->machine->frame.frame_size;
5645   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5646   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5647   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5648   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5649   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5650   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5651   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
5652   rtx_insn *insn;
5653 
5654   /* Sign return address for functions.  */
5655   if (aarch64_return_address_signing_enabled ())
5656     {
5657       insn = emit_insn (gen_pacisp ());
5658       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5659       RTX_FRAME_RELATED_P (insn) = 1;
5660     }
5661 
5662   if (flag_stack_usage_info)
5663     current_function_static_stack_size = constant_lower_bound (frame_size);
5664 
5665   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
5666     {
5667       if (crtl->is_leaf && !cfun->calls_alloca)
5668 	{
5669 	  if (maybe_gt (frame_size, PROBE_INTERVAL)
5670 	      && maybe_gt (frame_size, get_stack_check_protect ()))
5671 	    aarch64_emit_probe_stack_range (get_stack_check_protect (),
5672 					    (frame_size
5673 					     - get_stack_check_protect ()));
5674 	}
5675       else if (maybe_gt (frame_size, 0))
5676 	aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
5677     }
5678 
5679   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5680   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5681 
5682   /* In theory we should never have both an initial adjustment
5683      and a callee save adjustment.  Verify that is the case since the
5684      code below does not handle it for -fstack-clash-protection.  */
5685   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
5686 
5687   /* Will only probe if the initial adjustment is larger than the guard
5688      less the amount of the guard reserved for use by the caller's
5689      outgoing args.  */
5690   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
5691 					  true, false);
5692 
5693   if (callee_adjust != 0)
5694     aarch64_push_regs (reg1, reg2, callee_adjust);
5695 
5696   if (emit_frame_chain)
5697     {
5698       poly_int64 reg_offset = callee_adjust;
5699       if (callee_adjust == 0)
5700 	{
5701 	  reg1 = R29_REGNUM;
5702 	  reg2 = R30_REGNUM;
5703 	  reg_offset = callee_offset;
5704 	  aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
5705 	}
5706       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
5707 			  stack_pointer_rtx, callee_offset,
5708 			  tmp1_rtx, tmp0_rtx, frame_pointer_needed);
5709       if (frame_pointer_needed && !frame_size.is_constant ())
5710 	{
5711 	  /* Variable-sized frames need to describe the save slot
5712 	     address using DW_CFA_expression rather than DW_CFA_offset.
5713 	     This means that, without taking further action, the
5714 	     locations of the registers that we've already saved would
5715 	     remain based on the stack pointer even after we redefine
5716 	     the CFA based on the frame pointer.  We therefore need new
5717 	     DW_CFA_expressions to re-express the save slots with addresses
5718 	     based on the frame pointer.  */
5719 	  rtx_insn *insn = get_last_insn ();
5720 	  gcc_assert (RTX_FRAME_RELATED_P (insn));
5721 
5722 	  /* Add an explicit CFA definition if this was previously
5723 	     implicit.  */
5724 	  if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
5725 	    {
5726 	      rtx src = plus_constant (Pmode, stack_pointer_rtx,
5727 				       callee_offset);
5728 	      add_reg_note (insn, REG_CFA_ADJUST_CFA,
5729 			    gen_rtx_SET (hard_frame_pointer_rtx, src));
5730 	    }
5731 
5732 	  /* Change the save slot expressions for the registers that
5733 	     we've already saved.  */
5734 	  reg_offset -= callee_offset;
5735 	  aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
5736 				      reg_offset + UNITS_PER_WORD);
5737 	  aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
5738 				      reg_offset);
5739 	}
5740       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
5741     }
5742 
5743   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5744 			     callee_adjust != 0 || emit_frame_chain);
5745   if (aarch64_simd_decl_p (cfun->decl))
5746     aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5747 			       callee_adjust != 0 || emit_frame_chain);
5748   else
5749     aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5750 			       callee_adjust != 0 || emit_frame_chain);
5751 
5752   /* We may need to probe the final adjustment if it is larger than the guard
5753      that is assumed by the called.  */
5754   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
5755 					  !frame_pointer_needed, true);
5756 }
5757 
5758 /* Return TRUE if we can use a simple_return insn.
5759 
5760    This function checks whether the callee saved stack is empty, which
5761    means no restore actions are need. The pro_and_epilogue will use
5762    this to check whether shrink-wrapping opt is feasible.  */
5763 
5764 bool
5765 aarch64_use_return_insn_p (void)
5766 {
5767   if (!reload_completed)
5768     return false;
5769 
5770   if (crtl->profile)
5771     return false;
5772 
5773   return known_eq (cfun->machine->frame.frame_size, 0);
5774 }
5775 
5776 /* Return false for non-leaf SIMD functions in order to avoid
5777    shrink-wrapping them.  Doing this will lose the necessary
5778    save/restore of FP registers.  */
5779 
5780 bool
5781 aarch64_use_simple_return_insn_p (void)
5782 {
5783   if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf)
5784     return false;
5785 
5786   return true;
5787 }
5788 
5789 /* Generate the epilogue instructions for returning from a function.
5790    This is almost exactly the reverse of the prolog sequence, except
5791    that we need to insert barriers to avoid scheduling loads that read
5792    from a deallocated stack, and we optimize the unwind records by
5793    emitting them all together if possible.  */
5794 void
5795 aarch64_expand_epilogue (bool for_sibcall)
5796 {
5797   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
5798   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
5799   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
5800   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
5801   unsigned reg1 = cfun->machine->frame.wb_candidate1;
5802   unsigned reg2 = cfun->machine->frame.wb_candidate2;
5803   rtx cfi_ops = NULL;
5804   rtx_insn *insn;
5805   /* A stack clash protection prologue may not have left EP0_REGNUM or
5806      EP1_REGNUM in a usable state.  The same is true for allocations
5807      with an SVE component, since we then need both temporary registers
5808      for each allocation.  For stack clash we are in a usable state if
5809      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
5810   HOST_WIDE_INT guard_size
5811     = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
5812   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
5813 
5814   /* We can re-use the registers when the allocation amount is smaller than
5815      guard_size - guard_used_by_caller because we won't be doing any probes
5816      then.  In such situations the register should remain live with the correct
5817      value.  */
5818   bool can_inherit_p = (initial_adjust.is_constant ()
5819 			&& final_adjust.is_constant ())
5820 			&& (!flag_stack_clash_protection
5821 			    || known_lt (initial_adjust,
5822 					 guard_size - guard_used_by_caller));
5823 
5824   /* We need to add memory barrier to prevent read from deallocated stack.  */
5825   bool need_barrier_p
5826     = maybe_ne (get_frame_size ()
5827 		+ cfun->machine->frame.saved_varargs_size, 0);
5828 
5829   /* Emit a barrier to prevent loads from a deallocated stack.  */
5830   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5831       || cfun->calls_alloca
5832       || crtl->calls_eh_return)
5833     {
5834       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5835       need_barrier_p = false;
5836     }
5837 
5838   /* Restore the stack pointer from the frame pointer if it may not
5839      be the same as the stack pointer.  */
5840   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
5841   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
5842   if (frame_pointer_needed
5843       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5844     /* If writeback is used when restoring callee-saves, the CFA
5845        is restored on the instruction doing the writeback.  */
5846     aarch64_add_offset (Pmode, stack_pointer_rtx,
5847 			hard_frame_pointer_rtx, -callee_offset,
5848 			tmp1_rtx, tmp0_rtx, callee_adjust == 0);
5849   else
5850      /* The case where we need to re-use the register here is very rare, so
5851 	avoid the complicated condition and just always emit a move if the
5852 	immediate doesn't fit.  */
5853      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
5854 
5855   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5856 				callee_adjust != 0, &cfi_ops);
5857   if (aarch64_simd_decl_p (cfun->decl))
5858     aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5859 				  callee_adjust != 0, &cfi_ops);
5860   else
5861     aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5862 				  callee_adjust != 0, &cfi_ops);
5863 
5864   if (need_barrier_p)
5865     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5866 
5867   if (callee_adjust != 0)
5868     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5869 
5870   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5871     {
5872       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
5873       insn = get_last_insn ();
5874       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5875       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5876       RTX_FRAME_RELATED_P (insn) = 1;
5877       cfi_ops = NULL;
5878     }
5879 
5880   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
5881      add restriction on emit_move optimization to leaf functions.  */
5882   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
5883 		  (!can_inherit_p || !crtl->is_leaf
5884 		   || df_regs_ever_live_p (EP0_REGNUM)));
5885 
5886   if (cfi_ops)
5887     {
5888       /* Emit delayed restores and reset the CFA to be SP.  */
5889       insn = get_last_insn ();
5890       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5891       REG_NOTES (insn) = cfi_ops;
5892       RTX_FRAME_RELATED_P (insn) = 1;
5893     }
5894 
5895   /* We prefer to emit the combined return/authenticate instruction RETAA,
5896      however there are three cases in which we must instead emit an explicit
5897      authentication instruction.
5898 
5899 	1) Sibcalls don't return in a normal way, so if we're about to call one
5900 	   we must authenticate.
5901 
5902 	2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5903 	   generating code for !TARGET_ARMV8_3 we can't use it and must
5904 	   explicitly authenticate.
5905 
5906 	3) On an eh_return path we make extra stack adjustments to update the
5907 	   canonical frame address to be the exception handler's CFA.  We want
5908 	   to authenticate using the CFA of the function which calls eh_return.
5909     */
5910   if (aarch64_return_address_signing_enabled ()
5911       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5912     {
5913       insn = emit_insn (gen_autisp ());
5914       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5915       RTX_FRAME_RELATED_P (insn) = 1;
5916     }
5917 
5918   /* Stack adjustment for exception handler.  */
5919   if (crtl->calls_eh_return)
5920     {
5921       /* We need to unwind the stack by the offset computed by
5922 	 EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5923 	 to be SP; letting the CFA move during this adjustment
5924 	 is just as correct as retaining the CFA from the body
5925 	 of the function.  Therefore, do nothing special.  */
5926       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5927     }
5928 
5929   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5930   if (!for_sibcall)
5931     emit_jump_insn (ret_rtx);
5932 }
5933 
5934 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5935    normally or return to a previous frame after unwinding.
5936 
5937    An EH return uses a single shared return sequence.  The epilogue is
5938    exactly like a normal epilogue except that it has an extra input
5939    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5940    that must be applied after the frame has been destroyed.  An extra label
5941    is inserted before the epilogue which initializes this register to zero,
5942    and this is the entry point for a normal return.
5943 
5944    An actual EH return updates the return address, initializes the stack
5945    adjustment and jumps directly into the epilogue (bypassing the zeroing
5946    of the adjustment).  Since the return address is typically saved on the
5947    stack when a function makes a call, the saved LR must be updated outside
5948    the epilogue.
5949 
5950    This poses problems as the store is generated well before the epilogue,
5951    so the offset of LR is not known yet.  Also optimizations will remove the
5952    store as it appears dead, even after the epilogue is generated (as the
5953    base or offset for loading LR is different in many cases).
5954 
5955    To avoid these problems this implementation forces the frame pointer
5956    in eh_return functions so that the location of LR is fixed and known early.
5957    It also marks the store volatile, so no optimization is permitted to
5958    remove the store.  */
5959 rtx
5960 aarch64_eh_return_handler_rtx (void)
5961 {
5962   rtx tmp = gen_frame_mem (Pmode,
5963     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5964 
5965   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5966   MEM_VOLATILE_P (tmp) = true;
5967   return tmp;
5968 }
5969 
5970 /* Output code to add DELTA to the first argument, and then jump
5971    to FUNCTION.  Used for C++ multiple inheritance.  */
5972 static void
5973 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5974 			 HOST_WIDE_INT delta,
5975 			 HOST_WIDE_INT vcall_offset,
5976 			 tree function)
5977 {
5978   /* The this pointer is always in x0.  Note that this differs from
5979      Arm where the this pointer maybe bumped to r1 if r0 is required
5980      to return a pointer to an aggregate.  On AArch64 a result value
5981      pointer will be in x8.  */
5982   int this_regno = R0_REGNUM;
5983   rtx this_rtx, temp0, temp1, addr, funexp;
5984   rtx_insn *insn;
5985 
5986   if (aarch64_bti_enabled ())
5987     emit_insn (gen_bti_c());
5988 
5989   reload_completed = 1;
5990   emit_note (NOTE_INSN_PROLOGUE_END);
5991 
5992   this_rtx = gen_rtx_REG (Pmode, this_regno);
5993   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
5994   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
5995 
5996   if (vcall_offset == 0)
5997     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5998   else
5999     {
6000       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
6001 
6002       addr = this_rtx;
6003       if (delta != 0)
6004 	{
6005 	  if (delta >= -256 && delta < 256)
6006 	    addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
6007 				       plus_constant (Pmode, this_rtx, delta));
6008 	  else
6009 	    aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
6010 				temp1, temp0, false);
6011 	}
6012 
6013       if (Pmode == ptr_mode)
6014 	aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
6015       else
6016 	aarch64_emit_move (temp0,
6017 			   gen_rtx_ZERO_EXTEND (Pmode,
6018 						gen_rtx_MEM (ptr_mode, addr)));
6019 
6020       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
6021 	  addr = plus_constant (Pmode, temp0, vcall_offset);
6022       else
6023 	{
6024 	  aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
6025 					  Pmode);
6026 	  addr = gen_rtx_PLUS (Pmode, temp0, temp1);
6027 	}
6028 
6029       if (Pmode == ptr_mode)
6030 	aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
6031       else
6032 	aarch64_emit_move (temp1,
6033 			   gen_rtx_SIGN_EXTEND (Pmode,
6034 						gen_rtx_MEM (ptr_mode, addr)));
6035 
6036       emit_insn (gen_add2_insn (this_rtx, temp1));
6037     }
6038 
6039   /* Generate a tail call to the target function.  */
6040   if (!TREE_USED (function))
6041     {
6042       assemble_external (function);
6043       TREE_USED (function) = 1;
6044     }
6045   funexp = XEXP (DECL_RTL (function), 0);
6046   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
6047   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
6048   SIBLING_CALL_P (insn) = 1;
6049 
6050   insn = get_insns ();
6051   shorten_branches (insn);
6052   final_start_function (insn, file, 1);
6053   final (insn, file, 1);
6054   final_end_function ();
6055 
6056   /* Stop pretending to be a post-reload pass.  */
6057   reload_completed = 0;
6058 }
6059 
6060 static bool
6061 aarch64_tls_referenced_p (rtx x)
6062 {
6063   if (!TARGET_HAVE_TLS)
6064     return false;
6065   subrtx_iterator::array_type array;
6066   FOR_EACH_SUBRTX (iter, array, x, ALL)
6067     {
6068       const_rtx x = *iter;
6069       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
6070 	return true;
6071       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
6072 	 TLS offsets, not real symbol references.  */
6073       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6074 	iter.skip_subrtxes ();
6075     }
6076   return false;
6077 }
6078 
6079 
6080 /* Return true if val can be encoded as a 12-bit unsigned immediate with
6081    a left shift of 0 or 12 bits.  */
6082 bool
6083 aarch64_uimm12_shift (HOST_WIDE_INT val)
6084 {
6085   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
6086 	  || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
6087 	  );
6088 }
6089 
6090 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
6091    that can be created with a left shift of 0 or 12.  */
6092 static HOST_WIDE_INT
6093 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
6094 {
6095   /* Check to see if the value fits in 24 bits, as that is the maximum we can
6096      handle correctly.  */
6097   gcc_assert ((val & 0xffffff) == val);
6098 
6099   if (((val & 0xfff) << 0) == val)
6100     return val;
6101 
6102   return val & (0xfff << 12);
6103 }
6104 
6105 /* Return true if val is an immediate that can be loaded into a
6106    register by a MOVZ instruction.  */
6107 static bool
6108 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
6109 {
6110   if (GET_MODE_SIZE (mode) > 4)
6111     {
6112       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
6113 	  || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
6114 	return 1;
6115     }
6116   else
6117     {
6118       /* Ignore sign extension.  */
6119       val &= (HOST_WIDE_INT) 0xffffffff;
6120     }
6121   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
6122 	  || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
6123 }
6124 
6125 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
6126    64-bit (DImode) integer.  */
6127 
6128 static unsigned HOST_WIDE_INT
6129 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
6130 {
6131   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
6132   while (size < 64)
6133     {
6134       val &= (HOST_WIDE_INT_1U << size) - 1;
6135       val |= val << size;
6136       size *= 2;
6137     }
6138   return val;
6139 }
6140 
6141 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
6142 
6143 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
6144   {
6145     0x0000000100000001ull,
6146     0x0001000100010001ull,
6147     0x0101010101010101ull,
6148     0x1111111111111111ull,
6149     0x5555555555555555ull,
6150   };
6151 
6152 
6153 /* Return true if val is a valid bitmask immediate.  */
6154 
6155 bool
6156 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
6157 {
6158   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
6159   int bits;
6160 
6161   /* Check for a single sequence of one bits and return quickly if so.
6162      The special cases of all ones and all zeroes returns false.  */
6163   val = aarch64_replicate_bitmask_imm (val_in, mode);
6164   tmp = val + (val & -val);
6165 
6166   if (tmp == (tmp & -tmp))
6167     return (val + 1) > 1;
6168 
6169   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
6170   if (mode == SImode)
6171     val = (val << 32) | (val & 0xffffffff);
6172 
6173   /* Invert if the immediate doesn't start with a zero bit - this means we
6174      only need to search for sequences of one bits.  */
6175   if (val & 1)
6176     val = ~val;
6177 
6178   /* Find the first set bit and set tmp to val with the first sequence of one
6179      bits removed.  Return success if there is a single sequence of ones.  */
6180   first_one = val & -val;
6181   tmp = val & (val + first_one);
6182 
6183   if (tmp == 0)
6184     return true;
6185 
6186   /* Find the next set bit and compute the difference in bit position.  */
6187   next_one = tmp & -tmp;
6188   bits = clz_hwi (first_one) - clz_hwi (next_one);
6189   mask = val ^ tmp;
6190 
6191   /* Check the bit position difference is a power of 2, and that the first
6192      sequence of one bits fits within 'bits' bits.  */
6193   if ((mask >> bits) != 0 || bits != (bits & -bits))
6194     return false;
6195 
6196   /* Check the sequence of one bits is repeated 64/bits times.  */
6197   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
6198 }
6199 
6200 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
6201    Assumed precondition: VAL_IN Is not zero.  */
6202 
6203 unsigned HOST_WIDE_INT
6204 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
6205 {
6206   int lowest_bit_set = ctz_hwi (val_in);
6207   int highest_bit_set = floor_log2 (val_in);
6208   gcc_assert (val_in != 0);
6209 
6210   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
6211 	  (HOST_WIDE_INT_1U << lowest_bit_set));
6212 }
6213 
6214 /* Create constant where bits outside of lowest bit set to highest bit set
6215    are set to 1.  */
6216 
6217 unsigned HOST_WIDE_INT
6218 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
6219 {
6220   return val_in | ~aarch64_and_split_imm1 (val_in);
6221 }
6222 
6223 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
6224 
6225 bool
6226 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
6227 {
6228   scalar_int_mode int_mode;
6229   if (!is_a <scalar_int_mode> (mode, &int_mode))
6230     return false;
6231 
6232   if (aarch64_bitmask_imm (val_in, int_mode))
6233     return false;
6234 
6235   if (aarch64_move_imm (val_in, int_mode))
6236     return false;
6237 
6238   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
6239 
6240   return aarch64_bitmask_imm (imm2, int_mode);
6241 }
6242 
6243 /* Return true if val is an immediate that can be loaded into a
6244    register in a single instruction.  */
6245 bool
6246 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
6247 {
6248   scalar_int_mode int_mode;
6249   if (!is_a <scalar_int_mode> (mode, &int_mode))
6250     return false;
6251 
6252   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
6253     return 1;
6254   return aarch64_bitmask_imm (val, int_mode);
6255 }
6256 
6257 static bool
6258 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
6259 {
6260   rtx base, offset;
6261 
6262   if (GET_CODE (x) == HIGH)
6263     return true;
6264 
6265   /* There's no way to calculate VL-based values using relocations.  */
6266   subrtx_iterator::array_type array;
6267   FOR_EACH_SUBRTX (iter, array, x, ALL)
6268     if (GET_CODE (*iter) == CONST_POLY_INT)
6269       return true;
6270 
6271   split_const (x, &base, &offset);
6272   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
6273     {
6274       if (aarch64_classify_symbol (base, INTVAL (offset))
6275 	  != SYMBOL_FORCE_TO_MEM)
6276 	return true;
6277       else
6278 	/* Avoid generating a 64-bit relocation in ILP32; leave
6279 	   to aarch64_expand_mov_immediate to handle it properly.  */
6280 	return mode != ptr_mode;
6281     }
6282 
6283   return aarch64_tls_referenced_p (x);
6284 }
6285 
6286 /* Implement TARGET_CASE_VALUES_THRESHOLD.
6287    The expansion for a table switch is quite expensive due to the number
6288    of instructions, the table lookup and hard to predict indirect jump.
6289    When optimizing for speed, and -O3 enabled, use the per-core tuning if
6290    set, otherwise use tables for > 16 cases as a tradeoff between size and
6291    performance.  When optimizing for size, use the default setting.  */
6292 
6293 static unsigned int
6294 aarch64_case_values_threshold (void)
6295 {
6296   /* Use the specified limit for the number of cases before using jump
6297      tables at higher optimization levels.  */
6298   if (optimize > 2
6299       && selected_cpu->tune->max_case_values != 0)
6300     return selected_cpu->tune->max_case_values;
6301   else
6302     return optimize_size ? default_case_values_threshold () : 17;
6303 }
6304 
6305 /* Return true if register REGNO is a valid index register.
6306    STRICT_P is true if REG_OK_STRICT is in effect.  */
6307 
6308 bool
6309 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
6310 {
6311   if (!HARD_REGISTER_NUM_P (regno))
6312     {
6313       if (!strict_p)
6314 	return true;
6315 
6316       if (!reg_renumber)
6317 	return false;
6318 
6319       regno = reg_renumber[regno];
6320     }
6321   return GP_REGNUM_P (regno);
6322 }
6323 
6324 /* Return true if register REGNO is a valid base register for mode MODE.
6325    STRICT_P is true if REG_OK_STRICT is in effect.  */
6326 
6327 bool
6328 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
6329 {
6330   if (!HARD_REGISTER_NUM_P (regno))
6331     {
6332       if (!strict_p)
6333 	return true;
6334 
6335       if (!reg_renumber)
6336 	return false;
6337 
6338       regno = reg_renumber[regno];
6339     }
6340 
6341   /* The fake registers will be eliminated to either the stack or
6342      hard frame pointer, both of which are usually valid base registers.
6343      Reload deals with the cases where the eliminated form isn't valid.  */
6344   return (GP_REGNUM_P (regno)
6345 	  || regno == SP_REGNUM
6346 	  || regno == FRAME_POINTER_REGNUM
6347 	  || regno == ARG_POINTER_REGNUM);
6348 }
6349 
6350 /* Return true if X is a valid base register for mode MODE.
6351    STRICT_P is true if REG_OK_STRICT is in effect.  */
6352 
6353 static bool
6354 aarch64_base_register_rtx_p (rtx x, bool strict_p)
6355 {
6356   if (!strict_p
6357       && GET_CODE (x) == SUBREG
6358       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
6359     x = SUBREG_REG (x);
6360 
6361   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
6362 }
6363 
6364 /* Return true if address offset is a valid index.  If it is, fill in INFO
6365    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6366 
6367 static bool
6368 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
6369 			machine_mode mode, bool strict_p)
6370 {
6371   enum aarch64_address_type type;
6372   rtx index;
6373   int shift;
6374 
6375   /* (reg:P) */
6376   if ((REG_P (x) || GET_CODE (x) == SUBREG)
6377       && GET_MODE (x) == Pmode)
6378     {
6379       type = ADDRESS_REG_REG;
6380       index = x;
6381       shift = 0;
6382     }
6383   /* (sign_extend:DI (reg:SI)) */
6384   else if ((GET_CODE (x) == SIGN_EXTEND
6385 	    || GET_CODE (x) == ZERO_EXTEND)
6386 	   && GET_MODE (x) == DImode
6387 	   && GET_MODE (XEXP (x, 0)) == SImode)
6388     {
6389       type = (GET_CODE (x) == SIGN_EXTEND)
6390 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6391       index = XEXP (x, 0);
6392       shift = 0;
6393     }
6394   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
6395   else if (GET_CODE (x) == MULT
6396 	   && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6397 	       || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6398 	   && GET_MODE (XEXP (x, 0)) == DImode
6399 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6400 	   && CONST_INT_P (XEXP (x, 1)))
6401     {
6402       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6403 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6404       index = XEXP (XEXP (x, 0), 0);
6405       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6406     }
6407   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
6408   else if (GET_CODE (x) == ASHIFT
6409 	   && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
6410 	       || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
6411 	   && GET_MODE (XEXP (x, 0)) == DImode
6412 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
6413 	   && CONST_INT_P (XEXP (x, 1)))
6414     {
6415       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
6416 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6417       index = XEXP (XEXP (x, 0), 0);
6418       shift = INTVAL (XEXP (x, 1));
6419     }
6420   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
6421   else if ((GET_CODE (x) == SIGN_EXTRACT
6422 	    || GET_CODE (x) == ZERO_EXTRACT)
6423 	   && GET_MODE (x) == DImode
6424 	   && GET_CODE (XEXP (x, 0)) == MULT
6425 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6426 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6427     {
6428       type = (GET_CODE (x) == SIGN_EXTRACT)
6429 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6430       index = XEXP (XEXP (x, 0), 0);
6431       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6432       if (INTVAL (XEXP (x, 1)) != 32 + shift
6433 	  || INTVAL (XEXP (x, 2)) != 0)
6434 	shift = -1;
6435     }
6436   /* (and:DI (mult:DI (reg:DI) (const_int scale))
6437      (const_int 0xffffffff<<shift)) */
6438   else if (GET_CODE (x) == AND
6439 	   && GET_MODE (x) == DImode
6440 	   && GET_CODE (XEXP (x, 0)) == MULT
6441 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6442 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6443 	   && CONST_INT_P (XEXP (x, 1)))
6444     {
6445       type = ADDRESS_REG_UXTW;
6446       index = XEXP (XEXP (x, 0), 0);
6447       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
6448       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6449 	shift = -1;
6450     }
6451   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
6452   else if ((GET_CODE (x) == SIGN_EXTRACT
6453 	    || GET_CODE (x) == ZERO_EXTRACT)
6454 	   && GET_MODE (x) == DImode
6455 	   && GET_CODE (XEXP (x, 0)) == ASHIFT
6456 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6457 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
6458     {
6459       type = (GET_CODE (x) == SIGN_EXTRACT)
6460 	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
6461       index = XEXP (XEXP (x, 0), 0);
6462       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6463       if (INTVAL (XEXP (x, 1)) != 32 + shift
6464 	  || INTVAL (XEXP (x, 2)) != 0)
6465 	shift = -1;
6466     }
6467   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
6468      (const_int 0xffffffff<<shift)) */
6469   else if (GET_CODE (x) == AND
6470 	   && GET_MODE (x) == DImode
6471 	   && GET_CODE (XEXP (x, 0)) == ASHIFT
6472 	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
6473 	   && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6474 	   && CONST_INT_P (XEXP (x, 1)))
6475     {
6476       type = ADDRESS_REG_UXTW;
6477       index = XEXP (XEXP (x, 0), 0);
6478       shift = INTVAL (XEXP (XEXP (x, 0), 1));
6479       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
6480 	shift = -1;
6481     }
6482   /* (mult:P (reg:P) (const_int scale)) */
6483   else if (GET_CODE (x) == MULT
6484 	   && GET_MODE (x) == Pmode
6485 	   && GET_MODE (XEXP (x, 0)) == Pmode
6486 	   && CONST_INT_P (XEXP (x, 1)))
6487     {
6488       type = ADDRESS_REG_REG;
6489       index = XEXP (x, 0);
6490       shift = exact_log2 (INTVAL (XEXP (x, 1)));
6491     }
6492   /* (ashift:P (reg:P) (const_int shift)) */
6493   else if (GET_CODE (x) == ASHIFT
6494 	   && GET_MODE (x) == Pmode
6495 	   && GET_MODE (XEXP (x, 0)) == Pmode
6496 	   && CONST_INT_P (XEXP (x, 1)))
6497     {
6498       type = ADDRESS_REG_REG;
6499       index = XEXP (x, 0);
6500       shift = INTVAL (XEXP (x, 1));
6501     }
6502   else
6503     return false;
6504 
6505   if (!strict_p
6506       && GET_CODE (index) == SUBREG
6507       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
6508     index = SUBREG_REG (index);
6509 
6510   if (aarch64_sve_data_mode_p (mode))
6511     {
6512       if (type != ADDRESS_REG_REG
6513 	  || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
6514 	return false;
6515     }
6516   else
6517     {
6518       if (shift != 0
6519 	  && !(IN_RANGE (shift, 1, 3)
6520 	       && known_eq (1 << shift, GET_MODE_SIZE (mode))))
6521 	return false;
6522     }
6523 
6524   if (REG_P (index)
6525       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
6526     {
6527       info->type = type;
6528       info->offset = index;
6529       info->shift = shift;
6530       return true;
6531     }
6532 
6533   return false;
6534 }
6535 
6536 /* Return true if MODE is one of the modes for which we
6537    support LDP/STP operations.  */
6538 
6539 static bool
6540 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
6541 {
6542   return mode == SImode || mode == DImode
6543 	 || mode == SFmode || mode == DFmode
6544 	 || (aarch64_vector_mode_supported_p (mode)
6545 	     && (known_eq (GET_MODE_SIZE (mode), 8)
6546 		 || (known_eq (GET_MODE_SIZE (mode), 16)
6547 		    && (aarch64_tune_params.extra_tuning_flags
6548 			& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
6549 }
6550 
6551 /* Return true if REGNO is a virtual pointer register, or an eliminable
6552    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
6553    include stack_pointer or hard_frame_pointer.  */
6554 static bool
6555 virt_or_elim_regno_p (unsigned regno)
6556 {
6557   return ((regno >= FIRST_VIRTUAL_REGISTER
6558 	   && regno <= LAST_VIRTUAL_POINTER_REGISTER)
6559 	  || regno == FRAME_POINTER_REGNUM
6560 	  || regno == ARG_POINTER_REGNUM);
6561 }
6562 
6563 /* Return true if X is a valid address of type TYPE for machine mode MODE.
6564    If it is, fill in INFO appropriately.  STRICT_P is true if
6565    REG_OK_STRICT is in effect.  */
6566 
6567 bool
6568 aarch64_classify_address (struct aarch64_address_info *info,
6569 			  rtx x, machine_mode mode, bool strict_p,
6570 			  aarch64_addr_query_type type)
6571 {
6572   enum rtx_code code = GET_CODE (x);
6573   rtx op0, op1;
6574   poly_int64 offset;
6575 
6576   HOST_WIDE_INT const_size;
6577 
6578   /* On BE, we use load/store pair for all large int mode load/stores.
6579      TI/TFmode may also use a load/store pair.  */
6580   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6581   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
6582   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
6583 			    || type == ADDR_QUERY_LDP_STP_N
6584 			    || mode == TImode
6585 			    || mode == TFmode
6586 			    || (BYTES_BIG_ENDIAN && advsimd_struct_p));
6587 
6588   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
6589      corresponds to the actual size of the memory being loaded/stored and the
6590      mode of the corresponding addressing mode is half of that.  */
6591   if (type == ADDR_QUERY_LDP_STP_N
6592       && known_eq (GET_MODE_SIZE (mode), 16))
6593     mode = DFmode;
6594 
6595   bool allow_reg_index_p = (!load_store_pair_p
6596 			    && (known_lt (GET_MODE_SIZE (mode), 16)
6597 				|| vec_flags == VEC_ADVSIMD
6598 				|| vec_flags == VEC_SVE_DATA));
6599 
6600   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
6601      [Rn, #offset, MUL VL].  */
6602   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
6603       && (code != REG && code != PLUS))
6604     return false;
6605 
6606   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
6607      REG addressing.  */
6608   if (advsimd_struct_p
6609       && !BYTES_BIG_ENDIAN
6610       && (code != POST_INC && code != REG))
6611     return false;
6612 
6613   gcc_checking_assert (GET_MODE (x) == VOIDmode
6614 		       || SCALAR_INT_MODE_P (GET_MODE (x)));
6615 
6616   switch (code)
6617     {
6618     case REG:
6619     case SUBREG:
6620       info->type = ADDRESS_REG_IMM;
6621       info->base = x;
6622       info->offset = const0_rtx;
6623       info->const_offset = 0;
6624       return aarch64_base_register_rtx_p (x, strict_p);
6625 
6626     case PLUS:
6627       op0 = XEXP (x, 0);
6628       op1 = XEXP (x, 1);
6629 
6630       if (! strict_p
6631 	  && REG_P (op0)
6632 	  && virt_or_elim_regno_p (REGNO (op0))
6633 	  && poly_int_rtx_p (op1, &offset))
6634 	{
6635 	  info->type = ADDRESS_REG_IMM;
6636 	  info->base = op0;
6637 	  info->offset = op1;
6638 	  info->const_offset = offset;
6639 
6640 	  return true;
6641 	}
6642 
6643       if (maybe_ne (GET_MODE_SIZE (mode), 0)
6644 	  && aarch64_base_register_rtx_p (op0, strict_p)
6645 	  && poly_int_rtx_p (op1, &offset))
6646 	{
6647 	  info->type = ADDRESS_REG_IMM;
6648 	  info->base = op0;
6649 	  info->offset = op1;
6650 	  info->const_offset = offset;
6651 
6652 	  /* TImode and TFmode values are allowed in both pairs of X
6653 	     registers and individual Q registers.  The available
6654 	     address modes are:
6655 	     X,X: 7-bit signed scaled offset
6656 	     Q:   9-bit signed offset
6657 	     We conservatively require an offset representable in either mode.
6658 	     When performing the check for pairs of X registers i.e.  LDP/STP
6659 	     pass down DImode since that is the natural size of the LDP/STP
6660 	     instruction memory accesses.  */
6661 	  if (mode == TImode || mode == TFmode)
6662 	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
6663 		    && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6664 			|| offset_12bit_unsigned_scaled_p (mode, offset)));
6665 
6666 	  /* A 7bit offset check because OImode will emit a ldp/stp
6667 	     instruction (only big endian will get here).
6668 	     For ldp/stp instructions, the offset is scaled for the size of a
6669 	     single element of the pair.  */
6670 	  if (mode == OImode)
6671 	    return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
6672 
6673 	  /* Three 9/12 bit offsets checks because CImode will emit three
6674 	     ldr/str instructions (only big endian will get here).  */
6675 	  if (mode == CImode)
6676 	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6677 		    && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
6678 							       offset + 32)
6679 			|| offset_12bit_unsigned_scaled_p (V16QImode,
6680 							   offset + 32)));
6681 
6682 	  /* Two 7bit offsets checks because XImode will emit two ldp/stp
6683 	     instructions (only big endian will get here).  */
6684 	  if (mode == XImode)
6685 	    return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
6686 		    && aarch64_offset_7bit_signed_scaled_p (TImode,
6687 							    offset + 32));
6688 
6689 	  /* Make "m" use the LD1 offset range for SVE data modes, so
6690 	     that pre-RTL optimizers like ivopts will work to that
6691 	     instead of the wider LDR/STR range.  */
6692 	  if (vec_flags == VEC_SVE_DATA)
6693 	    return (type == ADDR_QUERY_M
6694 		    ? offset_4bit_signed_scaled_p (mode, offset)
6695 		    : offset_9bit_signed_scaled_p (mode, offset));
6696 
6697 	  if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
6698 	    {
6699 	      poly_int64 end_offset = (offset
6700 				       + GET_MODE_SIZE (mode)
6701 				       - BYTES_PER_SVE_VECTOR);
6702 	      return (type == ADDR_QUERY_M
6703 		      ? offset_4bit_signed_scaled_p (mode, offset)
6704 		      : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
6705 			 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
6706 							 end_offset)));
6707 	    }
6708 
6709 	  if (vec_flags == VEC_SVE_PRED)
6710 	    return offset_9bit_signed_scaled_p (mode, offset);
6711 
6712 	  if (load_store_pair_p)
6713 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
6714 		     || known_eq (GET_MODE_SIZE (mode), 8)
6715 		     || known_eq (GET_MODE_SIZE (mode), 16))
6716 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6717 	  else
6718 	    return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
6719 		    || offset_12bit_unsigned_scaled_p (mode, offset));
6720 	}
6721 
6722       if (allow_reg_index_p)
6723 	{
6724 	  /* Look for base + (scaled/extended) index register.  */
6725 	  if (aarch64_base_register_rtx_p (op0, strict_p)
6726 	      && aarch64_classify_index (info, op1, mode, strict_p))
6727 	    {
6728 	      info->base = op0;
6729 	      return true;
6730 	    }
6731 	  if (aarch64_base_register_rtx_p (op1, strict_p)
6732 	      && aarch64_classify_index (info, op0, mode, strict_p))
6733 	    {
6734 	      info->base = op1;
6735 	      return true;
6736 	    }
6737 	}
6738 
6739       return false;
6740 
6741     case POST_INC:
6742     case POST_DEC:
6743     case PRE_INC:
6744     case PRE_DEC:
6745       info->type = ADDRESS_REG_WB;
6746       info->base = XEXP (x, 0);
6747       info->offset = NULL_RTX;
6748       return aarch64_base_register_rtx_p (info->base, strict_p);
6749 
6750     case POST_MODIFY:
6751     case PRE_MODIFY:
6752       info->type = ADDRESS_REG_WB;
6753       info->base = XEXP (x, 0);
6754       if (GET_CODE (XEXP (x, 1)) == PLUS
6755 	  && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
6756 	  && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
6757 	  && aarch64_base_register_rtx_p (info->base, strict_p))
6758 	{
6759 	  info->offset = XEXP (XEXP (x, 1), 1);
6760 	  info->const_offset = offset;
6761 
6762 	  /* TImode and TFmode values are allowed in both pairs of X
6763 	     registers and individual Q registers.  The available
6764 	     address modes are:
6765 	     X,X: 7-bit signed scaled offset
6766 	     Q:   9-bit signed offset
6767 	     We conservatively require an offset representable in either mode.
6768 	   */
6769 	  if (mode == TImode || mode == TFmode)
6770 	    return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
6771 		    && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
6772 
6773 	  if (load_store_pair_p)
6774 	    return ((known_eq (GET_MODE_SIZE (mode), 4)
6775 		     || known_eq (GET_MODE_SIZE (mode), 8)
6776 		     || known_eq (GET_MODE_SIZE (mode), 16))
6777 		    && aarch64_offset_7bit_signed_scaled_p (mode, offset));
6778 	  else
6779 	    return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
6780 	}
6781       return false;
6782 
6783     case CONST:
6784     case SYMBOL_REF:
6785     case LABEL_REF:
6786       /* load literal: pc-relative constant pool entry.  Only supported
6787          for SI mode or larger.  */
6788       info->type = ADDRESS_SYMBOLIC;
6789 
6790       if (!load_store_pair_p
6791 	  && GET_MODE_SIZE (mode).is_constant (&const_size)
6792 	  && const_size >= 4)
6793 	{
6794 	  rtx sym, addend;
6795 
6796 	  split_const (x, &sym, &addend);
6797 	  return ((GET_CODE (sym) == LABEL_REF
6798 		   || (GET_CODE (sym) == SYMBOL_REF
6799 		       && CONSTANT_POOL_ADDRESS_P (sym)
6800 		       && aarch64_pcrelative_literal_loads)));
6801 	}
6802       return false;
6803 
6804     case LO_SUM:
6805       info->type = ADDRESS_LO_SUM;
6806       info->base = XEXP (x, 0);
6807       info->offset = XEXP (x, 1);
6808       if (allow_reg_index_p
6809 	  && aarch64_base_register_rtx_p (info->base, strict_p))
6810 	{
6811 	  rtx sym, offs;
6812 	  split_const (info->offset, &sym, &offs);
6813 	  if (GET_CODE (sym) == SYMBOL_REF
6814 	      && (aarch64_classify_symbol (sym, INTVAL (offs))
6815 		  == SYMBOL_SMALL_ABSOLUTE))
6816 	    {
6817 	      /* The symbol and offset must be aligned to the access size.  */
6818 	      unsigned int align;
6819 
6820 	      if (CONSTANT_POOL_ADDRESS_P (sym))
6821 		align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
6822 	      else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
6823 		{
6824 		  tree exp = SYMBOL_REF_DECL (sym);
6825 		  align = TYPE_ALIGN (TREE_TYPE (exp));
6826 		  align = aarch64_constant_alignment (exp, align);
6827 		}
6828 	      else if (SYMBOL_REF_DECL (sym))
6829 		align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
6830 	      else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
6831 		       && SYMBOL_REF_BLOCK (sym) != NULL)
6832 		align = SYMBOL_REF_BLOCK (sym)->alignment;
6833 	      else
6834 		align = BITS_PER_UNIT;
6835 
6836 	      poly_int64 ref_size = GET_MODE_SIZE (mode);
6837 	      if (known_eq (ref_size, 0))
6838 		ref_size = GET_MODE_SIZE (DImode);
6839 
6840 	      return (multiple_p (INTVAL (offs), ref_size)
6841 		      && multiple_p (align / BITS_PER_UNIT, ref_size));
6842 	    }
6843 	}
6844       return false;
6845 
6846     default:
6847       return false;
6848     }
6849 }
6850 
6851 /* Return true if the address X is valid for a PRFM instruction.
6852    STRICT_P is true if we should do strict checking with
6853    aarch64_classify_address.  */
6854 
6855 bool
6856 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6857 {
6858   struct aarch64_address_info addr;
6859 
6860   /* PRFM accepts the same addresses as DImode...  */
6861   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6862   if (!res)
6863     return false;
6864 
6865   /* ... except writeback forms.  */
6866   return addr.type != ADDRESS_REG_WB;
6867 }
6868 
6869 bool
6870 aarch64_symbolic_address_p (rtx x)
6871 {
6872   rtx offset;
6873 
6874   split_const (x, &x, &offset);
6875   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6876 }
6877 
6878 /* Classify the base of symbolic expression X.  */
6879 
6880 enum aarch64_symbol_type
6881 aarch64_classify_symbolic_expression (rtx x)
6882 {
6883   rtx offset;
6884 
6885   split_const (x, &x, &offset);
6886   return aarch64_classify_symbol (x, INTVAL (offset));
6887 }
6888 
6889 
6890 /* Return TRUE if X is a legitimate address for accessing memory in
6891    mode MODE.  */
6892 static bool
6893 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6894 {
6895   struct aarch64_address_info addr;
6896 
6897   return aarch64_classify_address (&addr, x, mode, strict_p);
6898 }
6899 
6900 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6901    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
6902 bool
6903 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6904 			      aarch64_addr_query_type type)
6905 {
6906   struct aarch64_address_info addr;
6907 
6908   return aarch64_classify_address (&addr, x, mode, strict_p, type);
6909 }
6910 
6911 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
6912 
6913 static bool
6914 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6915 					 poly_int64 orig_offset,
6916 					 machine_mode mode)
6917 {
6918   HOST_WIDE_INT size;
6919   if (GET_MODE_SIZE (mode).is_constant (&size))
6920     {
6921       HOST_WIDE_INT const_offset, second_offset;
6922 
6923       /* A general SVE offset is A * VQ + B.  Remove the A component from
6924 	 coefficient 0 in order to get the constant B.  */
6925       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6926 
6927       /* Split an out-of-range address displacement into a base and
6928 	 offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
6929 	 range otherwise to increase opportunities for sharing the base
6930 	 address of different sizes.  Unaligned accesses use the signed
6931 	 9-bit range, TImode/TFmode use the intersection of signed
6932 	 scaled 7-bit and signed 9-bit offset.  */
6933       if (mode == TImode || mode == TFmode)
6934 	second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6935       else if ((const_offset & (size - 1)) != 0)
6936 	second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6937       else
6938 	second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6939 
6940       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6941 	return false;
6942 
6943       /* Split the offset into second_offset and the rest.  */
6944       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6945       *offset2 = gen_int_mode (second_offset, Pmode);
6946       return true;
6947     }
6948   else
6949     {
6950       /* Get the mode we should use as the basis of the range.  For structure
6951 	 modes this is the mode of one vector.  */
6952       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6953       machine_mode step_mode
6954 	= (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6955 
6956       /* Get the "mul vl" multiplier we'd like to use.  */
6957       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6958       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6959       if (vec_flags & VEC_SVE_DATA)
6960 	/* LDR supports a 9-bit range, but the move patterns for
6961 	   structure modes require all vectors to be in range of the
6962 	   same base.  The simplest way of accomodating that while still
6963 	   promoting reuse of anchor points between different modes is
6964 	   to use an 8-bit range unconditionally.  */
6965 	vnum = ((vnum + 128) & 255) - 128;
6966       else
6967 	/* Predicates are only handled singly, so we might as well use
6968 	   the full range.  */
6969 	vnum = ((vnum + 256) & 511) - 256;
6970       if (vnum == 0)
6971 	return false;
6972 
6973       /* Convert the "mul vl" multiplier into a byte offset.  */
6974       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6975       if (known_eq (second_offset, orig_offset))
6976 	return false;
6977 
6978       /* Split the offset into second_offset and the rest.  */
6979       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6980       *offset2 = gen_int_mode (second_offset, Pmode);
6981       return true;
6982     }
6983 }
6984 
6985 /* Return the binary representation of floating point constant VALUE in INTVAL.
6986    If the value cannot be converted, return false without setting INTVAL.
6987    The conversion is done in the given MODE.  */
6988 bool
6989 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6990 {
6991 
6992   /* We make a general exception for 0.  */
6993   if (aarch64_float_const_zero_rtx_p (value))
6994     {
6995       *intval = 0;
6996       return true;
6997     }
6998 
6999   scalar_float_mode mode;
7000   if (GET_CODE (value) != CONST_DOUBLE
7001       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
7002       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
7003       /* Only support up to DF mode.  */
7004       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
7005     return false;
7006 
7007   unsigned HOST_WIDE_INT ival = 0;
7008 
7009   long res[2];
7010   real_to_target (res,
7011 		  CONST_DOUBLE_REAL_VALUE (value),
7012 		  REAL_MODE_FORMAT (mode));
7013 
7014   if (mode == DFmode)
7015     {
7016       int order = BYTES_BIG_ENDIAN ? 1 : 0;
7017       ival = zext_hwi (res[order], 32);
7018       ival |= (zext_hwi (res[1 - order], 32) << 32);
7019     }
7020   else
7021       ival = zext_hwi (res[0], 32);
7022 
7023   *intval = ival;
7024   return true;
7025 }
7026 
7027 /* Return TRUE if rtx X is an immediate constant that can be moved using a
7028    single MOV(+MOVK) followed by an FMOV.  */
7029 bool
7030 aarch64_float_const_rtx_p (rtx x)
7031 {
7032   machine_mode mode = GET_MODE (x);
7033   if (mode == VOIDmode)
7034     return false;
7035 
7036   /* Determine whether it's cheaper to write float constants as
7037      mov/movk pairs over ldr/adrp pairs.  */
7038   unsigned HOST_WIDE_INT ival;
7039 
7040   if (GET_CODE (x) == CONST_DOUBLE
7041       && SCALAR_FLOAT_MODE_P (mode)
7042       && aarch64_reinterpret_float_as_int (x, &ival))
7043     {
7044       scalar_int_mode imode = (mode == HFmode
7045 			       ? SImode
7046 			       : int_mode_for_mode (mode).require ());
7047       int num_instr = aarch64_internal_mov_immediate
7048 			(NULL_RTX, gen_int_mode (ival, imode), false, imode);
7049       return num_instr < 3;
7050     }
7051 
7052   return false;
7053 }
7054 
7055 /* Return TRUE if rtx X is immediate constant 0.0 */
7056 bool
7057 aarch64_float_const_zero_rtx_p (rtx x)
7058 {
7059   if (GET_MODE (x) == VOIDmode)
7060     return false;
7061 
7062   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
7063     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
7064   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
7065 }
7066 
7067 /* Return TRUE if rtx X is immediate constant that fits in a single
7068    MOVI immediate operation.  */
7069 bool
7070 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
7071 {
7072   if (!TARGET_SIMD)
7073      return false;
7074 
7075   machine_mode vmode;
7076   scalar_int_mode imode;
7077   unsigned HOST_WIDE_INT ival;
7078 
7079   if (GET_CODE (x) == CONST_DOUBLE
7080       && SCALAR_FLOAT_MODE_P (mode))
7081     {
7082       if (!aarch64_reinterpret_float_as_int (x, &ival))
7083 	return false;
7084 
7085       /* We make a general exception for 0.  */
7086       if (aarch64_float_const_zero_rtx_p (x))
7087 	return true;
7088 
7089       imode = int_mode_for_mode (mode).require ();
7090     }
7091   else if (GET_CODE (x) == CONST_INT
7092 	   && is_a <scalar_int_mode> (mode, &imode))
7093     ival = INTVAL (x);
7094   else
7095     return false;
7096 
7097    /* use a 64 bit mode for everything except for DI/DF mode, where we use
7098      a 128 bit vector mode.  */
7099   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
7100 
7101   vmode = aarch64_simd_container_mode (imode, width);
7102   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
7103 
7104   return aarch64_simd_valid_immediate (v_op, NULL);
7105 }
7106 
7107 
7108 /* Return the fixed registers used for condition codes.  */
7109 
7110 static bool
7111 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
7112 {
7113   *p1 = CC_REGNUM;
7114   *p2 = INVALID_REGNUM;
7115   return true;
7116 }
7117 
7118 /* This function is used by the call expanders of the machine description.
7119    RESULT is the register in which the result is returned.  It's NULL for
7120    "call" and "sibcall".
7121    MEM is the location of the function call.
7122    SIBCALL indicates whether this function call is normal call or sibling call.
7123    It will generate different pattern accordingly.  */
7124 
7125 void
7126 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
7127 {
7128   rtx call, callee, tmp;
7129   rtvec vec;
7130   machine_mode mode;
7131 
7132   gcc_assert (MEM_P (mem));
7133   callee = XEXP (mem, 0);
7134   mode = GET_MODE (callee);
7135   gcc_assert (mode == Pmode);
7136 
7137   /* Decide if we should generate indirect calls by loading the
7138      address of the callee into a register before performing
7139      the branch-and-link.  */
7140   if (SYMBOL_REF_P (callee)
7141       ? (aarch64_is_long_call_p (callee)
7142 	 || aarch64_is_noplt_call_p (callee))
7143       : !REG_P (callee))
7144     XEXP (mem, 0) = force_reg (mode, callee);
7145 
7146   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
7147 
7148   if (result != NULL_RTX)
7149     call = gen_rtx_SET (result, call);
7150 
7151   if (sibcall)
7152     tmp = ret_rtx;
7153   else
7154     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
7155 
7156   vec = gen_rtvec (2, call, tmp);
7157   call = gen_rtx_PARALLEL (VOIDmode, vec);
7158 
7159   aarch64_emit_call_insn (call);
7160 }
7161 
7162 /* Emit call insn with PAT and do aarch64-specific handling.  */
7163 
7164 void
7165 aarch64_emit_call_insn (rtx pat)
7166 {
7167   rtx insn = emit_call_insn (pat);
7168 
7169   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
7170   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
7171   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
7172 }
7173 
7174 machine_mode
7175 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
7176 {
7177   machine_mode mode_x = GET_MODE (x);
7178   rtx_code code_x = GET_CODE (x);
7179 
7180   /* All floating point compares return CCFP if it is an equality
7181      comparison, and CCFPE otherwise.  */
7182   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
7183     {
7184       switch (code)
7185 	{
7186 	case EQ:
7187 	case NE:
7188 	case UNORDERED:
7189 	case ORDERED:
7190 	case UNLT:
7191 	case UNLE:
7192 	case UNGT:
7193 	case UNGE:
7194 	case UNEQ:
7195 	  return CCFPmode;
7196 
7197 	case LT:
7198 	case LE:
7199 	case GT:
7200 	case GE:
7201 	case LTGT:
7202 	  return CCFPEmode;
7203 
7204 	default:
7205 	  gcc_unreachable ();
7206 	}
7207     }
7208 
7209   /* Equality comparisons of short modes against zero can be performed
7210      using the TST instruction with the appropriate bitmask.  */
7211   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
7212       && (code == EQ || code == NE)
7213       && (mode_x == HImode || mode_x == QImode))
7214     return CC_NZmode;
7215 
7216   /* Similarly, comparisons of zero_extends from shorter modes can
7217      be performed using an ANDS with an immediate mask.  */
7218   if (y == const0_rtx && code_x == ZERO_EXTEND
7219       && (mode_x == SImode || mode_x == DImode)
7220       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
7221       && (code == EQ || code == NE))
7222     return CC_NZmode;
7223 
7224   if ((mode_x == SImode || mode_x == DImode)
7225       && y == const0_rtx
7226       && (code == EQ || code == NE || code == LT || code == GE)
7227       && (code_x == PLUS || code_x == MINUS || code_x == AND
7228 	  || code_x == NEG
7229 	  || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
7230 	      && CONST_INT_P (XEXP (x, 2)))))
7231     return CC_NZmode;
7232 
7233   /* A compare with a shifted operand.  Because of canonicalization,
7234      the comparison will have to be swapped when we emit the assembly
7235      code.  */
7236   if ((mode_x == SImode || mode_x == DImode)
7237       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
7238       && (code_x == ASHIFT || code_x == ASHIFTRT
7239 	  || code_x == LSHIFTRT
7240 	  || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
7241     return CC_SWPmode;
7242 
7243   /* Similarly for a negated operand, but we can only do this for
7244      equalities.  */
7245   if ((mode_x == SImode || mode_x == DImode)
7246       && (REG_P (y) || GET_CODE (y) == SUBREG)
7247       && (code == EQ || code == NE)
7248       && code_x == NEG)
7249     return CC_Zmode;
7250 
7251   /* A test for unsigned overflow from an addition.  */
7252   if ((mode_x == DImode || mode_x == TImode)
7253       && (code == LTU || code == GEU)
7254       && code_x == PLUS
7255       && rtx_equal_p (XEXP (x, 0), y))
7256     return CC_Cmode;
7257 
7258   /* A test for unsigned overflow from an add with carry.  */
7259   if ((mode_x == DImode || mode_x == TImode)
7260       && (code == LTU || code == GEU)
7261       && code_x == PLUS
7262       && CONST_SCALAR_INT_P (y)
7263       && (rtx_mode_t (y, mode_x)
7264 	  == (wi::shwi (1, mode_x)
7265 	      << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
7266     return CC_ADCmode;
7267 
7268   /* A test for signed overflow.  */
7269   if ((mode_x == DImode || mode_x == TImode)
7270       && code == NE
7271       && code_x == PLUS
7272       && GET_CODE (y) == SIGN_EXTEND)
7273     return CC_Vmode;
7274 
7275   /* For everything else, return CCmode.  */
7276   return CCmode;
7277 }
7278 
7279 static int
7280 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
7281 
7282 int
7283 aarch64_get_condition_code (rtx x)
7284 {
7285   machine_mode mode = GET_MODE (XEXP (x, 0));
7286   enum rtx_code comp_code = GET_CODE (x);
7287 
7288   if (GET_MODE_CLASS (mode) != MODE_CC)
7289     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
7290   return aarch64_get_condition_code_1 (mode, comp_code);
7291 }
7292 
7293 static int
7294 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
7295 {
7296   switch (mode)
7297     {
7298     case E_CCFPmode:
7299     case E_CCFPEmode:
7300       switch (comp_code)
7301 	{
7302 	case GE: return AARCH64_GE;
7303 	case GT: return AARCH64_GT;
7304 	case LE: return AARCH64_LS;
7305 	case LT: return AARCH64_MI;
7306 	case NE: return AARCH64_NE;
7307 	case EQ: return AARCH64_EQ;
7308 	case ORDERED: return AARCH64_VC;
7309 	case UNORDERED: return AARCH64_VS;
7310 	case UNLT: return AARCH64_LT;
7311 	case UNLE: return AARCH64_LE;
7312 	case UNGT: return AARCH64_HI;
7313 	case UNGE: return AARCH64_PL;
7314 	default: return -1;
7315 	}
7316       break;
7317 
7318     case E_CCmode:
7319       switch (comp_code)
7320 	{
7321 	case NE: return AARCH64_NE;
7322 	case EQ: return AARCH64_EQ;
7323 	case GE: return AARCH64_GE;
7324 	case GT: return AARCH64_GT;
7325 	case LE: return AARCH64_LE;
7326 	case LT: return AARCH64_LT;
7327 	case GEU: return AARCH64_CS;
7328 	case GTU: return AARCH64_HI;
7329 	case LEU: return AARCH64_LS;
7330 	case LTU: return AARCH64_CC;
7331 	default: return -1;
7332 	}
7333       break;
7334 
7335     case E_CC_SWPmode:
7336       switch (comp_code)
7337 	{
7338 	case NE: return AARCH64_NE;
7339 	case EQ: return AARCH64_EQ;
7340 	case GE: return AARCH64_LE;
7341 	case GT: return AARCH64_LT;
7342 	case LE: return AARCH64_GE;
7343 	case LT: return AARCH64_GT;
7344 	case GEU: return AARCH64_LS;
7345 	case GTU: return AARCH64_CC;
7346 	case LEU: return AARCH64_CS;
7347 	case LTU: return AARCH64_HI;
7348 	default: return -1;
7349 	}
7350       break;
7351 
7352     case E_CC_NZmode:
7353       switch (comp_code)
7354 	{
7355 	case NE: return AARCH64_NE;
7356 	case EQ: return AARCH64_EQ;
7357 	case GE: return AARCH64_PL;
7358 	case LT: return AARCH64_MI;
7359 	default: return -1;
7360 	}
7361       break;
7362 
7363     case E_CC_Zmode:
7364       switch (comp_code)
7365 	{
7366 	case NE: return AARCH64_NE;
7367 	case EQ: return AARCH64_EQ;
7368 	default: return -1;
7369 	}
7370       break;
7371 
7372     case E_CC_Cmode:
7373       switch (comp_code)
7374 	{
7375 	case LTU: return AARCH64_CS;
7376 	case GEU: return AARCH64_CC;
7377 	default: return -1;
7378 	}
7379       break;
7380 
7381     case E_CC_ADCmode:
7382       switch (comp_code)
7383 	{
7384 	case GEU: return AARCH64_CS;
7385 	case LTU: return AARCH64_CC;
7386 	default: return -1;
7387 	}
7388       break;
7389 
7390     case E_CC_Vmode:
7391       switch (comp_code)
7392 	{
7393 	case NE: return AARCH64_VS;
7394 	case EQ: return AARCH64_VC;
7395 	default: return -1;
7396 	}
7397       break;
7398 
7399     default:
7400       return -1;
7401     }
7402 
7403   return -1;
7404 }
7405 
7406 bool
7407 aarch64_const_vec_all_same_in_range_p (rtx x,
7408 				       HOST_WIDE_INT minval,
7409 				       HOST_WIDE_INT maxval)
7410 {
7411   rtx elt;
7412   return (const_vec_duplicate_p (x, &elt)
7413 	  && CONST_INT_P (elt)
7414 	  && IN_RANGE (INTVAL (elt), minval, maxval));
7415 }
7416 
7417 bool
7418 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
7419 {
7420   return aarch64_const_vec_all_same_in_range_p (x, val, val);
7421 }
7422 
7423 /* Return true if VEC is a constant in which every element is in the range
7424    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
7425 
7426 static bool
7427 aarch64_const_vec_all_in_range_p (rtx vec,
7428 				  HOST_WIDE_INT minval,
7429 				  HOST_WIDE_INT maxval)
7430 {
7431   if (GET_CODE (vec) != CONST_VECTOR
7432       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
7433     return false;
7434 
7435   int nunits;
7436   if (!CONST_VECTOR_STEPPED_P (vec))
7437     nunits = const_vector_encoded_nelts (vec);
7438   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
7439     return false;
7440 
7441   for (int i = 0; i < nunits; i++)
7442     {
7443       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
7444       if (!CONST_INT_P (vec_elem)
7445 	  || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
7446 	return false;
7447     }
7448   return true;
7449 }
7450 
7451 /* N Z C V.  */
7452 #define AARCH64_CC_V 1
7453 #define AARCH64_CC_C (1 << 1)
7454 #define AARCH64_CC_Z (1 << 2)
7455 #define AARCH64_CC_N (1 << 3)
7456 
7457 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
7458 static const int aarch64_nzcv_codes[] =
7459 {
7460   0,		/* EQ, Z == 1.  */
7461   AARCH64_CC_Z,	/* NE, Z == 0.  */
7462   0,		/* CS, C == 1.  */
7463   AARCH64_CC_C,	/* CC, C == 0.  */
7464   0,		/* MI, N == 1.  */
7465   AARCH64_CC_N, /* PL, N == 0.  */
7466   0,		/* VS, V == 1.  */
7467   AARCH64_CC_V, /* VC, V == 0.  */
7468   0,		/* HI, C ==1 && Z == 0.  */
7469   AARCH64_CC_C,	/* LS, !(C == 1 && Z == 0).  */
7470   AARCH64_CC_V,	/* GE, N == V.  */
7471   0,		/* LT, N != V.  */
7472   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
7473   0,		/* LE, !(Z == 0 && N == V).  */
7474   0,		/* AL, Any.  */
7475   0		/* NV, Any.  */
7476 };
7477 
7478 /* Print floating-point vector immediate operand X to F, negating it
7479    first if NEGATE is true.  Return true on success, false if it isn't
7480    a constant we can handle.  */
7481 
7482 static bool
7483 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
7484 {
7485   rtx elt;
7486 
7487   if (!const_vec_duplicate_p (x, &elt))
7488     return false;
7489 
7490   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
7491   if (negate)
7492     r = real_value_negate (&r);
7493 
7494   /* We only handle the SVE single-bit immediates here.  */
7495   if (real_equal (&r, &dconst0))
7496     asm_fprintf (f, "0.0");
7497   else if (real_equal (&r, &dconst1))
7498     asm_fprintf (f, "1.0");
7499   else if (real_equal (&r, &dconsthalf))
7500     asm_fprintf (f, "0.5");
7501   else
7502     return false;
7503 
7504   return true;
7505 }
7506 
7507 /* Return the equivalent letter for size.  */
7508 static char
7509 sizetochar (int size)
7510 {
7511   switch (size)
7512     {
7513     case 64: return 'd';
7514     case 32: return 's';
7515     case 16: return 'h';
7516     case 8 : return 'b';
7517     default: gcc_unreachable ();
7518     }
7519 }
7520 
7521 /* Print operand X to file F in a target specific manner according to CODE.
7522    The acceptable formatting commands given by CODE are:
7523      'c':		An integer or symbol address without a preceding #
7524 			sign.
7525      'C':		Take the duplicated element in a vector constant
7526 			and print it in hex.
7527      'D':		Take the duplicated element in a vector constant
7528 			and print it as an unsigned integer, in decimal.
7529      'e':		Print the sign/zero-extend size as a character 8->b,
7530 			16->h, 32->w.
7531      'p':		Prints N such that 2^N == X (X must be power of 2 and
7532 			const int).
7533      'P':		Print the number of non-zero bits in X (a const_int).
7534      'H':		Print the higher numbered register of a pair (TImode)
7535 			of regs.
7536      'm':		Print a condition (eq, ne, etc).
7537      'M':		Same as 'm', but invert condition.
7538      'N':		Take the duplicated element in a vector constant
7539 			and print the negative of it in decimal.
7540      'b/h/s/d/q':	Print a scalar FP/SIMD register name.
7541      'S/T/U/V':		Print a FP/SIMD register name for a register list.
7542 			The register printed is the FP/SIMD register name
7543 			of X + 0/1/2/3 for S/T/U/V.
7544      'R':		Print a scalar FP/SIMD register name + 1.
7545      'X':		Print bottom 16 bits of integer constant in hex.
7546      'w/x':		Print a general register name or the zero register
7547 			(32-bit or 64-bit).
7548      '0':		Print a normal operand, if it's a general register,
7549 			then we assume DImode.
7550      'k':		Print NZCV for conditional compare instructions.
7551      'A':		Output address constant representing the first
7552 			argument of X, specifying a relocation offset
7553 			if appropriate.
7554      'L':		Output constant address specified by X
7555 			with a relocation offset if appropriate.
7556      'G':		Prints address of X, specifying a PC relative
7557 			relocation mode if appropriate.
7558      'y':		Output address of LDP or STP - this is used for
7559 			some LDP/STPs which don't use a PARALLEL in their
7560 			pattern (so the mode needs to be adjusted).
7561      'z':		Output address of a typical LDP or STP.  */
7562 
7563 static void
7564 aarch64_print_operand (FILE *f, rtx x, int code)
7565 {
7566   rtx elt;
7567   switch (code)
7568     {
7569     case 'c':
7570       switch (GET_CODE (x))
7571 	{
7572 	case CONST_INT:
7573 	  fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
7574 	  break;
7575 
7576 	case SYMBOL_REF:
7577 	  output_addr_const (f, x);
7578 	  break;
7579 
7580 	case CONST:
7581 	  if (GET_CODE (XEXP (x, 0)) == PLUS
7582 	      && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
7583 	    {
7584 	      output_addr_const (f, x);
7585 	      break;
7586 	    }
7587 	  /* Fall through.  */
7588 
7589 	default:
7590 	  output_operand_lossage ("unsupported operand for code '%c'", code);
7591 	}
7592       break;
7593 
7594     case 'e':
7595       {
7596 	int n;
7597 
7598 	if (!CONST_INT_P (x)
7599 	    || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
7600 	  {
7601 	    output_operand_lossage ("invalid operand for '%%%c'", code);
7602 	    return;
7603 	  }
7604 
7605 	switch (n)
7606 	  {
7607 	  case 3:
7608 	    fputc ('b', f);
7609 	    break;
7610 	  case 4:
7611 	    fputc ('h', f);
7612 	    break;
7613 	  case 5:
7614 	    fputc ('w', f);
7615 	    break;
7616 	  default:
7617 	    output_operand_lossage ("invalid operand for '%%%c'", code);
7618 	    return;
7619 	  }
7620       }
7621       break;
7622 
7623     case 'p':
7624       {
7625 	int n;
7626 
7627 	if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
7628 	  {
7629 	    output_operand_lossage ("invalid operand for '%%%c'", code);
7630 	    return;
7631 	  }
7632 
7633 	asm_fprintf (f, "%d", n);
7634       }
7635       break;
7636 
7637     case 'P':
7638       if (!CONST_INT_P (x))
7639 	{
7640 	  output_operand_lossage ("invalid operand for '%%%c'", code);
7641 	  return;
7642 	}
7643 
7644       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
7645       break;
7646 
7647     case 'H':
7648       if (x == const0_rtx)
7649 	{
7650 	  asm_fprintf (f, "xzr");
7651 	  break;
7652 	}
7653 
7654       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
7655 	{
7656 	  output_operand_lossage ("invalid operand for '%%%c'", code);
7657 	  return;
7658 	}
7659 
7660       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
7661       break;
7662 
7663     case 'M':
7664     case 'm':
7665       {
7666         int cond_code;
7667 	/* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
7668 	if (x == const_true_rtx)
7669 	  {
7670 	    if (code == 'M')
7671 	      fputs ("nv", f);
7672 	    return;
7673 	  }
7674 
7675         if (!COMPARISON_P (x))
7676 	  {
7677 	    output_operand_lossage ("invalid operand for '%%%c'", code);
7678 	    return;
7679 	  }
7680 
7681         cond_code = aarch64_get_condition_code (x);
7682         gcc_assert (cond_code >= 0);
7683 	if (code == 'M')
7684 	  cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
7685 	fputs (aarch64_condition_codes[cond_code], f);
7686       }
7687       break;
7688 
7689     case 'N':
7690       if (!const_vec_duplicate_p (x, &elt))
7691 	{
7692 	  output_operand_lossage ("invalid vector constant");
7693 	  return;
7694 	}
7695 
7696       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7697 	asm_fprintf (f, "%wd", -INTVAL (elt));
7698       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7699 	       && aarch64_print_vector_float_operand (f, x, true))
7700 	;
7701       else
7702 	{
7703 	  output_operand_lossage ("invalid vector constant");
7704 	  return;
7705 	}
7706       break;
7707 
7708     case 'b':
7709     case 'h':
7710     case 's':
7711     case 'd':
7712     case 'q':
7713       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7714 	{
7715 	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7716 	  return;
7717 	}
7718       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
7719       break;
7720 
7721     case 'S':
7722     case 'T':
7723     case 'U':
7724     case 'V':
7725       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7726 	{
7727 	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7728 	  return;
7729 	}
7730       asm_fprintf (f, "%c%d",
7731 		   aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
7732 		   REGNO (x) - V0_REGNUM + (code - 'S'));
7733       break;
7734 
7735     case 'R':
7736       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
7737 	{
7738 	  output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
7739 	  return;
7740 	}
7741       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
7742       break;
7743 
7744     case 'X':
7745       if (!CONST_INT_P (x))
7746 	{
7747 	  output_operand_lossage ("invalid operand for '%%%c'", code);
7748 	  return;
7749 	}
7750       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
7751       break;
7752 
7753     case 'C':
7754       {
7755 	/* Print a replicated constant in hex.  */
7756 	if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7757 	  {
7758 	    output_operand_lossage ("invalid operand for '%%%c'", code);
7759 	    return;
7760 	  }
7761 	scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7762 	asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7763       }
7764       break;
7765 
7766     case 'D':
7767       {
7768 	/* Print a replicated constant in decimal, treating it as
7769 	   unsigned.  */
7770 	if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
7771 	  {
7772 	    output_operand_lossage ("invalid operand for '%%%c'", code);
7773 	    return;
7774 	  }
7775 	scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
7776 	asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
7777       }
7778       break;
7779 
7780     case 'w':
7781     case 'x':
7782       if (x == const0_rtx
7783 	  || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
7784 	{
7785 	  asm_fprintf (f, "%czr", code);
7786 	  break;
7787 	}
7788 
7789       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
7790 	{
7791 	  asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
7792 	  break;
7793 	}
7794 
7795       if (REG_P (x) && REGNO (x) == SP_REGNUM)
7796 	{
7797 	  asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
7798 	  break;
7799 	}
7800 
7801       /* Fall through */
7802 
7803     case 0:
7804       if (x == NULL)
7805 	{
7806 	  output_operand_lossage ("missing operand");
7807 	  return;
7808 	}
7809 
7810       switch (GET_CODE (x))
7811 	{
7812 	case REG:
7813 	  if (aarch64_sve_data_mode_p (GET_MODE (x)))
7814 	    {
7815 	      if (REG_NREGS (x) == 1)
7816 		asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
7817 	      else
7818 		{
7819 		  char suffix
7820 		    = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
7821 		  asm_fprintf (f, "{z%d.%c - z%d.%c}",
7822 			       REGNO (x) - V0_REGNUM, suffix,
7823 			       END_REGNO (x) - V0_REGNUM - 1, suffix);
7824 		}
7825 	    }
7826 	  else
7827 	    asm_fprintf (f, "%s", reg_names [REGNO (x)]);
7828 	  break;
7829 
7830 	case MEM:
7831 	  output_address (GET_MODE (x), XEXP (x, 0));
7832 	  break;
7833 
7834 	case LABEL_REF:
7835 	case SYMBOL_REF:
7836 	  output_addr_const (asm_out_file, x);
7837 	  break;
7838 
7839 	case CONST_INT:
7840 	  asm_fprintf (f, "%wd", INTVAL (x));
7841 	  break;
7842 
7843 	case CONST:
7844 	  if (!VECTOR_MODE_P (GET_MODE (x)))
7845 	    {
7846 	      output_addr_const (asm_out_file, x);
7847 	      break;
7848 	    }
7849 	  /* fall through */
7850 
7851 	case CONST_VECTOR:
7852 	  if (!const_vec_duplicate_p (x, &elt))
7853 	    {
7854 	      output_operand_lossage ("invalid vector constant");
7855 	      return;
7856 	    }
7857 
7858 	  if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
7859 	    asm_fprintf (f, "%wd", INTVAL (elt));
7860 	  else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
7861 		   && aarch64_print_vector_float_operand (f, x, false))
7862 	    ;
7863 	  else
7864 	    {
7865 	      output_operand_lossage ("invalid vector constant");
7866 	      return;
7867 	    }
7868 	  break;
7869 
7870 	case CONST_DOUBLE:
7871 	  /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
7872 	     be getting CONST_DOUBLEs holding integers.  */
7873 	  gcc_assert (GET_MODE (x) != VOIDmode);
7874 	  if (aarch64_float_const_zero_rtx_p (x))
7875 	    {
7876 	      fputc ('0', f);
7877 	      break;
7878 	    }
7879 	  else if (aarch64_float_const_representable_p (x))
7880 	    {
7881 #define buf_size 20
7882 	      char float_buf[buf_size] = {'\0'};
7883 	      real_to_decimal_for_mode (float_buf,
7884 					CONST_DOUBLE_REAL_VALUE (x),
7885 					buf_size, buf_size,
7886 					1, GET_MODE (x));
7887 	      asm_fprintf (asm_out_file, "%s", float_buf);
7888 	      break;
7889 #undef buf_size
7890 	    }
7891 	  output_operand_lossage ("invalid constant");
7892 	  return;
7893 	default:
7894 	  output_operand_lossage ("invalid operand");
7895 	  return;
7896 	}
7897       break;
7898 
7899     case 'A':
7900       if (GET_CODE (x) == HIGH)
7901 	x = XEXP (x, 0);
7902 
7903       switch (aarch64_classify_symbolic_expression (x))
7904 	{
7905 	case SYMBOL_SMALL_GOT_4G:
7906 	  asm_fprintf (asm_out_file, ":got:");
7907 	  break;
7908 
7909 	case SYMBOL_SMALL_TLSGD:
7910 	  asm_fprintf (asm_out_file, ":tlsgd:");
7911 	  break;
7912 
7913 	case SYMBOL_SMALL_TLSDESC:
7914 	  asm_fprintf (asm_out_file, ":tlsdesc:");
7915 	  break;
7916 
7917 	case SYMBOL_SMALL_TLSIE:
7918 	  asm_fprintf (asm_out_file, ":gottprel:");
7919 	  break;
7920 
7921 	case SYMBOL_TLSLE24:
7922 	  asm_fprintf (asm_out_file, ":tprel:");
7923 	  break;
7924 
7925 	case SYMBOL_TINY_GOT:
7926 	  gcc_unreachable ();
7927 	  break;
7928 
7929 	default:
7930 	  break;
7931 	}
7932       output_addr_const (asm_out_file, x);
7933       break;
7934 
7935     case 'L':
7936       switch (aarch64_classify_symbolic_expression (x))
7937 	{
7938 	case SYMBOL_SMALL_GOT_4G:
7939 	  asm_fprintf (asm_out_file, ":lo12:");
7940 	  break;
7941 
7942 	case SYMBOL_SMALL_TLSGD:
7943 	  asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7944 	  break;
7945 
7946 	case SYMBOL_SMALL_TLSDESC:
7947 	  asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7948 	  break;
7949 
7950 	case SYMBOL_SMALL_TLSIE:
7951 	  asm_fprintf (asm_out_file, ":gottprel_lo12:");
7952 	  break;
7953 
7954 	case SYMBOL_TLSLE12:
7955 	  asm_fprintf (asm_out_file, ":tprel_lo12:");
7956 	  break;
7957 
7958 	case SYMBOL_TLSLE24:
7959 	  asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7960 	  break;
7961 
7962 	case SYMBOL_TINY_GOT:
7963 	  asm_fprintf (asm_out_file, ":got:");
7964 	  break;
7965 
7966 	case SYMBOL_TINY_TLSIE:
7967 	  asm_fprintf (asm_out_file, ":gottprel:");
7968 	  break;
7969 
7970 	default:
7971 	  break;
7972 	}
7973       output_addr_const (asm_out_file, x);
7974       break;
7975 
7976     case 'G':
7977       switch (aarch64_classify_symbolic_expression (x))
7978 	{
7979 	case SYMBOL_TLSLE24:
7980 	  asm_fprintf (asm_out_file, ":tprel_hi12:");
7981 	  break;
7982 	default:
7983 	  break;
7984 	}
7985       output_addr_const (asm_out_file, x);
7986       break;
7987 
7988     case 'k':
7989       {
7990 	HOST_WIDE_INT cond_code;
7991 
7992 	if (!CONST_INT_P (x))
7993 	  {
7994 	    output_operand_lossage ("invalid operand for '%%%c'", code);
7995 	    return;
7996 	  }
7997 
7998 	cond_code = INTVAL (x);
7999 	gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
8000 	asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
8001       }
8002       break;
8003 
8004     case 'y':
8005     case 'z':
8006       {
8007 	machine_mode mode = GET_MODE (x);
8008 
8009 	if (GET_CODE (x) != MEM
8010 	    || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
8011 	  {
8012 	    output_operand_lossage ("invalid operand for '%%%c'", code);
8013 	    return;
8014 	  }
8015 
8016 	if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
8017 					    code == 'y'
8018 					    ? ADDR_QUERY_LDP_STP_N
8019 					    : ADDR_QUERY_LDP_STP))
8020 	  output_operand_lossage ("invalid operand prefix '%%%c'", code);
8021       }
8022       break;
8023 
8024     default:
8025       output_operand_lossage ("invalid operand prefix '%%%c'", code);
8026       return;
8027     }
8028 }
8029 
8030 /* Print address 'x' of a memory access with mode 'mode'.
8031    'op' is the context required by aarch64_classify_address.  It can either be
8032    MEM for a normal memory access or PARALLEL for LDP/STP.  */
8033 static bool
8034 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
8035 				aarch64_addr_query_type type)
8036 {
8037   struct aarch64_address_info addr;
8038   unsigned int size;
8039 
8040   /* Check all addresses are Pmode - including ILP32.  */
8041   if (GET_MODE (x) != Pmode
8042       && (!CONST_INT_P (x)
8043 	  || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
8044     {
8045       output_operand_lossage ("invalid address mode");
8046       return false;
8047     }
8048 
8049   if (aarch64_classify_address (&addr, x, mode, true, type))
8050     switch (addr.type)
8051       {
8052       case ADDRESS_REG_IMM:
8053 	if (known_eq (addr.const_offset, 0))
8054 	  asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
8055 	else if (aarch64_sve_data_mode_p (mode))
8056 	  {
8057 	    HOST_WIDE_INT vnum
8058 	      = exact_div (addr.const_offset,
8059 			   BYTES_PER_SVE_VECTOR).to_constant ();
8060 	    asm_fprintf (f, "[%s, #%wd, mul vl]",
8061 			 reg_names[REGNO (addr.base)], vnum);
8062 	  }
8063 	else if (aarch64_sve_pred_mode_p (mode))
8064 	  {
8065 	    HOST_WIDE_INT vnum
8066 	      = exact_div (addr.const_offset,
8067 			   BYTES_PER_SVE_PRED).to_constant ();
8068 	    asm_fprintf (f, "[%s, #%wd, mul vl]",
8069 			 reg_names[REGNO (addr.base)], vnum);
8070 	  }
8071 	else
8072 	  asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
8073 		       INTVAL (addr.offset));
8074 	return true;
8075 
8076       case ADDRESS_REG_REG:
8077 	if (addr.shift == 0)
8078 	  asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
8079 		       reg_names [REGNO (addr.offset)]);
8080 	else
8081 	  asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
8082 		       reg_names [REGNO (addr.offset)], addr.shift);
8083 	return true;
8084 
8085       case ADDRESS_REG_UXTW:
8086 	if (addr.shift == 0)
8087 	  asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
8088 		       REGNO (addr.offset) - R0_REGNUM);
8089 	else
8090 	  asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
8091 		       REGNO (addr.offset) - R0_REGNUM, addr.shift);
8092 	return true;
8093 
8094       case ADDRESS_REG_SXTW:
8095 	if (addr.shift == 0)
8096 	  asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
8097 		       REGNO (addr.offset) - R0_REGNUM);
8098 	else
8099 	  asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
8100 		       REGNO (addr.offset) - R0_REGNUM, addr.shift);
8101 	return true;
8102 
8103       case ADDRESS_REG_WB:
8104 	/* Writeback is only supported for fixed-width modes.  */
8105 	size = GET_MODE_SIZE (mode).to_constant ();
8106 	switch (GET_CODE (x))
8107 	  {
8108 	  case PRE_INC:
8109 	    asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
8110 	    return true;
8111 	  case POST_INC:
8112 	    asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
8113 	    return true;
8114 	  case PRE_DEC:
8115 	    asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
8116 	    return true;
8117 	  case POST_DEC:
8118 	    asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
8119 	    return true;
8120 	  case PRE_MODIFY:
8121 	    asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
8122 			 INTVAL (addr.offset));
8123 	    return true;
8124 	  case POST_MODIFY:
8125 	    asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
8126 			 INTVAL (addr.offset));
8127 	    return true;
8128 	  default:
8129 	    break;
8130 	  }
8131 	break;
8132 
8133       case ADDRESS_LO_SUM:
8134 	asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
8135 	output_addr_const (f, addr.offset);
8136 	asm_fprintf (f, "]");
8137 	return true;
8138 
8139       case ADDRESS_SYMBOLIC:
8140 	output_addr_const (f, x);
8141 	return true;
8142       }
8143 
8144   return false;
8145 }
8146 
8147 /* Print address 'x' of a memory access with mode 'mode'.  */
8148 static void
8149 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
8150 {
8151   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
8152     output_addr_const (f, x);
8153 }
8154 
8155 bool
8156 aarch64_label_mentioned_p (rtx x)
8157 {
8158   const char *fmt;
8159   int i;
8160 
8161   if (GET_CODE (x) == LABEL_REF)
8162     return true;
8163 
8164   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
8165      referencing instruction, but they are constant offsets, not
8166      symbols.  */
8167   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8168     return false;
8169 
8170   fmt = GET_RTX_FORMAT (GET_CODE (x));
8171   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
8172     {
8173       if (fmt[i] == 'E')
8174 	{
8175 	  int j;
8176 
8177 	  for (j = XVECLEN (x, i) - 1; j >= 0; j--)
8178 	    if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
8179 	      return 1;
8180 	}
8181       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
8182 	return 1;
8183     }
8184 
8185   return 0;
8186 }
8187 
8188 /* Implement REGNO_REG_CLASS.  */
8189 
8190 enum reg_class
8191 aarch64_regno_regclass (unsigned regno)
8192 {
8193   if (GP_REGNUM_P (regno))
8194     return GENERAL_REGS;
8195 
8196   if (regno == SP_REGNUM)
8197     return STACK_REG;
8198 
8199   if (regno == FRAME_POINTER_REGNUM
8200       || regno == ARG_POINTER_REGNUM)
8201     return POINTER_REGS;
8202 
8203   if (FP_REGNUM_P (regno))
8204     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
8205 
8206   if (PR_REGNUM_P (regno))
8207     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
8208 
8209   return NO_REGS;
8210 }
8211 
8212 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
8213    If OFFSET is out of range, return an offset of an anchor point
8214    that is in range.  Return 0 otherwise.  */
8215 
8216 static HOST_WIDE_INT
8217 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
8218 		       machine_mode mode)
8219 {
8220   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
8221   if (size > 16)
8222     return (offset + 0x400) & ~0x7f0;
8223 
8224   /* For offsets that aren't a multiple of the access size, the limit is
8225      -256...255.  */
8226   if (offset & (size - 1))
8227     {
8228       /* BLKmode typically uses LDP of X-registers.  */
8229       if (mode == BLKmode)
8230 	return (offset + 512) & ~0x3ff;
8231       return (offset + 0x100) & ~0x1ff;
8232     }
8233 
8234   /* Small negative offsets are supported.  */
8235   if (IN_RANGE (offset, -256, 0))
8236     return 0;
8237 
8238   if (mode == TImode || mode == TFmode)
8239     return (offset + 0x100) & ~0x1ff;
8240 
8241   /* Use 12-bit offset by access size.  */
8242   return offset & (~0xfff * size);
8243 }
8244 
8245 static rtx
8246 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
8247 {
8248   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
8249      where mask is selected by alignment and size of the offset.
8250      We try to pick as large a range for the offset as possible to
8251      maximize the chance of a CSE.  However, for aligned addresses
8252      we limit the range to 4k so that structures with different sized
8253      elements are likely to use the same base.  We need to be careful
8254      not to split a CONST for some forms of address expression, otherwise
8255      it will generate sub-optimal code.  */
8256 
8257   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
8258     {
8259       rtx base = XEXP (x, 0);
8260       rtx offset_rtx = XEXP (x, 1);
8261       HOST_WIDE_INT offset = INTVAL (offset_rtx);
8262 
8263       if (GET_CODE (base) == PLUS)
8264 	{
8265 	  rtx op0 = XEXP (base, 0);
8266 	  rtx op1 = XEXP (base, 1);
8267 
8268 	  /* Force any scaling into a temp for CSE.  */
8269 	  op0 = force_reg (Pmode, op0);
8270 	  op1 = force_reg (Pmode, op1);
8271 
8272 	  /* Let the pointer register be in op0.  */
8273 	  if (REG_POINTER (op1))
8274 	    std::swap (op0, op1);
8275 
8276 	  /* If the pointer is virtual or frame related, then we know that
8277 	     virtual register instantiation or register elimination is going
8278 	     to apply a second constant.  We want the two constants folded
8279 	     together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
8280 	  if (virt_or_elim_regno_p (REGNO (op0)))
8281 	    {
8282 	      base = expand_binop (Pmode, add_optab, op0, offset_rtx,
8283 				   NULL_RTX, true, OPTAB_DIRECT);
8284 	      return gen_rtx_PLUS (Pmode, base, op1);
8285 	    }
8286 
8287 	  /* Otherwise, in order to encourage CSE (and thence loop strength
8288 	     reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
8289 	  base = expand_binop (Pmode, add_optab, op0, op1,
8290 			       NULL_RTX, true, OPTAB_DIRECT);
8291 	  x = gen_rtx_PLUS (Pmode, base, offset_rtx);
8292 	}
8293 
8294       HOST_WIDE_INT size;
8295       if (GET_MODE_SIZE (mode).is_constant (&size))
8296 	{
8297 	  HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
8298 							     mode);
8299 	  if (base_offset != 0)
8300 	    {
8301 	      base = plus_constant (Pmode, base, base_offset);
8302 	      base = force_operand (base, NULL_RTX);
8303 	      return plus_constant (Pmode, base, offset - base_offset);
8304 	    }
8305 	}
8306     }
8307 
8308   return x;
8309 }
8310 
8311 static reg_class_t
8312 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
8313 			  reg_class_t rclass,
8314 			  machine_mode mode,
8315 			  secondary_reload_info *sri)
8316 {
8317   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
8318      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
8319      comment at the head of aarch64-sve.md for more details about the
8320      big-endian handling.  */
8321   if (BYTES_BIG_ENDIAN
8322       && reg_class_subset_p (rclass, FP_REGS)
8323       && !((REG_P (x) && HARD_REGISTER_P (x))
8324 	   || aarch64_simd_valid_immediate (x, NULL))
8325       && aarch64_sve_data_mode_p (mode))
8326     {
8327       sri->icode = CODE_FOR_aarch64_sve_reload_be;
8328       return NO_REGS;
8329     }
8330 
8331   /* If we have to disable direct literal pool loads and stores because the
8332      function is too big, then we need a scratch register.  */
8333   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
8334       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
8335 	  || targetm.vector_mode_supported_p (GET_MODE (x)))
8336       && !aarch64_pcrelative_literal_loads)
8337     {
8338       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
8339       return NO_REGS;
8340     }
8341 
8342   /* Without the TARGET_SIMD instructions we cannot move a Q register
8343      to a Q register directly.  We need a scratch.  */
8344   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
8345       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
8346       && reg_class_subset_p (rclass, FP_REGS))
8347     {
8348       sri->icode = code_for_aarch64_reload_mov (mode);
8349       return NO_REGS;
8350     }
8351 
8352   /* A TFmode or TImode memory access should be handled via an FP_REGS
8353      because AArch64 has richer addressing modes for LDR/STR instructions
8354      than LDP/STP instructions.  */
8355   if (TARGET_FLOAT && rclass == GENERAL_REGS
8356       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
8357     return FP_REGS;
8358 
8359   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
8360       return GENERAL_REGS;
8361 
8362   return NO_REGS;
8363 }
8364 
8365 static bool
8366 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
8367 {
8368   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
8369 
8370   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
8371      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
8372   if (frame_pointer_needed)
8373     return to == HARD_FRAME_POINTER_REGNUM;
8374   return true;
8375 }
8376 
8377 poly_int64
8378 aarch64_initial_elimination_offset (unsigned from, unsigned to)
8379 {
8380   if (to == HARD_FRAME_POINTER_REGNUM)
8381     {
8382       if (from == ARG_POINTER_REGNUM)
8383 	return cfun->machine->frame.hard_fp_offset;
8384 
8385       if (from == FRAME_POINTER_REGNUM)
8386 	return cfun->machine->frame.hard_fp_offset
8387 	       - cfun->machine->frame.locals_offset;
8388     }
8389 
8390   if (to == STACK_POINTER_REGNUM)
8391     {
8392       if (from == FRAME_POINTER_REGNUM)
8393 	  return cfun->machine->frame.frame_size
8394 		 - cfun->machine->frame.locals_offset;
8395     }
8396 
8397   return cfun->machine->frame.frame_size;
8398 }
8399 
8400 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
8401    previous frame.  */
8402 
8403 rtx
8404 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
8405 {
8406   if (count != 0)
8407     return const0_rtx;
8408   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
8409 }
8410 
8411 
8412 static void
8413 aarch64_asm_trampoline_template (FILE *f)
8414 {
8415   int offset1 = 16;
8416   int offset2 = 20;
8417 
8418   if (aarch64_bti_enabled ())
8419     {
8420       asm_fprintf (f, "\thint\t34 // bti c\n");
8421       offset1 -= 4;
8422       offset2 -= 4;
8423     }
8424 
8425   if (TARGET_ILP32)
8426     {
8427       asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1);
8428       asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM,
8429 		   offset1);
8430     }
8431   else
8432     {
8433       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1);
8434       asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM],
8435 		   offset2);
8436     }
8437   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
8438 
8439   /* The trampoline needs an extra padding instruction.  In case if BTI is
8440      enabled the padding instruction is replaced by the BTI instruction at
8441      the beginning.  */
8442   if (!aarch64_bti_enabled ())
8443     assemble_aligned_integer (4, const0_rtx);
8444 
8445   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8446   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
8447 }
8448 
8449 static void
8450 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
8451 {
8452   rtx fnaddr, mem, a_tramp;
8453   const int tramp_code_sz = 16;
8454 
8455   /* Don't need to copy the trailing D-words, we fill those in below.  */
8456   emit_block_move (m_tramp, assemble_trampoline_template (),
8457 		   GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
8458   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
8459   fnaddr = XEXP (DECL_RTL (fndecl), 0);
8460   if (GET_MODE (fnaddr) != ptr_mode)
8461     fnaddr = convert_memory_address (ptr_mode, fnaddr);
8462   emit_move_insn (mem, fnaddr);
8463 
8464   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
8465   emit_move_insn (mem, chain_value);
8466 
8467   /* XXX We should really define a "clear_cache" pattern and use
8468      gen_clear_cache().  */
8469   a_tramp = XEXP (m_tramp, 0);
8470   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
8471 		     LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
8472 		     plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
8473 		     ptr_mode);
8474 }
8475 
8476 static unsigned char
8477 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
8478 {
8479   /* ??? Logically we should only need to provide a value when
8480      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
8481      can hold MODE, but at the moment we need to handle all modes.
8482      Just ignore any runtime parts for registers that can't store them.  */
8483   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
8484   unsigned int nregs;
8485   switch (regclass)
8486     {
8487     case TAILCALL_ADDR_REGS:
8488     case POINTER_REGS:
8489     case GENERAL_REGS:
8490     case ALL_REGS:
8491     case POINTER_AND_FP_REGS:
8492     case FP_REGS:
8493     case FP_LO_REGS:
8494       if (aarch64_sve_data_mode_p (mode)
8495 	  && constant_multiple_p (GET_MODE_SIZE (mode),
8496 				  BYTES_PER_SVE_VECTOR, &nregs))
8497 	return nregs;
8498       return (aarch64_vector_data_mode_p (mode)
8499 	      ? CEIL (lowest_size, UNITS_PER_VREG)
8500 	      : CEIL (lowest_size, UNITS_PER_WORD));
8501     case STACK_REG:
8502     case PR_REGS:
8503     case PR_LO_REGS:
8504     case PR_HI_REGS:
8505       return 1;
8506 
8507     case NO_REGS:
8508       return 0;
8509 
8510     default:
8511       break;
8512     }
8513   gcc_unreachable ();
8514 }
8515 
8516 static reg_class_t
8517 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
8518 {
8519   if (regclass == POINTER_REGS)
8520     return GENERAL_REGS;
8521 
8522   if (regclass == STACK_REG)
8523     {
8524       if (REG_P(x)
8525 	  && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
8526 	  return regclass;
8527 
8528       return NO_REGS;
8529     }
8530 
8531   /* Register eliminiation can result in a request for
8532      SP+constant->FP_REGS.  We cannot support such operations which
8533      use SP as source and an FP_REG as destination, so reject out
8534      right now.  */
8535   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
8536     {
8537       rtx lhs = XEXP (x, 0);
8538 
8539       /* Look through a possible SUBREG introduced by ILP32.  */
8540       if (GET_CODE (lhs) == SUBREG)
8541 	lhs = SUBREG_REG (lhs);
8542 
8543       gcc_assert (REG_P (lhs));
8544       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
8545 				      POINTER_REGS));
8546       return NO_REGS;
8547     }
8548 
8549   return regclass;
8550 }
8551 
8552 void
8553 aarch64_asm_output_labelref (FILE* f, const char *name)
8554 {
8555   asm_fprintf (f, "%U%s", name);
8556 }
8557 
8558 static void
8559 aarch64_elf_asm_constructor (rtx symbol, int priority)
8560 {
8561   if (priority == DEFAULT_INIT_PRIORITY)
8562     default_ctor_section_asm_out_constructor (symbol, priority);
8563   else
8564     {
8565       section *s;
8566       /* While priority is known to be in range [0, 65535], so 18 bytes
8567          would be enough, the compiler might not know that.  To avoid
8568          -Wformat-truncation false positive, use a larger size.  */
8569       char buf[23];
8570       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
8571       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8572       switch_to_section (s);
8573       assemble_align (POINTER_SIZE);
8574       assemble_aligned_integer (POINTER_BYTES, symbol);
8575     }
8576 }
8577 
8578 static void
8579 aarch64_elf_asm_destructor (rtx symbol, int priority)
8580 {
8581   if (priority == DEFAULT_INIT_PRIORITY)
8582     default_dtor_section_asm_out_destructor (symbol, priority);
8583   else
8584     {
8585       section *s;
8586       /* While priority is known to be in range [0, 65535], so 18 bytes
8587          would be enough, the compiler might not know that.  To avoid
8588          -Wformat-truncation false positive, use a larger size.  */
8589       char buf[23];
8590       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
8591       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
8592       switch_to_section (s);
8593       assemble_align (POINTER_SIZE);
8594       assemble_aligned_integer (POINTER_BYTES, symbol);
8595     }
8596 }
8597 
8598 const char*
8599 aarch64_output_casesi (rtx *operands)
8600 {
8601   char buf[100];
8602   char label[100];
8603   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
8604   int index;
8605   static const char *const patterns[4][2] =
8606   {
8607     {
8608       "ldrb\t%w3, [%0,%w1,uxtw]",
8609       "add\t%3, %4, %w3, sxtb #2"
8610     },
8611     {
8612       "ldrh\t%w3, [%0,%w1,uxtw #1]",
8613       "add\t%3, %4, %w3, sxth #2"
8614     },
8615     {
8616       "ldr\t%w3, [%0,%w1,uxtw #2]",
8617       "add\t%3, %4, %w3, sxtw #2"
8618     },
8619     /* We assume that DImode is only generated when not optimizing and
8620        that we don't really need 64-bit address offsets.  That would
8621        imply an object file with 8GB of code in a single function!  */
8622     {
8623       "ldr\t%w3, [%0,%w1,uxtw #2]",
8624       "add\t%3, %4, %w3, sxtw #2"
8625     }
8626   };
8627 
8628   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
8629 
8630   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
8631   index = exact_log2 (GET_MODE_SIZE (mode));
8632 
8633   gcc_assert (index >= 0 && index <= 3);
8634 
8635   /* Need to implement table size reduction, by chaning the code below.  */
8636   output_asm_insn (patterns[index][0], operands);
8637   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
8638   snprintf (buf, sizeof (buf),
8639 	    "adr\t%%4, %s", targetm.strip_name_encoding (label));
8640   output_asm_insn (buf, operands);
8641   output_asm_insn (patterns[index][1], operands);
8642   output_asm_insn ("br\t%3", operands);
8643   assemble_label (asm_out_file, label);
8644   return "";
8645 }
8646 
8647 
8648 /* Return size in bits of an arithmetic operand which is shifted/scaled and
8649    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
8650    operator.  */
8651 
8652 int
8653 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
8654 {
8655   if (shift >= 0 && shift <= 3)
8656     {
8657       int size;
8658       for (size = 8; size <= 32; size *= 2)
8659 	{
8660 	  HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
8661 	  if (mask == bits << shift)
8662 	    return size;
8663 	}
8664     }
8665   return 0;
8666 }
8667 
8668 /* Constant pools are per function only when PC relative
8669    literal loads are true or we are in the large memory
8670    model.  */
8671 
8672 static inline bool
8673 aarch64_can_use_per_function_literal_pools_p (void)
8674 {
8675   return (aarch64_pcrelative_literal_loads
8676 	  || aarch64_cmodel == AARCH64_CMODEL_LARGE);
8677 }
8678 
8679 static bool
8680 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
8681 {
8682   /* We can't use blocks for constants when we're using a per-function
8683      constant pool.  */
8684   return !aarch64_can_use_per_function_literal_pools_p ();
8685 }
8686 
8687 /* Select appropriate section for constants depending
8688    on where we place literal pools.  */
8689 
8690 static section *
8691 aarch64_select_rtx_section (machine_mode mode,
8692 			    rtx x,
8693 			    unsigned HOST_WIDE_INT align)
8694 {
8695   if (aarch64_can_use_per_function_literal_pools_p ())
8696     return function_section (current_function_decl);
8697 
8698   return default_elf_select_rtx_section (mode, x, align);
8699 }
8700 
8701 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
8702 void
8703 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
8704 				  HOST_WIDE_INT offset)
8705 {
8706   /* When using per-function literal pools, we must ensure that any code
8707      section is aligned to the minimal instruction length, lest we get
8708      errors from the assembler re "unaligned instructions".  */
8709   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
8710     ASM_OUTPUT_ALIGN (f, 2);
8711 }
8712 
8713 /* Costs.  */
8714 
8715 /* Helper function for rtx cost calculation.  Strip a shift expression
8716    from X.  Returns the inner operand if successful, or the original
8717    expression on failure.  */
8718 static rtx
8719 aarch64_strip_shift (rtx x)
8720 {
8721   rtx op = x;
8722 
8723   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
8724      we can convert both to ROR during final output.  */
8725   if ((GET_CODE (op) == ASHIFT
8726        || GET_CODE (op) == ASHIFTRT
8727        || GET_CODE (op) == LSHIFTRT
8728        || GET_CODE (op) == ROTATERT
8729        || GET_CODE (op) == ROTATE)
8730       && CONST_INT_P (XEXP (op, 1)))
8731     return XEXP (op, 0);
8732 
8733   if (GET_CODE (op) == MULT
8734       && CONST_INT_P (XEXP (op, 1))
8735       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
8736     return XEXP (op, 0);
8737 
8738   return x;
8739 }
8740 
8741 /* Helper function for rtx cost calculation.  Strip an extend
8742    expression from X.  Returns the inner operand if successful, or the
8743    original expression on failure.  We deal with a number of possible
8744    canonicalization variations here. If STRIP_SHIFT is true, then
8745    we can strip off a shift also.  */
8746 static rtx
8747 aarch64_strip_extend (rtx x, bool strip_shift)
8748 {
8749   scalar_int_mode mode;
8750   rtx op = x;
8751 
8752   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
8753     return op;
8754 
8755   /* Zero and sign extraction of a widened value.  */
8756   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
8757       && XEXP (op, 2) == const0_rtx
8758       && GET_CODE (XEXP (op, 0)) == MULT
8759       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
8760 					 XEXP (op, 1)))
8761     return XEXP (XEXP (op, 0), 0);
8762 
8763   /* It can also be represented (for zero-extend) as an AND with an
8764      immediate.  */
8765   if (GET_CODE (op) == AND
8766       && GET_CODE (XEXP (op, 0)) == MULT
8767       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
8768       && CONST_INT_P (XEXP (op, 1))
8769       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
8770 			   INTVAL (XEXP (op, 1))) != 0)
8771     return XEXP (XEXP (op, 0), 0);
8772 
8773   /* Now handle extended register, as this may also have an optional
8774      left shift by 1..4.  */
8775   if (strip_shift
8776       && GET_CODE (op) == ASHIFT
8777       && CONST_INT_P (XEXP (op, 1))
8778       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
8779     op = XEXP (op, 0);
8780 
8781   if (GET_CODE (op) == ZERO_EXTEND
8782       || GET_CODE (op) == SIGN_EXTEND)
8783     op = XEXP (op, 0);
8784 
8785   if (op != x)
8786     return op;
8787 
8788   return x;
8789 }
8790 
8791 /* Return true iff CODE is a shift supported in combination
8792    with arithmetic instructions.  */
8793 
8794 static bool
8795 aarch64_shift_p (enum rtx_code code)
8796 {
8797   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
8798 }
8799 
8800 
8801 /* Return true iff X is a cheap shift without a sign extend. */
8802 
8803 static bool
8804 aarch64_cheap_mult_shift_p (rtx x)
8805 {
8806   rtx op0, op1;
8807 
8808   op0 = XEXP (x, 0);
8809   op1 = XEXP (x, 1);
8810 
8811   if (!(aarch64_tune_params.extra_tuning_flags
8812                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
8813     return false;
8814 
8815   if (GET_CODE (op0) == SIGN_EXTEND)
8816     return false;
8817 
8818   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
8819       && UINTVAL (op1) <= 4)
8820     return true;
8821 
8822   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
8823     return false;
8824 
8825   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
8826 
8827   if (l2 > 0 && l2 <= 4)
8828     return true;
8829 
8830   return false;
8831 }
8832 
8833 /* Helper function for rtx cost calculation.  Calculate the cost of
8834    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
8835    Return the calculated cost of the expression, recursing manually in to
8836    operands where needed.  */
8837 
8838 static int
8839 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
8840 {
8841   rtx op0, op1;
8842   const struct cpu_cost_table *extra_cost
8843     = aarch64_tune_params.insn_extra_cost;
8844   int cost = 0;
8845   bool compound_p = (outer == PLUS || outer == MINUS);
8846   machine_mode mode = GET_MODE (x);
8847 
8848   gcc_checking_assert (code == MULT);
8849 
8850   op0 = XEXP (x, 0);
8851   op1 = XEXP (x, 1);
8852 
8853   if (VECTOR_MODE_P (mode))
8854     mode = GET_MODE_INNER (mode);
8855 
8856   /* Integer multiply/fma.  */
8857   if (GET_MODE_CLASS (mode) == MODE_INT)
8858     {
8859       /* The multiply will be canonicalized as a shift, cost it as such.  */
8860       if (aarch64_shift_p (GET_CODE (x))
8861 	  || (CONST_INT_P (op1)
8862 	      && exact_log2 (INTVAL (op1)) > 0))
8863 	{
8864 	  bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8865 	                   || GET_CODE (op0) == SIGN_EXTEND;
8866 	  if (speed)
8867 	    {
8868 	      if (compound_p)
8869 	        {
8870 		  /* If the shift is considered cheap,
8871 		     then don't add any cost. */
8872 		  if (aarch64_cheap_mult_shift_p (x))
8873 		    ;
8874 	          else if (REG_P (op1))
8875 		    /* ARITH + shift-by-register.  */
8876 		    cost += extra_cost->alu.arith_shift_reg;
8877 		  else if (is_extend)
8878 		    /* ARITH + extended register.  We don't have a cost field
8879 		       for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
8880 		    cost += extra_cost->alu.extend_arith;
8881 		  else
8882 		    /* ARITH + shift-by-immediate.  */
8883 		    cost += extra_cost->alu.arith_shift;
8884 		}
8885 	      else
8886 		/* LSL (immediate).  */
8887 	        cost += extra_cost->alu.shift;
8888 
8889 	    }
8890 	  /* Strip extends as we will have costed them in the case above.  */
8891 	  if (is_extend)
8892 	    op0 = aarch64_strip_extend (op0, true);
8893 
8894 	  cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8895 
8896 	  return cost;
8897 	}
8898 
8899       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
8900 	 compound and let the below cases handle it.  After all, MNEG is a
8901 	 special-case alias of MSUB.  */
8902       if (GET_CODE (op0) == NEG)
8903 	{
8904 	  op0 = XEXP (op0, 0);
8905 	  compound_p = true;
8906 	}
8907 
8908       /* Integer multiplies or FMAs have zero/sign extending variants.  */
8909       if ((GET_CODE (op0) == ZERO_EXTEND
8910 	   && GET_CODE (op1) == ZERO_EXTEND)
8911 	  || (GET_CODE (op0) == SIGN_EXTEND
8912 	      && GET_CODE (op1) == SIGN_EXTEND))
8913 	{
8914 	  cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8915 	  cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8916 
8917 	  if (speed)
8918 	    {
8919 	      if (compound_p)
8920 		/* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
8921 		cost += extra_cost->mult[0].extend_add;
8922 	      else
8923 		/* MUL/SMULL/UMULL.  */
8924 		cost += extra_cost->mult[0].extend;
8925 	    }
8926 
8927 	  return cost;
8928 	}
8929 
8930       /* This is either an integer multiply or a MADD.  In both cases
8931 	 we want to recurse and cost the operands.  */
8932       cost += rtx_cost (op0, mode, MULT, 0, speed);
8933       cost += rtx_cost (op1, mode, MULT, 1, speed);
8934 
8935       if (speed)
8936 	{
8937 	  if (compound_p)
8938 	    /* MADD/MSUB.  */
8939 	    cost += extra_cost->mult[mode == DImode].add;
8940 	  else
8941 	    /* MUL.  */
8942 	    cost += extra_cost->mult[mode == DImode].simple;
8943 	}
8944 
8945       return cost;
8946     }
8947   else
8948     {
8949       if (speed)
8950 	{
8951 	  /* Floating-point FMA/FMUL can also support negations of the
8952 	     operands, unless the rounding mode is upward or downward in
8953 	     which case FNMUL is different than FMUL with operand negation.  */
8954 	  bool neg0 = GET_CODE (op0) == NEG;
8955 	  bool neg1 = GET_CODE (op1) == NEG;
8956 	  if (compound_p || !flag_rounding_math || (neg0 && neg1))
8957 	    {
8958 	      if (neg0)
8959 		op0 = XEXP (op0, 0);
8960 	      if (neg1)
8961 		op1 = XEXP (op1, 0);
8962 	    }
8963 
8964 	  if (compound_p)
8965 	    /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8966 	    cost += extra_cost->fp[mode == DFmode].fma;
8967 	  else
8968 	    /* FMUL/FNMUL.  */
8969 	    cost += extra_cost->fp[mode == DFmode].mult;
8970 	}
8971 
8972       cost += rtx_cost (op0, mode, MULT, 0, speed);
8973       cost += rtx_cost (op1, mode, MULT, 1, speed);
8974       return cost;
8975     }
8976 }
8977 
8978 static int
8979 aarch64_address_cost (rtx x,
8980 		      machine_mode mode,
8981 		      addr_space_t as ATTRIBUTE_UNUSED,
8982 		      bool speed)
8983 {
8984   enum rtx_code c = GET_CODE (x);
8985   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8986   struct aarch64_address_info info;
8987   int cost = 0;
8988   info.shift = 0;
8989 
8990   if (!aarch64_classify_address (&info, x, mode, false))
8991     {
8992       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8993 	{
8994 	  /* This is a CONST or SYMBOL ref which will be split
8995 	     in a different way depending on the code model in use.
8996 	     Cost it through the generic infrastructure.  */
8997 	  int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8998 	  /* Divide through by the cost of one instruction to
8999 	     bring it to the same units as the address costs.  */
9000 	  cost_symbol_ref /= COSTS_N_INSNS (1);
9001 	  /* The cost is then the cost of preparing the address,
9002 	     followed by an immediate (possibly 0) offset.  */
9003 	  return cost_symbol_ref + addr_cost->imm_offset;
9004 	}
9005       else
9006 	{
9007 	  /* This is most likely a jump table from a case
9008 	     statement.  */
9009 	  return addr_cost->register_offset;
9010 	}
9011     }
9012 
9013   switch (info.type)
9014     {
9015       case ADDRESS_LO_SUM:
9016       case ADDRESS_SYMBOLIC:
9017       case ADDRESS_REG_IMM:
9018 	cost += addr_cost->imm_offset;
9019 	break;
9020 
9021       case ADDRESS_REG_WB:
9022 	if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
9023 	  cost += addr_cost->pre_modify;
9024 	else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
9025 	  cost += addr_cost->post_modify;
9026 	else
9027 	  gcc_unreachable ();
9028 
9029 	break;
9030 
9031       case ADDRESS_REG_REG:
9032 	cost += addr_cost->register_offset;
9033 	break;
9034 
9035       case ADDRESS_REG_SXTW:
9036 	cost += addr_cost->register_sextend;
9037 	break;
9038 
9039       case ADDRESS_REG_UXTW:
9040 	cost += addr_cost->register_zextend;
9041 	break;
9042 
9043       default:
9044 	gcc_unreachable ();
9045     }
9046 
9047 
9048   if (info.shift > 0)
9049     {
9050       /* For the sake of calculating the cost of the shifted register
9051 	 component, we can treat same sized modes in the same way.  */
9052       if (known_eq (GET_MODE_BITSIZE (mode), 16))
9053 	cost += addr_cost->addr_scale_costs.hi;
9054       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
9055 	cost += addr_cost->addr_scale_costs.si;
9056       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
9057 	cost += addr_cost->addr_scale_costs.di;
9058       else
9059 	/* We can't tell, or this is a 128-bit vector.  */
9060 	cost += addr_cost->addr_scale_costs.ti;
9061     }
9062 
9063   return cost;
9064 }
9065 
9066 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
9067    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
9068    to be taken.  */
9069 
9070 int
9071 aarch64_branch_cost (bool speed_p, bool predictable_p)
9072 {
9073   /* When optimizing for speed, use the cost of unpredictable branches.  */
9074   const struct cpu_branch_cost *branch_costs =
9075     aarch64_tune_params.branch_costs;
9076 
9077   if (!speed_p || predictable_p)
9078     return branch_costs->predictable;
9079   else
9080     return branch_costs->unpredictable;
9081 }
9082 
9083 /* Return true if the RTX X in mode MODE is a zero or sign extract
9084    usable in an ADD or SUB (extended register) instruction.  */
9085 static bool
9086 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
9087 {
9088   /* Catch add with a sign extract.
9089      This is add_<optab><mode>_multp2.  */
9090   if (GET_CODE (x) == SIGN_EXTRACT
9091       || GET_CODE (x) == ZERO_EXTRACT)
9092     {
9093       rtx op0 = XEXP (x, 0);
9094       rtx op1 = XEXP (x, 1);
9095       rtx op2 = XEXP (x, 2);
9096 
9097       if (GET_CODE (op0) == MULT
9098 	  && CONST_INT_P (op1)
9099 	  && op2 == const0_rtx
9100 	  && CONST_INT_P (XEXP (op0, 1))
9101 	  && aarch64_is_extend_from_extract (mode,
9102 					     XEXP (op0, 1),
9103 					     op1))
9104 	{
9105 	  return true;
9106 	}
9107     }
9108   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
9109      No shift.  */
9110   else if (GET_CODE (x) == SIGN_EXTEND
9111 	   || GET_CODE (x) == ZERO_EXTEND)
9112     return REG_P (XEXP (x, 0));
9113 
9114   return false;
9115 }
9116 
9117 static bool
9118 aarch64_frint_unspec_p (unsigned int u)
9119 {
9120   switch (u)
9121     {
9122       case UNSPEC_FRINTZ:
9123       case UNSPEC_FRINTP:
9124       case UNSPEC_FRINTM:
9125       case UNSPEC_FRINTA:
9126       case UNSPEC_FRINTN:
9127       case UNSPEC_FRINTX:
9128       case UNSPEC_FRINTI:
9129         return true;
9130 
9131       default:
9132         return false;
9133     }
9134 }
9135 
9136 /* Return true iff X is an rtx that will match an extr instruction
9137    i.e. as described in the *extr<mode>5_insn family of patterns.
9138    OP0 and OP1 will be set to the operands of the shifts involved
9139    on success and will be NULL_RTX otherwise.  */
9140 
9141 static bool
9142 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
9143 {
9144   rtx op0, op1;
9145   scalar_int_mode mode;
9146   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
9147     return false;
9148 
9149   *res_op0 = NULL_RTX;
9150   *res_op1 = NULL_RTX;
9151 
9152   if (GET_CODE (x) != IOR)
9153     return false;
9154 
9155   op0 = XEXP (x, 0);
9156   op1 = XEXP (x, 1);
9157 
9158   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
9159       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
9160     {
9161      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
9162       if (GET_CODE (op1) == ASHIFT)
9163         std::swap (op0, op1);
9164 
9165       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
9166         return false;
9167 
9168       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
9169       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
9170 
9171       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
9172           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
9173         {
9174           *res_op0 = XEXP (op0, 0);
9175           *res_op1 = XEXP (op1, 0);
9176           return true;
9177         }
9178     }
9179 
9180   return false;
9181 }
9182 
9183 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
9184    storing it in *COST.  Result is true if the total cost of the operation
9185    has now been calculated.  */
9186 static bool
9187 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
9188 {
9189   rtx inner;
9190   rtx comparator;
9191   enum rtx_code cmpcode;
9192 
9193   if (COMPARISON_P (op0))
9194     {
9195       inner = XEXP (op0, 0);
9196       comparator = XEXP (op0, 1);
9197       cmpcode = GET_CODE (op0);
9198     }
9199   else
9200     {
9201       inner = op0;
9202       comparator = const0_rtx;
9203       cmpcode = NE;
9204     }
9205 
9206   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
9207     {
9208       /* Conditional branch.  */
9209       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9210 	return true;
9211       else
9212 	{
9213 	  if (cmpcode == NE || cmpcode == EQ)
9214 	    {
9215 	      if (comparator == const0_rtx)
9216 		{
9217 		  /* TBZ/TBNZ/CBZ/CBNZ.  */
9218 		  if (GET_CODE (inner) == ZERO_EXTRACT)
9219 		    /* TBZ/TBNZ.  */
9220 		    *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
9221 				       ZERO_EXTRACT, 0, speed);
9222 		  else
9223 		    /* CBZ/CBNZ.  */
9224 		    *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
9225 
9226 	        return true;
9227 	      }
9228 	    }
9229 	  else if (cmpcode == LT || cmpcode == GE)
9230 	    {
9231 	      /* TBZ/TBNZ.  */
9232 	      if (comparator == const0_rtx)
9233 		return true;
9234 	    }
9235 	}
9236     }
9237   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
9238     {
9239       /* CCMP.  */
9240       if (GET_CODE (op1) == COMPARE)
9241 	{
9242 	  /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
9243 	  if (XEXP (op1, 1) == const0_rtx)
9244 	    *cost += 1;
9245 	  if (speed)
9246 	    {
9247 	      machine_mode mode = GET_MODE (XEXP (op1, 0));
9248 	      const struct cpu_cost_table *extra_cost
9249 		= aarch64_tune_params.insn_extra_cost;
9250 
9251 	      if (GET_MODE_CLASS (mode) == MODE_INT)
9252 		*cost += extra_cost->alu.arith;
9253 	      else
9254 		*cost += extra_cost->fp[mode == DFmode].compare;
9255 	    }
9256 	  return true;
9257 	}
9258 
9259       /* It's a conditional operation based on the status flags,
9260 	 so it must be some flavor of CSEL.  */
9261 
9262       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
9263       if (GET_CODE (op1) == NEG
9264           || GET_CODE (op1) == NOT
9265           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
9266 	op1 = XEXP (op1, 0);
9267       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
9268 	{
9269 	  /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
9270 	  op1 = XEXP (op1, 0);
9271 	  op2 = XEXP (op2, 0);
9272 	}
9273 
9274       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
9275       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
9276       return true;
9277     }
9278 
9279   /* We don't know what this is, cost all operands.  */
9280   return false;
9281 }
9282 
9283 /* Check whether X is a bitfield operation of the form shift + extend that
9284    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
9285    operand to which the bitfield operation is applied.  Otherwise return
9286    NULL_RTX.  */
9287 
9288 static rtx
9289 aarch64_extend_bitfield_pattern_p (rtx x)
9290 {
9291   rtx_code outer_code = GET_CODE (x);
9292   machine_mode outer_mode = GET_MODE (x);
9293 
9294   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
9295       && outer_mode != SImode && outer_mode != DImode)
9296     return NULL_RTX;
9297 
9298   rtx inner = XEXP (x, 0);
9299   rtx_code inner_code = GET_CODE (inner);
9300   machine_mode inner_mode = GET_MODE (inner);
9301   rtx op = NULL_RTX;
9302 
9303   switch (inner_code)
9304     {
9305       case ASHIFT:
9306 	if (CONST_INT_P (XEXP (inner, 1))
9307 	    && (inner_mode == QImode || inner_mode == HImode))
9308 	  op = XEXP (inner, 0);
9309 	break;
9310       case LSHIFTRT:
9311 	if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
9312 	    && (inner_mode == QImode || inner_mode == HImode))
9313 	  op = XEXP (inner, 0);
9314 	break;
9315       case ASHIFTRT:
9316 	if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
9317 	    && (inner_mode == QImode || inner_mode == HImode))
9318 	  op = XEXP (inner, 0);
9319 	break;
9320       default:
9321 	break;
9322     }
9323 
9324   return op;
9325 }
9326 
9327 /* Return true if the mask and a shift amount from an RTX of the form
9328    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
9329    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
9330 
9331 bool
9332 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
9333 				    rtx shft_amnt)
9334 {
9335   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
9336 	 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
9337 	 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
9338 	 && (INTVAL (mask)
9339 	     & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
9340 }
9341 
9342 /* Return true if the masks and a shift amount from an RTX of the form
9343    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
9344    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
9345 
9346 bool
9347 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
9348 				   unsigned HOST_WIDE_INT mask1,
9349 				   unsigned HOST_WIDE_INT shft_amnt,
9350 				   unsigned HOST_WIDE_INT mask2)
9351 {
9352   unsigned HOST_WIDE_INT t;
9353 
9354   /* Verify that there is no overlap in what bits are set in the two masks.  */
9355   if (mask1 != ~mask2)
9356     return false;
9357 
9358   /* Verify that mask2 is not all zeros or ones.  */
9359   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
9360     return false;
9361 
9362   /* The shift amount should always be less than the mode size.  */
9363   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
9364 
9365   /* Verify that the mask being shifted is contiguous and would be in the
9366      least significant bits after shifting by shft_amnt.  */
9367   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
9368   return (t == (t & -t));
9369 }
9370 
9371 /* Calculate the cost of calculating X, storing it in *COST.  Result
9372    is true if the total cost of the operation has now been calculated.  */
9373 static bool
9374 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
9375 		   int param ATTRIBUTE_UNUSED, int *cost, bool speed)
9376 {
9377   rtx op0, op1, op2;
9378   const struct cpu_cost_table *extra_cost
9379     = aarch64_tune_params.insn_extra_cost;
9380   int code = GET_CODE (x);
9381   scalar_int_mode int_mode;
9382 
9383   /* By default, assume that everything has equivalent cost to the
9384      cheapest instruction.  Any additional costs are applied as a delta
9385      above this default.  */
9386   *cost = COSTS_N_INSNS (1);
9387 
9388   switch (code)
9389     {
9390     case SET:
9391       /* The cost depends entirely on the operands to SET.  */
9392       *cost = 0;
9393       op0 = SET_DEST (x);
9394       op1 = SET_SRC (x);
9395 
9396       switch (GET_CODE (op0))
9397 	{
9398 	case MEM:
9399 	  if (speed)
9400 	    {
9401 	      rtx address = XEXP (op0, 0);
9402 	      if (VECTOR_MODE_P (mode))
9403 		*cost += extra_cost->ldst.storev;
9404 	      else if (GET_MODE_CLASS (mode) == MODE_INT)
9405 		*cost += extra_cost->ldst.store;
9406 	      else if (mode == SFmode)
9407 		*cost += extra_cost->ldst.storef;
9408 	      else if (mode == DFmode)
9409 		*cost += extra_cost->ldst.stored;
9410 
9411 	      *cost +=
9412 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
9413 						     0, speed));
9414 	    }
9415 
9416 	  *cost += rtx_cost (op1, mode, SET, 1, speed);
9417 	  return true;
9418 
9419 	case SUBREG:
9420 	  if (! REG_P (SUBREG_REG (op0)))
9421 	    *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
9422 
9423 	  /* Fall through.  */
9424 	case REG:
9425 	  /* The cost is one per vector-register copied.  */
9426 	  if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
9427 	    {
9428 	      int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
9429 	      *cost = COSTS_N_INSNS (nregs);
9430 	    }
9431 	  /* const0_rtx is in general free, but we will use an
9432 	     instruction to set a register to 0.  */
9433 	  else if (REG_P (op1) || op1 == const0_rtx)
9434 	    {
9435 	      /* The cost is 1 per register copied.  */
9436 	      int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
9437 	      *cost = COSTS_N_INSNS (nregs);
9438 	    }
9439           else
9440 	    /* Cost is just the cost of the RHS of the set.  */
9441 	    *cost += rtx_cost (op1, mode, SET, 1, speed);
9442 	  return true;
9443 
9444 	case ZERO_EXTRACT:
9445 	case SIGN_EXTRACT:
9446 	  /* Bit-field insertion.  Strip any redundant widening of
9447 	     the RHS to meet the width of the target.  */
9448 	  if (GET_CODE (op1) == SUBREG)
9449 	    op1 = SUBREG_REG (op1);
9450 	  if ((GET_CODE (op1) == ZERO_EXTEND
9451 	       || GET_CODE (op1) == SIGN_EXTEND)
9452 	      && CONST_INT_P (XEXP (op0, 1))
9453 	      && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
9454 	      && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
9455 	    op1 = XEXP (op1, 0);
9456 
9457           if (CONST_INT_P (op1))
9458             {
9459               /* MOV immediate is assumed to always be cheap.  */
9460               *cost = COSTS_N_INSNS (1);
9461             }
9462           else
9463             {
9464               /* BFM.  */
9465 	      if (speed)
9466 		*cost += extra_cost->alu.bfi;
9467 	      *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
9468             }
9469 
9470 	  return true;
9471 
9472 	default:
9473 	  /* We can't make sense of this, assume default cost.  */
9474           *cost = COSTS_N_INSNS (1);
9475 	  return false;
9476 	}
9477       return false;
9478 
9479     case CONST_INT:
9480       /* If an instruction can incorporate a constant within the
9481 	 instruction, the instruction's expression avoids calling
9482 	 rtx_cost() on the constant.  If rtx_cost() is called on a
9483 	 constant, then it is usually because the constant must be
9484 	 moved into a register by one or more instructions.
9485 
9486 	 The exception is constant 0, which can be expressed
9487 	 as XZR/WZR and is therefore free.  The exception to this is
9488 	 if we have (set (reg) (const0_rtx)) in which case we must cost
9489 	 the move.  However, we can catch that when we cost the SET, so
9490 	 we don't need to consider that here.  */
9491       if (x == const0_rtx)
9492 	*cost = 0;
9493       else
9494 	{
9495 	  /* To an approximation, building any other constant is
9496 	     proportionally expensive to the number of instructions
9497 	     required to build that constant.  This is true whether we
9498 	     are compiling for SPEED or otherwise.  */
9499 	  if (!is_a <scalar_int_mode> (mode, &int_mode))
9500 	    int_mode = word_mode;
9501 	  *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
9502 				 (NULL_RTX, x, false, int_mode));
9503 	}
9504       return true;
9505 
9506     case CONST_DOUBLE:
9507 
9508       /* First determine number of instructions to do the move
9509 	  as an integer constant.  */
9510       if (!aarch64_float_const_representable_p (x)
9511 	   && !aarch64_can_const_movi_rtx_p (x, mode)
9512 	   && aarch64_float_const_rtx_p (x))
9513 	{
9514 	  unsigned HOST_WIDE_INT ival;
9515 	  bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
9516 	  gcc_assert (succeed);
9517 
9518 	  scalar_int_mode imode = (mode == HFmode
9519 				   ? SImode
9520 				   : int_mode_for_mode (mode).require ());
9521 	  int ncost = aarch64_internal_mov_immediate
9522 		(NULL_RTX, gen_int_mode (ival, imode), false, imode);
9523 	  *cost += COSTS_N_INSNS (ncost);
9524 	  return true;
9525 	}
9526 
9527       if (speed)
9528 	{
9529 	  /* mov[df,sf]_aarch64.  */
9530 	  if (aarch64_float_const_representable_p (x))
9531 	    /* FMOV (scalar immediate).  */
9532 	    *cost += extra_cost->fp[mode == DFmode].fpconst;
9533 	  else if (!aarch64_float_const_zero_rtx_p (x))
9534 	    {
9535 	      /* This will be a load from memory.  */
9536 	      if (mode == DFmode)
9537 		*cost += extra_cost->ldst.loadd;
9538 	      else
9539 		*cost += extra_cost->ldst.loadf;
9540 	    }
9541 	  else
9542 	    /* Otherwise this is +0.0.  We get this using MOVI d0, #0
9543 	       or MOV v0.s[0], wzr - neither of which are modeled by the
9544 	       cost tables.  Just use the default cost.  */
9545 	    {
9546 	    }
9547 	}
9548 
9549       return true;
9550 
9551     case MEM:
9552       if (speed)
9553 	{
9554 	  /* For loads we want the base cost of a load, plus an
9555 	     approximation for the additional cost of the addressing
9556 	     mode.  */
9557 	  rtx address = XEXP (x, 0);
9558 	  if (VECTOR_MODE_P (mode))
9559 	    *cost += extra_cost->ldst.loadv;
9560 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
9561 	    *cost += extra_cost->ldst.load;
9562 	  else if (mode == SFmode)
9563 	    *cost += extra_cost->ldst.loadf;
9564 	  else if (mode == DFmode)
9565 	    *cost += extra_cost->ldst.loadd;
9566 
9567 	  *cost +=
9568 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
9569 						     0, speed));
9570 	}
9571 
9572       return true;
9573 
9574     case NEG:
9575       op0 = XEXP (x, 0);
9576 
9577       if (VECTOR_MODE_P (mode))
9578 	{
9579 	  if (speed)
9580 	    {
9581 	      /* FNEG.  */
9582 	      *cost += extra_cost->vect.alu;
9583 	    }
9584 	  return false;
9585 	}
9586 
9587       if (GET_MODE_CLASS (mode) == MODE_INT)
9588 	{
9589           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9590               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9591             {
9592               /* CSETM.  */
9593 	      *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
9594               return true;
9595             }
9596 
9597 	  /* Cost this as SUB wzr, X.  */
9598           op0 = CONST0_RTX (mode);
9599           op1 = XEXP (x, 0);
9600           goto cost_minus;
9601         }
9602 
9603       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9604         {
9605           /* Support (neg(fma...)) as a single instruction only if
9606              sign of zeros is unimportant.  This matches the decision
9607              making in aarch64.md.  */
9608           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
9609             {
9610 	      /* FNMADD.  */
9611 	      *cost = rtx_cost (op0, mode, NEG, 0, speed);
9612               return true;
9613             }
9614 	  if (GET_CODE (op0) == MULT)
9615 	    {
9616 	      /* FNMUL.  */
9617 	      *cost = rtx_cost (op0, mode, NEG, 0, speed);
9618 	      return true;
9619 	    }
9620 	  if (speed)
9621 	    /* FNEG.  */
9622 	    *cost += extra_cost->fp[mode == DFmode].neg;
9623           return false;
9624         }
9625 
9626       return false;
9627 
9628     case CLRSB:
9629     case CLZ:
9630       if (speed)
9631 	{
9632 	  if (VECTOR_MODE_P (mode))
9633 	    *cost += extra_cost->vect.alu;
9634 	  else
9635 	    *cost += extra_cost->alu.clz;
9636 	}
9637 
9638       return false;
9639 
9640     case COMPARE:
9641       op0 = XEXP (x, 0);
9642       op1 = XEXP (x, 1);
9643 
9644       if (op1 == const0_rtx
9645 	  && GET_CODE (op0) == AND)
9646 	{
9647 	  x = op0;
9648 	  mode = GET_MODE (op0);
9649 	  goto cost_logic;
9650 	}
9651 
9652       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
9653         {
9654           /* TODO: A write to the CC flags possibly costs extra, this
9655 	     needs encoding in the cost tables.  */
9656 
9657 	  mode = GET_MODE (op0);
9658           /* ANDS.  */
9659           if (GET_CODE (op0) == AND)
9660             {
9661               x = op0;
9662               goto cost_logic;
9663             }
9664 
9665           if (GET_CODE (op0) == PLUS)
9666             {
9667 	      /* ADDS (and CMN alias).  */
9668               x = op0;
9669               goto cost_plus;
9670             }
9671 
9672           if (GET_CODE (op0) == MINUS)
9673             {
9674 	      /* SUBS.  */
9675               x = op0;
9676               goto cost_minus;
9677             }
9678 
9679 	  if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
9680 	      && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
9681 	      && CONST_INT_P (XEXP (op0, 2)))
9682 	    {
9683 	      /* COMPARE of ZERO_EXTRACT form of TST-immediate.
9684 		 Handle it here directly rather than going to cost_logic
9685 		 since we know the immediate generated for the TST is valid
9686 		 so we can avoid creating an intermediate rtx for it only
9687 		 for costing purposes.  */
9688 	      if (speed)
9689 		*cost += extra_cost->alu.logical;
9690 
9691 	      *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
9692 				 ZERO_EXTRACT, 0, speed);
9693 	      return true;
9694 	    }
9695 
9696           if (GET_CODE (op1) == NEG)
9697             {
9698 	      /* CMN.  */
9699 	      if (speed)
9700 		*cost += extra_cost->alu.arith;
9701 
9702 	      *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
9703 	      *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
9704               return true;
9705             }
9706 
9707           /* CMP.
9708 
9709 	     Compare can freely swap the order of operands, and
9710              canonicalization puts the more complex operation first.
9711              But the integer MINUS logic expects the shift/extend
9712              operation in op1.  */
9713           if (! (REG_P (op0)
9714                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
9715           {
9716             op0 = XEXP (x, 1);
9717             op1 = XEXP (x, 0);
9718           }
9719           goto cost_minus;
9720         }
9721 
9722       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
9723         {
9724 	  /* FCMP.  */
9725 	  if (speed)
9726 	    *cost += extra_cost->fp[mode == DFmode].compare;
9727 
9728           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
9729             {
9730 	      *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
9731               /* FCMP supports constant 0.0 for no extra cost. */
9732               return true;
9733             }
9734           return false;
9735         }
9736 
9737       if (VECTOR_MODE_P (mode))
9738 	{
9739 	  /* Vector compare.  */
9740 	  if (speed)
9741 	    *cost += extra_cost->vect.alu;
9742 
9743 	  if (aarch64_float_const_zero_rtx_p (op1))
9744 	    {
9745 	      /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
9746 		 cost.  */
9747 	      return true;
9748 	    }
9749 	  return false;
9750 	}
9751       return false;
9752 
9753     case MINUS:
9754       {
9755 	op0 = XEXP (x, 0);
9756 	op1 = XEXP (x, 1);
9757 
9758 cost_minus:
9759 	*cost += rtx_cost (op0, mode, MINUS, 0, speed);
9760 
9761 	/* Detect valid immediates.  */
9762 	if ((GET_MODE_CLASS (mode) == MODE_INT
9763 	     || (GET_MODE_CLASS (mode) == MODE_CC
9764 		 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
9765 	    && CONST_INT_P (op1)
9766 	    && aarch64_uimm12_shift (INTVAL (op1)))
9767 	  {
9768 	    if (speed)
9769 	      /* SUB(S) (immediate).  */
9770 	      *cost += extra_cost->alu.arith;
9771 	    return true;
9772 	  }
9773 
9774 	/* Look for SUB (extended register).  */
9775 	if (is_a <scalar_int_mode> (mode, &int_mode)
9776 	    && aarch64_rtx_arith_op_extract_p (op1, int_mode))
9777 	  {
9778 	    if (speed)
9779 	      *cost += extra_cost->alu.extend_arith;
9780 
9781 	    op1 = aarch64_strip_extend (op1, true);
9782 	    *cost += rtx_cost (op1, VOIDmode,
9783 			       (enum rtx_code) GET_CODE (op1), 0, speed);
9784 	    return true;
9785 	  }
9786 
9787 	rtx new_op1 = aarch64_strip_extend (op1, false);
9788 
9789 	/* Cost this as an FMA-alike operation.  */
9790 	if ((GET_CODE (new_op1) == MULT
9791 	     || aarch64_shift_p (GET_CODE (new_op1)))
9792 	    && code != COMPARE)
9793 	  {
9794 	    *cost += aarch64_rtx_mult_cost (new_op1, MULT,
9795 					    (enum rtx_code) code,
9796 					    speed);
9797 	    return true;
9798 	  }
9799 
9800 	*cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
9801 
9802 	if (speed)
9803 	  {
9804 	    if (VECTOR_MODE_P (mode))
9805 	      {
9806 		/* Vector SUB.  */
9807 		*cost += extra_cost->vect.alu;
9808 	      }
9809 	    else if (GET_MODE_CLASS (mode) == MODE_INT)
9810 	      {
9811 		/* SUB(S).  */
9812 		*cost += extra_cost->alu.arith;
9813 	      }
9814 	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9815 	      {
9816 		/* FSUB.  */
9817 		*cost += extra_cost->fp[mode == DFmode].addsub;
9818 	      }
9819 	  }
9820 	return true;
9821       }
9822 
9823     case PLUS:
9824       {
9825 	rtx new_op0;
9826 
9827 	op0 = XEXP (x, 0);
9828 	op1 = XEXP (x, 1);
9829 
9830 cost_plus:
9831 	if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
9832 	    || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
9833 	  {
9834 	    /* CSINC.  */
9835 	    *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
9836 	    *cost += rtx_cost (op1, mode, PLUS, 1, speed);
9837 	    return true;
9838 	  }
9839 
9840 	if (GET_MODE_CLASS (mode) == MODE_INT
9841 	    && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
9842 		|| aarch64_sve_addvl_addpl_immediate (op1, mode)))
9843 	  {
9844 	    *cost += rtx_cost (op0, mode, PLUS, 0, speed);
9845 
9846 	    if (speed)
9847 	      /* ADD (immediate).  */
9848 	      *cost += extra_cost->alu.arith;
9849 	    return true;
9850 	  }
9851 
9852 	*cost += rtx_cost (op1, mode, PLUS, 1, speed);
9853 
9854 	/* Look for ADD (extended register).  */
9855 	if (is_a <scalar_int_mode> (mode, &int_mode)
9856 	    && aarch64_rtx_arith_op_extract_p (op0, int_mode))
9857 	  {
9858 	    if (speed)
9859 	      *cost += extra_cost->alu.extend_arith;
9860 
9861 	    op0 = aarch64_strip_extend (op0, true);
9862 	    *cost += rtx_cost (op0, VOIDmode,
9863 			       (enum rtx_code) GET_CODE (op0), 0, speed);
9864 	    return true;
9865 	  }
9866 
9867 	/* Strip any extend, leave shifts behind as we will
9868 	   cost them through mult_cost.  */
9869 	new_op0 = aarch64_strip_extend (op0, false);
9870 
9871 	if (GET_CODE (new_op0) == MULT
9872 	    || aarch64_shift_p (GET_CODE (new_op0)))
9873 	  {
9874 	    *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
9875 					    speed);
9876 	    return true;
9877 	  }
9878 
9879 	*cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
9880 
9881 	if (speed)
9882 	  {
9883 	    if (VECTOR_MODE_P (mode))
9884 	      {
9885 		/* Vector ADD.  */
9886 		*cost += extra_cost->vect.alu;
9887 	      }
9888 	    else if (GET_MODE_CLASS (mode) == MODE_INT)
9889 	      {
9890 		/* ADD.  */
9891 		*cost += extra_cost->alu.arith;
9892 	      }
9893 	    else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9894 	      {
9895 		/* FADD.  */
9896 		*cost += extra_cost->fp[mode == DFmode].addsub;
9897 	      }
9898 	  }
9899 	return true;
9900       }
9901 
9902     case BSWAP:
9903       *cost = COSTS_N_INSNS (1);
9904 
9905       if (speed)
9906 	{
9907 	  if (VECTOR_MODE_P (mode))
9908 	    *cost += extra_cost->vect.alu;
9909 	  else
9910 	    *cost += extra_cost->alu.rev;
9911 	}
9912       return false;
9913 
9914     case IOR:
9915       if (aarch_rev16_p (x))
9916         {
9917           *cost = COSTS_N_INSNS (1);
9918 
9919 	  if (speed)
9920 	    {
9921 	      if (VECTOR_MODE_P (mode))
9922 		*cost += extra_cost->vect.alu;
9923 	      else
9924 		*cost += extra_cost->alu.rev;
9925 	    }
9926 	  return true;
9927         }
9928 
9929       if (aarch64_extr_rtx_p (x, &op0, &op1))
9930         {
9931 	  *cost += rtx_cost (op0, mode, IOR, 0, speed);
9932 	  *cost += rtx_cost (op1, mode, IOR, 1, speed);
9933           if (speed)
9934             *cost += extra_cost->alu.shift;
9935 
9936           return true;
9937         }
9938     /* Fall through.  */
9939     case XOR:
9940     case AND:
9941     cost_logic:
9942       op0 = XEXP (x, 0);
9943       op1 = XEXP (x, 1);
9944 
9945       if (VECTOR_MODE_P (mode))
9946 	{
9947 	  if (speed)
9948 	    *cost += extra_cost->vect.alu;
9949 	  return true;
9950 	}
9951 
9952       if (code == AND
9953           && GET_CODE (op0) == MULT
9954           && CONST_INT_P (XEXP (op0, 1))
9955           && CONST_INT_P (op1)
9956           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9957                                INTVAL (op1)) != 0)
9958         {
9959           /* This is a UBFM/SBFM.  */
9960 	  *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9961 	  if (speed)
9962 	    *cost += extra_cost->alu.bfx;
9963           return true;
9964         }
9965 
9966       if (is_int_mode (mode, &int_mode))
9967 	{
9968 	  if (CONST_INT_P (op1))
9969 	    {
9970 	      /* We have a mask + shift version of a UBFIZ
9971 		 i.e. the *andim_ashift<mode>_bfiz pattern.  */
9972 	      if (GET_CODE (op0) == ASHIFT
9973 		  && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9974 							 XEXP (op0, 1)))
9975 		{
9976 		  *cost += rtx_cost (XEXP (op0, 0), int_mode,
9977 				     (enum rtx_code) code, 0, speed);
9978 		  if (speed)
9979 		    *cost += extra_cost->alu.bfx;
9980 
9981 		  return true;
9982 		}
9983 	      else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9984 		{
9985 		/* We possibly get the immediate for free, this is not
9986 		   modelled.  */
9987 		  *cost += rtx_cost (op0, int_mode,
9988 				     (enum rtx_code) code, 0, speed);
9989 		  if (speed)
9990 		    *cost += extra_cost->alu.logical;
9991 
9992 		  return true;
9993 		}
9994 	    }
9995 	  else
9996 	    {
9997 	      rtx new_op0 = op0;
9998 
9999 	      /* Handle ORN, EON, or BIC.  */
10000 	      if (GET_CODE (op0) == NOT)
10001 		op0 = XEXP (op0, 0);
10002 
10003 	      new_op0 = aarch64_strip_shift (op0);
10004 
10005 	      /* If we had a shift on op0 then this is a logical-shift-
10006 		 by-register/immediate operation.  Otherwise, this is just
10007 		 a logical operation.  */
10008 	      if (speed)
10009 		{
10010 		  if (new_op0 != op0)
10011 		    {
10012 		      /* Shift by immediate.  */
10013 		      if (CONST_INT_P (XEXP (op0, 1)))
10014 			*cost += extra_cost->alu.log_shift;
10015 		      else
10016 			*cost += extra_cost->alu.log_shift_reg;
10017 		    }
10018 		  else
10019 		    *cost += extra_cost->alu.logical;
10020 		}
10021 
10022 	      /* In both cases we want to cost both operands.  */
10023 	      *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
10024 				 0, speed);
10025 	      *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
10026 				 1, speed);
10027 
10028 	      return true;
10029 	    }
10030 	}
10031       return false;
10032 
10033     case NOT:
10034       x = XEXP (x, 0);
10035       op0 = aarch64_strip_shift (x);
10036 
10037       if (VECTOR_MODE_P (mode))
10038 	{
10039 	  /* Vector NOT.  */
10040 	  *cost += extra_cost->vect.alu;
10041 	  return false;
10042 	}
10043 
10044       /* MVN-shifted-reg.  */
10045       if (op0 != x)
10046         {
10047 	  *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10048 
10049           if (speed)
10050             *cost += extra_cost->alu.log_shift;
10051 
10052           return true;
10053         }
10054       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
10055          Handle the second form here taking care that 'a' in the above can
10056          be a shift.  */
10057       else if (GET_CODE (op0) == XOR)
10058         {
10059           rtx newop0 = XEXP (op0, 0);
10060           rtx newop1 = XEXP (op0, 1);
10061           rtx op0_stripped = aarch64_strip_shift (newop0);
10062 
10063 	  *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
10064 	  *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
10065 
10066           if (speed)
10067             {
10068               if (op0_stripped != newop0)
10069                 *cost += extra_cost->alu.log_shift;
10070               else
10071                 *cost += extra_cost->alu.logical;
10072             }
10073 
10074           return true;
10075         }
10076       /* MVN.  */
10077       if (speed)
10078 	*cost += extra_cost->alu.logical;
10079 
10080       return false;
10081 
10082     case ZERO_EXTEND:
10083 
10084       op0 = XEXP (x, 0);
10085       /* If a value is written in SI mode, then zero extended to DI
10086 	 mode, the operation will in general be free as a write to
10087 	 a 'w' register implicitly zeroes the upper bits of an 'x'
10088 	 register.  However, if this is
10089 
10090 	   (set (reg) (zero_extend (reg)))
10091 
10092 	 we must cost the explicit register move.  */
10093       if (mode == DImode
10094 	  && GET_MODE (op0) == SImode
10095 	  && outer == SET)
10096 	{
10097 	  int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
10098 
10099 	/* If OP_COST is non-zero, then the cost of the zero extend
10100 	   is effectively the cost of the inner operation.  Otherwise
10101 	   we have a MOV instruction and we take the cost from the MOV
10102 	   itself.  This is true independently of whether we are
10103 	   optimizing for space or time.  */
10104 	  if (op_cost)
10105 	    *cost = op_cost;
10106 
10107 	  return true;
10108 	}
10109       else if (MEM_P (op0))
10110 	{
10111 	  /* All loads can zero extend to any size for free.  */
10112 	  *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
10113 	  return true;
10114 	}
10115 
10116       op0 = aarch64_extend_bitfield_pattern_p (x);
10117       if (op0)
10118 	{
10119 	  *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
10120 	  if (speed)
10121 	    *cost += extra_cost->alu.bfx;
10122 	  return true;
10123 	}
10124 
10125       if (speed)
10126 	{
10127 	  if (VECTOR_MODE_P (mode))
10128 	    {
10129 	      /* UMOV.  */
10130 	      *cost += extra_cost->vect.alu;
10131 	    }
10132 	  else
10133 	    {
10134 	      /* We generate an AND instead of UXTB/UXTH.  */
10135 	      *cost += extra_cost->alu.logical;
10136 	    }
10137 	}
10138       return false;
10139 
10140     case SIGN_EXTEND:
10141       if (MEM_P (XEXP (x, 0)))
10142 	{
10143 	  /* LDRSH.  */
10144 	  if (speed)
10145 	    {
10146 	      rtx address = XEXP (XEXP (x, 0), 0);
10147 	      *cost += extra_cost->ldst.load_sign_extend;
10148 
10149 	      *cost +=
10150 		COSTS_N_INSNS (aarch64_address_cost (address, mode,
10151 						     0, speed));
10152 	    }
10153 	  return true;
10154 	}
10155 
10156       op0 = aarch64_extend_bitfield_pattern_p (x);
10157       if (op0)
10158 	{
10159 	  *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
10160 	  if (speed)
10161 	    *cost += extra_cost->alu.bfx;
10162 	  return true;
10163 	}
10164 
10165       if (speed)
10166 	{
10167 	  if (VECTOR_MODE_P (mode))
10168 	    *cost += extra_cost->vect.alu;
10169 	  else
10170 	    *cost += extra_cost->alu.extend;
10171 	}
10172       return false;
10173 
10174     case ASHIFT:
10175       op0 = XEXP (x, 0);
10176       op1 = XEXP (x, 1);
10177 
10178       if (CONST_INT_P (op1))
10179         {
10180 	  if (speed)
10181 	    {
10182 	      if (VECTOR_MODE_P (mode))
10183 		{
10184 		  /* Vector shift (immediate).  */
10185 		  *cost += extra_cost->vect.alu;
10186 		}
10187 	      else
10188 		{
10189 		  /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
10190 		     aliases.  */
10191 		  *cost += extra_cost->alu.shift;
10192 		}
10193 	    }
10194 
10195           /* We can incorporate zero/sign extend for free.  */
10196           if (GET_CODE (op0) == ZERO_EXTEND
10197               || GET_CODE (op0) == SIGN_EXTEND)
10198             op0 = XEXP (op0, 0);
10199 
10200 	  *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
10201           return true;
10202         }
10203       else
10204         {
10205 	  if (VECTOR_MODE_P (mode))
10206 	    {
10207 	      if (speed)
10208 		/* Vector shift (register).  */
10209 		*cost += extra_cost->vect.alu;
10210 	    }
10211 	  else
10212 	    {
10213 	      if (speed)
10214 		/* LSLV.  */
10215 		*cost += extra_cost->alu.shift_reg;
10216 
10217 	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10218 		  && CONST_INT_P (XEXP (op1, 1))
10219 		  && known_eq (INTVAL (XEXP (op1, 1)),
10220 			       GET_MODE_BITSIZE (mode) - 1))
10221 		{
10222 		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10223 		  /* We already demanded XEXP (op1, 0) to be REG_P, so
10224 		     don't recurse into it.  */
10225 		  return true;
10226 		}
10227 	    }
10228 	  return false;  /* All arguments need to be in registers.  */
10229         }
10230 
10231     case ROTATE:
10232     case ROTATERT:
10233     case LSHIFTRT:
10234     case ASHIFTRT:
10235       op0 = XEXP (x, 0);
10236       op1 = XEXP (x, 1);
10237 
10238       if (CONST_INT_P (op1))
10239 	{
10240 	  /* ASR (immediate) and friends.  */
10241 	  if (speed)
10242 	    {
10243 	      if (VECTOR_MODE_P (mode))
10244 		*cost += extra_cost->vect.alu;
10245 	      else
10246 		*cost += extra_cost->alu.shift;
10247 	    }
10248 
10249 	  *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
10250 	  return true;
10251 	}
10252       else
10253 	{
10254 	  if (VECTOR_MODE_P (mode))
10255 	    {
10256 	      if (speed)
10257 		/* Vector shift (register).  */
10258 		*cost += extra_cost->vect.alu;
10259 	    }
10260 	  else
10261 	    {
10262 	      if (speed)
10263 		/* ASR (register) and friends.  */
10264 		*cost += extra_cost->alu.shift_reg;
10265 
10266 	      if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
10267 		  && CONST_INT_P (XEXP (op1, 1))
10268 		  && known_eq (INTVAL (XEXP (op1, 1)),
10269 			       GET_MODE_BITSIZE (mode) - 1))
10270 		{
10271 		  *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
10272 		  /* We already demanded XEXP (op1, 0) to be REG_P, so
10273 		     don't recurse into it.  */
10274 		  return true;
10275 		}
10276 	    }
10277 	  return false;  /* All arguments need to be in registers.  */
10278 	}
10279 
10280     case SYMBOL_REF:
10281 
10282       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
10283 	  || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
10284 	{
10285 	  /* LDR.  */
10286 	  if (speed)
10287 	    *cost += extra_cost->ldst.load;
10288 	}
10289       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
10290 	       || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
10291 	{
10292 	  /* ADRP, followed by ADD.  */
10293 	  *cost += COSTS_N_INSNS (1);
10294 	  if (speed)
10295 	    *cost += 2 * extra_cost->alu.arith;
10296 	}
10297       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
10298 	       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10299 	{
10300 	  /* ADR.  */
10301 	  if (speed)
10302 	    *cost += extra_cost->alu.arith;
10303 	}
10304 
10305       if (flag_pic)
10306 	{
10307 	  /* One extra load instruction, after accessing the GOT.  */
10308 	  *cost += COSTS_N_INSNS (1);
10309 	  if (speed)
10310 	    *cost += extra_cost->ldst.load;
10311 	}
10312       return true;
10313 
10314     case HIGH:
10315     case LO_SUM:
10316       /* ADRP/ADD (immediate).  */
10317       if (speed)
10318 	*cost += extra_cost->alu.arith;
10319       return true;
10320 
10321     case ZERO_EXTRACT:
10322     case SIGN_EXTRACT:
10323       /* UBFX/SBFX.  */
10324       if (speed)
10325 	{
10326 	  if (VECTOR_MODE_P (mode))
10327 	    *cost += extra_cost->vect.alu;
10328 	  else
10329 	    *cost += extra_cost->alu.bfx;
10330 	}
10331 
10332       /* We can trust that the immediates used will be correct (there
10333 	 are no by-register forms), so we need only cost op0.  */
10334       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
10335       return true;
10336 
10337     case MULT:
10338       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
10339       /* aarch64_rtx_mult_cost always handles recursion to its
10340 	 operands.  */
10341       return true;
10342 
10343     case MOD:
10344     /* We can expand signed mod by power of 2 using a NEGS, two parallel
10345        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
10346        an unconditional negate.  This case should only ever be reached through
10347        the set_smod_pow2_cheap check in expmed.c.  */
10348       if (CONST_INT_P (XEXP (x, 1))
10349 	  && exact_log2 (INTVAL (XEXP (x, 1))) > 0
10350 	  && (mode == SImode || mode == DImode))
10351 	{
10352 	  /* We expand to 4 instructions.  Reset the baseline.  */
10353 	  *cost = COSTS_N_INSNS (4);
10354 
10355 	  if (speed)
10356 	    *cost += 2 * extra_cost->alu.logical
10357 		     + 2 * extra_cost->alu.arith;
10358 
10359 	  return true;
10360 	}
10361 
10362     /* Fall-through.  */
10363     case UMOD:
10364       if (speed)
10365 	{
10366 	  /* Slighly prefer UMOD over SMOD.  */
10367 	  if (VECTOR_MODE_P (mode))
10368 	    *cost += extra_cost->vect.alu;
10369 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
10370 	    *cost += (extra_cost->mult[mode == DImode].add
10371 		      + extra_cost->mult[mode == DImode].idiv
10372 		      + (code == MOD ? 1 : 0));
10373 	}
10374       return false;  /* All arguments need to be in registers.  */
10375 
10376     case DIV:
10377     case UDIV:
10378     case SQRT:
10379       if (speed)
10380 	{
10381 	  if (VECTOR_MODE_P (mode))
10382 	    *cost += extra_cost->vect.alu;
10383 	  else if (GET_MODE_CLASS (mode) == MODE_INT)
10384 	    /* There is no integer SQRT, so only DIV and UDIV can get
10385 	       here.  */
10386 	    *cost += (extra_cost->mult[mode == DImode].idiv
10387 		     /* Slighly prefer UDIV over SDIV.  */
10388 		     + (code == DIV ? 1 : 0));
10389 	  else
10390 	    *cost += extra_cost->fp[mode == DFmode].div;
10391 	}
10392       return false;  /* All arguments need to be in registers.  */
10393 
10394     case IF_THEN_ELSE:
10395       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
10396 					 XEXP (x, 2), cost, speed);
10397 
10398     case EQ:
10399     case NE:
10400     case GT:
10401     case GTU:
10402     case LT:
10403     case LTU:
10404     case GE:
10405     case GEU:
10406     case LE:
10407     case LEU:
10408 
10409       return false; /* All arguments must be in registers.  */
10410 
10411     case FMA:
10412       op0 = XEXP (x, 0);
10413       op1 = XEXP (x, 1);
10414       op2 = XEXP (x, 2);
10415 
10416       if (speed)
10417 	{
10418 	  if (VECTOR_MODE_P (mode))
10419 	    *cost += extra_cost->vect.alu;
10420 	  else
10421 	    *cost += extra_cost->fp[mode == DFmode].fma;
10422 	}
10423 
10424       /* FMSUB, FNMADD, and FNMSUB are free.  */
10425       if (GET_CODE (op0) == NEG)
10426         op0 = XEXP (op0, 0);
10427 
10428       if (GET_CODE (op2) == NEG)
10429         op2 = XEXP (op2, 0);
10430 
10431       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
10432 	 and the by-element operand as operand 0.  */
10433       if (GET_CODE (op1) == NEG)
10434         op1 = XEXP (op1, 0);
10435 
10436       /* Catch vector-by-element operations.  The by-element operand can
10437 	 either be (vec_duplicate (vec_select (x))) or just
10438 	 (vec_select (x)), depending on whether we are multiplying by
10439 	 a vector or a scalar.
10440 
10441 	 Canonicalization is not very good in these cases, FMA4 will put the
10442 	 by-element operand as operand 0, FNMA4 will have it as operand 1.  */
10443       if (GET_CODE (op0) == VEC_DUPLICATE)
10444 	op0 = XEXP (op0, 0);
10445       else if (GET_CODE (op1) == VEC_DUPLICATE)
10446 	op1 = XEXP (op1, 0);
10447 
10448       if (GET_CODE (op0) == VEC_SELECT)
10449 	op0 = XEXP (op0, 0);
10450       else if (GET_CODE (op1) == VEC_SELECT)
10451 	op1 = XEXP (op1, 0);
10452 
10453       /* If the remaining parameters are not registers,
10454          get the cost to put them into registers.  */
10455       *cost += rtx_cost (op0, mode, FMA, 0, speed);
10456       *cost += rtx_cost (op1, mode, FMA, 1, speed);
10457       *cost += rtx_cost (op2, mode, FMA, 2, speed);
10458       return true;
10459 
10460     case FLOAT:
10461     case UNSIGNED_FLOAT:
10462       if (speed)
10463 	*cost += extra_cost->fp[mode == DFmode].fromint;
10464       return false;
10465 
10466     case FLOAT_EXTEND:
10467       if (speed)
10468 	{
10469 	  if (VECTOR_MODE_P (mode))
10470 	    {
10471 	      /*Vector truncate.  */
10472 	      *cost += extra_cost->vect.alu;
10473 	    }
10474 	  else
10475 	    *cost += extra_cost->fp[mode == DFmode].widen;
10476 	}
10477       return false;
10478 
10479     case FLOAT_TRUNCATE:
10480       if (speed)
10481 	{
10482 	  if (VECTOR_MODE_P (mode))
10483 	    {
10484 	      /*Vector conversion.  */
10485 	      *cost += extra_cost->vect.alu;
10486 	    }
10487 	  else
10488 	    *cost += extra_cost->fp[mode == DFmode].narrow;
10489 	}
10490       return false;
10491 
10492     case FIX:
10493     case UNSIGNED_FIX:
10494       x = XEXP (x, 0);
10495       /* Strip the rounding part.  They will all be implemented
10496          by the fcvt* family of instructions anyway.  */
10497       if (GET_CODE (x) == UNSPEC)
10498         {
10499           unsigned int uns_code = XINT (x, 1);
10500 
10501           if (uns_code == UNSPEC_FRINTA
10502               || uns_code == UNSPEC_FRINTM
10503               || uns_code == UNSPEC_FRINTN
10504               || uns_code == UNSPEC_FRINTP
10505               || uns_code == UNSPEC_FRINTZ)
10506             x = XVECEXP (x, 0, 0);
10507         }
10508 
10509       if (speed)
10510 	{
10511 	  if (VECTOR_MODE_P (mode))
10512 	    *cost += extra_cost->vect.alu;
10513 	  else
10514 	    *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
10515 	}
10516 
10517       /* We can combine fmul by a power of 2 followed by a fcvt into a single
10518 	 fixed-point fcvt.  */
10519       if (GET_CODE (x) == MULT
10520 	  && ((VECTOR_MODE_P (mode)
10521 	       && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
10522 	      || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
10523 	{
10524 	  *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
10525 			     0, speed);
10526 	  return true;
10527 	}
10528 
10529       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
10530       return true;
10531 
10532     case ABS:
10533       if (VECTOR_MODE_P (mode))
10534 	{
10535 	  /* ABS (vector).  */
10536 	  if (speed)
10537 	    *cost += extra_cost->vect.alu;
10538 	}
10539       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
10540 	{
10541 	  op0 = XEXP (x, 0);
10542 
10543 	  /* FABD, which is analogous to FADD.  */
10544 	  if (GET_CODE (op0) == MINUS)
10545 	    {
10546 	      *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
10547 	      *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
10548 	      if (speed)
10549 		*cost += extra_cost->fp[mode == DFmode].addsub;
10550 
10551 	      return true;
10552 	    }
10553 	  /* Simple FABS is analogous to FNEG.  */
10554 	  if (speed)
10555 	    *cost += extra_cost->fp[mode == DFmode].neg;
10556 	}
10557       else
10558 	{
10559 	  /* Integer ABS will either be split to
10560 	     two arithmetic instructions, or will be an ABS
10561 	     (scalar), which we don't model.  */
10562 	  *cost = COSTS_N_INSNS (2);
10563 	  if (speed)
10564 	    *cost += 2 * extra_cost->alu.arith;
10565 	}
10566       return false;
10567 
10568     case SMAX:
10569     case SMIN:
10570       if (speed)
10571 	{
10572 	  if (VECTOR_MODE_P (mode))
10573 	    *cost += extra_cost->vect.alu;
10574 	  else
10575 	    {
10576 	      /* FMAXNM/FMINNM/FMAX/FMIN.
10577 	         TODO: This may not be accurate for all implementations, but
10578 	         we do not model this in the cost tables.  */
10579 	      *cost += extra_cost->fp[mode == DFmode].addsub;
10580 	    }
10581 	}
10582       return false;
10583 
10584     case UNSPEC:
10585       /* The floating point round to integer frint* instructions.  */
10586       if (aarch64_frint_unspec_p (XINT (x, 1)))
10587         {
10588           if (speed)
10589             *cost += extra_cost->fp[mode == DFmode].roundint;
10590 
10591           return false;
10592         }
10593 
10594       if (XINT (x, 1) == UNSPEC_RBIT)
10595         {
10596           if (speed)
10597             *cost += extra_cost->alu.rev;
10598 
10599           return false;
10600         }
10601       break;
10602 
10603     case TRUNCATE:
10604 
10605       /* Decompose <su>muldi3_highpart.  */
10606       if (/* (truncate:DI  */
10607 	  mode == DImode
10608 	  /*   (lshiftrt:TI  */
10609           && GET_MODE (XEXP (x, 0)) == TImode
10610           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
10611 	  /*      (mult:TI  */
10612           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
10613 	  /*        (ANY_EXTEND:TI (reg:DI))
10614 	            (ANY_EXTEND:TI (reg:DI)))  */
10615           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
10616                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
10617               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
10618                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
10619           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
10620           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
10621 	  /*     (const_int 64)  */
10622           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10623           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
10624         {
10625           /* UMULH/SMULH.  */
10626 	  if (speed)
10627 	    *cost += extra_cost->mult[mode == DImode].extend;
10628 	  *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
10629 			     mode, MULT, 0, speed);
10630 	  *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
10631 			     mode, MULT, 1, speed);
10632           return true;
10633         }
10634 
10635       /* Fall through.  */
10636     default:
10637       break;
10638     }
10639 
10640   if (dump_file
10641       && flag_aarch64_verbose_cost)
10642     fprintf (dump_file,
10643       "\nFailed to cost RTX.  Assuming default cost.\n");
10644 
10645   return true;
10646 }
10647 
10648 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
10649    calculated for X.  This cost is stored in *COST.  Returns true
10650    if the total cost of X was calculated.  */
10651 static bool
10652 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
10653 		   int param, int *cost, bool speed)
10654 {
10655   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
10656 
10657   if (dump_file
10658       && flag_aarch64_verbose_cost)
10659     {
10660       print_rtl_single (dump_file, x);
10661       fprintf (dump_file, "\n%s cost: %d (%s)\n",
10662 	       speed ? "Hot" : "Cold",
10663 	       *cost, result ? "final" : "partial");
10664     }
10665 
10666   return result;
10667 }
10668 
10669 static int
10670 aarch64_register_move_cost (machine_mode mode,
10671 			    reg_class_t from_i, reg_class_t to_i)
10672 {
10673   enum reg_class from = (enum reg_class) from_i;
10674   enum reg_class to = (enum reg_class) to_i;
10675   const struct cpu_regmove_cost *regmove_cost
10676     = aarch64_tune_params.regmove_cost;
10677 
10678   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
10679   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
10680     to = GENERAL_REGS;
10681 
10682   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
10683     from = GENERAL_REGS;
10684 
10685   /* Moving between GPR and stack cost is the same as GP2GP.  */
10686   if ((from == GENERAL_REGS && to == STACK_REG)
10687       || (to == GENERAL_REGS && from == STACK_REG))
10688     return regmove_cost->GP2GP;
10689 
10690   /* To/From the stack register, we move via the gprs.  */
10691   if (to == STACK_REG || from == STACK_REG)
10692     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
10693             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
10694 
10695   if (known_eq (GET_MODE_SIZE (mode), 16))
10696     {
10697       /* 128-bit operations on general registers require 2 instructions.  */
10698       if (from == GENERAL_REGS && to == GENERAL_REGS)
10699 	return regmove_cost->GP2GP * 2;
10700       else if (from == GENERAL_REGS)
10701 	return regmove_cost->GP2FP * 2;
10702       else if (to == GENERAL_REGS)
10703 	return regmove_cost->FP2GP * 2;
10704 
10705       /* When AdvSIMD instructions are disabled it is not possible to move
10706 	 a 128-bit value directly between Q registers.  This is handled in
10707 	 secondary reload.  A general register is used as a scratch to move
10708 	 the upper DI value and the lower DI value is moved directly,
10709 	 hence the cost is the sum of three moves. */
10710       if (! TARGET_SIMD)
10711 	return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
10712 
10713       return regmove_cost->FP2FP;
10714     }
10715 
10716   if (from == GENERAL_REGS && to == GENERAL_REGS)
10717     return regmove_cost->GP2GP;
10718   else if (from == GENERAL_REGS)
10719     return regmove_cost->GP2FP;
10720   else if (to == GENERAL_REGS)
10721     return regmove_cost->FP2GP;
10722 
10723   return regmove_cost->FP2FP;
10724 }
10725 
10726 static int
10727 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
10728 			  reg_class_t rclass ATTRIBUTE_UNUSED,
10729 			  bool in ATTRIBUTE_UNUSED)
10730 {
10731   return aarch64_tune_params.memmov_cost;
10732 }
10733 
10734 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
10735    to optimize 1.0/sqrt.  */
10736 
10737 static bool
10738 use_rsqrt_p (machine_mode mode)
10739 {
10740   return (!flag_trapping_math
10741 	  && flag_unsafe_math_optimizations
10742 	  && ((aarch64_tune_params.approx_modes->recip_sqrt
10743 	       & AARCH64_APPROX_MODE (mode))
10744 	      || flag_mrecip_low_precision_sqrt));
10745 }
10746 
10747 /* Function to decide when to use the approximate reciprocal square root
10748    builtin.  */
10749 
10750 static tree
10751 aarch64_builtin_reciprocal (tree fndecl)
10752 {
10753   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
10754 
10755   if (!use_rsqrt_p (mode))
10756     return NULL_TREE;
10757   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
10758 }
10759 
10760 /* Emit instruction sequence to compute either the approximate square root
10761    or its approximate reciprocal, depending on the flag RECP, and return
10762    whether the sequence was emitted or not.  */
10763 
10764 bool
10765 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
10766 {
10767   machine_mode mode = GET_MODE (dst);
10768 
10769   if (GET_MODE_INNER (mode) == HFmode)
10770     {
10771       gcc_assert (!recp);
10772       return false;
10773     }
10774 
10775   if (!recp)
10776     {
10777       if (!(flag_mlow_precision_sqrt
10778 	    || (aarch64_tune_params.approx_modes->sqrt
10779 		& AARCH64_APPROX_MODE (mode))))
10780 	return false;
10781 
10782       if (flag_finite_math_only
10783 	  || flag_trapping_math
10784 	  || !flag_unsafe_math_optimizations
10785 	  || optimize_function_for_size_p (cfun))
10786 	return false;
10787     }
10788   else
10789     /* Caller assumes we cannot fail.  */
10790     gcc_assert (use_rsqrt_p (mode));
10791 
10792   machine_mode mmsk = mode_for_int_vector (mode).require ();
10793   rtx xmsk = gen_reg_rtx (mmsk);
10794   if (!recp)
10795     /* When calculating the approximate square root, compare the
10796        argument with 0.0 and create a mask.  */
10797     emit_insn (gen_rtx_SET (xmsk,
10798 			    gen_rtx_NEG (mmsk,
10799 					 gen_rtx_EQ (mmsk, src,
10800 						     CONST0_RTX (mode)))));
10801 
10802   /* Estimate the approximate reciprocal square root.  */
10803   rtx xdst = gen_reg_rtx (mode);
10804   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
10805 
10806   /* Iterate over the series twice for SF and thrice for DF.  */
10807   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10808 
10809   /* Optionally iterate over the series once less for faster performance
10810      while sacrificing the accuracy.  */
10811   if ((recp && flag_mrecip_low_precision_sqrt)
10812       || (!recp && flag_mlow_precision_sqrt))
10813     iterations--;
10814 
10815   /* Iterate over the series to calculate the approximate reciprocal square
10816      root.  */
10817   rtx x1 = gen_reg_rtx (mode);
10818   while (iterations--)
10819     {
10820       rtx x2 = gen_reg_rtx (mode);
10821       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
10822 
10823       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
10824 
10825       if (iterations > 0)
10826 	emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
10827     }
10828 
10829   if (!recp)
10830     {
10831       /* Qualify the approximate reciprocal square root when the argument is
10832 	 0.0 by squashing the intermediary result to 0.0.  */
10833       rtx xtmp = gen_reg_rtx (mmsk);
10834       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
10835 					      gen_rtx_SUBREG (mmsk, xdst, 0)));
10836       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
10837 
10838       /* Calculate the approximate square root.  */
10839       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
10840     }
10841 
10842   /* Finalize the approximation.  */
10843   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
10844 
10845   return true;
10846 }
10847 
10848 /* Emit the instruction sequence to compute the approximation for the division
10849    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
10850 
10851 bool
10852 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10853 {
10854   machine_mode mode = GET_MODE (quo);
10855 
10856   if (GET_MODE_INNER (mode) == HFmode)
10857     return false;
10858 
10859   bool use_approx_division_p = (flag_mlow_precision_div
10860 			        || (aarch64_tune_params.approx_modes->division
10861 				    & AARCH64_APPROX_MODE (mode)));
10862 
10863   if (!flag_finite_math_only
10864       || flag_trapping_math
10865       || !flag_unsafe_math_optimizations
10866       || optimize_function_for_size_p (cfun)
10867       || !use_approx_division_p)
10868     return false;
10869 
10870   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10871     return false;
10872 
10873   /* Estimate the approximate reciprocal.  */
10874   rtx xrcp = gen_reg_rtx (mode);
10875   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
10876 
10877   /* Iterate over the series twice for SF and thrice for DF.  */
10878   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10879 
10880   /* Optionally iterate over the series once less for faster performance,
10881      while sacrificing the accuracy.  */
10882   if (flag_mlow_precision_div)
10883     iterations--;
10884 
10885   /* Iterate over the series to calculate the approximate reciprocal.  */
10886   rtx xtmp = gen_reg_rtx (mode);
10887   while (iterations--)
10888     {
10889       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
10890 
10891       if (iterations > 0)
10892 	emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10893     }
10894 
10895   if (num != CONST1_RTX (mode))
10896     {
10897       /* As the approximate reciprocal of DEN is already calculated, only
10898 	 calculate the approximate division when NUM is not 1.0.  */
10899       rtx xnum = force_reg (mode, num);
10900       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10901     }
10902 
10903   /* Finalize the approximation.  */
10904   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10905   return true;
10906 }
10907 
10908 /* Return the number of instructions that can be issued per cycle.  */
10909 static int
10910 aarch64_sched_issue_rate (void)
10911 {
10912   return aarch64_tune_params.issue_rate;
10913 }
10914 
10915 static int
10916 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10917 {
10918   int issue_rate = aarch64_sched_issue_rate ();
10919 
10920   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10921 }
10922 
10923 
10924 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10925    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
10926    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
10927 
10928 static int
10929 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10930 						    int ready_index)
10931 {
10932   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10933 }
10934 
10935 
10936 /* Vectorizer cost model target hooks.  */
10937 
10938 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10939 static int
10940 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10941 				    tree vectype,
10942 				    int misalign ATTRIBUTE_UNUSED)
10943 {
10944   unsigned elements;
10945   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10946   bool fp = false;
10947 
10948   if (vectype != NULL)
10949     fp = FLOAT_TYPE_P (vectype);
10950 
10951   switch (type_of_cost)
10952     {
10953       case scalar_stmt:
10954 	return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10955 
10956       case scalar_load:
10957 	return costs->scalar_load_cost;
10958 
10959       case scalar_store:
10960 	return costs->scalar_store_cost;
10961 
10962       case vector_stmt:
10963 	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10964 
10965       case vector_load:
10966 	return costs->vec_align_load_cost;
10967 
10968       case vector_store:
10969 	return costs->vec_store_cost;
10970 
10971       case vec_to_scalar:
10972 	return costs->vec_to_scalar_cost;
10973 
10974       case scalar_to_vec:
10975 	return costs->scalar_to_vec_cost;
10976 
10977       case unaligned_load:
10978       case vector_gather_load:
10979 	return costs->vec_unalign_load_cost;
10980 
10981       case unaligned_store:
10982       case vector_scatter_store:
10983 	return costs->vec_unalign_store_cost;
10984 
10985       case cond_branch_taken:
10986 	return costs->cond_taken_branch_cost;
10987 
10988       case cond_branch_not_taken:
10989 	return costs->cond_not_taken_branch_cost;
10990 
10991       case vec_perm:
10992 	return costs->vec_permute_cost;
10993 
10994       case vec_promote_demote:
10995 	return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10996 
10997       case vec_construct:
10998 	elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10999 	return elements / 2 + 1;
11000 
11001       default:
11002 	gcc_unreachable ();
11003     }
11004 }
11005 
11006 /* Implement targetm.vectorize.add_stmt_cost.  */
11007 static unsigned
11008 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
11009 		       struct _stmt_vec_info *stmt_info, int misalign,
11010 		       enum vect_cost_model_location where)
11011 {
11012   unsigned *cost = (unsigned *) data;
11013   unsigned retval = 0;
11014 
11015   if (flag_vect_cost_model)
11016     {
11017       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
11018       int stmt_cost =
11019 	    aarch64_builtin_vectorization_cost (kind, vectype, misalign);
11020 
11021       /* Statements in an inner loop relative to the loop being
11022 	 vectorized are weighted more heavily.  The value here is
11023 	 arbitrary and could potentially be improved with analysis.  */
11024       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
11025 	count *= 50; /*  FIXME  */
11026 
11027       retval = (unsigned) (count * stmt_cost);
11028       cost[where] += retval;
11029     }
11030 
11031   return retval;
11032 }
11033 
11034 static void initialize_aarch64_code_model (struct gcc_options *);
11035 
11036 /* Parse the TO_PARSE string and put the architecture struct that it
11037    selects into RES and the architectural features into ISA_FLAGS.
11038    Return an aarch64_parse_opt_result describing the parse result.
11039    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
11040    When the TO_PARSE string contains an invalid extension,
11041    a copy of the string is created and stored to INVALID_EXTENSION.  */
11042 
11043 static enum aarch64_parse_opt_result
11044 aarch64_parse_arch (const char *to_parse, const struct processor **res,
11045 		    unsigned long *isa_flags, std::string *invalid_extension)
11046 {
11047   const char *ext;
11048   const struct processor *arch;
11049   size_t len;
11050 
11051   ext = strchr (to_parse, '+');
11052 
11053   if (ext != NULL)
11054     len = ext - to_parse;
11055   else
11056     len = strlen (to_parse);
11057 
11058   if (len == 0)
11059     return AARCH64_PARSE_MISSING_ARG;
11060 
11061 
11062   /* Loop through the list of supported ARCHes to find a match.  */
11063   for (arch = all_architectures; arch->name != NULL; arch++)
11064     {
11065       if (strlen (arch->name) == len
11066 	  && strncmp (arch->name, to_parse, len) == 0)
11067 	{
11068 	  unsigned long isa_temp = arch->flags;
11069 
11070 	  if (ext != NULL)
11071 	    {
11072 	      /* TO_PARSE string contains at least one extension.  */
11073 	      enum aarch64_parse_opt_result ext_res
11074 		= aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11075 
11076 	      if (ext_res != AARCH64_PARSE_OK)
11077 		return ext_res;
11078 	    }
11079 	  /* Extension parsing was successful.  Confirm the result
11080 	     arch and ISA flags.  */
11081 	  *res = arch;
11082 	  *isa_flags = isa_temp;
11083 	  return AARCH64_PARSE_OK;
11084 	}
11085     }
11086 
11087   /* ARCH name not found in list.  */
11088   return AARCH64_PARSE_INVALID_ARG;
11089 }
11090 
11091 /* Parse the TO_PARSE string and put the result tuning in RES and the
11092    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
11093    describing the parse result.  If there is an error parsing, RES and
11094    ISA_FLAGS are left unchanged.
11095    When the TO_PARSE string contains an invalid extension,
11096    a copy of the string is created and stored to INVALID_EXTENSION.  */
11097 
11098 static enum aarch64_parse_opt_result
11099 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
11100 		   unsigned long *isa_flags, std::string *invalid_extension)
11101 {
11102   const char *ext;
11103   const struct processor *cpu;
11104   size_t len;
11105 
11106   ext = strchr (to_parse, '+');
11107 
11108   if (ext != NULL)
11109     len = ext - to_parse;
11110   else
11111     len = strlen (to_parse);
11112 
11113   if (len == 0)
11114     return AARCH64_PARSE_MISSING_ARG;
11115 
11116 
11117   /* Loop through the list of supported CPUs to find a match.  */
11118   for (cpu = all_cores; cpu->name != NULL; cpu++)
11119     {
11120       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
11121 	{
11122 	  unsigned long isa_temp = cpu->flags;
11123 
11124 
11125 	  if (ext != NULL)
11126 	    {
11127 	      /* TO_PARSE string contains at least one extension.  */
11128 	      enum aarch64_parse_opt_result ext_res
11129 		= aarch64_parse_extension (ext, &isa_temp, invalid_extension);
11130 
11131 	      if (ext_res != AARCH64_PARSE_OK)
11132 		return ext_res;
11133 	    }
11134 	  /* Extension parsing was successfull.  Confirm the result
11135 	     cpu and ISA flags.  */
11136 	  *res = cpu;
11137 	  *isa_flags = isa_temp;
11138 	  return AARCH64_PARSE_OK;
11139 	}
11140     }
11141 
11142   /* CPU name not found in list.  */
11143   return AARCH64_PARSE_INVALID_ARG;
11144 }
11145 
11146 /* Parse the TO_PARSE string and put the cpu it selects into RES.
11147    Return an aarch64_parse_opt_result describing the parse result.
11148    If the parsing fails the RES does not change.  */
11149 
11150 static enum aarch64_parse_opt_result
11151 aarch64_parse_tune (const char *to_parse, const struct processor **res)
11152 {
11153   const struct processor *cpu;
11154 
11155   /* Loop through the list of supported CPUs to find a match.  */
11156   for (cpu = all_cores; cpu->name != NULL; cpu++)
11157     {
11158       if (strcmp (cpu->name, to_parse) == 0)
11159 	{
11160 	  *res = cpu;
11161 	  return AARCH64_PARSE_OK;
11162 	}
11163     }
11164 
11165   /* CPU name not found in list.  */
11166   return AARCH64_PARSE_INVALID_ARG;
11167 }
11168 
11169 /* Parse TOKEN, which has length LENGTH to see if it is an option
11170    described in FLAG.  If it is, return the index bit for that fusion type.
11171    If not, error (printing OPTION_NAME) and return zero.  */
11172 
11173 static unsigned int
11174 aarch64_parse_one_option_token (const char *token,
11175 				size_t length,
11176 				const struct aarch64_flag_desc *flag,
11177 				const char *option_name)
11178 {
11179   for (; flag->name != NULL; flag++)
11180     {
11181       if (length == strlen (flag->name)
11182 	  && !strncmp (flag->name, token, length))
11183 	return flag->flag;
11184     }
11185 
11186   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
11187   return 0;
11188 }
11189 
11190 /* Parse OPTION which is a comma-separated list of flags to enable.
11191    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
11192    default state we inherit from the CPU tuning structures.  OPTION_NAME
11193    gives the top-level option we are parsing in the -moverride string,
11194    for use in error messages.  */
11195 
11196 static unsigned int
11197 aarch64_parse_boolean_options (const char *option,
11198 			       const struct aarch64_flag_desc *flags,
11199 			       unsigned int initial_state,
11200 			       const char *option_name)
11201 {
11202   const char separator = '.';
11203   const char* specs = option;
11204   const char* ntoken = option;
11205   unsigned int found_flags = initial_state;
11206 
11207   while ((ntoken = strchr (specs, separator)))
11208     {
11209       size_t token_length = ntoken - specs;
11210       unsigned token_ops = aarch64_parse_one_option_token (specs,
11211 							   token_length,
11212 							   flags,
11213 							   option_name);
11214       /* If we find "none" (or, for simplicity's sake, an error) anywhere
11215 	 in the token stream, reset the supported operations.  So:
11216 
11217 	   adrp+add.cmp+branch.none.adrp+add
11218 
11219 	   would have the result of turning on only adrp+add fusion.  */
11220       if (!token_ops)
11221 	found_flags = 0;
11222 
11223       found_flags |= token_ops;
11224       specs = ++ntoken;
11225     }
11226 
11227   /* We ended with a comma, print something.  */
11228   if (!(*specs))
11229     {
11230       error ("%s string ill-formed\n", option_name);
11231       return 0;
11232     }
11233 
11234   /* We still have one more token to parse.  */
11235   size_t token_length = strlen (specs);
11236   unsigned token_ops = aarch64_parse_one_option_token (specs,
11237 						       token_length,
11238 						       flags,
11239 						       option_name);
11240    if (!token_ops)
11241      found_flags = 0;
11242 
11243   found_flags |= token_ops;
11244   return found_flags;
11245 }
11246 
11247 /* Support for overriding instruction fusion.  */
11248 
11249 static void
11250 aarch64_parse_fuse_string (const char *fuse_string,
11251 			    struct tune_params *tune)
11252 {
11253   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
11254 						     aarch64_fusible_pairs,
11255 						     tune->fusible_ops,
11256 						     "fuse=");
11257 }
11258 
11259 /* Support for overriding other tuning flags.  */
11260 
11261 static void
11262 aarch64_parse_tune_string (const char *tune_string,
11263 			    struct tune_params *tune)
11264 {
11265   tune->extra_tuning_flags
11266     = aarch64_parse_boolean_options (tune_string,
11267 				     aarch64_tuning_flags,
11268 				     tune->extra_tuning_flags,
11269 				     "tune=");
11270 }
11271 
11272 /* Parse the sve_width tuning moverride string in TUNE_STRING.
11273    Accept the valid SVE vector widths allowed by
11274    aarch64_sve_vector_bits_enum and use it to override sve_width
11275    in TUNE.  */
11276 
11277 static void
11278 aarch64_parse_sve_width_string (const char *tune_string,
11279 				struct tune_params *tune)
11280 {
11281   int width = -1;
11282 
11283   int n = sscanf (tune_string, "%d", &width);
11284   if (n == EOF)
11285     {
11286       error ("invalid format for sve_width");
11287       return;
11288     }
11289   switch (width)
11290     {
11291     case SVE_128:
11292     case SVE_256:
11293     case SVE_512:
11294     case SVE_1024:
11295     case SVE_2048:
11296       break;
11297     default:
11298       error ("invalid sve_width value: %d", width);
11299     }
11300   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
11301 }
11302 
11303 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
11304    we understand.  If it is, extract the option string and handoff to
11305    the appropriate function.  */
11306 
11307 void
11308 aarch64_parse_one_override_token (const char* token,
11309 				  size_t length,
11310 				  struct tune_params *tune)
11311 {
11312   const struct aarch64_tuning_override_function *fn
11313     = aarch64_tuning_override_functions;
11314 
11315   const char *option_part = strchr (token, '=');
11316   if (!option_part)
11317     {
11318       error ("tuning string missing in option (%s)", token);
11319       return;
11320     }
11321 
11322   /* Get the length of the option name.  */
11323   length = option_part - token;
11324   /* Skip the '=' to get to the option string.  */
11325   option_part++;
11326 
11327   for (; fn->name != NULL; fn++)
11328     {
11329       if (!strncmp (fn->name, token, length))
11330 	{
11331 	  fn->parse_override (option_part, tune);
11332 	  return;
11333 	}
11334     }
11335 
11336   error ("unknown tuning option (%s)",token);
11337   return;
11338 }
11339 
11340 /* A checking mechanism for the implementation of the tls size.  */
11341 
11342 static void
11343 initialize_aarch64_tls_size (struct gcc_options *opts)
11344 {
11345   if (aarch64_tls_size == 0)
11346     aarch64_tls_size = 24;
11347 
11348   switch (opts->x_aarch64_cmodel_var)
11349     {
11350     case AARCH64_CMODEL_TINY:
11351       /* Both the default and maximum TLS size allowed under tiny is 1M which
11352 	 needs two instructions to address, so we clamp the size to 24.  */
11353       if (aarch64_tls_size > 24)
11354 	aarch64_tls_size = 24;
11355       break;
11356     case AARCH64_CMODEL_SMALL:
11357       /* The maximum TLS size allowed under small is 4G.  */
11358       if (aarch64_tls_size > 32)
11359 	aarch64_tls_size = 32;
11360       break;
11361     case AARCH64_CMODEL_LARGE:
11362       /* The maximum TLS size allowed under large is 16E.
11363 	 FIXME: 16E should be 64bit, we only support 48bit offset now.  */
11364       if (aarch64_tls_size > 48)
11365 	aarch64_tls_size = 48;
11366       break;
11367     default:
11368       gcc_unreachable ();
11369     }
11370 
11371   return;
11372 }
11373 
11374 /* Parse STRING looking for options in the format:
11375      string	:: option:string
11376      option	:: name=substring
11377      name	:: {a-z}
11378      substring	:: defined by option.  */
11379 
11380 static void
11381 aarch64_parse_override_string (const char* input_string,
11382 			       struct tune_params* tune)
11383 {
11384   const char separator = ':';
11385   size_t string_length = strlen (input_string) + 1;
11386   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
11387   char *string = string_root;
11388   strncpy (string, input_string, string_length);
11389   string[string_length - 1] = '\0';
11390 
11391   char* ntoken = string;
11392 
11393   while ((ntoken = strchr (string, separator)))
11394     {
11395       size_t token_length = ntoken - string;
11396       /* Make this substring look like a string.  */
11397       *ntoken = '\0';
11398       aarch64_parse_one_override_token (string, token_length, tune);
11399       string = ++ntoken;
11400     }
11401 
11402   /* One last option to parse.  */
11403   aarch64_parse_one_override_token (string, strlen (string), tune);
11404   free (string_root);
11405 }
11406 
11407 
11408 static void
11409 aarch64_override_options_after_change_1 (struct gcc_options *opts)
11410 {
11411   if (accepted_branch_protection_string)
11412     {
11413       opts->x_aarch64_branch_protection_string
11414 	= xstrdup (accepted_branch_protection_string);
11415     }
11416 
11417   /* PR 70044: We have to be careful about being called multiple times for the
11418      same function.  This means all changes should be repeatable.  */
11419 
11420   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
11421      Disable the frame pointer flag so the mid-end will not use a frame
11422      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
11423      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
11424      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
11425   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
11426   if (opts->x_flag_omit_frame_pointer == 0)
11427     opts->x_flag_omit_frame_pointer = 2;
11428 
11429   /* If not optimizing for size, set the default
11430      alignment to what the target wants.  */
11431   if (!opts->x_optimize_size)
11432     {
11433       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
11434 	opts->x_str_align_loops = aarch64_tune_params.loop_align;
11435       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
11436 	opts->x_str_align_jumps = aarch64_tune_params.jump_align;
11437       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
11438 	opts->x_str_align_functions = aarch64_tune_params.function_align;
11439     }
11440 
11441   /* We default to no pc-relative literal loads.  */
11442 
11443   aarch64_pcrelative_literal_loads = false;
11444 
11445   /* If -mpc-relative-literal-loads is set on the command line, this
11446      implies that the user asked for PC relative literal loads.  */
11447   if (opts->x_pcrelative_literal_loads == 1)
11448     aarch64_pcrelative_literal_loads = true;
11449 
11450   /* In the tiny memory model it makes no sense to disallow PC relative
11451      literal pool loads.  */
11452   if (aarch64_cmodel == AARCH64_CMODEL_TINY
11453       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
11454     aarch64_pcrelative_literal_loads = true;
11455 
11456   /* When enabling the lower precision Newton series for the square root, also
11457      enable it for the reciprocal square root, since the latter is an
11458      intermediary step for the former.  */
11459   if (flag_mlow_precision_sqrt)
11460     flag_mrecip_low_precision_sqrt = true;
11461 }
11462 
11463 /* 'Unpack' up the internal tuning structs and update the options
11464     in OPTS.  The caller must have set up selected_tune and selected_arch
11465     as all the other target-specific codegen decisions are
11466     derived from them.  */
11467 
11468 void
11469 aarch64_override_options_internal (struct gcc_options *opts)
11470 {
11471   aarch64_tune_flags = selected_tune->flags;
11472   aarch64_tune = selected_tune->sched_core;
11473   /* Make a copy of the tuning parameters attached to the core, which
11474      we may later overwrite.  */
11475   aarch64_tune_params = *(selected_tune->tune);
11476   aarch64_architecture_version = selected_arch->architecture_version;
11477 
11478   if (opts->x_aarch64_override_tune_string)
11479     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
11480 				  &aarch64_tune_params);
11481 
11482   /* This target defaults to strict volatile bitfields.  */
11483   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
11484     opts->x_flag_strict_volatile_bitfields = 1;
11485 
11486   if (aarch64_stack_protector_guard == SSP_GLOBAL
11487       && opts->x_aarch64_stack_protector_guard_offset_str)
11488     {
11489       error ("incompatible options %<-mstack-protector-guard=global%> and "
11490 	     "%<-mstack-protector-guard-offset=%s%>",
11491 	     aarch64_stack_protector_guard_offset_str);
11492     }
11493 
11494   if (aarch64_stack_protector_guard == SSP_SYSREG
11495       && !(opts->x_aarch64_stack_protector_guard_offset_str
11496 	   && opts->x_aarch64_stack_protector_guard_reg_str))
11497     {
11498       error ("both %<-mstack-protector-guard-offset%> and "
11499 	     "%<-mstack-protector-guard-reg%> must be used "
11500 	     "with %<-mstack-protector-guard=sysreg%>");
11501     }
11502 
11503   if (opts->x_aarch64_stack_protector_guard_reg_str)
11504     {
11505       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
11506 	  error ("specify a system register with a small string length.");
11507     }
11508 
11509   if (opts->x_aarch64_stack_protector_guard_offset_str)
11510     {
11511       char *end;
11512       const char *str = aarch64_stack_protector_guard_offset_str;
11513       errno = 0;
11514       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
11515       if (!*str || *end || errno)
11516 	error ("%qs is not a valid offset in %qs", str,
11517 	       "-mstack-protector-guard-offset=");
11518       aarch64_stack_protector_guard_offset = offs;
11519     }
11520 
11521   initialize_aarch64_code_model (opts);
11522   initialize_aarch64_tls_size (opts);
11523 
11524   int queue_depth = 0;
11525   switch (aarch64_tune_params.autoprefetcher_model)
11526     {
11527       case tune_params::AUTOPREFETCHER_OFF:
11528 	queue_depth = -1;
11529 	break;
11530       case tune_params::AUTOPREFETCHER_WEAK:
11531 	queue_depth = 0;
11532 	break;
11533       case tune_params::AUTOPREFETCHER_STRONG:
11534 	queue_depth = max_insn_queue_index + 1;
11535 	break;
11536       default:
11537 	gcc_unreachable ();
11538     }
11539 
11540   /* We don't mind passing in global_options_set here as we don't use
11541      the *options_set structs anyway.  */
11542   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
11543 			 queue_depth,
11544 			 opts->x_param_values,
11545 			 global_options_set.x_param_values);
11546 
11547   /* Set up parameters to be used in prefetching algorithm.  Do not
11548      override the defaults unless we are tuning for a core we have
11549      researched values for.  */
11550   if (aarch64_tune_params.prefetch->num_slots > 0)
11551     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
11552 			   aarch64_tune_params.prefetch->num_slots,
11553 			   opts->x_param_values,
11554 			   global_options_set.x_param_values);
11555   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
11556     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
11557 			   aarch64_tune_params.prefetch->l1_cache_size,
11558 			   opts->x_param_values,
11559 			   global_options_set.x_param_values);
11560   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
11561     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
11562 			   aarch64_tune_params.prefetch->l1_cache_line_size,
11563 			   opts->x_param_values,
11564 			   global_options_set.x_param_values);
11565   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
11566     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
11567 			   aarch64_tune_params.prefetch->l2_cache_size,
11568 			   opts->x_param_values,
11569 			   global_options_set.x_param_values);
11570   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
11571     maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
11572 			   0,
11573 			   opts->x_param_values,
11574 			   global_options_set.x_param_values);
11575   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
11576     maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
11577 			   aarch64_tune_params.prefetch->minimum_stride,
11578 			   opts->x_param_values,
11579 			   global_options_set.x_param_values);
11580 
11581   /* Use the alternative scheduling-pressure algorithm by default.  */
11582   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
11583 			 opts->x_param_values,
11584 			 global_options_set.x_param_values);
11585 
11586   /* If the user hasn't changed it via configure then set the default to 64 KB
11587      for the backend.  */
11588   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE,
11589 			 DEFAULT_STK_CLASH_GUARD_SIZE == 0
11590 			   ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE,
11591 			 opts->x_param_values,
11592 			 global_options_set.x_param_values);
11593 
11594   /* Validate the guard size.  */
11595   int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE);
11596 
11597   /* Enforce that interval is the same size as size so the mid-end does the
11598      right thing.  */
11599   maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL,
11600 			 guard_size,
11601 			 opts->x_param_values,
11602 			 global_options_set.x_param_values);
11603 
11604   /* The maybe_set calls won't update the value if the user has explicitly set
11605      one.  Which means we need to validate that probing interval and guard size
11606      are equal.  */
11607   int probe_interval
11608     = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL);
11609   if (guard_size != probe_interval)
11610     error ("stack clash guard size %<%d%> must be equal to probing interval "
11611 	   "%<%d%>", guard_size, probe_interval);
11612 
11613   /* Enable sw prefetching at specified optimization level for
11614      CPUS that have prefetch.  Lower optimization level threshold by 1
11615      when profiling is enabled.  */
11616   if (opts->x_flag_prefetch_loop_arrays < 0
11617       && !opts->x_optimize_size
11618       && aarch64_tune_params.prefetch->default_opt_level >= 0
11619       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
11620     opts->x_flag_prefetch_loop_arrays = 1;
11621 
11622   if (opts->x_aarch64_arch_string == NULL)
11623     opts->x_aarch64_arch_string = selected_arch->name;
11624   if (opts->x_aarch64_cpu_string == NULL)
11625     opts->x_aarch64_cpu_string = selected_cpu->name;
11626   if (opts->x_aarch64_tune_string == NULL)
11627     opts->x_aarch64_tune_string = selected_tune->name;
11628 
11629   aarch64_override_options_after_change_1 (opts);
11630 }
11631 
11632 /* Print a hint with a suggestion for a core or architecture name that
11633    most closely resembles what the user passed in STR.  ARCH is true if
11634    the user is asking for an architecture name.  ARCH is false if the user
11635    is asking for a core name.  */
11636 
11637 static void
11638 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
11639 {
11640   auto_vec<const char *> candidates;
11641   const struct processor *entry = arch ? all_architectures : all_cores;
11642   for (; entry->name != NULL; entry++)
11643     candidates.safe_push (entry->name);
11644 
11645 #ifdef HAVE_LOCAL_CPU_DETECT
11646   /* Add also "native" as possible value.  */
11647   if (arch)
11648     candidates.safe_push ("native");
11649 #endif
11650 
11651   char *s;
11652   const char *hint = candidates_list_and_hint (str, s, candidates);
11653   if (hint)
11654     inform (input_location, "valid arguments are: %s;"
11655 			     " did you mean %qs?", s, hint);
11656   else
11657     inform (input_location, "valid arguments are: %s", s);
11658 
11659   XDELETEVEC (s);
11660 }
11661 
11662 /* Print a hint with a suggestion for a core name that most closely resembles
11663    what the user passed in STR.  */
11664 
11665 inline static void
11666 aarch64_print_hint_for_core (const char *str)
11667 {
11668   aarch64_print_hint_for_core_or_arch (str, false);
11669 }
11670 
11671 /* Print a hint with a suggestion for an architecture name that most closely
11672    resembles what the user passed in STR.  */
11673 
11674 inline static void
11675 aarch64_print_hint_for_arch (const char *str)
11676 {
11677   aarch64_print_hint_for_core_or_arch (str, true);
11678 }
11679 
11680 
11681 /* Print a hint with a suggestion for an extension name
11682    that most closely resembles what the user passed in STR.  */
11683 
11684 void
11685 aarch64_print_hint_for_extensions (const std::string &str)
11686 {
11687   auto_vec<const char *> candidates;
11688   aarch64_get_all_extension_candidates (&candidates);
11689   char *s;
11690   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
11691   if (hint)
11692     inform (input_location, "valid arguments are: %s;"
11693 			     " did you mean %qs?", s, hint);
11694   else
11695     inform (input_location, "valid arguments are: %s;", s);
11696 
11697   XDELETEVEC (s);
11698 }
11699 
11700 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
11701    specified in STR and throw errors if appropriate.  Put the results if
11702    they are valid in RES and ISA_FLAGS.  Return whether the option is
11703    valid.  */
11704 
11705 static bool
11706 aarch64_validate_mcpu (const char *str, const struct processor **res,
11707 		       unsigned long *isa_flags)
11708 {
11709   std::string invalid_extension;
11710   enum aarch64_parse_opt_result parse_res
11711     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
11712 
11713   if (parse_res == AARCH64_PARSE_OK)
11714     return true;
11715 
11716   switch (parse_res)
11717     {
11718       case AARCH64_PARSE_MISSING_ARG:
11719 	error ("missing cpu name in %<-mcpu=%s%>", str);
11720 	break;
11721       case AARCH64_PARSE_INVALID_ARG:
11722 	error ("unknown value %qs for %<-mcpu%>", str);
11723 	aarch64_print_hint_for_core (str);
11724 	break;
11725       case AARCH64_PARSE_INVALID_FEATURE:
11726 	error ("invalid feature modifier %qs in %<-mcpu=%s%>",
11727 	       invalid_extension.c_str (), str);
11728 	aarch64_print_hint_for_extensions (invalid_extension);
11729 	break;
11730       default:
11731 	gcc_unreachable ();
11732     }
11733 
11734   return false;
11735 }
11736 
11737 /* Parses CONST_STR for branch protection features specified in
11738    aarch64_branch_protect_types, and set any global variables required.  Returns
11739    the parsing result and assigns LAST_STR to the last processed token from
11740    CONST_STR so that it can be used for error reporting.  */
11741 
11742 static enum
11743 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
11744 							  char** last_str)
11745 {
11746   char *str_root = xstrdup (const_str);
11747   char* token_save = NULL;
11748   char *str = strtok_r (str_root, "+", &token_save);
11749   enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
11750   if (!str)
11751     res = AARCH64_PARSE_MISSING_ARG;
11752   else
11753     {
11754       char *next_str = strtok_r (NULL, "+", &token_save);
11755       /* Reset the branch protection features to their defaults.  */
11756       aarch64_handle_no_branch_protection (NULL, NULL);
11757 
11758       while (str && res == AARCH64_PARSE_OK)
11759 	{
11760 	  const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
11761 	  bool found = false;
11762 	  /* Search for this type.  */
11763 	  while (type && type->name && !found && res == AARCH64_PARSE_OK)
11764 	    {
11765 	      if (strcmp (str, type->name) == 0)
11766 		{
11767 		  found = true;
11768 		  res = type->handler (str, next_str);
11769 		  str = next_str;
11770 		  next_str = strtok_r (NULL, "+", &token_save);
11771 		}
11772 	      else
11773 		type++;
11774 	    }
11775 	  if (found && res == AARCH64_PARSE_OK)
11776 	    {
11777 	      bool found_subtype = true;
11778 	      /* Loop through each token until we find one that isn't a
11779 		 subtype.  */
11780 	      while (found_subtype)
11781 		{
11782 		  found_subtype = false;
11783 		  const aarch64_branch_protect_type *subtype = type->subtypes;
11784 		  /* Search for the subtype.  */
11785 		  while (str && subtype && subtype->name && !found_subtype
11786 			  && res == AARCH64_PARSE_OK)
11787 		    {
11788 		      if (strcmp (str, subtype->name) == 0)
11789 			{
11790 			  found_subtype = true;
11791 			  res = subtype->handler (str, next_str);
11792 			  str = next_str;
11793 			  next_str = strtok_r (NULL, "+", &token_save);
11794 			}
11795 		      else
11796 			subtype++;
11797 		    }
11798 		}
11799 	    }
11800 	  else if (!found)
11801 	    res = AARCH64_PARSE_INVALID_ARG;
11802 	}
11803     }
11804   /* Copy the last processed token into the argument to pass it back.
11805     Used by option and attribute validation to print the offending token.  */
11806   if (last_str)
11807     {
11808       if (str) strcpy (*last_str, str);
11809       else *last_str = NULL;
11810     }
11811   if (res == AARCH64_PARSE_OK)
11812     {
11813       /* If needed, alloc the accepted string then copy in const_str.
11814 	Used by override_option_after_change_1.  */
11815       if (!accepted_branch_protection_string)
11816 	accepted_branch_protection_string = (char *) xmalloc (
11817 						      BRANCH_PROTECT_STR_MAX
11818 							+ 1);
11819       strncpy (accepted_branch_protection_string, const_str,
11820 		BRANCH_PROTECT_STR_MAX + 1);
11821       /* Forcibly null-terminate.  */
11822       accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
11823     }
11824   return res;
11825 }
11826 
11827 static bool
11828 aarch64_validate_mbranch_protection (const char *const_str)
11829 {
11830   char *str = (char *) xmalloc (strlen (const_str));
11831   enum aarch64_parse_opt_result res =
11832     aarch64_parse_branch_protection (const_str, &str);
11833   if (res == AARCH64_PARSE_INVALID_ARG)
11834     error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str);
11835   else if (res == AARCH64_PARSE_MISSING_ARG)
11836     error ("missing arg for %<-mbranch-protection=%>");
11837   free (str);
11838   return res == AARCH64_PARSE_OK;
11839 }
11840 
11841 /* Validate a command-line -march option.  Parse the arch and extensions
11842    (if any) specified in STR and throw errors if appropriate.  Put the
11843    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
11844    option is valid.  */
11845 
11846 static bool
11847 aarch64_validate_march (const char *str, const struct processor **res,
11848 			 unsigned long *isa_flags)
11849 {
11850   std::string invalid_extension;
11851   enum aarch64_parse_opt_result parse_res
11852     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
11853 
11854   if (parse_res == AARCH64_PARSE_OK)
11855     return true;
11856 
11857   switch (parse_res)
11858     {
11859       case AARCH64_PARSE_MISSING_ARG:
11860 	error ("missing arch name in %<-march=%s%>", str);
11861 	break;
11862       case AARCH64_PARSE_INVALID_ARG:
11863 	error ("unknown value %qs for %<-march%>", str);
11864 	aarch64_print_hint_for_arch (str);
11865 	break;
11866       case AARCH64_PARSE_INVALID_FEATURE:
11867 	error ("invalid feature modifier %qs in %<-march=%s%>",
11868 	       invalid_extension.c_str (), str);
11869 	aarch64_print_hint_for_extensions (invalid_extension);
11870 	break;
11871       default:
11872 	gcc_unreachable ();
11873     }
11874 
11875   return false;
11876 }
11877 
11878 /* Validate a command-line -mtune option.  Parse the cpu
11879    specified in STR and throw errors if appropriate.  Put the
11880    result, if it is valid, in RES.  Return whether the option is
11881    valid.  */
11882 
11883 static bool
11884 aarch64_validate_mtune (const char *str, const struct processor **res)
11885 {
11886   enum aarch64_parse_opt_result parse_res
11887     = aarch64_parse_tune (str, res);
11888 
11889   if (parse_res == AARCH64_PARSE_OK)
11890     return true;
11891 
11892   switch (parse_res)
11893     {
11894       case AARCH64_PARSE_MISSING_ARG:
11895 	error ("missing cpu name in %<-mtune=%s%>", str);
11896 	break;
11897       case AARCH64_PARSE_INVALID_ARG:
11898 	error ("unknown value %qs for %<-mtune%>", str);
11899 	aarch64_print_hint_for_core (str);
11900 	break;
11901       default:
11902 	gcc_unreachable ();
11903     }
11904   return false;
11905 }
11906 
11907 /* Return the CPU corresponding to the enum CPU.
11908    If it doesn't specify a cpu, return the default.  */
11909 
11910 static const struct processor *
11911 aarch64_get_tune_cpu (enum aarch64_processor cpu)
11912 {
11913   if (cpu != aarch64_none)
11914     return &all_cores[cpu];
11915 
11916   /* The & 0x3f is to extract the bottom 6 bits that encode the
11917      default cpu as selected by the --with-cpu GCC configure option
11918      in config.gcc.
11919      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
11920      flags mechanism should be reworked to make it more sane.  */
11921   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11922 }
11923 
11924 /* Return the architecture corresponding to the enum ARCH.
11925    If it doesn't specify a valid architecture, return the default.  */
11926 
11927 static const struct processor *
11928 aarch64_get_arch (enum aarch64_arch arch)
11929 {
11930   if (arch != aarch64_no_arch)
11931     return &all_architectures[arch];
11932 
11933   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
11934 
11935   return &all_architectures[cpu->arch];
11936 }
11937 
11938 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
11939 
11940 static poly_uint16
11941 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
11942 {
11943   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
11944      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
11945      deciding which .md file patterns to use and when deciding whether
11946      something is a legitimate address or constant.  */
11947   if (value == SVE_SCALABLE || value == SVE_128)
11948     return poly_uint16 (2, 2);
11949   else
11950     return (int) value / 64;
11951 }
11952 
11953 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
11954    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
11955    tuning structs.  In particular it must set selected_tune and
11956    aarch64_isa_flags that define the available ISA features and tuning
11957    decisions.  It must also set selected_arch as this will be used to
11958    output the .arch asm tags for each function.  */
11959 
11960 static void
11961 aarch64_override_options (void)
11962 {
11963   unsigned long cpu_isa = 0;
11964   unsigned long arch_isa = 0;
11965   aarch64_isa_flags = 0;
11966 
11967   bool valid_cpu = true;
11968   bool valid_tune = true;
11969   bool valid_arch = true;
11970 
11971   selected_cpu = NULL;
11972   selected_arch = NULL;
11973   selected_tune = NULL;
11974 
11975   if (aarch64_branch_protection_string)
11976     aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
11977 
11978   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
11979      If either of -march or -mtune is given, they override their
11980      respective component of -mcpu.  */
11981   if (aarch64_cpu_string)
11982     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
11983 					&cpu_isa);
11984 
11985   if (aarch64_arch_string)
11986     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
11987 					  &arch_isa);
11988 
11989   if (aarch64_tune_string)
11990     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
11991 
11992 #ifdef SUBTARGET_OVERRIDE_OPTIONS
11993   SUBTARGET_OVERRIDE_OPTIONS;
11994 #endif
11995 
11996   /* If the user did not specify a processor, choose the default
11997      one for them.  This will be the CPU set during configuration using
11998      --with-cpu, otherwise it is "generic".  */
11999   if (!selected_cpu)
12000     {
12001       if (selected_arch)
12002 	{
12003 	  selected_cpu = &all_cores[selected_arch->ident];
12004 	  aarch64_isa_flags = arch_isa;
12005 	  explicit_arch = selected_arch->arch;
12006 	}
12007       else
12008 	{
12009 	  /* Get default configure-time CPU.  */
12010 	  selected_cpu = aarch64_get_tune_cpu (aarch64_none);
12011 	  aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
12012 	}
12013 
12014       if (selected_tune)
12015 	explicit_tune_core = selected_tune->ident;
12016     }
12017   /* If both -mcpu and -march are specified check that they are architecturally
12018      compatible, warn if they're not and prefer the -march ISA flags.  */
12019   else if (selected_arch)
12020     {
12021       if (selected_arch->arch != selected_cpu->arch)
12022 	{
12023 	  warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
12024 		       all_architectures[selected_cpu->arch].name,
12025 		       selected_arch->name);
12026 	}
12027       aarch64_isa_flags = arch_isa;
12028       explicit_arch = selected_arch->arch;
12029       explicit_tune_core = selected_tune ? selected_tune->ident
12030 					  : selected_cpu->ident;
12031     }
12032   else
12033     {
12034       /* -mcpu but no -march.  */
12035       aarch64_isa_flags = cpu_isa;
12036       explicit_tune_core = selected_tune ? selected_tune->ident
12037 					  : selected_cpu->ident;
12038       gcc_assert (selected_cpu);
12039       selected_arch = &all_architectures[selected_cpu->arch];
12040       explicit_arch = selected_arch->arch;
12041     }
12042 
12043   /* Set the arch as well as we will need it when outputing
12044      the .arch directive in assembly.  */
12045   if (!selected_arch)
12046     {
12047       gcc_assert (selected_cpu);
12048       selected_arch = &all_architectures[selected_cpu->arch];
12049     }
12050 
12051   if (!selected_tune)
12052     selected_tune = selected_cpu;
12053 
12054   if (aarch64_enable_bti == 2)
12055     {
12056 #ifdef TARGET_ENABLE_BTI
12057       aarch64_enable_bti = 1;
12058 #else
12059       aarch64_enable_bti = 0;
12060 #endif
12061     }
12062 
12063   /* Return address signing is currently not supported for ILP32 targets.  For
12064      LP64 targets use the configured option in the absence of a command-line
12065      option for -mbranch-protection.  */
12066   if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
12067     {
12068 #ifdef TARGET_ENABLE_PAC_RET
12069       aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
12070 #else
12071       aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
12072 #endif
12073     }
12074 
12075 #ifndef HAVE_AS_MABI_OPTION
12076   /* The compiler may have been configured with 2.23.* binutils, which does
12077      not have support for ILP32.  */
12078   if (TARGET_ILP32)
12079     error ("assembler does not support %<-mabi=ilp32%>");
12080 #endif
12081 
12082   /* Convert -msve-vector-bits to a VG count.  */
12083   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
12084 
12085   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
12086     sorry ("return address signing is only supported for %<-mabi=lp64%>");
12087 
12088   /* Make sure we properly set up the explicit options.  */
12089   if ((aarch64_cpu_string && valid_cpu)
12090        || (aarch64_tune_string && valid_tune))
12091     gcc_assert (explicit_tune_core != aarch64_none);
12092 
12093   if ((aarch64_cpu_string && valid_cpu)
12094        || (aarch64_arch_string && valid_arch))
12095     gcc_assert (explicit_arch != aarch64_no_arch);
12096 
12097   /* The pass to insert speculation tracking runs before
12098      shrink-wrapping and the latter does not know how to update the
12099      tracking status.  So disable it in this case.  */
12100   if (aarch64_track_speculation)
12101     flag_shrink_wrap = 0;
12102 
12103   aarch64_override_options_internal (&global_options);
12104 
12105   /* Save these options as the default ones in case we push and pop them later
12106      while processing functions with potential target attributes.  */
12107   target_option_default_node = target_option_current_node
12108       = build_target_option_node (&global_options);
12109 }
12110 
12111 /* Implement targetm.override_options_after_change.  */
12112 
12113 static void
12114 aarch64_override_options_after_change (void)
12115 {
12116   aarch64_override_options_after_change_1 (&global_options);
12117 }
12118 
12119 static struct machine_function *
12120 aarch64_init_machine_status (void)
12121 {
12122   struct machine_function *machine;
12123   machine = ggc_cleared_alloc<machine_function> ();
12124   return machine;
12125 }
12126 
12127 void
12128 aarch64_init_expanders (void)
12129 {
12130   init_machine_status = aarch64_init_machine_status;
12131 }
12132 
12133 /* A checking mechanism for the implementation of the various code models.  */
12134 static void
12135 initialize_aarch64_code_model (struct gcc_options *opts)
12136 {
12137    if (opts->x_flag_pic)
12138      {
12139        switch (opts->x_aarch64_cmodel_var)
12140 	 {
12141 	 case AARCH64_CMODEL_TINY:
12142 	   aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
12143 	   break;
12144 	 case AARCH64_CMODEL_SMALL:
12145 #ifdef HAVE_AS_SMALL_PIC_RELOCS
12146 	   aarch64_cmodel = (flag_pic == 2
12147 			     ? AARCH64_CMODEL_SMALL_PIC
12148 			     : AARCH64_CMODEL_SMALL_SPIC);
12149 #else
12150 	   aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
12151 #endif
12152 	   break;
12153 	 case AARCH64_CMODEL_LARGE:
12154 	   sorry ("code model %qs with %<-f%s%>", "large",
12155 		  opts->x_flag_pic > 1 ? "PIC" : "pic");
12156 	   break;
12157 	 default:
12158 	   gcc_unreachable ();
12159 	 }
12160      }
12161    else
12162      aarch64_cmodel = opts->x_aarch64_cmodel_var;
12163 }
12164 
12165 /* Implement TARGET_OPTION_SAVE.  */
12166 
12167 static void
12168 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
12169 {
12170   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
12171   ptr->x_aarch64_branch_protection_string
12172     = opts->x_aarch64_branch_protection_string;
12173 }
12174 
12175 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
12176    using the information saved in PTR.  */
12177 
12178 static void
12179 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
12180 {
12181   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
12182   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12183   opts->x_explicit_arch = ptr->x_explicit_arch;
12184   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
12185   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
12186   opts->x_aarch64_branch_protection_string
12187     = ptr->x_aarch64_branch_protection_string;
12188   if (opts->x_aarch64_branch_protection_string)
12189     {
12190       aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
12191 					NULL);
12192     }
12193 
12194   aarch64_override_options_internal (opts);
12195 }
12196 
12197 /* Implement TARGET_OPTION_PRINT.  */
12198 
12199 static void
12200 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
12201 {
12202   const struct processor *cpu
12203     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
12204   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
12205   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
12206   std::string extension
12207     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
12208 
12209   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
12210   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
12211 	   arch->name, extension.c_str ());
12212 }
12213 
12214 static GTY(()) tree aarch64_previous_fndecl;
12215 
12216 void
12217 aarch64_reset_previous_fndecl (void)
12218 {
12219   aarch64_previous_fndecl = NULL;
12220 }
12221 
12222 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
12223    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
12224    make sure optab availability predicates are recomputed when necessary.  */
12225 
12226 void
12227 aarch64_save_restore_target_globals (tree new_tree)
12228 {
12229   if (TREE_TARGET_GLOBALS (new_tree))
12230     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
12231   else if (new_tree == target_option_default_node)
12232     restore_target_globals (&default_target_globals);
12233   else
12234     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
12235 }
12236 
12237 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
12238    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
12239    of the function, if such exists.  This function may be called multiple
12240    times on a single function so use aarch64_previous_fndecl to avoid
12241    setting up identical state.  */
12242 
12243 static void
12244 aarch64_set_current_function (tree fndecl)
12245 {
12246   if (!fndecl || fndecl == aarch64_previous_fndecl)
12247     return;
12248 
12249   tree old_tree = (aarch64_previous_fndecl
12250 		   ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
12251 		   : NULL_TREE);
12252 
12253   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12254 
12255   /* If current function has no attributes but the previous one did,
12256      use the default node.  */
12257   if (!new_tree && old_tree)
12258     new_tree = target_option_default_node;
12259 
12260   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
12261      the default have been handled by aarch64_save_restore_target_globals from
12262      aarch64_pragma_target_parse.  */
12263   if (old_tree == new_tree)
12264     return;
12265 
12266   aarch64_previous_fndecl = fndecl;
12267 
12268   /* First set the target options.  */
12269   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
12270 
12271   aarch64_save_restore_target_globals (new_tree);
12272 }
12273 
12274 /* Enum describing the various ways we can handle attributes.
12275    In many cases we can reuse the generic option handling machinery.  */
12276 
12277 enum aarch64_attr_opt_type
12278 {
12279   aarch64_attr_mask,	/* Attribute should set a bit in target_flags.  */
12280   aarch64_attr_bool,	/* Attribute sets or unsets a boolean variable.  */
12281   aarch64_attr_enum,	/* Attribute sets an enum variable.  */
12282   aarch64_attr_custom	/* Attribute requires a custom handling function.  */
12283 };
12284 
12285 /* All the information needed to handle a target attribute.
12286    NAME is the name of the attribute.
12287    ATTR_TYPE specifies the type of behavior of the attribute as described
12288    in the definition of enum aarch64_attr_opt_type.
12289    ALLOW_NEG is true if the attribute supports a "no-" form.
12290    HANDLER is the function that takes the attribute string as an argument
12291    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
12292    OPT_NUM is the enum specifying the option that the attribute modifies.
12293    This is needed for attributes that mirror the behavior of a command-line
12294    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
12295    aarch64_attr_enum.  */
12296 
12297 struct aarch64_attribute_info
12298 {
12299   const char *name;
12300   enum aarch64_attr_opt_type attr_type;
12301   bool allow_neg;
12302   bool (*handler) (const char *);
12303   enum opt_code opt_num;
12304 };
12305 
12306 /* Handle the ARCH_STR argument to the arch= target attribute.  */
12307 
12308 static bool
12309 aarch64_handle_attr_arch (const char *str)
12310 {
12311   const struct processor *tmp_arch = NULL;
12312   std::string invalid_extension;
12313   enum aarch64_parse_opt_result parse_res
12314     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
12315 
12316   if (parse_res == AARCH64_PARSE_OK)
12317     {
12318       gcc_assert (tmp_arch);
12319       selected_arch = tmp_arch;
12320       explicit_arch = selected_arch->arch;
12321       return true;
12322     }
12323 
12324   switch (parse_res)
12325     {
12326       case AARCH64_PARSE_MISSING_ARG:
12327 	error ("missing name in %<target(\"arch=\")%> pragma or attribute");
12328 	break;
12329       case AARCH64_PARSE_INVALID_ARG:
12330 	error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
12331 	aarch64_print_hint_for_arch (str);
12332 	break;
12333       case AARCH64_PARSE_INVALID_FEATURE:
12334 	error ("invalid feature modifier %s of value (\"%s\") in "
12335 	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12336 	aarch64_print_hint_for_extensions (invalid_extension);
12337 	break;
12338       default:
12339 	gcc_unreachable ();
12340     }
12341 
12342   return false;
12343 }
12344 
12345 /* Handle the argument CPU_STR to the cpu= target attribute.  */
12346 
12347 static bool
12348 aarch64_handle_attr_cpu (const char *str)
12349 {
12350   const struct processor *tmp_cpu = NULL;
12351   std::string invalid_extension;
12352   enum aarch64_parse_opt_result parse_res
12353     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
12354 
12355   if (parse_res == AARCH64_PARSE_OK)
12356     {
12357       gcc_assert (tmp_cpu);
12358       selected_tune = tmp_cpu;
12359       explicit_tune_core = selected_tune->ident;
12360 
12361       selected_arch = &all_architectures[tmp_cpu->arch];
12362       explicit_arch = selected_arch->arch;
12363       return true;
12364     }
12365 
12366   switch (parse_res)
12367     {
12368       case AARCH64_PARSE_MISSING_ARG:
12369 	error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
12370 	break;
12371       case AARCH64_PARSE_INVALID_ARG:
12372 	error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
12373 	aarch64_print_hint_for_core (str);
12374 	break;
12375       case AARCH64_PARSE_INVALID_FEATURE:
12376 	error ("invalid feature modifier %s of value (\"%s\") in "
12377 	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12378 	aarch64_print_hint_for_extensions (invalid_extension);
12379 	break;
12380       default:
12381 	gcc_unreachable ();
12382     }
12383 
12384   return false;
12385 }
12386 
12387 /* Handle the argument STR to the branch-protection= attribute.  */
12388 
12389  static bool
12390  aarch64_handle_attr_branch_protection (const char* str)
12391  {
12392   char *err_str = (char *) xmalloc (strlen (str));
12393   enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
12394 								      &err_str);
12395   bool success = false;
12396   switch (res)
12397     {
12398      case AARCH64_PARSE_MISSING_ARG:
12399        error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
12400 	      " attribute");
12401        break;
12402      case AARCH64_PARSE_INVALID_ARG:
12403        error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
12404 	      "=\")%> pragma or attribute", err_str);
12405        break;
12406      case AARCH64_PARSE_OK:
12407        success = true;
12408       /* Fall through.  */
12409      case AARCH64_PARSE_INVALID_FEATURE:
12410        break;
12411      default:
12412        gcc_unreachable ();
12413     }
12414   free (err_str);
12415   return success;
12416  }
12417 
12418 /* Handle the argument STR to the tune= target attribute.  */
12419 
12420 static bool
12421 aarch64_handle_attr_tune (const char *str)
12422 {
12423   const struct processor *tmp_tune = NULL;
12424   enum aarch64_parse_opt_result parse_res
12425     = aarch64_parse_tune (str, &tmp_tune);
12426 
12427   if (parse_res == AARCH64_PARSE_OK)
12428     {
12429       gcc_assert (tmp_tune);
12430       selected_tune = tmp_tune;
12431       explicit_tune_core = selected_tune->ident;
12432       return true;
12433     }
12434 
12435   switch (parse_res)
12436     {
12437       case AARCH64_PARSE_INVALID_ARG:
12438 	error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
12439 	aarch64_print_hint_for_core (str);
12440 	break;
12441       default:
12442 	gcc_unreachable ();
12443     }
12444 
12445   return false;
12446 }
12447 
12448 /* Parse an architecture extensions target attribute string specified in STR.
12449    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
12450    if successful.  Update aarch64_isa_flags to reflect the ISA features
12451    modified.  */
12452 
12453 static bool
12454 aarch64_handle_attr_isa_flags (char *str)
12455 {
12456   enum aarch64_parse_opt_result parse_res;
12457   unsigned long isa_flags = aarch64_isa_flags;
12458 
12459   /* We allow "+nothing" in the beginning to clear out all architectural
12460      features if the user wants to handpick specific features.  */
12461   if (strncmp ("+nothing", str, 8) == 0)
12462     {
12463       isa_flags = 0;
12464       str += 8;
12465     }
12466 
12467   std::string invalid_extension;
12468   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
12469 
12470   if (parse_res == AARCH64_PARSE_OK)
12471     {
12472       aarch64_isa_flags = isa_flags;
12473       return true;
12474     }
12475 
12476   switch (parse_res)
12477     {
12478       case AARCH64_PARSE_MISSING_ARG:
12479 	error ("missing value in %<target()%> pragma or attribute");
12480 	break;
12481 
12482       case AARCH64_PARSE_INVALID_FEATURE:
12483 	error ("invalid feature modifier %s of value (\"%s\") in "
12484 	       "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
12485 	break;
12486 
12487       default:
12488 	gcc_unreachable ();
12489     }
12490 
12491  return false;
12492 }
12493 
12494 /* The target attributes that we support.  On top of these we also support just
12495    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
12496    handled explicitly in aarch64_process_one_target_attr.  */
12497 
12498 static const struct aarch64_attribute_info aarch64_attributes[] =
12499 {
12500   { "general-regs-only", aarch64_attr_mask, false, NULL,
12501      OPT_mgeneral_regs_only },
12502   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
12503      OPT_mfix_cortex_a53_835769 },
12504   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
12505      OPT_mfix_cortex_a53_843419 },
12506   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
12507   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
12508   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
12509      OPT_momit_leaf_frame_pointer },
12510   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
12511   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
12512      OPT_march_ },
12513   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
12514   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
12515      OPT_mtune_ },
12516   { "branch-protection", aarch64_attr_custom, false,
12517      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
12518   { "sign-return-address", aarch64_attr_enum, false, NULL,
12519      OPT_msign_return_address_ },
12520   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
12521 };
12522 
12523 /* Parse ARG_STR which contains the definition of one target attribute.
12524    Show appropriate errors if any or return true if the attribute is valid.  */
12525 
12526 static bool
12527 aarch64_process_one_target_attr (char *arg_str)
12528 {
12529   bool invert = false;
12530 
12531   size_t len = strlen (arg_str);
12532 
12533   if (len == 0)
12534     {
12535       error ("malformed %<target()%> pragma or attribute");
12536       return false;
12537     }
12538 
12539   char *str_to_check = (char *) alloca (len + 1);
12540   strcpy (str_to_check, arg_str);
12541 
12542   /* Skip leading whitespace.  */
12543   while (*str_to_check == ' ' || *str_to_check == '\t')
12544     str_to_check++;
12545 
12546   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
12547      It is easier to detect and handle it explicitly here rather than going
12548      through the machinery for the rest of the target attributes in this
12549      function.  */
12550   if (*str_to_check == '+')
12551     return aarch64_handle_attr_isa_flags (str_to_check);
12552 
12553   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
12554     {
12555       invert = true;
12556       str_to_check += 3;
12557     }
12558   char *arg = strchr (str_to_check, '=');
12559 
12560   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
12561      and point ARG to "foo".  */
12562   if (arg)
12563     {
12564       *arg = '\0';
12565       arg++;
12566     }
12567   const struct aarch64_attribute_info *p_attr;
12568   bool found = false;
12569   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
12570     {
12571       /* If the names don't match up, or the user has given an argument
12572 	 to an attribute that doesn't accept one, or didn't give an argument
12573 	 to an attribute that expects one, fail to match.  */
12574       if (strcmp (str_to_check, p_attr->name) != 0)
12575 	continue;
12576 
12577       found = true;
12578       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
12579 			      || p_attr->attr_type == aarch64_attr_enum;
12580 
12581       if (attr_need_arg_p ^ (arg != NULL))
12582 	{
12583 	  error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
12584 	  return false;
12585 	}
12586 
12587       /* If the name matches but the attribute does not allow "no-" versions
12588 	 then we can't match.  */
12589       if (invert && !p_attr->allow_neg)
12590 	{
12591 	  error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
12592 	  return false;
12593 	}
12594 
12595       switch (p_attr->attr_type)
12596 	{
12597 	/* Has a custom handler registered.
12598 	   For example, cpu=, arch=, tune=.  */
12599 	  case aarch64_attr_custom:
12600 	    gcc_assert (p_attr->handler);
12601 	    if (!p_attr->handler (arg))
12602 	      return false;
12603 	    break;
12604 
12605 	  /* Either set or unset a boolean option.  */
12606 	  case aarch64_attr_bool:
12607 	    {
12608 	      struct cl_decoded_option decoded;
12609 
12610 	      generate_option (p_attr->opt_num, NULL, !invert,
12611 			       CL_TARGET, &decoded);
12612 	      aarch64_handle_option (&global_options, &global_options_set,
12613 				      &decoded, input_location);
12614 	      break;
12615 	    }
12616 	  /* Set or unset a bit in the target_flags.  aarch64_handle_option
12617 	     should know what mask to apply given the option number.  */
12618 	  case aarch64_attr_mask:
12619 	    {
12620 	      struct cl_decoded_option decoded;
12621 	      /* We only need to specify the option number.
12622 		 aarch64_handle_option will know which mask to apply.  */
12623 	      decoded.opt_index = p_attr->opt_num;
12624 	      decoded.value = !invert;
12625 	      aarch64_handle_option (&global_options, &global_options_set,
12626 				      &decoded, input_location);
12627 	      break;
12628 	    }
12629 	  /* Use the option setting machinery to set an option to an enum.  */
12630 	  case aarch64_attr_enum:
12631 	    {
12632 	      gcc_assert (arg);
12633 	      bool valid;
12634 	      int value;
12635 	      valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
12636 					      &value, CL_TARGET);
12637 	      if (valid)
12638 		{
12639 		  set_option (&global_options, NULL, p_attr->opt_num, value,
12640 			      NULL, DK_UNSPECIFIED, input_location,
12641 			      global_dc);
12642 		}
12643 	      else
12644 		{
12645 		  error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
12646 		}
12647 	      break;
12648 	    }
12649 	  default:
12650 	    gcc_unreachable ();
12651 	}
12652     }
12653 
12654   /* If we reached here we either have found an attribute and validated
12655      it or didn't match any.  If we matched an attribute but its arguments
12656      were malformed we will have returned false already.  */
12657   return found;
12658 }
12659 
12660 /* Count how many times the character C appears in
12661    NULL-terminated string STR.  */
12662 
12663 static unsigned int
12664 num_occurences_in_str (char c, char *str)
12665 {
12666   unsigned int res = 0;
12667   while (*str != '\0')
12668     {
12669       if (*str == c)
12670 	res++;
12671 
12672       str++;
12673     }
12674 
12675   return res;
12676 }
12677 
12678 /* Parse the tree in ARGS that contains the target attribute information
12679    and update the global target options space.  */
12680 
12681 bool
12682 aarch64_process_target_attr (tree args)
12683 {
12684   if (TREE_CODE (args) == TREE_LIST)
12685     {
12686       do
12687 	{
12688 	  tree head = TREE_VALUE (args);
12689 	  if (head)
12690 	    {
12691 	      if (!aarch64_process_target_attr (head))
12692 		return false;
12693 	    }
12694 	  args = TREE_CHAIN (args);
12695 	} while (args);
12696 
12697       return true;
12698     }
12699 
12700   if (TREE_CODE (args) != STRING_CST)
12701     {
12702       error ("attribute %<target%> argument not a string");
12703       return false;
12704     }
12705 
12706   size_t len = strlen (TREE_STRING_POINTER (args));
12707   char *str_to_check = (char *) alloca (len + 1);
12708   strcpy (str_to_check, TREE_STRING_POINTER (args));
12709 
12710   if (len == 0)
12711     {
12712       error ("malformed %<target()%> pragma or attribute");
12713       return false;
12714     }
12715 
12716   /* Used to catch empty spaces between commas i.e.
12717      attribute ((target ("attr1,,attr2"))).  */
12718   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
12719 
12720   /* Handle multiple target attributes separated by ','.  */
12721   char *token = strtok_r (str_to_check, ",", &str_to_check);
12722 
12723   unsigned int num_attrs = 0;
12724   while (token)
12725     {
12726       num_attrs++;
12727       if (!aarch64_process_one_target_attr (token))
12728 	{
12729 	  error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
12730 	  return false;
12731 	}
12732 
12733       token = strtok_r (NULL, ",", &str_to_check);
12734     }
12735 
12736   if (num_attrs != num_commas + 1)
12737     {
12738       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
12739       return false;
12740     }
12741 
12742   return true;
12743 }
12744 
12745 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
12746    process attribute ((target ("..."))).  */
12747 
12748 static bool
12749 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
12750 {
12751   struct cl_target_option cur_target;
12752   bool ret;
12753   tree old_optimize;
12754   tree new_target, new_optimize;
12755   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
12756 
12757   /* If what we're processing is the current pragma string then the
12758      target option node is already stored in target_option_current_node
12759      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
12760      having to re-parse the string.  This is especially useful to keep
12761      arm_neon.h compile times down since that header contains a lot
12762      of intrinsics enclosed in pragmas.  */
12763   if (!existing_target && args == current_target_pragma)
12764     {
12765       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
12766       return true;
12767     }
12768   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12769 
12770   old_optimize = build_optimization_node (&global_options);
12771   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
12772 
12773   /* If the function changed the optimization levels as well as setting
12774      target options, start with the optimizations specified.  */
12775   if (func_optimize && func_optimize != old_optimize)
12776     cl_optimization_restore (&global_options,
12777 			     TREE_OPTIMIZATION (func_optimize));
12778 
12779   /* Save the current target options to restore at the end.  */
12780   cl_target_option_save (&cur_target, &global_options);
12781 
12782   /* If fndecl already has some target attributes applied to it, unpack
12783      them so that we add this attribute on top of them, rather than
12784      overwriting them.  */
12785   if (existing_target)
12786     {
12787       struct cl_target_option *existing_options
12788 	= TREE_TARGET_OPTION (existing_target);
12789 
12790       if (existing_options)
12791 	cl_target_option_restore (&global_options, existing_options);
12792     }
12793   else
12794     cl_target_option_restore (&global_options,
12795 			TREE_TARGET_OPTION (target_option_current_node));
12796 
12797   ret = aarch64_process_target_attr (args);
12798 
12799   /* Set up any additional state.  */
12800   if (ret)
12801     {
12802       aarch64_override_options_internal (&global_options);
12803       /* Initialize SIMD builtins if we haven't already.
12804 	 Set current_target_pragma to NULL for the duration so that
12805 	 the builtin initialization code doesn't try to tag the functions
12806 	 being built with the attributes specified by any current pragma, thus
12807 	 going into an infinite recursion.  */
12808       if (TARGET_SIMD)
12809 	{
12810 	  tree saved_current_target_pragma = current_target_pragma;
12811 	  current_target_pragma = NULL;
12812 	  aarch64_init_simd_builtins ();
12813 	  current_target_pragma = saved_current_target_pragma;
12814 	}
12815       new_target = build_target_option_node (&global_options);
12816     }
12817   else
12818     new_target = NULL;
12819 
12820   new_optimize = build_optimization_node (&global_options);
12821 
12822   if (fndecl && ret)
12823     {
12824       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
12825 
12826       if (old_optimize != new_optimize)
12827 	DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
12828     }
12829 
12830   cl_target_option_restore (&global_options, &cur_target);
12831 
12832   if (old_optimize != new_optimize)
12833     cl_optimization_restore (&global_options,
12834 			     TREE_OPTIMIZATION (old_optimize));
12835   return ret;
12836 }
12837 
12838 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
12839    tri-bool options (yes, no, don't care) and the default value is
12840    DEF, determine whether to reject inlining.  */
12841 
12842 static bool
12843 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
12844 				     int dont_care, int def)
12845 {
12846   /* If the callee doesn't care, always allow inlining.  */
12847   if (callee == dont_care)
12848     return true;
12849 
12850   /* If the caller doesn't care, always allow inlining.  */
12851   if (caller == dont_care)
12852     return true;
12853 
12854   /* Otherwise, allow inlining if either the callee and caller values
12855      agree, or if the callee is using the default value.  */
12856   return (callee == caller || callee == def);
12857 }
12858 
12859 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
12860    to inline CALLEE into CALLER based on target-specific info.
12861    Make sure that the caller and callee have compatible architectural
12862    features.  Then go through the other possible target attributes
12863    and see if they can block inlining.  Try not to reject always_inline
12864    callees unless they are incompatible architecturally.  */
12865 
12866 static bool
12867 aarch64_can_inline_p (tree caller, tree callee)
12868 {
12869   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
12870   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
12871 
12872   struct cl_target_option *caller_opts
12873 	= TREE_TARGET_OPTION (caller_tree ? caller_tree
12874 					   : target_option_default_node);
12875 
12876   struct cl_target_option *callee_opts
12877 	= TREE_TARGET_OPTION (callee_tree ? callee_tree
12878 					   : target_option_default_node);
12879 
12880   /* Callee's ISA flags should be a subset of the caller's.  */
12881   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
12882        != callee_opts->x_aarch64_isa_flags)
12883     return false;
12884 
12885   /* Allow non-strict aligned functions inlining into strict
12886      aligned ones.  */
12887   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
12888        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
12889       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
12890 	   && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
12891     return false;
12892 
12893   bool always_inline = lookup_attribute ("always_inline",
12894 					  DECL_ATTRIBUTES (callee));
12895 
12896   /* If the architectural features match up and the callee is always_inline
12897      then the other attributes don't matter.  */
12898   if (always_inline)
12899     return true;
12900 
12901   if (caller_opts->x_aarch64_cmodel_var
12902       != callee_opts->x_aarch64_cmodel_var)
12903     return false;
12904 
12905   if (caller_opts->x_aarch64_tls_dialect
12906       != callee_opts->x_aarch64_tls_dialect)
12907     return false;
12908 
12909   /* Honour explicit requests to workaround errata.  */
12910   if (!aarch64_tribools_ok_for_inlining_p (
12911 	  caller_opts->x_aarch64_fix_a53_err835769,
12912 	  callee_opts->x_aarch64_fix_a53_err835769,
12913 	  2, TARGET_FIX_ERR_A53_835769_DEFAULT))
12914     return false;
12915 
12916   if (!aarch64_tribools_ok_for_inlining_p (
12917 	  caller_opts->x_aarch64_fix_a53_err843419,
12918 	  callee_opts->x_aarch64_fix_a53_err843419,
12919 	  2, TARGET_FIX_ERR_A53_843419))
12920     return false;
12921 
12922   /* If the user explicitly specified -momit-leaf-frame-pointer for the
12923      caller and calle and they don't match up, reject inlining.  */
12924   if (!aarch64_tribools_ok_for_inlining_p (
12925 	  caller_opts->x_flag_omit_leaf_frame_pointer,
12926 	  callee_opts->x_flag_omit_leaf_frame_pointer,
12927 	  2, 1))
12928     return false;
12929 
12930   /* If the callee has specific tuning overrides, respect them.  */
12931   if (callee_opts->x_aarch64_override_tune_string != NULL
12932       && caller_opts->x_aarch64_override_tune_string == NULL)
12933     return false;
12934 
12935   /* If the user specified tuning override strings for the
12936      caller and callee and they don't match up, reject inlining.
12937      We just do a string compare here, we don't analyze the meaning
12938      of the string, as it would be too costly for little gain.  */
12939   if (callee_opts->x_aarch64_override_tune_string
12940       && caller_opts->x_aarch64_override_tune_string
12941       && (strcmp (callee_opts->x_aarch64_override_tune_string,
12942 		  caller_opts->x_aarch64_override_tune_string) != 0))
12943     return false;
12944 
12945   return true;
12946 }
12947 
12948 /* Return true if SYMBOL_REF X binds locally.  */
12949 
12950 static bool
12951 aarch64_symbol_binds_local_p (const_rtx x)
12952 {
12953   return (SYMBOL_REF_DECL (x)
12954 	  ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
12955 	  : SYMBOL_REF_LOCAL_P (x));
12956 }
12957 
12958 /* Return true if SYMBOL_REF X is thread local */
12959 static bool
12960 aarch64_tls_symbol_p (rtx x)
12961 {
12962   if (! TARGET_HAVE_TLS)
12963     return false;
12964 
12965   if (GET_CODE (x) != SYMBOL_REF)
12966     return false;
12967 
12968   return SYMBOL_REF_TLS_MODEL (x) != 0;
12969 }
12970 
12971 /* Classify a TLS symbol into one of the TLS kinds.  */
12972 enum aarch64_symbol_type
12973 aarch64_classify_tls_symbol (rtx x)
12974 {
12975   enum tls_model tls_kind = tls_symbolic_operand_type (x);
12976 
12977   switch (tls_kind)
12978     {
12979     case TLS_MODEL_GLOBAL_DYNAMIC:
12980     case TLS_MODEL_LOCAL_DYNAMIC:
12981       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
12982 
12983     case TLS_MODEL_INITIAL_EXEC:
12984       switch (aarch64_cmodel)
12985 	{
12986 	case AARCH64_CMODEL_TINY:
12987 	case AARCH64_CMODEL_TINY_PIC:
12988 	  return SYMBOL_TINY_TLSIE;
12989 	default:
12990 	  return SYMBOL_SMALL_TLSIE;
12991 	}
12992 
12993     case TLS_MODEL_LOCAL_EXEC:
12994       if (aarch64_tls_size == 12)
12995 	return SYMBOL_TLSLE12;
12996       else if (aarch64_tls_size == 24)
12997 	return SYMBOL_TLSLE24;
12998       else if (aarch64_tls_size == 32)
12999 	return SYMBOL_TLSLE32;
13000       else if (aarch64_tls_size == 48)
13001 	return SYMBOL_TLSLE48;
13002       else
13003 	gcc_unreachable ();
13004 
13005     case TLS_MODEL_EMULATED:
13006     case TLS_MODEL_NONE:
13007       return SYMBOL_FORCE_TO_MEM;
13008 
13009     default:
13010       gcc_unreachable ();
13011     }
13012 }
13013 
13014 /* Return the correct method for accessing X + OFFSET, where X is either
13015    a SYMBOL_REF or LABEL_REF.  */
13016 
13017 enum aarch64_symbol_type
13018 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
13019 {
13020   if (GET_CODE (x) == LABEL_REF)
13021     {
13022       switch (aarch64_cmodel)
13023 	{
13024 	case AARCH64_CMODEL_LARGE:
13025 	  return SYMBOL_FORCE_TO_MEM;
13026 
13027 	case AARCH64_CMODEL_TINY_PIC:
13028 	case AARCH64_CMODEL_TINY:
13029 	  return SYMBOL_TINY_ABSOLUTE;
13030 
13031 	case AARCH64_CMODEL_SMALL_SPIC:
13032 	case AARCH64_CMODEL_SMALL_PIC:
13033 	case AARCH64_CMODEL_SMALL:
13034 	  return SYMBOL_SMALL_ABSOLUTE;
13035 
13036 	default:
13037 	  gcc_unreachable ();
13038 	}
13039     }
13040 
13041   if (GET_CODE (x) == SYMBOL_REF)
13042     {
13043       if (aarch64_tls_symbol_p (x))
13044 	return aarch64_classify_tls_symbol (x);
13045 
13046       switch (aarch64_cmodel)
13047 	{
13048 	case AARCH64_CMODEL_TINY:
13049 	  /* When we retrieve symbol + offset address, we have to make sure
13050 	     the offset does not cause overflow of the final address.  But
13051 	     we have no way of knowing the address of symbol at compile time
13052 	     so we can't accurately say if the distance between the PC and
13053 	     symbol + offset is outside the addressible range of +/-1M in the
13054 	     TINY code model.  So we rely on images not being greater than
13055 	     1M and cap the offset at 1M and anything beyond 1M will have to
13056 	     be loaded using an alternative mechanism.  Furthermore if the
13057 	     symbol is a weak reference to something that isn't known to
13058 	     resolve to a symbol in this module, then force to memory.  */
13059 	  if ((SYMBOL_REF_WEAK (x)
13060 	       && !aarch64_symbol_binds_local_p (x))
13061 	      || !IN_RANGE (offset, -1048575, 1048575))
13062 	    return SYMBOL_FORCE_TO_MEM;
13063 	  return SYMBOL_TINY_ABSOLUTE;
13064 
13065 	case AARCH64_CMODEL_SMALL:
13066 	  /* Same reasoning as the tiny code model, but the offset cap here is
13067 	     4G.  */
13068 	  if ((SYMBOL_REF_WEAK (x)
13069 	       && !aarch64_symbol_binds_local_p (x))
13070 	      || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
13071 			    HOST_WIDE_INT_C (4294967264)))
13072 	    return SYMBOL_FORCE_TO_MEM;
13073 	  return SYMBOL_SMALL_ABSOLUTE;
13074 
13075 	case AARCH64_CMODEL_TINY_PIC:
13076 	  if (!aarch64_symbol_binds_local_p (x))
13077 	    return SYMBOL_TINY_GOT;
13078 	  return SYMBOL_TINY_ABSOLUTE;
13079 
13080 	case AARCH64_CMODEL_SMALL_SPIC:
13081 	case AARCH64_CMODEL_SMALL_PIC:
13082 	  if (!aarch64_symbol_binds_local_p (x))
13083 	    return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
13084 		    ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
13085 	  return SYMBOL_SMALL_ABSOLUTE;
13086 
13087 	case AARCH64_CMODEL_LARGE:
13088 	  /* This is alright even in PIC code as the constant
13089 	     pool reference is always PC relative and within
13090 	     the same translation unit.  */
13091 	  if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
13092 	    return SYMBOL_SMALL_ABSOLUTE;
13093 	  else
13094 	    return SYMBOL_FORCE_TO_MEM;
13095 
13096 	default:
13097 	  gcc_unreachable ();
13098 	}
13099     }
13100 
13101   /* By default push everything into the constant pool.  */
13102   return SYMBOL_FORCE_TO_MEM;
13103 }
13104 
13105 bool
13106 aarch64_constant_address_p (rtx x)
13107 {
13108   return (CONSTANT_P (x) && memory_address_p (DImode, x));
13109 }
13110 
13111 bool
13112 aarch64_legitimate_pic_operand_p (rtx x)
13113 {
13114   if (GET_CODE (x) == SYMBOL_REF
13115       || (GET_CODE (x) == CONST
13116 	  && GET_CODE (XEXP (x, 0)) == PLUS
13117 	  && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
13118      return false;
13119 
13120   return true;
13121 }
13122 
13123 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
13124    that should be rematerialized rather than spilled.  */
13125 
13126 static bool
13127 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
13128 {
13129   /* Support CSE and rematerialization of common constants.  */
13130   if (CONST_INT_P (x)
13131       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
13132       || GET_CODE (x) == CONST_VECTOR)
13133     return true;
13134 
13135   /* Do not allow vector struct mode constants for Advanced SIMD.
13136      We could support 0 and -1 easily, but they need support in
13137      aarch64-simd.md.  */
13138   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13139   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13140     return false;
13141 
13142   /* Only accept variable-length vector constants if they can be
13143      handled directly.
13144 
13145      ??? It would be possible to handle rematerialization of other
13146      constants via secondary reloads.  */
13147   if (vec_flags & VEC_ANY_SVE)
13148     return aarch64_simd_valid_immediate (x, NULL);
13149 
13150   if (GET_CODE (x) == HIGH)
13151     x = XEXP (x, 0);
13152 
13153   /* Accept polynomial constants that can be calculated by using the
13154      destination of a move as the sole temporary.  Constants that
13155      require a second temporary cannot be rematerialized (they can't be
13156      forced to memory and also aren't legitimate constants).  */
13157   poly_int64 offset;
13158   if (poly_int_rtx_p (x, &offset))
13159     return aarch64_offset_temporaries (false, offset) <= 1;
13160 
13161   /* If an offset is being added to something else, we need to allow the
13162      base to be moved into the destination register, meaning that there
13163      are no free temporaries for the offset.  */
13164   x = strip_offset (x, &offset);
13165   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
13166     return false;
13167 
13168   /* Do not allow const (plus (anchor_symbol, const_int)).  */
13169   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
13170     return false;
13171 
13172   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
13173      so spilling them is better than rematerialization.  */
13174   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
13175     return true;
13176 
13177   /* Label references are always constant.  */
13178   if (GET_CODE (x) == LABEL_REF)
13179     return true;
13180 
13181   return false;
13182 }
13183 
13184 rtx
13185 aarch64_load_tp (rtx target)
13186 {
13187   if (!target
13188       || GET_MODE (target) != Pmode
13189       || !register_operand (target, Pmode))
13190     target = gen_reg_rtx (Pmode);
13191 
13192   /* Can return in any reg.  */
13193   emit_insn (gen_aarch64_load_tp_hard (target));
13194   return target;
13195 }
13196 
13197 /* On AAPCS systems, this is the "struct __va_list".  */
13198 static GTY(()) tree va_list_type;
13199 
13200 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
13201    Return the type to use as __builtin_va_list.
13202 
13203    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
13204 
13205    struct __va_list
13206    {
13207      void *__stack;
13208      void *__gr_top;
13209      void *__vr_top;
13210      int   __gr_offs;
13211      int   __vr_offs;
13212    };  */
13213 
13214 static tree
13215 aarch64_build_builtin_va_list (void)
13216 {
13217   tree va_list_name;
13218   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13219 
13220   /* Create the type.  */
13221   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
13222   /* Give it the required name.  */
13223   va_list_name = build_decl (BUILTINS_LOCATION,
13224 			     TYPE_DECL,
13225 			     get_identifier ("__va_list"),
13226 			     va_list_type);
13227   DECL_ARTIFICIAL (va_list_name) = 1;
13228   TYPE_NAME (va_list_type) = va_list_name;
13229   TYPE_STUB_DECL (va_list_type) = va_list_name;
13230 
13231   /* Create the fields.  */
13232   f_stack = build_decl (BUILTINS_LOCATION,
13233 			FIELD_DECL, get_identifier ("__stack"),
13234 			ptr_type_node);
13235   f_grtop = build_decl (BUILTINS_LOCATION,
13236 			FIELD_DECL, get_identifier ("__gr_top"),
13237 			ptr_type_node);
13238   f_vrtop = build_decl (BUILTINS_LOCATION,
13239 			FIELD_DECL, get_identifier ("__vr_top"),
13240 			ptr_type_node);
13241   f_groff = build_decl (BUILTINS_LOCATION,
13242 			FIELD_DECL, get_identifier ("__gr_offs"),
13243 			integer_type_node);
13244   f_vroff = build_decl (BUILTINS_LOCATION,
13245 			FIELD_DECL, get_identifier ("__vr_offs"),
13246 			integer_type_node);
13247 
13248   /* Tell tree-stdarg pass about our internal offset fields.
13249      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
13250      purpose to identify whether the code is updating va_list internal
13251      offset fields through irregular way.  */
13252   va_list_gpr_counter_field = f_groff;
13253   va_list_fpr_counter_field = f_vroff;
13254 
13255   DECL_ARTIFICIAL (f_stack) = 1;
13256   DECL_ARTIFICIAL (f_grtop) = 1;
13257   DECL_ARTIFICIAL (f_vrtop) = 1;
13258   DECL_ARTIFICIAL (f_groff) = 1;
13259   DECL_ARTIFICIAL (f_vroff) = 1;
13260 
13261   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
13262   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
13263   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
13264   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
13265   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
13266 
13267   TYPE_FIELDS (va_list_type) = f_stack;
13268   DECL_CHAIN (f_stack) = f_grtop;
13269   DECL_CHAIN (f_grtop) = f_vrtop;
13270   DECL_CHAIN (f_vrtop) = f_groff;
13271   DECL_CHAIN (f_groff) = f_vroff;
13272 
13273   /* Compute its layout.  */
13274   layout_type (va_list_type);
13275 
13276   return va_list_type;
13277 }
13278 
13279 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
13280 static void
13281 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
13282 {
13283   const CUMULATIVE_ARGS *cum;
13284   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13285   tree stack, grtop, vrtop, groff, vroff;
13286   tree t;
13287   int gr_save_area_size = cfun->va_list_gpr_size;
13288   int vr_save_area_size = cfun->va_list_fpr_size;
13289   int vr_offset;
13290 
13291   cum = &crtl->args.info;
13292   if (cfun->va_list_gpr_size)
13293     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
13294 			     cfun->va_list_gpr_size);
13295   if (cfun->va_list_fpr_size)
13296     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
13297 			     * UNITS_PER_VREG, cfun->va_list_fpr_size);
13298 
13299   if (!TARGET_FLOAT)
13300     {
13301       gcc_assert (cum->aapcs_nvrn == 0);
13302       vr_save_area_size = 0;
13303     }
13304 
13305   f_stack = TYPE_FIELDS (va_list_type_node);
13306   f_grtop = DECL_CHAIN (f_stack);
13307   f_vrtop = DECL_CHAIN (f_grtop);
13308   f_groff = DECL_CHAIN (f_vrtop);
13309   f_vroff = DECL_CHAIN (f_groff);
13310 
13311   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
13312 		  NULL_TREE);
13313   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
13314 		  NULL_TREE);
13315   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
13316 		  NULL_TREE);
13317   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
13318 		  NULL_TREE);
13319   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
13320 		  NULL_TREE);
13321 
13322   /* Emit code to initialize STACK, which points to the next varargs stack
13323      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
13324      by named arguments.  STACK is 8-byte aligned.  */
13325   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
13326   if (cum->aapcs_stack_size > 0)
13327     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
13328   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
13329   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13330 
13331   /* Emit code to initialize GRTOP, the top of the GR save area.
13332      virtual_incoming_args_rtx should have been 16 byte aligned.  */
13333   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
13334   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
13335   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13336 
13337   /* Emit code to initialize VRTOP, the top of the VR save area.
13338      This address is gr_save_area_bytes below GRTOP, rounded
13339      down to the next 16-byte boundary.  */
13340   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
13341   vr_offset = ROUND_UP (gr_save_area_size,
13342 			STACK_BOUNDARY / BITS_PER_UNIT);
13343 
13344   if (vr_offset)
13345     t = fold_build_pointer_plus_hwi (t, -vr_offset);
13346   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
13347   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13348 
13349   /* Emit code to initialize GROFF, the offset from GRTOP of the
13350      next GPR argument.  */
13351   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
13352 	      build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
13353   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13354 
13355   /* Likewise emit code to initialize VROFF, the offset from FTOP
13356      of the next VR argument.  */
13357   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
13358 	      build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
13359   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
13360 }
13361 
13362 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
13363 
13364 static tree
13365 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
13366 			      gimple_seq *post_p ATTRIBUTE_UNUSED)
13367 {
13368   tree addr;
13369   bool indirect_p;
13370   bool is_ha;		/* is HFA or HVA.  */
13371   bool dw_align;	/* double-word align.  */
13372   machine_mode ag_mode = VOIDmode;
13373   int nregs;
13374   machine_mode mode;
13375 
13376   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
13377   tree stack, f_top, f_off, off, arg, roundup, on_stack;
13378   HOST_WIDE_INT size, rsize, adjust, align;
13379   tree t, u, cond1, cond2;
13380 
13381   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
13382   if (indirect_p)
13383     type = build_pointer_type (type);
13384 
13385   mode = TYPE_MODE (type);
13386 
13387   f_stack = TYPE_FIELDS (va_list_type_node);
13388   f_grtop = DECL_CHAIN (f_stack);
13389   f_vrtop = DECL_CHAIN (f_grtop);
13390   f_groff = DECL_CHAIN (f_vrtop);
13391   f_vroff = DECL_CHAIN (f_groff);
13392 
13393   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
13394 		  f_stack, NULL_TREE);
13395   size = int_size_in_bytes (type);
13396 
13397   bool abi_break;
13398   align
13399     = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
13400 
13401   dw_align = false;
13402   adjust = 0;
13403   if (aarch64_vfp_is_call_or_return_candidate (mode,
13404 					       type,
13405 					       &ag_mode,
13406 					       &nregs,
13407 					       &is_ha))
13408     {
13409       /* No frontends can create types with variable-sized modes, so we
13410 	 shouldn't be asked to pass or return them.  */
13411       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
13412 
13413       /* TYPE passed in fp/simd registers.  */
13414       if (!TARGET_FLOAT)
13415 	aarch64_err_no_fpadvsimd (mode);
13416 
13417       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
13418 		      unshare_expr (valist), f_vrtop, NULL_TREE);
13419       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
13420 		      unshare_expr (valist), f_vroff, NULL_TREE);
13421 
13422       rsize = nregs * UNITS_PER_VREG;
13423 
13424       if (is_ha)
13425 	{
13426 	  if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
13427 	    adjust = UNITS_PER_VREG - ag_size;
13428 	}
13429       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13430 	       && size < UNITS_PER_VREG)
13431 	{
13432 	  adjust = UNITS_PER_VREG - size;
13433 	}
13434     }
13435   else
13436     {
13437       /* TYPE passed in general registers.  */
13438       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
13439 		      unshare_expr (valist), f_grtop, NULL_TREE);
13440       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
13441 		      unshare_expr (valist), f_groff, NULL_TREE);
13442       rsize = ROUND_UP (size, UNITS_PER_WORD);
13443       nregs = rsize / UNITS_PER_WORD;
13444 
13445       if (align > 8)
13446 	{
13447 	  if (abi_break && warn_psabi)
13448 	    inform (input_location, "parameter passing for argument of type "
13449 		    "%qT changed in GCC 9.1", type);
13450 	  dw_align = true;
13451 	}
13452 
13453       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13454 	  && size < UNITS_PER_WORD)
13455 	{
13456 	  adjust = UNITS_PER_WORD  - size;
13457 	}
13458     }
13459 
13460   /* Get a local temporary for the field value.  */
13461   off = get_initialized_tmp_var (f_off, pre_p, NULL);
13462 
13463   /* Emit code to branch if off >= 0.  */
13464   t = build2 (GE_EXPR, boolean_type_node, off,
13465 	      build_int_cst (TREE_TYPE (off), 0));
13466   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
13467 
13468   if (dw_align)
13469     {
13470       /* Emit: offs = (offs + 15) & -16.  */
13471       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13472 		  build_int_cst (TREE_TYPE (off), 15));
13473       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
13474 		  build_int_cst (TREE_TYPE (off), -16));
13475       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
13476     }
13477   else
13478     roundup = NULL;
13479 
13480   /* Update ap.__[g|v]r_offs  */
13481   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
13482 	      build_int_cst (TREE_TYPE (off), rsize));
13483   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
13484 
13485   /* String up.  */
13486   if (roundup)
13487     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13488 
13489   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
13490   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
13491 	      build_int_cst (TREE_TYPE (f_off), 0));
13492   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
13493 
13494   /* String up: make sure the assignment happens before the use.  */
13495   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
13496   COND_EXPR_ELSE (cond1) = t;
13497 
13498   /* Prepare the trees handling the argument that is passed on the stack;
13499      the top level node will store in ON_STACK.  */
13500   arg = get_initialized_tmp_var (stack, pre_p, NULL);
13501   if (align > 8)
13502     {
13503       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
13504       t = fold_build_pointer_plus_hwi (arg, 15);
13505       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13506 		  build_int_cst (TREE_TYPE (t), -16));
13507       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
13508     }
13509   else
13510     roundup = NULL;
13511   /* Advance ap.__stack  */
13512   t = fold_build_pointer_plus_hwi (arg, size + 7);
13513   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
13514 	      build_int_cst (TREE_TYPE (t), -8));
13515   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
13516   /* String up roundup and advance.  */
13517   if (roundup)
13518     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
13519   /* String up with arg */
13520   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
13521   /* Big-endianness related address adjustment.  */
13522   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
13523       && size < UNITS_PER_WORD)
13524   {
13525     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
13526 		size_int (UNITS_PER_WORD - size));
13527     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
13528   }
13529 
13530   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
13531   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
13532 
13533   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
13534   t = off;
13535   if (adjust)
13536     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
13537 		build_int_cst (TREE_TYPE (off), adjust));
13538 
13539   t = fold_convert (sizetype, t);
13540   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
13541 
13542   if (is_ha)
13543     {
13544       /* type ha; // treat as "struct {ftype field[n];}"
13545          ... [computing offs]
13546          for (i = 0; i <nregs; ++i, offs += 16)
13547 	   ha.field[i] = *((ftype *)(ap.__vr_top + offs));
13548 	 return ha;  */
13549       int i;
13550       tree tmp_ha, field_t, field_ptr_t;
13551 
13552       /* Declare a local variable.  */
13553       tmp_ha = create_tmp_var_raw (type, "ha");
13554       gimple_add_tmp_var (tmp_ha);
13555 
13556       /* Establish the base type.  */
13557       switch (ag_mode)
13558 	{
13559 	case E_SFmode:
13560 	  field_t = float_type_node;
13561 	  field_ptr_t = float_ptr_type_node;
13562 	  break;
13563 	case E_DFmode:
13564 	  field_t = double_type_node;
13565 	  field_ptr_t = double_ptr_type_node;
13566 	  break;
13567 	case E_TFmode:
13568 	  field_t = long_double_type_node;
13569 	  field_ptr_t = long_double_ptr_type_node;
13570 	  break;
13571 	case E_HFmode:
13572 	  field_t = aarch64_fp16_type_node;
13573 	  field_ptr_t = aarch64_fp16_ptr_type_node;
13574 	  break;
13575 	case E_V2SImode:
13576 	case E_V4SImode:
13577 	    {
13578 	      tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
13579 	      field_t = build_vector_type_for_mode (innertype, ag_mode);
13580 	      field_ptr_t = build_pointer_type (field_t);
13581 	    }
13582 	  break;
13583 	default:
13584 	  gcc_assert (0);
13585 	}
13586 
13587       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
13588       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
13589       addr = t;
13590       t = fold_convert (field_ptr_t, addr);
13591       t = build2 (MODIFY_EXPR, field_t,
13592 		  build1 (INDIRECT_REF, field_t, tmp_ha),
13593 		  build1 (INDIRECT_REF, field_t, t));
13594 
13595       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
13596       for (i = 1; i < nregs; ++i)
13597 	{
13598 	  addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
13599 	  u = fold_convert (field_ptr_t, addr);
13600 	  u = build2 (MODIFY_EXPR, field_t,
13601 		      build2 (MEM_REF, field_t, tmp_ha,
13602 			      build_int_cst (field_ptr_t,
13603 					     (i *
13604 					      int_size_in_bytes (field_t)))),
13605 		      build1 (INDIRECT_REF, field_t, u));
13606 	  t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
13607 	}
13608 
13609       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
13610       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
13611     }
13612 
13613   COND_EXPR_ELSE (cond2) = t;
13614   addr = fold_convert (build_pointer_type (type), cond1);
13615   addr = build_va_arg_indirect_ref (addr);
13616 
13617   if (indirect_p)
13618     addr = build_va_arg_indirect_ref (addr);
13619 
13620   return addr;
13621 }
13622 
13623 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
13624 
13625 static void
13626 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
13627 				tree type, int *pretend_size ATTRIBUTE_UNUSED,
13628 				int no_rtl)
13629 {
13630   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
13631   CUMULATIVE_ARGS local_cum;
13632   int gr_saved = cfun->va_list_gpr_size;
13633   int vr_saved = cfun->va_list_fpr_size;
13634 
13635   /* The caller has advanced CUM up to, but not beyond, the last named
13636      argument.  Advance a local copy of CUM past the last "real" named
13637      argument, to find out how many registers are left over.  */
13638   local_cum = *cum;
13639   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
13640 
13641   /* Found out how many registers we need to save.
13642      Honor tree-stdvar analysis results.  */
13643   if (cfun->va_list_gpr_size)
13644     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
13645 		    cfun->va_list_gpr_size / UNITS_PER_WORD);
13646   if (cfun->va_list_fpr_size)
13647     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
13648 		    cfun->va_list_fpr_size / UNITS_PER_VREG);
13649 
13650   if (!TARGET_FLOAT)
13651     {
13652       gcc_assert (local_cum.aapcs_nvrn == 0);
13653       vr_saved = 0;
13654     }
13655 
13656   if (!no_rtl)
13657     {
13658       if (gr_saved > 0)
13659 	{
13660 	  rtx ptr, mem;
13661 
13662 	  /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
13663 	  ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
13664 			       - gr_saved * UNITS_PER_WORD);
13665 	  mem = gen_frame_mem (BLKmode, ptr);
13666 	  set_mem_alias_set (mem, get_varargs_alias_set ());
13667 
13668 	  move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
13669 			       mem, gr_saved);
13670 	}
13671       if (vr_saved > 0)
13672 	{
13673 	  /* We can't use move_block_from_reg, because it will use
13674 	     the wrong mode, storing D regs only.  */
13675 	  machine_mode mode = TImode;
13676 	  int off, i, vr_start;
13677 
13678 	  /* Set OFF to the offset from virtual_incoming_args_rtx of
13679 	     the first vector register.  The VR save area lies below
13680 	     the GR one, and is aligned to 16 bytes.  */
13681 	  off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
13682 			   STACK_BOUNDARY / BITS_PER_UNIT);
13683 	  off -= vr_saved * UNITS_PER_VREG;
13684 
13685 	  vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
13686 	  for (i = 0; i < vr_saved; ++i)
13687 	    {
13688 	      rtx ptr, mem;
13689 
13690 	      ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
13691 	      mem = gen_frame_mem (mode, ptr);
13692 	      set_mem_alias_set (mem, get_varargs_alias_set ());
13693 	      aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
13694 	      off += UNITS_PER_VREG;
13695 	    }
13696 	}
13697     }
13698 
13699   /* We don't save the size into *PRETEND_SIZE because we want to avoid
13700      any complication of having crtl->args.pretend_args_size changed.  */
13701   cfun->machine->frame.saved_varargs_size
13702     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
13703 		 STACK_BOUNDARY / BITS_PER_UNIT)
13704        + vr_saved * UNITS_PER_VREG);
13705 }
13706 
13707 static void
13708 aarch64_conditional_register_usage (void)
13709 {
13710   int i;
13711   if (!TARGET_FLOAT)
13712     {
13713       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
13714 	{
13715 	  fixed_regs[i] = 1;
13716 	  call_used_regs[i] = 1;
13717 	}
13718     }
13719   if (!TARGET_SVE)
13720     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
13721       {
13722 	fixed_regs[i] = 1;
13723 	call_used_regs[i] = 1;
13724       }
13725 
13726   /* When tracking speculation, we need a couple of call-clobbered registers
13727      to track the speculation state.  It would be nice to just use
13728      IP0 and IP1, but currently there are numerous places that just
13729      assume these registers are free for other uses (eg pointer
13730      authentication).  */
13731   if (aarch64_track_speculation)
13732     {
13733       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
13734       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
13735       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13736       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
13737     }
13738 }
13739 
13740 /* Walk down the type tree of TYPE counting consecutive base elements.
13741    If *MODEP is VOIDmode, then set it to the first valid floating point
13742    type.  If a non-floating point type is found, or if a floating point
13743    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
13744    otherwise return the count in the sub-tree.  */
13745 static int
13746 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
13747 {
13748   machine_mode mode;
13749   HOST_WIDE_INT size;
13750 
13751   switch (TREE_CODE (type))
13752     {
13753     case REAL_TYPE:
13754       mode = TYPE_MODE (type);
13755       if (mode != DFmode && mode != SFmode
13756 	  && mode != TFmode && mode != HFmode)
13757 	return -1;
13758 
13759       if (*modep == VOIDmode)
13760 	*modep = mode;
13761 
13762       if (*modep == mode)
13763 	return 1;
13764 
13765       break;
13766 
13767     case COMPLEX_TYPE:
13768       mode = TYPE_MODE (TREE_TYPE (type));
13769       if (mode != DFmode && mode != SFmode
13770 	  && mode != TFmode && mode != HFmode)
13771 	return -1;
13772 
13773       if (*modep == VOIDmode)
13774 	*modep = mode;
13775 
13776       if (*modep == mode)
13777 	return 2;
13778 
13779       break;
13780 
13781     case VECTOR_TYPE:
13782       /* Use V2SImode and V4SImode as representatives of all 64-bit
13783 	 and 128-bit vector types.  */
13784       size = int_size_in_bytes (type);
13785       switch (size)
13786 	{
13787 	case 8:
13788 	  mode = V2SImode;
13789 	  break;
13790 	case 16:
13791 	  mode = V4SImode;
13792 	  break;
13793 	default:
13794 	  return -1;
13795 	}
13796 
13797       if (*modep == VOIDmode)
13798 	*modep = mode;
13799 
13800       /* Vector modes are considered to be opaque: two vectors are
13801 	 equivalent for the purposes of being homogeneous aggregates
13802 	 if they are the same size.  */
13803       if (*modep == mode)
13804 	return 1;
13805 
13806       break;
13807 
13808     case ARRAY_TYPE:
13809       {
13810 	int count;
13811 	tree index = TYPE_DOMAIN (type);
13812 
13813 	/* Can't handle incomplete types nor sizes that are not
13814 	   fixed.  */
13815 	if (!COMPLETE_TYPE_P (type)
13816 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13817 	  return -1;
13818 
13819 	count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
13820 	if (count == -1
13821 	    || !index
13822 	    || !TYPE_MAX_VALUE (index)
13823 	    || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
13824 	    || !TYPE_MIN_VALUE (index)
13825 	    || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
13826 	    || count < 0)
13827 	  return -1;
13828 
13829 	count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
13830 		      - tree_to_uhwi (TYPE_MIN_VALUE (index)));
13831 
13832 	/* There must be no padding.  */
13833 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13834 		      count * GET_MODE_BITSIZE (*modep)))
13835 	  return -1;
13836 
13837 	return count;
13838       }
13839 
13840     case RECORD_TYPE:
13841       {
13842 	int count = 0;
13843 	int sub_count;
13844 	tree field;
13845 
13846 	/* Can't handle incomplete types nor sizes that are not
13847 	   fixed.  */
13848 	if (!COMPLETE_TYPE_P (type)
13849 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13850 	  return -1;
13851 
13852 	for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13853 	  {
13854 	    if (TREE_CODE (field) != FIELD_DECL)
13855 	      continue;
13856 
13857 	    sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13858 	    if (sub_count < 0)
13859 	      return -1;
13860 	    count += sub_count;
13861 	  }
13862 
13863 	/* There must be no padding.  */
13864 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13865 		      count * GET_MODE_BITSIZE (*modep)))
13866 	  return -1;
13867 
13868 	return count;
13869       }
13870 
13871     case UNION_TYPE:
13872     case QUAL_UNION_TYPE:
13873       {
13874 	/* These aren't very interesting except in a degenerate case.  */
13875 	int count = 0;
13876 	int sub_count;
13877 	tree field;
13878 
13879 	/* Can't handle incomplete types nor sizes that are not
13880 	   fixed.  */
13881 	if (!COMPLETE_TYPE_P (type)
13882 	    || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13883 	  return -1;
13884 
13885 	for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
13886 	  {
13887 	    if (TREE_CODE (field) != FIELD_DECL)
13888 	      continue;
13889 
13890 	    sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
13891 	    if (sub_count < 0)
13892 	      return -1;
13893 	    count = count > sub_count ? count : sub_count;
13894 	  }
13895 
13896 	/* There must be no padding.  */
13897 	if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
13898 		      count * GET_MODE_BITSIZE (*modep)))
13899 	  return -1;
13900 
13901 	return count;
13902       }
13903 
13904     default:
13905       break;
13906     }
13907 
13908   return -1;
13909 }
13910 
13911 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
13912    type as described in AAPCS64 \S 4.1.2.
13913 
13914    See the comment above aarch64_composite_type_p for the notes on MODE.  */
13915 
13916 static bool
13917 aarch64_short_vector_p (const_tree type,
13918 			machine_mode mode)
13919 {
13920   poly_int64 size = -1;
13921 
13922   if (type && TREE_CODE (type) == VECTOR_TYPE)
13923     size = int_size_in_bytes (type);
13924   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
13925 	    || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
13926     size = GET_MODE_SIZE (mode);
13927 
13928   return known_eq (size, 8) || known_eq (size, 16);
13929 }
13930 
13931 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
13932    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
13933    array types.  The C99 floating-point complex types are also considered
13934    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
13935    types, which are GCC extensions and out of the scope of AAPCS64, are
13936    treated as composite types here as well.
13937 
13938    Note that MODE itself is not sufficient in determining whether a type
13939    is such a composite type or not.  This is because
13940    stor-layout.c:compute_record_mode may have already changed the MODE
13941    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
13942    structure with only one field may have its MODE set to the mode of the
13943    field.  Also an integer mode whose size matches the size of the
13944    RECORD_TYPE type may be used to substitute the original mode
13945    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
13946    solely relied on.  */
13947 
13948 static bool
13949 aarch64_composite_type_p (const_tree type,
13950 			  machine_mode mode)
13951 {
13952   if (aarch64_short_vector_p (type, mode))
13953     return false;
13954 
13955   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
13956     return true;
13957 
13958   if (mode == BLKmode
13959       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
13960       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
13961     return true;
13962 
13963   return false;
13964 }
13965 
13966 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
13967    shall be passed or returned in simd/fp register(s) (providing these
13968    parameter passing registers are available).
13969 
13970    Upon successful return, *COUNT returns the number of needed registers,
13971    *BASE_MODE returns the mode of the individual register and when IS_HAF
13972    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
13973    floating-point aggregate or a homogeneous short-vector aggregate.  */
13974 
13975 static bool
13976 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
13977 					 const_tree type,
13978 					 machine_mode *base_mode,
13979 					 int *count,
13980 					 bool *is_ha)
13981 {
13982   machine_mode new_mode = VOIDmode;
13983   bool composite_p = aarch64_composite_type_p (type, mode);
13984 
13985   if (is_ha != NULL) *is_ha = false;
13986 
13987   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
13988       || aarch64_short_vector_p (type, mode))
13989     {
13990       *count = 1;
13991       new_mode = mode;
13992     }
13993   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
13994     {
13995       if (is_ha != NULL) *is_ha = true;
13996       *count = 2;
13997       new_mode = GET_MODE_INNER (mode);
13998     }
13999   else if (type && composite_p)
14000     {
14001       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
14002 
14003       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
14004 	{
14005 	  if (is_ha != NULL) *is_ha = true;
14006 	  *count = ag_count;
14007 	}
14008       else
14009 	return false;
14010     }
14011   else
14012     return false;
14013 
14014   *base_mode = new_mode;
14015   return true;
14016 }
14017 
14018 /* Implement TARGET_STRUCT_VALUE_RTX.  */
14019 
14020 static rtx
14021 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
14022 			  int incoming ATTRIBUTE_UNUSED)
14023 {
14024   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
14025 }
14026 
14027 /* Implements target hook vector_mode_supported_p.  */
14028 static bool
14029 aarch64_vector_mode_supported_p (machine_mode mode)
14030 {
14031   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14032   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
14033 }
14034 
14035 /* Return appropriate SIMD container
14036    for MODE within a vector of WIDTH bits.  */
14037 static machine_mode
14038 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
14039 {
14040   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
14041     switch (mode)
14042       {
14043       case E_DFmode:
14044 	return VNx2DFmode;
14045       case E_SFmode:
14046 	return VNx4SFmode;
14047       case E_HFmode:
14048 	return VNx8HFmode;
14049       case E_DImode:
14050 	return VNx2DImode;
14051       case E_SImode:
14052 	return VNx4SImode;
14053       case E_HImode:
14054 	return VNx8HImode;
14055       case E_QImode:
14056 	return VNx16QImode;
14057       default:
14058 	return word_mode;
14059       }
14060 
14061   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
14062   if (TARGET_SIMD)
14063     {
14064       if (known_eq (width, 128))
14065 	switch (mode)
14066 	  {
14067 	  case E_DFmode:
14068 	    return V2DFmode;
14069 	  case E_SFmode:
14070 	    return V4SFmode;
14071 	  case E_HFmode:
14072 	    return V8HFmode;
14073 	  case E_SImode:
14074 	    return V4SImode;
14075 	  case E_HImode:
14076 	    return V8HImode;
14077 	  case E_QImode:
14078 	    return V16QImode;
14079 	  case E_DImode:
14080 	    return V2DImode;
14081 	  default:
14082 	    break;
14083 	  }
14084       else
14085 	switch (mode)
14086 	  {
14087 	  case E_SFmode:
14088 	    return V2SFmode;
14089 	  case E_HFmode:
14090 	    return V4HFmode;
14091 	  case E_SImode:
14092 	    return V2SImode;
14093 	  case E_HImode:
14094 	    return V4HImode;
14095 	  case E_QImode:
14096 	    return V8QImode;
14097 	  default:
14098 	    break;
14099 	  }
14100     }
14101   return word_mode;
14102 }
14103 
14104 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
14105 static machine_mode
14106 aarch64_preferred_simd_mode (scalar_mode mode)
14107 {
14108   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
14109   return aarch64_simd_container_mode (mode, bits);
14110 }
14111 
14112 /* Return a list of possible vector sizes for the vectorizer
14113    to iterate over.  */
14114 static void
14115 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
14116 {
14117   if (TARGET_SVE)
14118     sizes->safe_push (BYTES_PER_SVE_VECTOR);
14119   sizes->safe_push (16);
14120   sizes->safe_push (8);
14121 }
14122 
14123 /* Implement TARGET_MANGLE_TYPE.  */
14124 
14125 static const char *
14126 aarch64_mangle_type (const_tree type)
14127 {
14128   /* The AArch64 ABI documents say that "__va_list" has to be
14129      mangled as if it is in the "std" namespace.  */
14130   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
14131     return "St9__va_list";
14132 
14133   /* Half-precision float.  */
14134   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
14135     return "Dh";
14136 
14137   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
14138      builtin types.  */
14139   if (TYPE_NAME (type) != NULL)
14140     return aarch64_mangle_builtin_type (type);
14141 
14142   /* Use the default mangling.  */
14143   return NULL;
14144 }
14145 
14146 /* Find the first rtx_insn before insn that will generate an assembly
14147    instruction.  */
14148 
14149 static rtx_insn *
14150 aarch64_prev_real_insn (rtx_insn *insn)
14151 {
14152   if (!insn)
14153     return NULL;
14154 
14155   do
14156     {
14157       insn = prev_real_insn (insn);
14158     }
14159   while (insn && recog_memoized (insn) < 0);
14160 
14161   return insn;
14162 }
14163 
14164 static bool
14165 is_madd_op (enum attr_type t1)
14166 {
14167   unsigned int i;
14168   /* A number of these may be AArch32 only.  */
14169   enum attr_type mlatypes[] = {
14170     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
14171     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
14172     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
14173   };
14174 
14175   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
14176     {
14177       if (t1 == mlatypes[i])
14178 	return true;
14179     }
14180 
14181   return false;
14182 }
14183 
14184 /* Check if there is a register dependency between a load and the insn
14185    for which we hold recog_data.  */
14186 
14187 static bool
14188 dep_between_memop_and_curr (rtx memop)
14189 {
14190   rtx load_reg;
14191   int opno;
14192 
14193   gcc_assert (GET_CODE (memop) == SET);
14194 
14195   if (!REG_P (SET_DEST (memop)))
14196     return false;
14197 
14198   load_reg = SET_DEST (memop);
14199   for (opno = 1; opno < recog_data.n_operands; opno++)
14200     {
14201       rtx operand = recog_data.operand[opno];
14202       if (REG_P (operand)
14203           && reg_overlap_mentioned_p (load_reg, operand))
14204         return true;
14205 
14206     }
14207   return false;
14208 }
14209 
14210 
14211 /* When working around the Cortex-A53 erratum 835769,
14212    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
14213    instruction and has a preceding memory instruction such that a NOP
14214    should be inserted between them.  */
14215 
14216 bool
14217 aarch64_madd_needs_nop (rtx_insn* insn)
14218 {
14219   enum attr_type attr_type;
14220   rtx_insn *prev;
14221   rtx body;
14222 
14223   if (!TARGET_FIX_ERR_A53_835769)
14224     return false;
14225 
14226   if (!INSN_P (insn) || recog_memoized (insn) < 0)
14227     return false;
14228 
14229   attr_type = get_attr_type (insn);
14230   if (!is_madd_op (attr_type))
14231     return false;
14232 
14233   prev = aarch64_prev_real_insn (insn);
14234   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
14235      Restore recog state to INSN to avoid state corruption.  */
14236   extract_constrain_insn_cached (insn);
14237 
14238   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
14239     return false;
14240 
14241   body = single_set (prev);
14242 
14243   /* If the previous insn is a memory op and there is no dependency between
14244      it and the DImode madd, emit a NOP between them.  If body is NULL then we
14245      have a complex memory operation, probably a load/store pair.
14246      Be conservative for now and emit a NOP.  */
14247   if (GET_MODE (recog_data.operand[0]) == DImode
14248       && (!body || !dep_between_memop_and_curr (body)))
14249     return true;
14250 
14251   return false;
14252 
14253 }
14254 
14255 
14256 /* Implement FINAL_PRESCAN_INSN.  */
14257 
14258 void
14259 aarch64_final_prescan_insn (rtx_insn *insn)
14260 {
14261   if (aarch64_madd_needs_nop (insn))
14262     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
14263 }
14264 
14265 
14266 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
14267    instruction.  */
14268 
14269 bool
14270 aarch64_sve_index_immediate_p (rtx base_or_step)
14271 {
14272   return (CONST_INT_P (base_or_step)
14273 	  && IN_RANGE (INTVAL (base_or_step), -16, 15));
14274 }
14275 
14276 /* Return true if X is a valid immediate for the SVE ADD and SUB
14277    instructions.  Negate X first if NEGATE_P is true.  */
14278 
14279 bool
14280 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
14281 {
14282   rtx elt;
14283 
14284   if (!const_vec_duplicate_p (x, &elt)
14285       || !CONST_INT_P (elt))
14286     return false;
14287 
14288   HOST_WIDE_INT val = INTVAL (elt);
14289   if (negate_p)
14290     val = -val;
14291   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
14292 
14293   if (val & 0xff)
14294     return IN_RANGE (val, 0, 0xff);
14295   return IN_RANGE (val, 0, 0xff00);
14296 }
14297 
14298 /* Return true if X is a valid immediate operand for an SVE logical
14299    instruction such as AND.  */
14300 
14301 bool
14302 aarch64_sve_bitmask_immediate_p (rtx x)
14303 {
14304   rtx elt;
14305 
14306   return (const_vec_duplicate_p (x, &elt)
14307 	  && CONST_INT_P (elt)
14308 	  && aarch64_bitmask_imm (INTVAL (elt),
14309 				  GET_MODE_INNER (GET_MODE (x))));
14310 }
14311 
14312 /* Return true if X is a valid immediate for the SVE DUP and CPY
14313    instructions.  */
14314 
14315 bool
14316 aarch64_sve_dup_immediate_p (rtx x)
14317 {
14318   rtx elt;
14319 
14320   if (!const_vec_duplicate_p (x, &elt)
14321       || !CONST_INT_P (elt))
14322     return false;
14323 
14324   HOST_WIDE_INT val = INTVAL (elt);
14325   if (val & 0xff)
14326     return IN_RANGE (val, -0x80, 0x7f);
14327   return IN_RANGE (val, -0x8000, 0x7f00);
14328 }
14329 
14330 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
14331    SIGNED_P says whether the operand is signed rather than unsigned.  */
14332 
14333 bool
14334 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
14335 {
14336   rtx elt;
14337 
14338   return (const_vec_duplicate_p (x, &elt)
14339 	  && CONST_INT_P (elt)
14340 	  && (signed_p
14341 	      ? IN_RANGE (INTVAL (elt), -16, 15)
14342 	      : IN_RANGE (INTVAL (elt), 0, 127)));
14343 }
14344 
14345 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
14346    instruction.  Negate X first if NEGATE_P is true.  */
14347 
14348 bool
14349 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
14350 {
14351   rtx elt;
14352   REAL_VALUE_TYPE r;
14353 
14354   if (!const_vec_duplicate_p (x, &elt)
14355       || GET_CODE (elt) != CONST_DOUBLE)
14356     return false;
14357 
14358   r = *CONST_DOUBLE_REAL_VALUE (elt);
14359 
14360   if (negate_p)
14361     r = real_value_negate (&r);
14362 
14363   if (real_equal (&r, &dconst1))
14364     return true;
14365   if (real_equal (&r, &dconsthalf))
14366     return true;
14367   return false;
14368 }
14369 
14370 /* Return true if X is a valid immediate operand for an SVE FMUL
14371    instruction.  */
14372 
14373 bool
14374 aarch64_sve_float_mul_immediate_p (rtx x)
14375 {
14376   rtx elt;
14377 
14378   /* GCC will never generate a multiply with an immediate of 2, so there is no
14379      point testing for it (even though it is a valid constant).  */
14380   return (const_vec_duplicate_p (x, &elt)
14381 	  && GET_CODE (elt) == CONST_DOUBLE
14382 	  && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
14383 }
14384 
14385 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
14386    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
14387    is nonnull, use it to describe valid immediates.  */
14388 static bool
14389 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
14390 				    simd_immediate_info *info,
14391 				    enum simd_immediate_check which,
14392 				    simd_immediate_info::insn_type insn)
14393 {
14394   /* Try a 4-byte immediate with LSL.  */
14395   for (unsigned int shift = 0; shift < 32; shift += 8)
14396     if ((val32 & (0xff << shift)) == val32)
14397       {
14398 	if (info)
14399 	  *info = simd_immediate_info (SImode, val32 >> shift, insn,
14400 				       simd_immediate_info::LSL, shift);
14401 	return true;
14402       }
14403 
14404   /* Try a 2-byte immediate with LSL.  */
14405   unsigned int imm16 = val32 & 0xffff;
14406   if (imm16 == (val32 >> 16))
14407     for (unsigned int shift = 0; shift < 16; shift += 8)
14408       if ((imm16 & (0xff << shift)) == imm16)
14409 	{
14410 	  if (info)
14411 	    *info = simd_immediate_info (HImode, imm16 >> shift, insn,
14412 					 simd_immediate_info::LSL, shift);
14413 	  return true;
14414 	}
14415 
14416   /* Try a 4-byte immediate with MSL, except for cases that MVN
14417      can handle.  */
14418   if (which == AARCH64_CHECK_MOV)
14419     for (unsigned int shift = 8; shift < 24; shift += 8)
14420       {
14421 	unsigned int low = (1 << shift) - 1;
14422 	if (((val32 & (0xff << shift)) | low) == val32)
14423 	  {
14424 	    if (info)
14425 	      *info = simd_immediate_info (SImode, val32 >> shift, insn,
14426 					   simd_immediate_info::MSL, shift);
14427 	    return true;
14428 	  }
14429       }
14430 
14431   return false;
14432 }
14433 
14434 /* Return true if replicating VAL64 is a valid immediate for the
14435    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
14436    use it to describe valid immediates.  */
14437 static bool
14438 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
14439 				 simd_immediate_info *info,
14440 				 enum simd_immediate_check which)
14441 {
14442   unsigned int val32 = val64 & 0xffffffff;
14443   unsigned int val16 = val64 & 0xffff;
14444   unsigned int val8 = val64 & 0xff;
14445 
14446   if (val32 == (val64 >> 32))
14447     {
14448       if ((which & AARCH64_CHECK_ORR) != 0
14449 	  && aarch64_advsimd_valid_immediate_hs (val32, info, which,
14450 						 simd_immediate_info::MOV))
14451 	return true;
14452 
14453       if ((which & AARCH64_CHECK_BIC) != 0
14454 	  && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
14455 						 simd_immediate_info::MVN))
14456 	return true;
14457 
14458       /* Try using a replicated byte.  */
14459       if (which == AARCH64_CHECK_MOV
14460 	  && val16 == (val32 >> 16)
14461 	  && val8 == (val16 >> 8))
14462 	{
14463 	  if (info)
14464 	    *info = simd_immediate_info (QImode, val8);
14465 	  return true;
14466 	}
14467     }
14468 
14469   /* Try using a bit-to-bytemask.  */
14470   if (which == AARCH64_CHECK_MOV)
14471     {
14472       unsigned int i;
14473       for (i = 0; i < 64; i += 8)
14474 	{
14475 	  unsigned char byte = (val64 >> i) & 0xff;
14476 	  if (byte != 0 && byte != 0xff)
14477 	    break;
14478 	}
14479       if (i == 64)
14480 	{
14481 	  if (info)
14482 	    *info = simd_immediate_info (DImode, val64);
14483 	  return true;
14484 	}
14485     }
14486   return false;
14487 }
14488 
14489 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
14490    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
14491 
14492 static bool
14493 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
14494 			     simd_immediate_info *info)
14495 {
14496   scalar_int_mode mode = DImode;
14497   unsigned int val32 = val64 & 0xffffffff;
14498   if (val32 == (val64 >> 32))
14499     {
14500       mode = SImode;
14501       unsigned int val16 = val32 & 0xffff;
14502       if (val16 == (val32 >> 16))
14503 	{
14504 	  mode = HImode;
14505 	  unsigned int val8 = val16 & 0xff;
14506 	  if (val8 == (val16 >> 8))
14507 	    mode = QImode;
14508 	}
14509     }
14510   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
14511   if (IN_RANGE (val, -0x80, 0x7f))
14512     {
14513       /* DUP with no shift.  */
14514       if (info)
14515 	*info = simd_immediate_info (mode, val);
14516       return true;
14517     }
14518   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
14519     {
14520       /* DUP with LSL #8.  */
14521       if (info)
14522 	*info = simd_immediate_info (mode, val);
14523       return true;
14524     }
14525   if (aarch64_bitmask_imm (val64, mode))
14526     {
14527       /* DUPM.  */
14528       if (info)
14529 	*info = simd_immediate_info (mode, val);
14530       return true;
14531     }
14532   return false;
14533 }
14534 
14535 /* Return true if OP is a valid SIMD immediate for the operation
14536    described by WHICH.  If INFO is nonnull, use it to describe valid
14537    immediates.  */
14538 bool
14539 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
14540 			      enum simd_immediate_check which)
14541 {
14542   machine_mode mode = GET_MODE (op);
14543   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14544   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
14545     return false;
14546 
14547   scalar_mode elt_mode = GET_MODE_INNER (mode);
14548   rtx base, step;
14549   unsigned int n_elts;
14550   if (GET_CODE (op) == CONST_VECTOR
14551       && CONST_VECTOR_DUPLICATE_P (op))
14552     n_elts = CONST_VECTOR_NPATTERNS (op);
14553   else if ((vec_flags & VEC_SVE_DATA)
14554 	   && const_vec_series_p (op, &base, &step))
14555     {
14556       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
14557       if (!aarch64_sve_index_immediate_p (base)
14558 	  || !aarch64_sve_index_immediate_p (step))
14559 	return false;
14560 
14561       if (info)
14562 	*info = simd_immediate_info (elt_mode, base, step);
14563       return true;
14564     }
14565   else if (GET_CODE (op) == CONST_VECTOR
14566 	   && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
14567     /* N_ELTS set above.  */;
14568   else
14569     return false;
14570 
14571   /* Handle PFALSE and PTRUE.  */
14572   if (vec_flags & VEC_SVE_PRED)
14573     return (op == CONST0_RTX (mode)
14574 	    || op == CONSTM1_RTX (mode));
14575 
14576   scalar_float_mode elt_float_mode;
14577   if (n_elts == 1
14578       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
14579     {
14580       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
14581       if (aarch64_float_const_zero_rtx_p (elt)
14582 	  || aarch64_float_const_representable_p (elt))
14583 	{
14584 	  if (info)
14585 	    *info = simd_immediate_info (elt_float_mode, elt);
14586 	  return true;
14587 	}
14588     }
14589 
14590   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
14591   if (elt_size > 8)
14592     return false;
14593 
14594   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
14595 
14596   /* Expand the vector constant out into a byte vector, with the least
14597      significant byte of the register first.  */
14598   auto_vec<unsigned char, 16> bytes;
14599   bytes.reserve (n_elts * elt_size);
14600   for (unsigned int i = 0; i < n_elts; i++)
14601     {
14602       /* The vector is provided in gcc endian-neutral fashion.
14603 	 For aarch64_be Advanced SIMD, it must be laid out in the vector
14604 	 register in reverse order.  */
14605       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
14606       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
14607 
14608       if (elt_mode != elt_int_mode)
14609 	elt = gen_lowpart (elt_int_mode, elt);
14610 
14611       if (!CONST_INT_P (elt))
14612 	return false;
14613 
14614       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
14615       for (unsigned int byte = 0; byte < elt_size; byte++)
14616 	{
14617 	  bytes.quick_push (elt_val & 0xff);
14618 	  elt_val >>= BITS_PER_UNIT;
14619 	}
14620     }
14621 
14622   /* The immediate must repeat every eight bytes.  */
14623   unsigned int nbytes = bytes.length ();
14624   for (unsigned i = 8; i < nbytes; ++i)
14625     if (bytes[i] != bytes[i - 8])
14626       return false;
14627 
14628   /* Get the repeating 8-byte value as an integer.  No endian correction
14629      is needed here because bytes is already in lsb-first order.  */
14630   unsigned HOST_WIDE_INT val64 = 0;
14631   for (unsigned int i = 0; i < 8; i++)
14632     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
14633 	      << (i * BITS_PER_UNIT));
14634 
14635   if (vec_flags & VEC_SVE_DATA)
14636     return aarch64_sve_valid_immediate (val64, info);
14637   else
14638     return aarch64_advsimd_valid_immediate (val64, info, which);
14639 }
14640 
14641 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
14642    has a step in the range of INDEX.  Return the index expression if so,
14643    otherwise return null.  */
14644 rtx
14645 aarch64_check_zero_based_sve_index_immediate (rtx x)
14646 {
14647   rtx base, step;
14648   if (const_vec_series_p (x, &base, &step)
14649       && base == const0_rtx
14650       && aarch64_sve_index_immediate_p (step))
14651     return step;
14652   return NULL_RTX;
14653 }
14654 
14655 /* Check of immediate shift constants are within range.  */
14656 bool
14657 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
14658 {
14659   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
14660   if (left)
14661     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
14662   else
14663     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
14664 }
14665 
14666 /* Return the bitmask CONST_INT to select the bits required by a zero extract
14667    operation of width WIDTH at bit position POS.  */
14668 
14669 rtx
14670 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
14671 {
14672   gcc_assert (CONST_INT_P (width));
14673   gcc_assert (CONST_INT_P (pos));
14674 
14675   unsigned HOST_WIDE_INT mask
14676     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
14677   return GEN_INT (mask << UINTVAL (pos));
14678 }
14679 
14680 bool
14681 aarch64_mov_operand_p (rtx x, machine_mode mode)
14682 {
14683   if (GET_CODE (x) == HIGH
14684       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
14685     return true;
14686 
14687   if (CONST_INT_P (x))
14688     return true;
14689 
14690   if (VECTOR_MODE_P (GET_MODE (x)))
14691     return aarch64_simd_valid_immediate (x, NULL);
14692 
14693   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
14694     return true;
14695 
14696   if (aarch64_sve_cnt_immediate_p (x))
14697     return true;
14698 
14699   return aarch64_classify_symbolic_expression (x)
14700     == SYMBOL_TINY_ABSOLUTE;
14701 }
14702 
14703 /* Return a const_int vector of VAL.  */
14704 rtx
14705 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
14706 {
14707   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
14708   return gen_const_vec_duplicate (mode, c);
14709 }
14710 
14711 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
14712 
14713 bool
14714 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
14715 {
14716   machine_mode vmode;
14717 
14718   vmode = aarch64_simd_container_mode (mode, 64);
14719   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
14720   return aarch64_simd_valid_immediate (op_v, NULL);
14721 }
14722 
14723 /* Construct and return a PARALLEL RTX vector with elements numbering the
14724    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
14725    the vector - from the perspective of the architecture.  This does not
14726    line up with GCC's perspective on lane numbers, so we end up with
14727    different masks depending on our target endian-ness.  The diagram
14728    below may help.  We must draw the distinction when building masks
14729    which select one half of the vector.  An instruction selecting
14730    architectural low-lanes for a big-endian target, must be described using
14731    a mask selecting GCC high-lanes.
14732 
14733                  Big-Endian             Little-Endian
14734 
14735 GCC             0   1   2   3           3   2   1   0
14736               | x | x | x | x |       | x | x | x | x |
14737 Architecture    3   2   1   0           3   2   1   0
14738 
14739 Low Mask:         { 2, 3 }                { 0, 1 }
14740 High Mask:        { 0, 1 }                { 2, 3 }
14741 
14742    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
14743 
14744 rtx
14745 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
14746 {
14747   rtvec v = rtvec_alloc (nunits / 2);
14748   int high_base = nunits / 2;
14749   int low_base = 0;
14750   int base;
14751   rtx t1;
14752   int i;
14753 
14754   if (BYTES_BIG_ENDIAN)
14755     base = high ? low_base : high_base;
14756   else
14757     base = high ? high_base : low_base;
14758 
14759   for (i = 0; i < nunits / 2; i++)
14760     RTVEC_ELT (v, i) = GEN_INT (base + i);
14761 
14762   t1 = gen_rtx_PARALLEL (mode, v);
14763   return t1;
14764 }
14765 
14766 /* Check OP for validity as a PARALLEL RTX vector with elements
14767    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
14768    from the perspective of the architecture.  See the diagram above
14769    aarch64_simd_vect_par_cnst_half for more details.  */
14770 
14771 bool
14772 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
14773 				       bool high)
14774 {
14775   int nelts;
14776   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
14777     return false;
14778 
14779   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
14780   HOST_WIDE_INT count_op = XVECLEN (op, 0);
14781   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
14782   int i = 0;
14783 
14784   if (count_op != count_ideal)
14785     return false;
14786 
14787   for (i = 0; i < count_ideal; i++)
14788     {
14789       rtx elt_op = XVECEXP (op, 0, i);
14790       rtx elt_ideal = XVECEXP (ideal, 0, i);
14791 
14792       if (!CONST_INT_P (elt_op)
14793 	  || INTVAL (elt_ideal) != INTVAL (elt_op))
14794 	return false;
14795     }
14796   return true;
14797 }
14798 
14799 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
14800    HIGH (exclusive).  */
14801 void
14802 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
14803 			  const_tree exp)
14804 {
14805   HOST_WIDE_INT lane;
14806   gcc_assert (CONST_INT_P (operand));
14807   lane = INTVAL (operand);
14808 
14809   if (lane < low || lane >= high)
14810   {
14811     if (exp)
14812       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
14813     else
14814       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
14815   }
14816 }
14817 
14818 /* Peform endian correction on lane number N, which indexes a vector
14819    of mode MODE, and return the result as an SImode rtx.  */
14820 
14821 rtx
14822 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
14823 {
14824   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
14825 }
14826 
14827 /* Return TRUE if OP is a valid vector addressing mode.  */
14828 
14829 bool
14830 aarch64_simd_mem_operand_p (rtx op)
14831 {
14832   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
14833 			|| REG_P (XEXP (op, 0)));
14834 }
14835 
14836 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
14837 
14838 bool
14839 aarch64_sve_ld1r_operand_p (rtx op)
14840 {
14841   struct aarch64_address_info addr;
14842   scalar_mode mode;
14843 
14844   return (MEM_P (op)
14845 	  && is_a <scalar_mode> (GET_MODE (op), &mode)
14846 	  && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
14847 	  && addr.type == ADDRESS_REG_IMM
14848 	  && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
14849 }
14850 
14851 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
14852    The conditions for STR are the same.  */
14853 bool
14854 aarch64_sve_ldr_operand_p (rtx op)
14855 {
14856   struct aarch64_address_info addr;
14857 
14858   return (MEM_P (op)
14859 	  && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
14860 				       false, ADDR_QUERY_ANY)
14861 	  && addr.type == ADDRESS_REG_IMM);
14862 }
14863 
14864 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
14865    We need to be able to access the individual pieces, so the range
14866    is different from LD[234] and ST[234].  */
14867 bool
14868 aarch64_sve_struct_memory_operand_p (rtx op)
14869 {
14870   if (!MEM_P (op))
14871     return false;
14872 
14873   machine_mode mode = GET_MODE (op);
14874   struct aarch64_address_info addr;
14875   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
14876 				 ADDR_QUERY_ANY)
14877       || addr.type != ADDRESS_REG_IMM)
14878     return false;
14879 
14880   poly_int64 first = addr.const_offset;
14881   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
14882   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
14883 	  && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
14884 }
14885 
14886 /* Emit a register copy from operand to operand, taking care not to
14887    early-clobber source registers in the process.
14888 
14889    COUNT is the number of components into which the copy needs to be
14890    decomposed.  */
14891 void
14892 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
14893 				unsigned int count)
14894 {
14895   unsigned int i;
14896   int rdest = REGNO (operands[0]);
14897   int rsrc = REGNO (operands[1]);
14898 
14899   if (!reg_overlap_mentioned_p (operands[0], operands[1])
14900       || rdest < rsrc)
14901     for (i = 0; i < count; i++)
14902       emit_move_insn (gen_rtx_REG (mode, rdest + i),
14903 		      gen_rtx_REG (mode, rsrc + i));
14904   else
14905     for (i = 0; i < count; i++)
14906       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
14907 		      gen_rtx_REG (mode, rsrc + count - i - 1));
14908 }
14909 
14910 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
14911    one of VSTRUCT modes: OI, CI, or XI.  */
14912 int
14913 aarch64_simd_attr_length_rglist (machine_mode mode)
14914 {
14915   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
14916   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
14917 }
14918 
14919 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
14920    alignment of a vector to 128 bits.  SVE predicates have an alignment of
14921    16 bits.  */
14922 static HOST_WIDE_INT
14923 aarch64_simd_vector_alignment (const_tree type)
14924 {
14925   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
14926     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
14927        be set for non-predicate vectors of booleans.  Modes are the most
14928        direct way we have of identifying real SVE predicate types.  */
14929     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
14930   return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
14931 }
14932 
14933 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
14934 static poly_uint64
14935 aarch64_vectorize_preferred_vector_alignment (const_tree type)
14936 {
14937   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
14938     {
14939       /* If the length of the vector is fixed, try to align to that length,
14940 	 otherwise don't try to align at all.  */
14941       HOST_WIDE_INT result;
14942       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
14943 	result = TYPE_ALIGN (TREE_TYPE (type));
14944       return result;
14945     }
14946   return TYPE_ALIGN (type);
14947 }
14948 
14949 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
14950 static bool
14951 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
14952 {
14953   if (is_packed)
14954     return false;
14955 
14956   /* For fixed-length vectors, check that the vectorizer will aim for
14957      full-vector alignment.  This isn't true for generic GCC vectors
14958      that are wider than the ABI maximum of 128 bits.  */
14959   poly_uint64 preferred_alignment =
14960     aarch64_vectorize_preferred_vector_alignment (type);
14961   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
14962       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
14963 		   preferred_alignment))
14964     return false;
14965 
14966   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
14967   return true;
14968 }
14969 
14970 /* Return true if the vector misalignment factor is supported by the
14971    target.  */
14972 static bool
14973 aarch64_builtin_support_vector_misalignment (machine_mode mode,
14974 					     const_tree type, int misalignment,
14975 					     bool is_packed)
14976 {
14977   if (TARGET_SIMD && STRICT_ALIGNMENT)
14978     {
14979       /* Return if movmisalign pattern is not supported for this mode.  */
14980       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
14981         return false;
14982 
14983       /* Misalignment factor is unknown at compile time.  */
14984       if (misalignment == -1)
14985 	return false;
14986     }
14987   return default_builtin_support_vector_misalignment (mode, type, misalignment,
14988 						      is_packed);
14989 }
14990 
14991 /* If VALS is a vector constant that can be loaded into a register
14992    using DUP, generate instructions to do so and return an RTX to
14993    assign to the register.  Otherwise return NULL_RTX.  */
14994 static rtx
14995 aarch64_simd_dup_constant (rtx vals)
14996 {
14997   machine_mode mode = GET_MODE (vals);
14998   machine_mode inner_mode = GET_MODE_INNER (mode);
14999   rtx x;
15000 
15001   if (!const_vec_duplicate_p (vals, &x))
15002     return NULL_RTX;
15003 
15004   /* We can load this constant by using DUP and a constant in a
15005      single ARM register.  This will be cheaper than a vector
15006      load.  */
15007   x = copy_to_mode_reg (inner_mode, x);
15008   return gen_vec_duplicate (mode, x);
15009 }
15010 
15011 
15012 /* Generate code to load VALS, which is a PARALLEL containing only
15013    constants (for vec_init) or CONST_VECTOR, efficiently into a
15014    register.  Returns an RTX to copy into the register, or NULL_RTX
15015    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
15016 static rtx
15017 aarch64_simd_make_constant (rtx vals)
15018 {
15019   machine_mode mode = GET_MODE (vals);
15020   rtx const_dup;
15021   rtx const_vec = NULL_RTX;
15022   int n_const = 0;
15023   int i;
15024 
15025   if (GET_CODE (vals) == CONST_VECTOR)
15026     const_vec = vals;
15027   else if (GET_CODE (vals) == PARALLEL)
15028     {
15029       /* A CONST_VECTOR must contain only CONST_INTs and
15030 	 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
15031 	 Only store valid constants in a CONST_VECTOR.  */
15032       int n_elts = XVECLEN (vals, 0);
15033       for (i = 0; i < n_elts; ++i)
15034 	{
15035 	  rtx x = XVECEXP (vals, 0, i);
15036 	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15037 	    n_const++;
15038 	}
15039       if (n_const == n_elts)
15040 	const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
15041     }
15042   else
15043     gcc_unreachable ();
15044 
15045   if (const_vec != NULL_RTX
15046       && aarch64_simd_valid_immediate (const_vec, NULL))
15047     /* Load using MOVI/MVNI.  */
15048     return const_vec;
15049   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
15050     /* Loaded using DUP.  */
15051     return const_dup;
15052   else if (const_vec != NULL_RTX)
15053     /* Load from constant pool. We cannot take advantage of single-cycle
15054        LD1 because we need a PC-relative addressing mode.  */
15055     return const_vec;
15056   else
15057     /* A PARALLEL containing something not valid inside CONST_VECTOR.
15058        We cannot construct an initializer.  */
15059     return NULL_RTX;
15060 }
15061 
15062 /* Expand a vector initialisation sequence, such that TARGET is
15063    initialised to contain VALS.  */
15064 
15065 void
15066 aarch64_expand_vector_init (rtx target, rtx vals)
15067 {
15068   machine_mode mode = GET_MODE (target);
15069   scalar_mode inner_mode = GET_MODE_INNER (mode);
15070   /* The number of vector elements.  */
15071   int n_elts = XVECLEN (vals, 0);
15072   /* The number of vector elements which are not constant.  */
15073   int n_var = 0;
15074   rtx any_const = NULL_RTX;
15075   /* The first element of vals.  */
15076   rtx v0 = XVECEXP (vals, 0, 0);
15077   bool all_same = true;
15078 
15079   /* Count the number of variable elements to initialise.  */
15080   for (int i = 0; i < n_elts; ++i)
15081     {
15082       rtx x = XVECEXP (vals, 0, i);
15083       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
15084 	++n_var;
15085       else
15086 	any_const = x;
15087 
15088       all_same &= rtx_equal_p (x, v0);
15089     }
15090 
15091   /* No variable elements, hand off to aarch64_simd_make_constant which knows
15092      how best to handle this.  */
15093   if (n_var == 0)
15094     {
15095       rtx constant = aarch64_simd_make_constant (vals);
15096       if (constant != NULL_RTX)
15097 	{
15098 	  emit_move_insn (target, constant);
15099 	  return;
15100 	}
15101     }
15102 
15103   /* Splat a single non-constant element if we can.  */
15104   if (all_same)
15105     {
15106       rtx x = copy_to_mode_reg (inner_mode, v0);
15107       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15108       return;
15109     }
15110 
15111   enum insn_code icode = optab_handler (vec_set_optab, mode);
15112   gcc_assert (icode != CODE_FOR_nothing);
15113 
15114   /* If there are only variable elements, try to optimize
15115      the insertion using dup for the most common element
15116      followed by insertions.  */
15117 
15118   /* The algorithm will fill matches[*][0] with the earliest matching element,
15119      and matches[X][1] with the count of duplicate elements (if X is the
15120      earliest element which has duplicates).  */
15121 
15122   if (n_var == n_elts && n_elts <= 16)
15123     {
15124       int matches[16][2] = {0};
15125       for (int i = 0; i < n_elts; i++)
15126 	{
15127 	  for (int j = 0; j <= i; j++)
15128 	    {
15129 	      if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
15130 		{
15131 		  matches[i][0] = j;
15132 		  matches[j][1]++;
15133 		  break;
15134 		}
15135 	    }
15136 	}
15137       int maxelement = 0;
15138       int maxv = 0;
15139       for (int i = 0; i < n_elts; i++)
15140 	if (matches[i][1] > maxv)
15141 	  {
15142 	    maxelement = i;
15143 	    maxv = matches[i][1];
15144 	  }
15145 
15146       /* Create a duplicate of the most common element, unless all elements
15147 	 are equally useless to us, in which case just immediately set the
15148 	 vector register using the first element.  */
15149 
15150       if (maxv == 1)
15151 	{
15152 	  /* For vectors of two 64-bit elements, we can do even better.  */
15153 	  if (n_elts == 2
15154 	      && (inner_mode == E_DImode
15155 		  || inner_mode == E_DFmode))
15156 
15157 	    {
15158 	      rtx x0 = XVECEXP (vals, 0, 0);
15159 	      rtx x1 = XVECEXP (vals, 0, 1);
15160 	      /* Combine can pick up this case, but handling it directly
15161 		 here leaves clearer RTL.
15162 
15163 		 This is load_pair_lanes<mode>, and also gives us a clean-up
15164 		 for store_pair_lanes<mode>.  */
15165 	      if (memory_operand (x0, inner_mode)
15166 		  && memory_operand (x1, inner_mode)
15167 		  && !STRICT_ALIGNMENT
15168 		  && rtx_equal_p (XEXP (x1, 0),
15169 				  plus_constant (Pmode,
15170 						 XEXP (x0, 0),
15171 						 GET_MODE_SIZE (inner_mode))))
15172 		{
15173 		  rtx t;
15174 		  if (inner_mode == DFmode)
15175 		    t = gen_load_pair_lanesdf (target, x0, x1);
15176 		  else
15177 		    t = gen_load_pair_lanesdi (target, x0, x1);
15178 		  emit_insn (t);
15179 		  return;
15180 		}
15181 	    }
15182 	  /* The subreg-move sequence below will move into lane zero of the
15183 	     vector register.  For big-endian we want that position to hold
15184 	     the last element of VALS.  */
15185 	  maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
15186 	  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15187 	  aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
15188 	}
15189       else
15190 	{
15191 	  rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
15192 	  aarch64_emit_move (target, gen_vec_duplicate (mode, x));
15193 	}
15194 
15195       /* Insert the rest.  */
15196       for (int i = 0; i < n_elts; i++)
15197 	{
15198 	  rtx x = XVECEXP (vals, 0, i);
15199 	  if (matches[i][0] == maxelement)
15200 	    continue;
15201 	  x = copy_to_mode_reg (inner_mode, x);
15202 	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15203 	}
15204       return;
15205     }
15206 
15207   /* Initialise a vector which is part-variable.  We want to first try
15208      to build those lanes which are constant in the most efficient way we
15209      can.  */
15210   if (n_var != n_elts)
15211     {
15212       rtx copy = copy_rtx (vals);
15213 
15214       /* Load constant part of vector.  We really don't care what goes into the
15215 	 parts we will overwrite, but we're more likely to be able to load the
15216 	 constant efficiently if it has fewer, larger, repeating parts
15217 	 (see aarch64_simd_valid_immediate).  */
15218       for (int i = 0; i < n_elts; i++)
15219 	{
15220 	  rtx x = XVECEXP (vals, 0, i);
15221 	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15222 	    continue;
15223 	  rtx subst = any_const;
15224 	  for (int bit = n_elts / 2; bit > 0; bit /= 2)
15225 	    {
15226 	      /* Look in the copied vector, as more elements are const.  */
15227 	      rtx test = XVECEXP (copy, 0, i ^ bit);
15228 	      if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
15229 		{
15230 		  subst = test;
15231 		  break;
15232 		}
15233 	    }
15234 	  XVECEXP (copy, 0, i) = subst;
15235 	}
15236       aarch64_expand_vector_init (target, copy);
15237     }
15238 
15239   /* Insert the variable lanes directly.  */
15240   for (int i = 0; i < n_elts; i++)
15241     {
15242       rtx x = XVECEXP (vals, 0, i);
15243       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
15244 	continue;
15245       x = copy_to_mode_reg (inner_mode, x);
15246       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
15247     }
15248 }
15249 
15250 static unsigned HOST_WIDE_INT
15251 aarch64_shift_truncation_mask (machine_mode mode)
15252 {
15253   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
15254     return 0;
15255   return GET_MODE_UNIT_BITSIZE (mode) - 1;
15256 }
15257 
15258 /* Select a format to encode pointers in exception handling data.  */
15259 int
15260 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
15261 {
15262    int type;
15263    switch (aarch64_cmodel)
15264      {
15265      case AARCH64_CMODEL_TINY:
15266      case AARCH64_CMODEL_TINY_PIC:
15267      case AARCH64_CMODEL_SMALL:
15268      case AARCH64_CMODEL_SMALL_PIC:
15269      case AARCH64_CMODEL_SMALL_SPIC:
15270        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
15271 	  for everything.  */
15272        type = DW_EH_PE_sdata4;
15273        break;
15274      default:
15275        /* No assumptions here.  8-byte relocs required.  */
15276        type = DW_EH_PE_sdata8;
15277        break;
15278      }
15279    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
15280 }
15281 
15282 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
15283 
15284 static void
15285 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
15286 {
15287   if (aarch64_simd_decl_p (decl))
15288     {
15289       fprintf (stream, "\t.variant_pcs\t");
15290       assemble_name (stream, name);
15291       fprintf (stream, "\n");
15292     }
15293 }
15294 
15295 /* The last .arch and .tune assembly strings that we printed.  */
15296 static std::string aarch64_last_printed_arch_string;
15297 static std::string aarch64_last_printed_tune_string;
15298 
15299 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
15300    by the function fndecl.  */
15301 
15302 void
15303 aarch64_declare_function_name (FILE *stream, const char* name,
15304 				tree fndecl)
15305 {
15306   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15307 
15308   struct cl_target_option *targ_options;
15309   if (target_parts)
15310     targ_options = TREE_TARGET_OPTION (target_parts);
15311   else
15312     targ_options = TREE_TARGET_OPTION (target_option_current_node);
15313   gcc_assert (targ_options);
15314 
15315   const struct processor *this_arch
15316     = aarch64_get_arch (targ_options->x_explicit_arch);
15317 
15318   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
15319   std::string extension
15320     = aarch64_get_extension_string_for_isa_flags (isa_flags,
15321 						  this_arch->flags);
15322   /* Only update the assembler .arch string if it is distinct from the last
15323      such string we printed.  */
15324   std::string to_print = this_arch->name + extension;
15325   if (to_print != aarch64_last_printed_arch_string)
15326     {
15327       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
15328       aarch64_last_printed_arch_string = to_print;
15329     }
15330 
15331   /* Print the cpu name we're tuning for in the comments, might be
15332      useful to readers of the generated asm.  Do it only when it changes
15333      from function to function and verbose assembly is requested.  */
15334   const struct processor *this_tune
15335     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
15336 
15337   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
15338     {
15339       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
15340 		   this_tune->name);
15341       aarch64_last_printed_tune_string = this_tune->name;
15342     }
15343 
15344   aarch64_asm_output_variant_pcs (stream, fndecl, name);
15345 
15346   /* Don't forget the type directive for ELF.  */
15347   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
15348   ASM_OUTPUT_LABEL (stream, name);
15349 
15350   cfun->machine->label_is_assembled = true;
15351 }
15352 
15353 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY.  Check if the patch area is after
15354    the function label and emit a BTI if necessary.  */
15355 
15356 void
15357 aarch64_print_patchable_function_entry (FILE *file,
15358 					unsigned HOST_WIDE_INT patch_area_size,
15359 					bool record_p)
15360 {
15361   if (cfun->machine->label_is_assembled
15362       && aarch64_bti_enabled ()
15363       && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
15364     {
15365       /* Remove the BTI that follows the patch area and insert a new BTI
15366 	 before the patch area right after the function label.  */
15367       rtx_insn *insn = next_real_nondebug_insn (get_insns ());
15368       if (insn
15369 	  && INSN_P (insn)
15370 	  && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
15371 	  && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
15372 	delete_insn (insn);
15373       asm_fprintf (file, "\thint\t34 // bti c\n");
15374     }
15375 
15376   default_print_patchable_function_entry (file, patch_area_size, record_p);
15377 }
15378 
15379 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
15380 
15381 void
15382 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
15383 {
15384   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
15385   const char *value = IDENTIFIER_POINTER (target);
15386   aarch64_asm_output_variant_pcs (stream, decl, name);
15387   ASM_OUTPUT_DEF (stream, name, value);
15388 }
15389 
15390 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
15391    function symbol references.  */
15392 
15393 void
15394 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
15395 {
15396   default_elf_asm_output_external (stream, decl, name);
15397   aarch64_asm_output_variant_pcs (stream, decl, name);
15398 }
15399 
15400 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
15401 
15402 static void
15403 aarch64_start_file (void)
15404 {
15405   struct cl_target_option *default_options
15406     = TREE_TARGET_OPTION (target_option_default_node);
15407 
15408   const struct processor *default_arch
15409     = aarch64_get_arch (default_options->x_explicit_arch);
15410   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
15411   std::string extension
15412     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
15413 						  default_arch->flags);
15414 
15415    aarch64_last_printed_arch_string = default_arch->name + extension;
15416    aarch64_last_printed_tune_string = "";
15417    asm_fprintf (asm_out_file, "\t.arch %s\n",
15418 		aarch64_last_printed_arch_string.c_str ());
15419 
15420    default_file_start ();
15421 }
15422 
15423 /* Emit load exclusive.  */
15424 
15425 static void
15426 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
15427 			     rtx mem, rtx model_rtx)
15428 {
15429   emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
15430 }
15431 
15432 /* Emit store exclusive.  */
15433 
15434 static void
15435 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
15436 			      rtx rval, rtx mem, rtx model_rtx)
15437 {
15438   emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
15439 }
15440 
15441 /* Mark the previous jump instruction as unlikely.  */
15442 
15443 static void
15444 aarch64_emit_unlikely_jump (rtx insn)
15445 {
15446   rtx_insn *jump = emit_jump_insn (insn);
15447   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
15448 }
15449 
15450 /* Expand a compare and swap pattern.  */
15451 
15452 void
15453 aarch64_expand_compare_and_swap (rtx operands[])
15454 {
15455   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
15456   machine_mode mode, r_mode;
15457 
15458   bval = operands[0];
15459   rval = operands[1];
15460   mem = operands[2];
15461   oldval = operands[3];
15462   newval = operands[4];
15463   is_weak = operands[5];
15464   mod_s = operands[6];
15465   mod_f = operands[7];
15466   mode = GET_MODE (mem);
15467 
15468   /* Normally the succ memory model must be stronger than fail, but in the
15469      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
15470      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
15471   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
15472       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
15473     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
15474 
15475   r_mode = mode;
15476   if (mode == QImode || mode == HImode)
15477     {
15478       r_mode = SImode;
15479       rval = gen_reg_rtx (r_mode);
15480     }
15481 
15482   if (TARGET_LSE)
15483     {
15484       /* The CAS insn requires oldval and rval overlap, but we need to
15485 	 have a copy of oldval saved across the operation to tell if
15486 	 the operation is successful.  */
15487       if (reg_overlap_mentioned_p (rval, oldval))
15488         rval = copy_to_mode_reg (r_mode, oldval);
15489       else
15490 	emit_move_insn (rval, gen_lowpart (r_mode, oldval));
15491 
15492       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
15493 						   newval, mod_s));
15494       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15495     }
15496   else
15497     {
15498       /* The oldval predicate varies by mode.  Test it and force to reg.  */
15499       insn_code code = code_for_aarch64_compare_and_swap (mode);
15500       if (!insn_data[code].operand[2].predicate (oldval, mode))
15501 	oldval = force_reg (mode, oldval);
15502 
15503       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
15504 				 is_weak, mod_s, mod_f));
15505       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
15506     }
15507 
15508   if (r_mode != mode)
15509     rval = gen_lowpart (mode, rval);
15510   emit_move_insn (operands[1], rval);
15511 
15512   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
15513   emit_insn (gen_rtx_SET (bval, x));
15514 }
15515 
15516 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
15517    sequence implementing an atomic operation.  */
15518 
15519 static void
15520 aarch64_emit_post_barrier (enum memmodel model)
15521 {
15522   const enum memmodel base_model = memmodel_base (model);
15523 
15524   if (is_mm_sync (model)
15525       && (base_model == MEMMODEL_ACQUIRE
15526 	  || base_model == MEMMODEL_ACQ_REL
15527 	  || base_model == MEMMODEL_SEQ_CST))
15528     {
15529       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
15530     }
15531 }
15532 
15533 /* Split a compare and swap pattern.  */
15534 
15535 void
15536 aarch64_split_compare_and_swap (rtx operands[])
15537 {
15538   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
15539   gcc_assert (epilogue_completed);
15540 
15541   rtx rval, mem, oldval, newval, scratch;
15542   machine_mode mode;
15543   bool is_weak;
15544   rtx_code_label *label1, *label2;
15545   rtx x, cond;
15546   enum memmodel model;
15547   rtx model_rtx;
15548 
15549   rval = operands[0];
15550   mem = operands[1];
15551   oldval = operands[2];
15552   newval = operands[3];
15553   is_weak = (operands[4] != const0_rtx);
15554   model_rtx = operands[5];
15555   scratch = operands[7];
15556   mode = GET_MODE (mem);
15557   model = memmodel_from_int (INTVAL (model_rtx));
15558 
15559   /* When OLDVAL is zero and we want the strong version we can emit a tighter
15560     loop:
15561     .label1:
15562 	LD[A]XR	rval, [mem]
15563 	CBNZ	rval, .label2
15564 	ST[L]XR	scratch, newval, [mem]
15565 	CBNZ	scratch, .label1
15566     .label2:
15567 	CMP	rval, 0.  */
15568   bool strong_zero_p = !is_weak && oldval == const0_rtx;
15569 
15570   label1 = NULL;
15571   if (!is_weak)
15572     {
15573       label1 = gen_label_rtx ();
15574       emit_label (label1);
15575     }
15576   label2 = gen_label_rtx ();
15577 
15578   /* The initial load can be relaxed for a __sync operation since a final
15579      barrier will be emitted to stop code hoisting.  */
15580   if (is_mm_sync (model))
15581     aarch64_emit_load_exclusive (mode, rval, mem,
15582 				 GEN_INT (MEMMODEL_RELAXED));
15583   else
15584     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
15585 
15586   if (strong_zero_p)
15587     {
15588       if (aarch64_track_speculation)
15589 	{
15590 	  /* Emit an explicit compare instruction, so that we can correctly
15591 	     track the condition codes.  */
15592 	  rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
15593 	  x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15594 	}
15595       else
15596 	x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
15597 
15598       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15599 				gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15600       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15601     }
15602   else
15603     {
15604       cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
15605       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15606       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15607 				gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
15608       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15609     }
15610 
15611   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
15612 
15613   if (!is_weak)
15614     {
15615       if (aarch64_track_speculation)
15616 	{
15617 	  /* Emit an explicit compare instruction, so that we can correctly
15618 	     track the condition codes.  */
15619 	  rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
15620 	  x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15621 	}
15622       else
15623 	x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
15624 
15625       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15626 				gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
15627       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15628     }
15629   else
15630     {
15631       cond = gen_rtx_REG (CCmode, CC_REGNUM);
15632       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
15633       emit_insn (gen_rtx_SET (cond, x));
15634     }
15635 
15636   emit_label (label2);
15637   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
15638      to set the condition flags.  If this is not used it will be removed by
15639      later passes.  */
15640   if (strong_zero_p)
15641     {
15642       cond = gen_rtx_REG (CCmode, CC_REGNUM);
15643       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
15644       emit_insn (gen_rtx_SET (cond, x));
15645     }
15646   /* Emit any final barrier needed for a __sync operation.  */
15647   if (is_mm_sync (model))
15648     aarch64_emit_post_barrier (model);
15649 }
15650 
15651 /* Split an atomic operation.  */
15652 
15653 void
15654 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
15655 			 rtx value, rtx model_rtx, rtx cond)
15656 {
15657   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
15658   gcc_assert (epilogue_completed);
15659 
15660   machine_mode mode = GET_MODE (mem);
15661   machine_mode wmode = (mode == DImode ? DImode : SImode);
15662   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
15663   const bool is_sync = is_mm_sync (model);
15664   rtx_code_label *label;
15665   rtx x;
15666 
15667   /* Split the atomic operation into a sequence.  */
15668   label = gen_label_rtx ();
15669   emit_label (label);
15670 
15671   if (new_out)
15672     new_out = gen_lowpart (wmode, new_out);
15673   if (old_out)
15674     old_out = gen_lowpart (wmode, old_out);
15675   else
15676     old_out = new_out;
15677   value = simplify_gen_subreg (wmode, value, mode, 0);
15678 
15679   /* The initial load can be relaxed for a __sync operation since a final
15680      barrier will be emitted to stop code hoisting.  */
15681  if (is_sync)
15682     aarch64_emit_load_exclusive (mode, old_out, mem,
15683 				 GEN_INT (MEMMODEL_RELAXED));
15684   else
15685     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
15686 
15687   switch (code)
15688     {
15689     case SET:
15690       new_out = value;
15691       break;
15692 
15693     case NOT:
15694       x = gen_rtx_AND (wmode, old_out, value);
15695       emit_insn (gen_rtx_SET (new_out, x));
15696       x = gen_rtx_NOT (wmode, new_out);
15697       emit_insn (gen_rtx_SET (new_out, x));
15698       break;
15699 
15700     case MINUS:
15701       if (CONST_INT_P (value))
15702 	{
15703 	  value = GEN_INT (-INTVAL (value));
15704 	  code = PLUS;
15705 	}
15706       /* Fall through.  */
15707 
15708     default:
15709       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
15710       emit_insn (gen_rtx_SET (new_out, x));
15711       break;
15712     }
15713 
15714   aarch64_emit_store_exclusive (mode, cond, mem,
15715 				gen_lowpart (mode, new_out), model_rtx);
15716 
15717   if (aarch64_track_speculation)
15718     {
15719       /* Emit an explicit compare instruction, so that we can correctly
15720 	 track the condition codes.  */
15721       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
15722       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
15723     }
15724   else
15725     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
15726 
15727   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
15728 			    gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
15729   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
15730 
15731   /* Emit any final barrier needed for a __sync operation.  */
15732   if (is_sync)
15733     aarch64_emit_post_barrier (model);
15734 }
15735 
15736 static void
15737 aarch64_init_libfuncs (void)
15738 {
15739    /* Half-precision float operations.  The compiler handles all operations
15740      with NULL libfuncs by converting to SFmode.  */
15741 
15742   /* Conversions.  */
15743   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
15744   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
15745 
15746   /* Arithmetic.  */
15747   set_optab_libfunc (add_optab, HFmode, NULL);
15748   set_optab_libfunc (sdiv_optab, HFmode, NULL);
15749   set_optab_libfunc (smul_optab, HFmode, NULL);
15750   set_optab_libfunc (neg_optab, HFmode, NULL);
15751   set_optab_libfunc (sub_optab, HFmode, NULL);
15752 
15753   /* Comparisons.  */
15754   set_optab_libfunc (eq_optab, HFmode, NULL);
15755   set_optab_libfunc (ne_optab, HFmode, NULL);
15756   set_optab_libfunc (lt_optab, HFmode, NULL);
15757   set_optab_libfunc (le_optab, HFmode, NULL);
15758   set_optab_libfunc (ge_optab, HFmode, NULL);
15759   set_optab_libfunc (gt_optab, HFmode, NULL);
15760   set_optab_libfunc (unord_optab, HFmode, NULL);
15761 }
15762 
15763 /* Target hook for c_mode_for_suffix.  */
15764 static machine_mode
15765 aarch64_c_mode_for_suffix (char suffix)
15766 {
15767   if (suffix == 'q')
15768     return TFmode;
15769 
15770   return VOIDmode;
15771 }
15772 
15773 /* We can only represent floating point constants which will fit in
15774    "quarter-precision" values.  These values are characterised by
15775    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
15776    by:
15777 
15778    (-1)^s * (n/16) * 2^r
15779 
15780    Where:
15781      's' is the sign bit.
15782      'n' is an integer in the range 16 <= n <= 31.
15783      'r' is an integer in the range -3 <= r <= 4.  */
15784 
15785 /* Return true iff X can be represented by a quarter-precision
15786    floating point immediate operand X.  Note, we cannot represent 0.0.  */
15787 bool
15788 aarch64_float_const_representable_p (rtx x)
15789 {
15790   /* This represents our current view of how many bits
15791      make up the mantissa.  */
15792   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
15793   int exponent;
15794   unsigned HOST_WIDE_INT mantissa, mask;
15795   REAL_VALUE_TYPE r, m;
15796   bool fail;
15797 
15798   if (!CONST_DOUBLE_P (x))
15799     return false;
15800 
15801   if (GET_MODE (x) == VOIDmode
15802       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
15803     return false;
15804 
15805   r = *CONST_DOUBLE_REAL_VALUE (x);
15806 
15807   /* We cannot represent infinities, NaNs or +/-zero.  We won't
15808      know if we have +zero until we analyse the mantissa, but we
15809      can reject the other invalid values.  */
15810   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
15811       || REAL_VALUE_MINUS_ZERO (r))
15812     return false;
15813 
15814   /* Extract exponent.  */
15815   r = real_value_abs (&r);
15816   exponent = REAL_EXP (&r);
15817 
15818   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
15819      highest (sign) bit, with a fixed binary point at bit point_pos.
15820      m1 holds the low part of the mantissa, m2 the high part.
15821      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
15822      bits for the mantissa, this can fail (low bits will be lost).  */
15823   real_ldexp (&m, &r, point_pos - exponent);
15824   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
15825 
15826   /* If the low part of the mantissa has bits set we cannot represent
15827      the value.  */
15828   if (w.ulow () != 0)
15829     return false;
15830   /* We have rejected the lower HOST_WIDE_INT, so update our
15831      understanding of how many bits lie in the mantissa and
15832      look only at the high HOST_WIDE_INT.  */
15833   mantissa = w.elt (1);
15834   point_pos -= HOST_BITS_PER_WIDE_INT;
15835 
15836   /* We can only represent values with a mantissa of the form 1.xxxx.  */
15837   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
15838   if ((mantissa & mask) != 0)
15839     return false;
15840 
15841   /* Having filtered unrepresentable values, we may now remove all
15842      but the highest 5 bits.  */
15843   mantissa >>= point_pos - 5;
15844 
15845   /* We cannot represent the value 0.0, so reject it.  This is handled
15846      elsewhere.  */
15847   if (mantissa == 0)
15848     return false;
15849 
15850   /* Then, as bit 4 is always set, we can mask it off, leaving
15851      the mantissa in the range [0, 15].  */
15852   mantissa &= ~(1 << 4);
15853   gcc_assert (mantissa <= 15);
15854 
15855   /* GCC internally does not use IEEE754-like encoding (where normalized
15856      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
15857      Our mantissa values are shifted 4 places to the left relative to
15858      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
15859      by 5 places to correct for GCC's representation.  */
15860   exponent = 5 - exponent;
15861 
15862   return (exponent >= 0 && exponent <= 7);
15863 }
15864 
15865 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
15866    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
15867    output MOVI/MVNI, ORR or BIC immediate.  */
15868 char*
15869 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
15870 				   enum simd_immediate_check which)
15871 {
15872   bool is_valid;
15873   static char templ[40];
15874   const char *mnemonic;
15875   const char *shift_op;
15876   unsigned int lane_count = 0;
15877   char element_char;
15878 
15879   struct simd_immediate_info info;
15880 
15881   /* This will return true to show const_vector is legal for use as either
15882      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
15883      It will also update INFO to show how the immediate should be generated.
15884      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
15885   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
15886   gcc_assert (is_valid);
15887 
15888   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15889   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
15890 
15891   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15892     {
15893       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
15894       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15895 	 move immediate path.  */
15896       if (aarch64_float_const_zero_rtx_p (info.value))
15897         info.value = GEN_INT (0);
15898       else
15899 	{
15900 	  const unsigned int buf_size = 20;
15901 	  char float_buf[buf_size] = {'\0'};
15902 	  real_to_decimal_for_mode (float_buf,
15903 				    CONST_DOUBLE_REAL_VALUE (info.value),
15904 				    buf_size, buf_size, 1, info.elt_mode);
15905 
15906 	  if (lane_count == 1)
15907 	    snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
15908 	  else
15909 	    snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
15910 		      lane_count, element_char, float_buf);
15911 	  return templ;
15912 	}
15913     }
15914 
15915   gcc_assert (CONST_INT_P (info.value));
15916 
15917   if (which == AARCH64_CHECK_MOV)
15918     {
15919       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15920       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15921       if (lane_count == 1)
15922 	snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15923 		  mnemonic, UINTVAL (info.value));
15924       else if (info.shift)
15925 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15926 		  HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15927 		  element_char, UINTVAL (info.value), shift_op, info.shift);
15928       else
15929 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15930 		  HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15931 		  element_char, UINTVAL (info.value));
15932     }
15933   else
15934     {
15935       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
15936       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15937       if (info.shift)
15938 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15939 		  HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15940 		  element_char, UINTVAL (info.value), "lsl", info.shift);
15941       else
15942 	snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15943 		  HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15944 		  element_char, UINTVAL (info.value));
15945     }
15946   return templ;
15947 }
15948 
15949 char*
15950 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15951 {
15952 
15953   /* If a floating point number was passed and we desire to use it in an
15954      integer mode do the conversion to integer.  */
15955   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15956     {
15957       unsigned HOST_WIDE_INT ival;
15958       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15959 	  gcc_unreachable ();
15960       immediate = gen_int_mode (ival, mode);
15961     }
15962 
15963   machine_mode vmode;
15964   /* use a 64 bit mode for everything except for DI/DF mode, where we use
15965      a 128 bit vector mode.  */
15966   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15967 
15968   vmode = aarch64_simd_container_mode (mode, width);
15969   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15970   return aarch64_output_simd_mov_immediate (v_op, width);
15971 }
15972 
15973 /* Return the output string to use for moving immediate CONST_VECTOR
15974    into an SVE register.  */
15975 
15976 char *
15977 aarch64_output_sve_mov_immediate (rtx const_vector)
15978 {
15979   static char templ[40];
15980   struct simd_immediate_info info;
15981   char element_char;
15982 
15983   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15984   gcc_assert (is_valid);
15985 
15986   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15987 
15988   if (info.step)
15989     {
15990       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15991 		HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15992 		element_char, INTVAL (info.value), INTVAL (info.step));
15993       return templ;
15994     }
15995 
15996   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15997     {
15998       if (aarch64_float_const_zero_rtx_p (info.value))
15999 	info.value = GEN_INT (0);
16000       else
16001 	{
16002 	  const int buf_size = 20;
16003 	  char float_buf[buf_size] = {};
16004 	  real_to_decimal_for_mode (float_buf,
16005 				    CONST_DOUBLE_REAL_VALUE (info.value),
16006 				    buf_size, buf_size, 1, info.elt_mode);
16007 
16008 	  snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
16009 		    element_char, float_buf);
16010 	  return templ;
16011 	}
16012     }
16013 
16014   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
16015 	    element_char, INTVAL (info.value));
16016   return templ;
16017 }
16018 
16019 /* Return the asm format for a PTRUE instruction whose destination has
16020    mode MODE.  SUFFIX is the element size suffix.  */
16021 
16022 char *
16023 aarch64_output_ptrue (machine_mode mode, char suffix)
16024 {
16025   unsigned int nunits;
16026   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
16027   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
16028     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
16029   else
16030     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
16031   return buf;
16032 }
16033 
16034 /* Split operands into moves from op[1] + op[2] into op[0].  */
16035 
16036 void
16037 aarch64_split_combinev16qi (rtx operands[3])
16038 {
16039   unsigned int dest = REGNO (operands[0]);
16040   unsigned int src1 = REGNO (operands[1]);
16041   unsigned int src2 = REGNO (operands[2]);
16042   machine_mode halfmode = GET_MODE (operands[1]);
16043   unsigned int halfregs = REG_NREGS (operands[1]);
16044   rtx destlo, desthi;
16045 
16046   gcc_assert (halfmode == V16QImode);
16047 
16048   if (src1 == dest && src2 == dest + halfregs)
16049     {
16050       /* No-op move.  Can't split to nothing; emit something.  */
16051       emit_note (NOTE_INSN_DELETED);
16052       return;
16053     }
16054 
16055   /* Preserve register attributes for variable tracking.  */
16056   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
16057   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
16058 			       GET_MODE_SIZE (halfmode));
16059 
16060   /* Special case of reversed high/low parts.  */
16061   if (reg_overlap_mentioned_p (operands[2], destlo)
16062       && reg_overlap_mentioned_p (operands[1], desthi))
16063     {
16064       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16065       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
16066       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
16067     }
16068   else if (!reg_overlap_mentioned_p (operands[2], destlo))
16069     {
16070       /* Try to avoid unnecessary moves if part of the result
16071 	 is in the right place already.  */
16072       if (src1 != dest)
16073 	emit_move_insn (destlo, operands[1]);
16074       if (src2 != dest + halfregs)
16075 	emit_move_insn (desthi, operands[2]);
16076     }
16077   else
16078     {
16079       if (src2 != dest + halfregs)
16080 	emit_move_insn (desthi, operands[2]);
16081       if (src1 != dest)
16082 	emit_move_insn (destlo, operands[1]);
16083     }
16084 }
16085 
16086 /* vec_perm support.  */
16087 
16088 struct expand_vec_perm_d
16089 {
16090   rtx target, op0, op1;
16091   vec_perm_indices perm;
16092   machine_mode vmode;
16093   unsigned int vec_flags;
16094   bool one_vector_p;
16095   bool testing_p;
16096 };
16097 
16098 /* Generate a variable permutation.  */
16099 
16100 static void
16101 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
16102 {
16103   machine_mode vmode = GET_MODE (target);
16104   bool one_vector_p = rtx_equal_p (op0, op1);
16105 
16106   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
16107   gcc_checking_assert (GET_MODE (op0) == vmode);
16108   gcc_checking_assert (GET_MODE (op1) == vmode);
16109   gcc_checking_assert (GET_MODE (sel) == vmode);
16110   gcc_checking_assert (TARGET_SIMD);
16111 
16112   if (one_vector_p)
16113     {
16114       if (vmode == V8QImode)
16115 	{
16116 	  /* Expand the argument to a V16QI mode by duplicating it.  */
16117 	  rtx pair = gen_reg_rtx (V16QImode);
16118 	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
16119 	  emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16120 	}
16121       else
16122 	{
16123 	  emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
16124 	}
16125     }
16126   else
16127     {
16128       rtx pair;
16129 
16130       if (vmode == V8QImode)
16131 	{
16132 	  pair = gen_reg_rtx (V16QImode);
16133 	  emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
16134 	  emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
16135 	}
16136       else
16137 	{
16138 	  pair = gen_reg_rtx (OImode);
16139 	  emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
16140 	  emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
16141 	}
16142     }
16143 }
16144 
16145 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
16146    NELT is the number of elements in the vector.  */
16147 
16148 void
16149 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
16150 			 unsigned int nelt)
16151 {
16152   machine_mode vmode = GET_MODE (target);
16153   bool one_vector_p = rtx_equal_p (op0, op1);
16154   rtx mask;
16155 
16156   /* The TBL instruction does not use a modulo index, so we must take care
16157      of that ourselves.  */
16158   mask = aarch64_simd_gen_const_vector_dup (vmode,
16159       one_vector_p ? nelt - 1 : 2 * nelt - 1);
16160   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
16161 
16162   /* For big-endian, we also need to reverse the index within the vector
16163      (but not which vector).  */
16164   if (BYTES_BIG_ENDIAN)
16165     {
16166       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
16167       if (!one_vector_p)
16168         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
16169       sel = expand_simple_binop (vmode, XOR, sel, mask,
16170 				 NULL, 0, OPTAB_LIB_WIDEN);
16171     }
16172   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
16173 }
16174 
16175 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
16176 
16177 static void
16178 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
16179 {
16180   emit_insn (gen_rtx_SET (target,
16181 			  gen_rtx_UNSPEC (GET_MODE (target),
16182 					  gen_rtvec (2, op0, op1), code)));
16183 }
16184 
16185 /* Expand an SVE vec_perm with the given operands.  */
16186 
16187 void
16188 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
16189 {
16190   machine_mode data_mode = GET_MODE (target);
16191   machine_mode sel_mode = GET_MODE (sel);
16192   /* Enforced by the pattern condition.  */
16193   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
16194 
16195   /* Note: vec_perm indices are supposed to wrap when they go beyond the
16196      size of the two value vectors, i.e. the upper bits of the indices
16197      are effectively ignored.  SVE TBL instead produces 0 for any
16198      out-of-range indices, so we need to modulo all the vec_perm indices
16199      to ensure they are all in range.  */
16200   rtx sel_reg = force_reg (sel_mode, sel);
16201 
16202   /* Check if the sel only references the first values vector.  */
16203   if (GET_CODE (sel) == CONST_VECTOR
16204       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
16205     {
16206       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
16207       return;
16208     }
16209 
16210   /* Check if the two values vectors are the same.  */
16211   if (rtx_equal_p (op0, op1))
16212     {
16213       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
16214       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16215 					 NULL, 0, OPTAB_DIRECT);
16216       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
16217       return;
16218     }
16219 
16220   /* Run TBL on for each value vector and combine the results.  */
16221 
16222   rtx res0 = gen_reg_rtx (data_mode);
16223   rtx res1 = gen_reg_rtx (data_mode);
16224   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
16225   if (GET_CODE (sel) != CONST_VECTOR
16226       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
16227     {
16228       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
16229 						       2 * nunits - 1);
16230       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
16231 				     NULL, 0, OPTAB_DIRECT);
16232     }
16233   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
16234   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
16235 				     NULL, 0, OPTAB_DIRECT);
16236   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
16237   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
16238     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
16239   else
16240     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
16241 }
16242 
16243 /* Recognize patterns suitable for the TRN instructions.  */
16244 static bool
16245 aarch64_evpc_trn (struct expand_vec_perm_d *d)
16246 {
16247   HOST_WIDE_INT odd;
16248   poly_uint64 nelt = d->perm.length ();
16249   rtx out, in0, in1, x;
16250   machine_mode vmode = d->vmode;
16251 
16252   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16253     return false;
16254 
16255   /* Note that these are little-endian tests.
16256      We correct for big-endian later.  */
16257   if (!d->perm[0].is_constant (&odd)
16258       || (odd != 0 && odd != 1)
16259       || !d->perm.series_p (0, 2, odd, 2)
16260       || !d->perm.series_p (1, 2, nelt + odd, 2))
16261     return false;
16262 
16263   /* Success!  */
16264   if (d->testing_p)
16265     return true;
16266 
16267   in0 = d->op0;
16268   in1 = d->op1;
16269   /* We don't need a big-endian lane correction for SVE; see the comment
16270      at the head of aarch64-sve.md for details.  */
16271   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16272     {
16273       x = in0, in0 = in1, in1 = x;
16274       odd = !odd;
16275     }
16276   out = d->target;
16277 
16278   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16279 				      odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
16280   return true;
16281 }
16282 
16283 /* Recognize patterns suitable for the UZP instructions.  */
16284 static bool
16285 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
16286 {
16287   HOST_WIDE_INT odd;
16288   rtx out, in0, in1, x;
16289   machine_mode vmode = d->vmode;
16290 
16291   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16292     return false;
16293 
16294   /* Note that these are little-endian tests.
16295      We correct for big-endian later.  */
16296   if (!d->perm[0].is_constant (&odd)
16297       || (odd != 0 && odd != 1)
16298       || !d->perm.series_p (0, 1, odd, 2))
16299     return false;
16300 
16301   /* Success!  */
16302   if (d->testing_p)
16303     return true;
16304 
16305   in0 = d->op0;
16306   in1 = d->op1;
16307   /* We don't need a big-endian lane correction for SVE; see the comment
16308      at the head of aarch64-sve.md for details.  */
16309   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16310     {
16311       x = in0, in0 = in1, in1 = x;
16312       odd = !odd;
16313     }
16314   out = d->target;
16315 
16316   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16317 				      odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
16318   return true;
16319 }
16320 
16321 /* Recognize patterns suitable for the ZIP instructions.  */
16322 static bool
16323 aarch64_evpc_zip (struct expand_vec_perm_d *d)
16324 {
16325   unsigned int high;
16326   poly_uint64 nelt = d->perm.length ();
16327   rtx out, in0, in1, x;
16328   machine_mode vmode = d->vmode;
16329 
16330   if (GET_MODE_UNIT_SIZE (vmode) > 8)
16331     return false;
16332 
16333   /* Note that these are little-endian tests.
16334      We correct for big-endian later.  */
16335   poly_uint64 first = d->perm[0];
16336   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
16337       || !d->perm.series_p (0, 2, first, 1)
16338       || !d->perm.series_p (1, 2, first + nelt, 1))
16339     return false;
16340   high = maybe_ne (first, 0U);
16341 
16342   /* Success!  */
16343   if (d->testing_p)
16344     return true;
16345 
16346   in0 = d->op0;
16347   in1 = d->op1;
16348   /* We don't need a big-endian lane correction for SVE; see the comment
16349      at the head of aarch64-sve.md for details.  */
16350   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
16351     {
16352       x = in0, in0 = in1, in1 = x;
16353       high = !high;
16354     }
16355   out = d->target;
16356 
16357   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
16358 				      high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
16359   return true;
16360 }
16361 
16362 /* Recognize patterns for the EXT insn.  */
16363 
16364 static bool
16365 aarch64_evpc_ext (struct expand_vec_perm_d *d)
16366 {
16367   HOST_WIDE_INT location;
16368   rtx offset;
16369 
16370   /* The first element always refers to the first vector.
16371      Check if the extracted indices are increasing by one.  */
16372   if (d->vec_flags == VEC_SVE_PRED
16373       || !d->perm[0].is_constant (&location)
16374       || !d->perm.series_p (0, 1, location, 1))
16375     return false;
16376 
16377   /* Success! */
16378   if (d->testing_p)
16379     return true;
16380 
16381   /* The case where (location == 0) is a no-op for both big- and little-endian,
16382      and is removed by the mid-end at optimization levels -O1 and higher.
16383 
16384      We don't need a big-endian lane correction for SVE; see the comment
16385      at the head of aarch64-sve.md for details.  */
16386   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
16387     {
16388       /* After setup, we want the high elements of the first vector (stored
16389          at the LSB end of the register), and the low elements of the second
16390          vector (stored at the MSB end of the register). So swap.  */
16391       std::swap (d->op0, d->op1);
16392       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
16393 	 to_constant () is safe since this is restricted to Advanced SIMD
16394 	 vectors.  */
16395       location = d->perm.length ().to_constant () - location;
16396     }
16397 
16398   offset = GEN_INT (location);
16399   emit_set_insn (d->target,
16400 		 gen_rtx_UNSPEC (d->vmode,
16401 				 gen_rtvec (3, d->op0, d->op1, offset),
16402 				 UNSPEC_EXT));
16403   return true;
16404 }
16405 
16406 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
16407    within each 64-bit, 32-bit or 16-bit granule.  */
16408 
16409 static bool
16410 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
16411 {
16412   HOST_WIDE_INT diff;
16413   unsigned int i, size, unspec;
16414   machine_mode pred_mode;
16415 
16416   if (d->vec_flags == VEC_SVE_PRED
16417       || !d->one_vector_p
16418       || !d->perm[0].is_constant (&diff))
16419     return false;
16420 
16421   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
16422   if (size == 8)
16423     {
16424       unspec = UNSPEC_REV64;
16425       pred_mode = VNx2BImode;
16426     }
16427   else if (size == 4)
16428     {
16429       unspec = UNSPEC_REV32;
16430       pred_mode = VNx4BImode;
16431     }
16432   else if (size == 2)
16433     {
16434       unspec = UNSPEC_REV16;
16435       pred_mode = VNx8BImode;
16436     }
16437   else
16438     return false;
16439 
16440   unsigned int step = diff + 1;
16441   for (i = 0; i < step; ++i)
16442     if (!d->perm.series_p (i, step, diff - i, step))
16443       return false;
16444 
16445   /* Success! */
16446   if (d->testing_p)
16447     return true;
16448 
16449   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
16450   if (d->vec_flags == VEC_SVE_DATA)
16451     {
16452       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16453       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
16454 			    UNSPEC_MERGE_PTRUE);
16455     }
16456   emit_set_insn (d->target, src);
16457   return true;
16458 }
16459 
16460 /* Recognize patterns for the REV insn, which reverses elements within
16461    a full vector.  */
16462 
16463 static bool
16464 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
16465 {
16466   poly_uint64 nelt = d->perm.length ();
16467 
16468   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
16469     return false;
16470 
16471   if (!d->perm.series_p (0, 1, nelt - 1, -1))
16472     return false;
16473 
16474   /* Success! */
16475   if (d->testing_p)
16476     return true;
16477 
16478   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
16479   emit_set_insn (d->target, src);
16480   return true;
16481 }
16482 
16483 static bool
16484 aarch64_evpc_dup (struct expand_vec_perm_d *d)
16485 {
16486   rtx out = d->target;
16487   rtx in0;
16488   HOST_WIDE_INT elt;
16489   machine_mode vmode = d->vmode;
16490   rtx lane;
16491 
16492   if (d->vec_flags == VEC_SVE_PRED
16493       || d->perm.encoding ().encoded_nelts () != 1
16494       || !d->perm[0].is_constant (&elt))
16495     return false;
16496 
16497   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
16498     return false;
16499 
16500   /* Success! */
16501   if (d->testing_p)
16502     return true;
16503 
16504   /* The generic preparation in aarch64_expand_vec_perm_const_1
16505      swaps the operand order and the permute indices if it finds
16506      d->perm[0] to be in the second operand.  Thus, we can always
16507      use d->op0 and need not do any extra arithmetic to get the
16508      correct lane number.  */
16509   in0 = d->op0;
16510   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
16511 
16512   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
16513   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
16514   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
16515   return true;
16516 }
16517 
16518 static bool
16519 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
16520 {
16521   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
16522   machine_mode vmode = d->vmode;
16523 
16524   /* Make sure that the indices are constant.  */
16525   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
16526   for (unsigned int i = 0; i < encoded_nelts; ++i)
16527     if (!d->perm[i].is_constant ())
16528       return false;
16529 
16530   if (d->testing_p)
16531     return true;
16532 
16533   /* Generic code will try constant permutation twice.  Once with the
16534      original mode and again with the elements lowered to QImode.
16535      So wait and don't do the selector expansion ourselves.  */
16536   if (vmode != V8QImode && vmode != V16QImode)
16537     return false;
16538 
16539   /* to_constant is safe since this routine is specific to Advanced SIMD
16540      vectors.  */
16541   unsigned int nelt = d->perm.length ().to_constant ();
16542   for (unsigned int i = 0; i < nelt; ++i)
16543     /* If big-endian and two vectors we end up with a weird mixed-endian
16544        mode on NEON.  Reverse the index within each word but not the word
16545        itself.  to_constant is safe because we checked is_constant above.  */
16546     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
16547 			? d->perm[i].to_constant () ^ (nelt - 1)
16548 			: d->perm[i].to_constant ());
16549 
16550   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
16551   sel = force_reg (vmode, sel);
16552 
16553   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
16554   return true;
16555 }
16556 
16557 /* Try to implement D using an SVE TBL instruction.  */
16558 
16559 static bool
16560 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
16561 {
16562   unsigned HOST_WIDE_INT nelt;
16563 
16564   /* Permuting two variable-length vectors could overflow the
16565      index range.  */
16566   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
16567     return false;
16568 
16569   if (d->testing_p)
16570     return true;
16571 
16572   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
16573   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
16574   if (d->one_vector_p)
16575     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
16576   else
16577     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
16578   return true;
16579 }
16580 
16581 static bool
16582 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
16583 {
16584   /* The pattern matching functions above are written to look for a small
16585      number to begin the sequence (0, 1, N/2).  If we begin with an index
16586      from the second operand, we can swap the operands.  */
16587   poly_int64 nelt = d->perm.length ();
16588   if (known_ge (d->perm[0], nelt))
16589     {
16590       d->perm.rotate_inputs (1);
16591       std::swap (d->op0, d->op1);
16592     }
16593 
16594   if ((d->vec_flags == VEC_ADVSIMD
16595        || d->vec_flags == VEC_SVE_DATA
16596        || d->vec_flags == VEC_SVE_PRED)
16597       && known_gt (nelt, 1))
16598     {
16599       if (aarch64_evpc_rev_local (d))
16600 	return true;
16601       else if (aarch64_evpc_rev_global (d))
16602 	return true;
16603       else if (aarch64_evpc_ext (d))
16604 	return true;
16605       else if (aarch64_evpc_dup (d))
16606 	return true;
16607       else if (aarch64_evpc_zip (d))
16608 	return true;
16609       else if (aarch64_evpc_uzp (d))
16610 	return true;
16611       else if (aarch64_evpc_trn (d))
16612 	return true;
16613       if (d->vec_flags == VEC_SVE_DATA)
16614 	return aarch64_evpc_sve_tbl (d);
16615       else if (d->vec_flags == VEC_ADVSIMD)
16616 	return aarch64_evpc_tbl (d);
16617     }
16618   return false;
16619 }
16620 
16621 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
16622 
16623 static bool
16624 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
16625 				  rtx op1, const vec_perm_indices &sel)
16626 {
16627   struct expand_vec_perm_d d;
16628 
16629   /* Check whether the mask can be applied to a single vector.  */
16630   if (sel.ninputs () == 1
16631       || (op0 && rtx_equal_p (op0, op1)))
16632     d.one_vector_p = true;
16633   else if (sel.all_from_input_p (0))
16634     {
16635       d.one_vector_p = true;
16636       op1 = op0;
16637     }
16638   else if (sel.all_from_input_p (1))
16639     {
16640       d.one_vector_p = true;
16641       op0 = op1;
16642     }
16643   else
16644     d.one_vector_p = false;
16645 
16646   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
16647 		     sel.nelts_per_input ());
16648   d.vmode = vmode;
16649   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
16650   d.target = target;
16651   d.op0 = op0;
16652   d.op1 = op1;
16653   d.testing_p = !target;
16654 
16655   if (!d.testing_p)
16656     return aarch64_expand_vec_perm_const_1 (&d);
16657 
16658   rtx_insn *last = get_last_insn ();
16659   bool ret = aarch64_expand_vec_perm_const_1 (&d);
16660   gcc_assert (last == get_last_insn ());
16661 
16662   return ret;
16663 }
16664 
16665 /* Generate a byte permute mask for a register of mode MODE,
16666    which has NUNITS units.  */
16667 
16668 rtx
16669 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
16670 {
16671   /* We have to reverse each vector because we dont have
16672      a permuted load that can reverse-load according to ABI rules.  */
16673   rtx mask;
16674   rtvec v = rtvec_alloc (16);
16675   unsigned int i, j;
16676   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
16677 
16678   gcc_assert (BYTES_BIG_ENDIAN);
16679   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
16680 
16681   for (i = 0; i < nunits; i++)
16682     for (j = 0; j < usize; j++)
16683       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
16684   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
16685   return force_reg (V16QImode, mask);
16686 }
16687 
16688 /* Return true if X is a valid second operand for the SVE instruction
16689    that implements integer comparison OP_CODE.  */
16690 
16691 static bool
16692 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
16693 {
16694   if (register_operand (x, VOIDmode))
16695     return true;
16696 
16697   switch (op_code)
16698     {
16699     case LTU:
16700     case LEU:
16701     case GEU:
16702     case GTU:
16703       return aarch64_sve_cmp_immediate_p (x, false);
16704     case LT:
16705     case LE:
16706     case GE:
16707     case GT:
16708     case NE:
16709     case EQ:
16710       return aarch64_sve_cmp_immediate_p (x, true);
16711     default:
16712       gcc_unreachable ();
16713     }
16714 }
16715 
16716 /* Use predicated SVE instructions to implement the equivalent of:
16717 
16718      (set TARGET OP)
16719 
16720    given that PTRUE is an all-true predicate of the appropriate mode.  */
16721 
16722 static void
16723 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
16724 {
16725   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16726 			       gen_rtvec (2, ptrue, op),
16727 			       UNSPEC_MERGE_PTRUE);
16728   rtx_insn *insn = emit_set_insn (target, unspec);
16729   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16730 }
16731 
16732 /* Likewise, but also clobber the condition codes.  */
16733 
16734 static void
16735 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
16736 {
16737   rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
16738 			       gen_rtvec (2, ptrue, op),
16739 			       UNSPEC_MERGE_PTRUE);
16740   rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
16741   set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
16742 }
16743 
16744 /* Return the UNSPEC_COND_* code for comparison CODE.  */
16745 
16746 static unsigned int
16747 aarch64_unspec_cond_code (rtx_code code)
16748 {
16749   switch (code)
16750     {
16751     case NE:
16752       return UNSPEC_COND_NE;
16753     case EQ:
16754       return UNSPEC_COND_EQ;
16755     case LT:
16756       return UNSPEC_COND_LT;
16757     case GT:
16758       return UNSPEC_COND_GT;
16759     case LE:
16760       return UNSPEC_COND_LE;
16761     case GE:
16762       return UNSPEC_COND_GE;
16763     default:
16764       gcc_unreachable ();
16765     }
16766 }
16767 
16768 /* Emit:
16769 
16770       (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
16771 
16772    where <X> is the operation associated with comparison CODE.  This form
16773    of instruction is used when (and (CODE OP0 OP1) PRED) would have different
16774    semantics, such as when PRED might not be all-true and when comparing
16775    inactive lanes could have side effects.  */
16776 
16777 static void
16778 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
16779 				  rtx pred, rtx op0, rtx op1)
16780 {
16781   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
16782 			       gen_rtvec (3, pred, op0, op1),
16783 			       aarch64_unspec_cond_code (code));
16784   emit_set_insn (target, unspec);
16785 }
16786 
16787 /* Expand an SVE integer comparison using the SVE equivalent of:
16788 
16789      (set TARGET (CODE OP0 OP1)).  */
16790 
16791 void
16792 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
16793 {
16794   machine_mode pred_mode = GET_MODE (target);
16795   machine_mode data_mode = GET_MODE (op0);
16796 
16797   if (!aarch64_sve_cmp_operand_p (code, op1))
16798     op1 = force_reg (data_mode, op1);
16799 
16800   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16801   rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16802   aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
16803 }
16804 
16805 /* Emit the SVE equivalent of:
16806 
16807       (set TMP1 (CODE1 OP0 OP1))
16808       (set TMP2 (CODE2 OP0 OP1))
16809       (set TARGET (ior:PRED_MODE TMP1 TMP2))
16810 
16811    PTRUE is an all-true predicate with the same mode as TARGET.  */
16812 
16813 static void
16814 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
16815 			   rtx ptrue, rtx op0, rtx op1)
16816 {
16817   machine_mode pred_mode = GET_MODE (ptrue);
16818   rtx tmp1 = gen_reg_rtx (pred_mode);
16819   aarch64_emit_sve_ptrue_op (tmp1, ptrue,
16820 			     gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
16821   rtx tmp2 = gen_reg_rtx (pred_mode);
16822   aarch64_emit_sve_ptrue_op (tmp2, ptrue,
16823 			     gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
16824   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
16825 }
16826 
16827 /* Emit the SVE equivalent of:
16828 
16829       (set TMP (CODE OP0 OP1))
16830       (set TARGET (not TMP))
16831 
16832    PTRUE is an all-true predicate with the same mode as TARGET.  */
16833 
16834 static void
16835 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
16836 				rtx op0, rtx op1)
16837 {
16838   machine_mode pred_mode = GET_MODE (ptrue);
16839   rtx tmp = gen_reg_rtx (pred_mode);
16840   aarch64_emit_sve_ptrue_op (tmp, ptrue,
16841 			     gen_rtx_fmt_ee (code, pred_mode, op0, op1));
16842   aarch64_emit_unop (target, one_cmpl_optab, tmp);
16843 }
16844 
16845 /* Expand an SVE floating-point comparison using the SVE equivalent of:
16846 
16847      (set TARGET (CODE OP0 OP1))
16848 
16849    If CAN_INVERT_P is true, the caller can also handle inverted results;
16850    return true if the result is in fact inverted.  */
16851 
16852 bool
16853 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
16854 				  rtx op0, rtx op1, bool can_invert_p)
16855 {
16856   machine_mode pred_mode = GET_MODE (target);
16857   machine_mode data_mode = GET_MODE (op0);
16858 
16859   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
16860   switch (code)
16861     {
16862     case UNORDERED:
16863       /* UNORDERED has no immediate form.  */
16864       op1 = force_reg (data_mode, op1);
16865       /* fall through */
16866     case LT:
16867     case LE:
16868     case GT:
16869     case GE:
16870     case EQ:
16871     case NE:
16872       {
16873 	/* There is native support for the comparison.  */
16874 	rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16875 	aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16876 	return false;
16877       }
16878 
16879     case LTGT:
16880       /* This is a trapping operation (LT or GT).  */
16881       aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
16882       return false;
16883 
16884     case UNEQ:
16885       if (!flag_trapping_math)
16886 	{
16887 	  /* This would trap for signaling NaNs.  */
16888 	  op1 = force_reg (data_mode, op1);
16889 	  aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
16890 	  return false;
16891 	}
16892       /* fall through */
16893     case UNLT:
16894     case UNLE:
16895     case UNGT:
16896     case UNGE:
16897       if (flag_trapping_math)
16898 	{
16899 	  /* Work out which elements are ordered.  */
16900 	  rtx ordered = gen_reg_rtx (pred_mode);
16901 	  op1 = force_reg (data_mode, op1);
16902 	  aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
16903 
16904 	  /* Test the opposite condition for the ordered elements,
16905 	     then invert the result.  */
16906 	  if (code == UNEQ)
16907 	    code = NE;
16908 	  else
16909 	    code = reverse_condition_maybe_unordered (code);
16910 	  if (can_invert_p)
16911 	    {
16912 	      aarch64_emit_sve_predicated_cond (target, code,
16913 						ordered, op0, op1);
16914 	      return true;
16915 	    }
16916 	  rtx tmp = gen_reg_rtx (pred_mode);
16917 	  aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
16918 	  aarch64_emit_unop (target, one_cmpl_optab, tmp);
16919 	  return false;
16920 	}
16921       break;
16922 
16923     case ORDERED:
16924       /* ORDERED has no immediate form.  */
16925       op1 = force_reg (data_mode, op1);
16926       break;
16927 
16928     default:
16929       gcc_unreachable ();
16930     }
16931 
16932   /* There is native support for the inverse comparison.  */
16933   code = reverse_condition_maybe_unordered (code);
16934   if (can_invert_p)
16935     {
16936       rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16937       aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16938       return true;
16939     }
16940   aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16941   return false;
16942 }
16943 
16944 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
16945    of the data being selected and CMP_MODE is the mode of the values being
16946    compared.  */
16947 
16948 void
16949 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16950 			  rtx *ops)
16951 {
16952   machine_mode pred_mode
16953     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16954 			     GET_MODE_SIZE (cmp_mode)).require ();
16955   rtx pred = gen_reg_rtx (pred_mode);
16956   if (FLOAT_MODE_P (cmp_mode))
16957     {
16958       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16959 					    ops[4], ops[5], true))
16960 	std::swap (ops[1], ops[2]);
16961     }
16962   else
16963     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16964 
16965   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16966   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16967 }
16968 
16969 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
16970    true.  However due to issues with register allocation it is preferable
16971    to avoid tieing integer scalar and FP scalar modes.  Executing integer
16972    operations in general registers is better than treating them as scalar
16973    vector operations.  This reduces latency and avoids redundant int<->FP
16974    moves.  So tie modes if they are either the same class, or vector modes
16975    with other vector modes, vector structs or any scalar mode.  */
16976 
16977 static bool
16978 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16979 {
16980   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16981     return true;
16982 
16983   /* We specifically want to allow elements of "structure" modes to
16984      be tieable to the structure.  This more general condition allows
16985      other rarer situations too.  The reason we don't extend this to
16986      predicate modes is that there are no predicate structure modes
16987      nor any specific instructions for extracting part of a predicate
16988      register.  */
16989   if (aarch64_vector_data_mode_p (mode1)
16990       && aarch64_vector_data_mode_p (mode2))
16991     return true;
16992 
16993   /* Also allow any scalar modes with vectors.  */
16994   if (aarch64_vector_mode_supported_p (mode1)
16995       || aarch64_vector_mode_supported_p (mode2))
16996     return true;
16997 
16998   return false;
16999 }
17000 
17001 /* Return a new RTX holding the result of moving POINTER forward by
17002    AMOUNT bytes.  */
17003 
17004 static rtx
17005 aarch64_move_pointer (rtx pointer, poly_int64 amount)
17006 {
17007   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
17008 
17009   return adjust_automodify_address (pointer, GET_MODE (pointer),
17010 				    next, amount);
17011 }
17012 
17013 /* Return a new RTX holding the result of moving POINTER forward by the
17014    size of the mode it points to.  */
17015 
17016 static rtx
17017 aarch64_progress_pointer (rtx pointer)
17018 {
17019   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
17020 }
17021 
17022 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
17023    MODE bytes.  */
17024 
17025 static void
17026 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
17027 					      machine_mode mode)
17028 {
17029   rtx reg = gen_reg_rtx (mode);
17030 
17031   /* "Cast" the pointers to the correct mode.  */
17032   *src = adjust_address (*src, mode, 0);
17033   *dst = adjust_address (*dst, mode, 0);
17034   /* Emit the memcpy.  */
17035   emit_move_insn (reg, *src);
17036   emit_move_insn (*dst, reg);
17037   /* Move the pointers forward.  */
17038   *src = aarch64_progress_pointer (*src);
17039   *dst = aarch64_progress_pointer (*dst);
17040 }
17041 
17042 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
17043    we succeed, otherwise return false.  */
17044 
17045 bool
17046 aarch64_expand_movmem (rtx *operands)
17047 {
17048   int n, mode_bits;
17049   rtx dst = operands[0];
17050   rtx src = operands[1];
17051   rtx base;
17052   machine_mode cur_mode = BLKmode, next_mode;
17053   bool speed_p = !optimize_function_for_size_p (cfun);
17054 
17055   /* When optimizing for size, give a better estimate of the length of a
17056      memcpy call, but use the default otherwise.  Moves larger than 8 bytes
17057      will always require an even number of instructions to do now.  And each
17058      operation requires both a load+store, so devide the max number by 2.  */
17059   int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
17060 
17061   /* We can't do anything smart if the amount to copy is not constant.  */
17062   if (!CONST_INT_P (operands[2]))
17063     return false;
17064 
17065   n = INTVAL (operands[2]);
17066 
17067   /* Try to keep the number of instructions low.  For all cases we will do at
17068      most two moves for the residual amount, since we'll always overlap the
17069      remainder.  */
17070   if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
17071     return false;
17072 
17073   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
17074   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
17075 
17076   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
17077   src = adjust_automodify_address (src, VOIDmode, base, 0);
17078 
17079   /* Convert n to bits to make the rest of the code simpler.  */
17080   n = n * BITS_PER_UNIT;
17081 
17082   /* Maximum amount to copy in one go.  The AArch64 back-end has integer modes
17083      larger than TImode, but we should not use them for loads/stores here.  */
17084   const int copy_limit = GET_MODE_BITSIZE (TImode);
17085 
17086   while (n > 0)
17087     {
17088       /* Find the largest mode in which to do the copy in without over reading
17089 	 or writing.  */
17090       opt_scalar_int_mode mode_iter;
17091       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
17092 	if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
17093 	  cur_mode = mode_iter.require ();
17094 
17095       gcc_assert (cur_mode != BLKmode);
17096 
17097       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
17098       aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
17099 
17100       n -= mode_bits;
17101 
17102       /* Do certain trailing copies as overlapping if it's going to be
17103 	 cheaper.  i.e. less instructions to do so.  For instance doing a 15
17104 	 byte copy it's more efficient to do two overlapping 8 byte copies than
17105 	 8 + 6 + 1.  */
17106       if (n > 0 && n <= 8 * BITS_PER_UNIT)
17107 	{
17108 	  next_mode = smallest_mode_for_size (n, MODE_INT);
17109 	  int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
17110 	  src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
17111 	  dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
17112 	  n = n_bits;
17113 	}
17114     }
17115 
17116   return true;
17117 }
17118 
17119 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
17120    SImode stores.  Handle the case when the constant has identical
17121    bottom and top halves.  This is beneficial when the two stores can be
17122    merged into an STP and we avoid synthesising potentially expensive
17123    immediates twice.  Return true if such a split is possible.  */
17124 
17125 bool
17126 aarch64_split_dimode_const_store (rtx dst, rtx src)
17127 {
17128   rtx lo = gen_lowpart (SImode, src);
17129   rtx hi = gen_highpart_mode (SImode, DImode, src);
17130 
17131   bool size_p = optimize_function_for_size_p (cfun);
17132 
17133   if (!rtx_equal_p (lo, hi))
17134     return false;
17135 
17136   unsigned int orig_cost
17137     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
17138   unsigned int lo_cost
17139     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
17140 
17141   /* We want to transform:
17142      MOV	x1, 49370
17143      MOVK	x1, 0x140, lsl 16
17144      MOVK	x1, 0xc0da, lsl 32
17145      MOVK	x1, 0x140, lsl 48
17146      STR	x1, [x0]
17147    into:
17148      MOV	w1, 49370
17149      MOVK	w1, 0x140, lsl 16
17150      STP	w1, w1, [x0]
17151    So we want to perform this only when we save two instructions
17152    or more.  When optimizing for size, however, accept any code size
17153    savings we can.  */
17154   if (size_p && orig_cost <= lo_cost)
17155     return false;
17156 
17157   if (!size_p
17158       && (orig_cost <= lo_cost + 1))
17159     return false;
17160 
17161   rtx mem_lo = adjust_address (dst, SImode, 0);
17162   if (!aarch64_mem_pair_operand (mem_lo, SImode))
17163     return false;
17164 
17165   rtx tmp_reg = gen_reg_rtx (SImode);
17166   aarch64_expand_mov_immediate (tmp_reg, lo);
17167   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
17168   /* Don't emit an explicit store pair as this may not be always profitable.
17169      Let the sched-fusion logic decide whether to merge them.  */
17170   emit_move_insn (mem_lo, tmp_reg);
17171   emit_move_insn (mem_hi, tmp_reg);
17172 
17173   return true;
17174 }
17175 
17176 /* Generate RTL for a conditional branch with rtx comparison CODE in
17177    mode CC_MODE.  The destination of the unlikely conditional branch
17178    is LABEL_REF.  */
17179 
17180 void
17181 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
17182 			      rtx label_ref)
17183 {
17184   rtx x;
17185   x = gen_rtx_fmt_ee (code, VOIDmode,
17186 		      gen_rtx_REG (cc_mode, CC_REGNUM),
17187 		      const0_rtx);
17188 
17189   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
17190 			    gen_rtx_LABEL_REF (VOIDmode, label_ref),
17191 			    pc_rtx);
17192   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
17193 }
17194 
17195 /* Generate DImode scratch registers for 128-bit (TImode) addition.
17196 
17197    OP1 represents the TImode destination operand 1
17198    OP2 represents the TImode destination operand 2
17199    LOW_DEST represents the low half (DImode) of TImode operand 0
17200    LOW_IN1 represents the low half (DImode) of TImode operand 1
17201    LOW_IN2 represents the low half (DImode) of TImode operand 2
17202    HIGH_DEST represents the high half (DImode) of TImode operand 0
17203    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17204    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
17205 
17206 void
17207 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17208 			    rtx *low_in1, rtx *low_in2,
17209 			    rtx *high_dest, rtx *high_in1,
17210 			    rtx *high_in2)
17211 {
17212   *low_dest = gen_reg_rtx (DImode);
17213   *low_in1 = gen_lowpart (DImode, op1);
17214   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17215 				  subreg_lowpart_offset (DImode, TImode));
17216   *high_dest = gen_reg_rtx (DImode);
17217   *high_in1 = gen_highpart (DImode, op1);
17218   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17219 				   subreg_highpart_offset (DImode, TImode));
17220 }
17221 
17222 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
17223 
17224    This function differs from 'arch64_addti_scratch_regs' in that
17225    OP1 can be an immediate constant (zero). We must call
17226    subreg_highpart_offset with DImode and TImode arguments, otherwise
17227    VOIDmode will be used for the const_int which generates an internal
17228    error from subreg_size_highpart_offset which does not expect a size of zero.
17229 
17230    OP1 represents the TImode destination operand 1
17231    OP2 represents the TImode destination operand 2
17232    LOW_DEST represents the low half (DImode) of TImode operand 0
17233    LOW_IN1 represents the low half (DImode) of TImode operand 1
17234    LOW_IN2 represents the low half (DImode) of TImode operand 2
17235    HIGH_DEST represents the high half (DImode) of TImode operand 0
17236    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17237    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
17238 
17239 
17240 void
17241 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
17242 			     rtx *low_in1, rtx *low_in2,
17243 			     rtx *high_dest, rtx *high_in1,
17244 			     rtx *high_in2)
17245 {
17246   *low_dest = gen_reg_rtx (DImode);
17247   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
17248 				  subreg_lowpart_offset (DImode, TImode));
17249 
17250   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
17251 				  subreg_lowpart_offset (DImode, TImode));
17252   *high_dest = gen_reg_rtx (DImode);
17253 
17254   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
17255 				   subreg_highpart_offset (DImode, TImode));
17256   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
17257 				   subreg_highpart_offset (DImode, TImode));
17258 }
17259 
17260 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
17261 
17262    OP0 represents the TImode destination operand 0
17263    LOW_DEST represents the low half (DImode) of TImode operand 0
17264    LOW_IN1 represents the low half (DImode) of TImode operand 1
17265    LOW_IN2 represents the low half (DImode) of TImode operand 2
17266    HIGH_DEST represents the high half (DImode) of TImode operand 0
17267    HIGH_IN1 represents the high half (DImode) of TImode operand 1
17268    HIGH_IN2 represents the high half (DImode) of TImode operand 2
17269    UNSIGNED_P is true if the operation is being performed on unsigned
17270    values.  */
17271 void
17272 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
17273 		       rtx low_in2, rtx high_dest, rtx high_in1,
17274 		       rtx high_in2, bool unsigned_p)
17275 {
17276   if (low_in2 == const0_rtx)
17277     {
17278       low_dest = low_in1;
17279       high_in2 = force_reg (DImode, high_in2);
17280       if (unsigned_p)
17281 	emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
17282       else
17283 	emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
17284     }
17285   else
17286     {
17287       if (aarch64_plus_immediate (low_in2, DImode))
17288 	emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
17289 					    GEN_INT (-INTVAL (low_in2))));
17290       else
17291 	{
17292 	  low_in2 = force_reg (DImode, low_in2);
17293 	  emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
17294 	}
17295       high_in2 = force_reg (DImode, high_in2);
17296 
17297       if (unsigned_p)
17298 	emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
17299       else
17300 	emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
17301     }
17302 
17303   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
17304   emit_move_insn (gen_highpart (DImode, op0), high_dest);
17305 
17306 }
17307 
17308 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
17309 
17310 static unsigned HOST_WIDE_INT
17311 aarch64_asan_shadow_offset (void)
17312 {
17313   return (HOST_WIDE_INT_1 << 36);
17314 }
17315 
17316 static rtx
17317 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
17318 			int code, tree treeop0, tree treeop1)
17319 {
17320   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17321   rtx op0, op1;
17322   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17323   insn_code icode;
17324   struct expand_operand ops[4];
17325 
17326   start_sequence ();
17327   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17328 
17329   op_mode = GET_MODE (op0);
17330   if (op_mode == VOIDmode)
17331     op_mode = GET_MODE (op1);
17332 
17333   switch (op_mode)
17334     {
17335     case E_QImode:
17336     case E_HImode:
17337     case E_SImode:
17338       cmp_mode = SImode;
17339       icode = CODE_FOR_cmpsi;
17340       break;
17341 
17342     case E_DImode:
17343       cmp_mode = DImode;
17344       icode = CODE_FOR_cmpdi;
17345       break;
17346 
17347     case E_SFmode:
17348       cmp_mode = SFmode;
17349       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17350       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
17351       break;
17352 
17353     case E_DFmode:
17354       cmp_mode = DFmode;
17355       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
17356       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
17357       break;
17358 
17359     default:
17360       end_sequence ();
17361       return NULL_RTX;
17362     }
17363 
17364   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
17365   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
17366   if (!op0 || !op1)
17367     {
17368       end_sequence ();
17369       return NULL_RTX;
17370     }
17371   *prep_seq = get_insns ();
17372   end_sequence ();
17373 
17374   create_fixed_operand (&ops[0], op0);
17375   create_fixed_operand (&ops[1], op1);
17376 
17377   start_sequence ();
17378   if (!maybe_expand_insn (icode, 2, ops))
17379     {
17380       end_sequence ();
17381       return NULL_RTX;
17382     }
17383   *gen_seq = get_insns ();
17384   end_sequence ();
17385 
17386   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
17387 			 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
17388 }
17389 
17390 static rtx
17391 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
17392 		       int cmp_code, tree treeop0, tree treeop1, int bit_code)
17393 {
17394   rtx op0, op1, target;
17395   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
17396   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
17397   insn_code icode;
17398   struct expand_operand ops[6];
17399   int aarch64_cond;
17400 
17401   push_to_sequence (*prep_seq);
17402   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
17403 
17404   op_mode = GET_MODE (op0);
17405   if (op_mode == VOIDmode)
17406     op_mode = GET_MODE (op1);
17407 
17408   switch (op_mode)
17409     {
17410     case E_QImode:
17411     case E_HImode:
17412     case E_SImode:
17413       cmp_mode = SImode;
17414       icode = CODE_FOR_ccmpsi;
17415       break;
17416 
17417     case E_DImode:
17418       cmp_mode = DImode;
17419       icode = CODE_FOR_ccmpdi;
17420       break;
17421 
17422     case E_SFmode:
17423       cmp_mode = SFmode;
17424       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17425       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
17426       break;
17427 
17428     case E_DFmode:
17429       cmp_mode = DFmode;
17430       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
17431       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
17432       break;
17433 
17434     default:
17435       end_sequence ();
17436       return NULL_RTX;
17437     }
17438 
17439   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
17440   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
17441   if (!op0 || !op1)
17442     {
17443       end_sequence ();
17444       return NULL_RTX;
17445     }
17446   *prep_seq = get_insns ();
17447   end_sequence ();
17448 
17449   target = gen_rtx_REG (cc_mode, CC_REGNUM);
17450   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
17451 
17452   if (bit_code != AND)
17453     {
17454       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
17455 						GET_MODE (XEXP (prev, 0))),
17456 			     VOIDmode, XEXP (prev, 0), const0_rtx);
17457       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
17458     }
17459 
17460   create_fixed_operand (&ops[0], XEXP (prev, 0));
17461   create_fixed_operand (&ops[1], target);
17462   create_fixed_operand (&ops[2], op0);
17463   create_fixed_operand (&ops[3], op1);
17464   create_fixed_operand (&ops[4], prev);
17465   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
17466 
17467   push_to_sequence (*gen_seq);
17468   if (!maybe_expand_insn (icode, 6, ops))
17469     {
17470       end_sequence ();
17471       return NULL_RTX;
17472     }
17473 
17474   *gen_seq = get_insns ();
17475   end_sequence ();
17476 
17477   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
17478 }
17479 
17480 #undef TARGET_GEN_CCMP_FIRST
17481 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
17482 
17483 #undef TARGET_GEN_CCMP_NEXT
17484 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
17485 
17486 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
17487    instruction fusion of some sort.  */
17488 
17489 static bool
17490 aarch64_macro_fusion_p (void)
17491 {
17492   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
17493 }
17494 
17495 
17496 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
17497    should be kept together during scheduling.  */
17498 
17499 static bool
17500 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
17501 {
17502   rtx set_dest;
17503   rtx prev_set = single_set (prev);
17504   rtx curr_set = single_set (curr);
17505   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
17506   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
17507 
17508   if (!aarch64_macro_fusion_p ())
17509     return false;
17510 
17511   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
17512     {
17513       /* We are trying to match:
17514          prev (mov)  == (set (reg r0) (const_int imm16))
17515          curr (movk) == (set (zero_extract (reg r0)
17516                                            (const_int 16)
17517                                            (const_int 16))
17518                              (const_int imm16_1))  */
17519 
17520       set_dest = SET_DEST (curr_set);
17521 
17522       if (GET_CODE (set_dest) == ZERO_EXTRACT
17523           && CONST_INT_P (SET_SRC (curr_set))
17524           && CONST_INT_P (SET_SRC (prev_set))
17525           && CONST_INT_P (XEXP (set_dest, 2))
17526           && INTVAL (XEXP (set_dest, 2)) == 16
17527           && REG_P (XEXP (set_dest, 0))
17528           && REG_P (SET_DEST (prev_set))
17529           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
17530         {
17531           return true;
17532         }
17533     }
17534 
17535   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
17536     {
17537 
17538       /*  We're trying to match:
17539           prev (adrp) == (set (reg r1)
17540                               (high (symbol_ref ("SYM"))))
17541           curr (add) == (set (reg r0)
17542                              (lo_sum (reg r1)
17543                                      (symbol_ref ("SYM"))))
17544           Note that r0 need not necessarily be the same as r1, especially
17545           during pre-regalloc scheduling.  */
17546 
17547       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17548           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17549         {
17550           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
17551               && REG_P (XEXP (SET_SRC (curr_set), 0))
17552               && REGNO (XEXP (SET_SRC (curr_set), 0))
17553                  == REGNO (SET_DEST (prev_set))
17554               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
17555                               XEXP (SET_SRC (curr_set), 1)))
17556             return true;
17557         }
17558     }
17559 
17560   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
17561     {
17562 
17563       /* We're trying to match:
17564          prev (movk) == (set (zero_extract (reg r0)
17565                                            (const_int 16)
17566                                            (const_int 32))
17567                              (const_int imm16_1))
17568          curr (movk) == (set (zero_extract (reg r0)
17569                                            (const_int 16)
17570                                            (const_int 48))
17571                              (const_int imm16_2))  */
17572 
17573       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
17574           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
17575           && REG_P (XEXP (SET_DEST (prev_set), 0))
17576           && REG_P (XEXP (SET_DEST (curr_set), 0))
17577           && REGNO (XEXP (SET_DEST (prev_set), 0))
17578              == REGNO (XEXP (SET_DEST (curr_set), 0))
17579           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
17580           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
17581           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
17582           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
17583           && CONST_INT_P (SET_SRC (prev_set))
17584           && CONST_INT_P (SET_SRC (curr_set)))
17585         return true;
17586 
17587     }
17588   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
17589     {
17590       /* We're trying to match:
17591           prev (adrp) == (set (reg r0)
17592                               (high (symbol_ref ("SYM"))))
17593           curr (ldr) == (set (reg r1)
17594                              (mem (lo_sum (reg r0)
17595                                              (symbol_ref ("SYM")))))
17596                  or
17597           curr (ldr) == (set (reg r1)
17598                              (zero_extend (mem
17599                                            (lo_sum (reg r0)
17600                                                    (symbol_ref ("SYM"))))))  */
17601       if (satisfies_constraint_Ush (SET_SRC (prev_set))
17602           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
17603         {
17604           rtx curr_src = SET_SRC (curr_set);
17605 
17606           if (GET_CODE (curr_src) == ZERO_EXTEND)
17607             curr_src = XEXP (curr_src, 0);
17608 
17609           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
17610               && REG_P (XEXP (XEXP (curr_src, 0), 0))
17611               && REGNO (XEXP (XEXP (curr_src, 0), 0))
17612                  == REGNO (SET_DEST (prev_set))
17613               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
17614                               XEXP (SET_SRC (prev_set), 0)))
17615               return true;
17616         }
17617     }
17618 
17619   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
17620        && aarch_crypto_can_dual_issue (prev, curr))
17621     return true;
17622 
17623   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
17624       && any_condjump_p (curr))
17625     {
17626       unsigned int condreg1, condreg2;
17627       rtx cc_reg_1;
17628       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
17629       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
17630 
17631       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
17632 	  && prev
17633 	  && modified_in_p (cc_reg_1, prev))
17634 	{
17635 	  enum attr_type prev_type = get_attr_type (prev);
17636 
17637 	  /* FIXME: this misses some which is considered simple arthematic
17638 	     instructions for ThunderX.  Simple shifts are missed here.  */
17639 	  if (prev_type == TYPE_ALUS_SREG
17640 	      || prev_type == TYPE_ALUS_IMM
17641 	      || prev_type == TYPE_LOGICS_REG
17642 	      || prev_type == TYPE_LOGICS_IMM)
17643 	    return true;
17644 	}
17645     }
17646 
17647   if (prev_set
17648       && curr_set
17649       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
17650       && any_condjump_p (curr))
17651     {
17652       /* We're trying to match:
17653 	  prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
17654 	  curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
17655 							 (const_int 0))
17656 						 (label_ref ("SYM"))
17657 						 (pc))  */
17658       if (SET_DEST (curr_set) == (pc_rtx)
17659 	  && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
17660 	  && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
17661 	  && REG_P (SET_DEST (prev_set))
17662 	  && REGNO (SET_DEST (prev_set))
17663 	     == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
17664 	{
17665 	  /* Fuse ALU operations followed by conditional branch instruction.  */
17666 	  switch (get_attr_type (prev))
17667 	    {
17668 	    case TYPE_ALU_IMM:
17669 	    case TYPE_ALU_SREG:
17670 	    case TYPE_ADC_REG:
17671 	    case TYPE_ADC_IMM:
17672 	    case TYPE_ADCS_REG:
17673 	    case TYPE_ADCS_IMM:
17674 	    case TYPE_LOGIC_REG:
17675 	    case TYPE_LOGIC_IMM:
17676 	    case TYPE_CSEL:
17677 	    case TYPE_ADR:
17678 	    case TYPE_MOV_IMM:
17679 	    case TYPE_SHIFT_REG:
17680 	    case TYPE_SHIFT_IMM:
17681 	    case TYPE_BFM:
17682 	    case TYPE_RBIT:
17683 	    case TYPE_REV:
17684 	    case TYPE_EXTEND:
17685 	      return true;
17686 
17687 	    default:;
17688 	    }
17689 	}
17690     }
17691 
17692   return false;
17693 }
17694 
17695 /* Return true iff the instruction fusion described by OP is enabled.  */
17696 
17697 bool
17698 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
17699 {
17700   return (aarch64_tune_params.fusible_ops & op) != 0;
17701 }
17702 
17703 /* If MEM is in the form of [base+offset], extract the two parts
17704    of address and set to BASE and OFFSET, otherwise return false
17705    after clearing BASE and OFFSET.  */
17706 
17707 bool
17708 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
17709 {
17710   rtx addr;
17711 
17712   gcc_assert (MEM_P (mem));
17713 
17714   addr = XEXP (mem, 0);
17715 
17716   if (REG_P (addr))
17717     {
17718       *base = addr;
17719       *offset = const0_rtx;
17720       return true;
17721     }
17722 
17723   if (GET_CODE (addr) == PLUS
17724       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
17725     {
17726       *base = XEXP (addr, 0);
17727       *offset = XEXP (addr, 1);
17728       return true;
17729     }
17730 
17731   *base = NULL_RTX;
17732   *offset = NULL_RTX;
17733 
17734   return false;
17735 }
17736 
17737 /* Types for scheduling fusion.  */
17738 enum sched_fusion_type
17739 {
17740   SCHED_FUSION_NONE = 0,
17741   SCHED_FUSION_LD_SIGN_EXTEND,
17742   SCHED_FUSION_LD_ZERO_EXTEND,
17743   SCHED_FUSION_LD,
17744   SCHED_FUSION_ST,
17745   SCHED_FUSION_NUM
17746 };
17747 
17748 /* If INSN is a load or store of address in the form of [base+offset],
17749    extract the two parts and set to BASE and OFFSET.  Return scheduling
17750    fusion type this INSN is.  */
17751 
17752 static enum sched_fusion_type
17753 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
17754 {
17755   rtx x, dest, src;
17756   enum sched_fusion_type fusion = SCHED_FUSION_LD;
17757 
17758   gcc_assert (INSN_P (insn));
17759   x = PATTERN (insn);
17760   if (GET_CODE (x) != SET)
17761     return SCHED_FUSION_NONE;
17762 
17763   src = SET_SRC (x);
17764   dest = SET_DEST (x);
17765 
17766   machine_mode dest_mode = GET_MODE (dest);
17767 
17768   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
17769     return SCHED_FUSION_NONE;
17770 
17771   if (GET_CODE (src) == SIGN_EXTEND)
17772     {
17773       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
17774       src = XEXP (src, 0);
17775       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17776 	return SCHED_FUSION_NONE;
17777     }
17778   else if (GET_CODE (src) == ZERO_EXTEND)
17779     {
17780       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
17781       src = XEXP (src, 0);
17782       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
17783 	return SCHED_FUSION_NONE;
17784     }
17785 
17786   if (GET_CODE (src) == MEM && REG_P (dest))
17787     extract_base_offset_in_addr (src, base, offset);
17788   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
17789     {
17790       fusion = SCHED_FUSION_ST;
17791       extract_base_offset_in_addr (dest, base, offset);
17792     }
17793   else
17794     return SCHED_FUSION_NONE;
17795 
17796   if (*base == NULL_RTX || *offset == NULL_RTX)
17797     fusion = SCHED_FUSION_NONE;
17798 
17799   return fusion;
17800 }
17801 
17802 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
17803 
17804    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
17805    and PRI are only calculated for these instructions.  For other instruction,
17806    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
17807    type instruction fusion can be added by returning different priorities.
17808 
17809    It's important that irrelevant instructions get the largest FUSION_PRI.  */
17810 
17811 static void
17812 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
17813 			       int *fusion_pri, int *pri)
17814 {
17815   int tmp, off_val;
17816   rtx base, offset;
17817   enum sched_fusion_type fusion;
17818 
17819   gcc_assert (INSN_P (insn));
17820 
17821   tmp = max_pri - 1;
17822   fusion = fusion_load_store (insn, &base, &offset);
17823   if (fusion == SCHED_FUSION_NONE)
17824     {
17825       *pri = tmp;
17826       *fusion_pri = tmp;
17827       return;
17828     }
17829 
17830   /* Set FUSION_PRI according to fusion type and base register.  */
17831   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
17832 
17833   /* Calculate PRI.  */
17834   tmp /= 2;
17835 
17836   /* INSN with smaller offset goes first.  */
17837   off_val = (int)(INTVAL (offset));
17838   if (off_val >= 0)
17839     tmp -= (off_val & 0xfffff);
17840   else
17841     tmp += ((- off_val) & 0xfffff);
17842 
17843   *pri = tmp;
17844   return;
17845 }
17846 
17847 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
17848    Adjust priority of sha1h instructions so they are scheduled before
17849    other SHA1 instructions.  */
17850 
17851 static int
17852 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
17853 {
17854   rtx x = PATTERN (insn);
17855 
17856   if (GET_CODE (x) == SET)
17857     {
17858       x = SET_SRC (x);
17859 
17860       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
17861 	return priority + 10;
17862     }
17863 
17864   return priority;
17865 }
17866 
17867 /* Given OPERANDS of consecutive load/store, check if we can merge
17868    them into ldp/stp.  LOAD is true if they are load instructions.
17869    MODE is the mode of memory operands.  */
17870 
17871 bool
17872 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
17873 				machine_mode mode)
17874 {
17875   HOST_WIDE_INT offval_1, offval_2, msize;
17876   enum reg_class rclass_1, rclass_2;
17877   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
17878 
17879   if (load)
17880     {
17881       mem_1 = operands[1];
17882       mem_2 = operands[3];
17883       reg_1 = operands[0];
17884       reg_2 = operands[2];
17885       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
17886       if (REGNO (reg_1) == REGNO (reg_2))
17887 	return false;
17888     }
17889   else
17890     {
17891       mem_1 = operands[0];
17892       mem_2 = operands[2];
17893       reg_1 = operands[1];
17894       reg_2 = operands[3];
17895     }
17896 
17897   /* The mems cannot be volatile.  */
17898   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
17899     return false;
17900 
17901   /* If we have SImode and slow unaligned ldp,
17902      check the alignment to be at least 8 byte. */
17903   if (mode == SImode
17904       && (aarch64_tune_params.extra_tuning_flags
17905           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17906       && !optimize_size
17907       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17908     return false;
17909 
17910   /* Check if the addresses are in the form of [base+offset].  */
17911   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17912   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17913     return false;
17914   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17915   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17916     return false;
17917 
17918   /* Check if the bases are same.  */
17919   if (!rtx_equal_p (base_1, base_2))
17920     return false;
17921 
17922   /* The operands must be of the same size.  */
17923   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
17924 			 GET_MODE_SIZE (GET_MODE (mem_2))));
17925 
17926   offval_1 = INTVAL (offset_1);
17927   offval_2 = INTVAL (offset_2);
17928   /* We should only be trying this for fixed-sized modes.  There is no
17929      SVE LDP/STP instruction.  */
17930   msize = GET_MODE_SIZE (mode).to_constant ();
17931   /* Check if the offsets are consecutive.  */
17932   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
17933     return false;
17934 
17935   /* Check if the addresses are clobbered by load.  */
17936   if (load)
17937     {
17938       if (reg_mentioned_p (reg_1, mem_1))
17939 	return false;
17940 
17941       /* In increasing order, the last load can clobber the address.  */
17942       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
17943 	return false;
17944     }
17945 
17946   /* One of the memory accesses must be a mempair operand.
17947      If it is not the first one, they need to be swapped by the
17948      peephole.  */
17949   if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
17950        && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
17951     return false;
17952 
17953   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17954     rclass_1 = FP_REGS;
17955   else
17956     rclass_1 = GENERAL_REGS;
17957 
17958   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17959     rclass_2 = FP_REGS;
17960   else
17961     rclass_2 = GENERAL_REGS;
17962 
17963   /* Check if the registers are of same class.  */
17964   if (rclass_1 != rclass_2)
17965     return false;
17966 
17967   return true;
17968 }
17969 
17970 /* Given OPERANDS of consecutive load/store that can be merged,
17971    swap them if they are not in ascending order.  */
17972 void
17973 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
17974 {
17975   rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
17976   HOST_WIDE_INT offval_1, offval_2;
17977 
17978   if (load)
17979     {
17980       mem_1 = operands[1];
17981       mem_2 = operands[3];
17982     }
17983   else
17984     {
17985       mem_1 = operands[0];
17986       mem_2 = operands[2];
17987     }
17988 
17989   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17990   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17991 
17992   offval_1 = INTVAL (offset_1);
17993   offval_2 = INTVAL (offset_2);
17994 
17995   if (offval_1 > offval_2)
17996     {
17997       /* Irrespective of whether this is a load or a store,
17998 	 we do the same swap.  */
17999       std::swap (operands[0], operands[2]);
18000       std::swap (operands[1], operands[3]);
18001     }
18002 }
18003 
18004 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
18005    comparison between the two.  */
18006 int
18007 aarch64_host_wide_int_compare (const void *x, const void *y)
18008 {
18009   return wi::cmps (* ((const HOST_WIDE_INT *) x),
18010 		   * ((const HOST_WIDE_INT *) y));
18011 }
18012 
18013 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
18014    other pointing to a REG rtx containing an offset, compare the offsets
18015    of the two pairs.
18016 
18017    Return:
18018 
18019 	1 iff offset (X) > offset (Y)
18020 	0 iff offset (X) == offset (Y)
18021 	-1 iff offset (X) < offset (Y)  */
18022 int
18023 aarch64_ldrstr_offset_compare (const void *x, const void *y)
18024 {
18025   const rtx * operands_1 = (const rtx *) x;
18026   const rtx * operands_2 = (const rtx *) y;
18027   rtx mem_1, mem_2, base, offset_1, offset_2;
18028 
18029   if (MEM_P (operands_1[0]))
18030     mem_1 = operands_1[0];
18031   else
18032     mem_1 = operands_1[1];
18033 
18034   if (MEM_P (operands_2[0]))
18035     mem_2 = operands_2[0];
18036   else
18037     mem_2 = operands_2[1];
18038 
18039   /* Extract the offsets.  */
18040   extract_base_offset_in_addr (mem_1, &base, &offset_1);
18041   extract_base_offset_in_addr (mem_2, &base, &offset_2);
18042 
18043   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
18044 
18045   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
18046 }
18047 
18048 /* Given OPERANDS of consecutive load/store, check if we can merge
18049    them into ldp/stp by adjusting the offset.  LOAD is true if they
18050    are load instructions.  MODE is the mode of memory operands.
18051 
18052    Given below consecutive stores:
18053 
18054      str  w1, [xb, 0x100]
18055      str  w1, [xb, 0x104]
18056      str  w1, [xb, 0x108]
18057      str  w1, [xb, 0x10c]
18058 
18059    Though the offsets are out of the range supported by stp, we can
18060    still pair them after adjusting the offset, like:
18061 
18062      add  scratch, xb, 0x100
18063      stp  w1, w1, [scratch]
18064      stp  w1, w1, [scratch, 0x8]
18065 
18066    The peephole patterns detecting this opportunity should guarantee
18067    the scratch register is avaliable.  */
18068 
18069 bool
18070 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
18071 				       scalar_mode mode)
18072 {
18073   const int num_insns = 4;
18074   enum reg_class rclass;
18075   HOST_WIDE_INT offvals[num_insns], msize;
18076   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
18077 
18078   if (load)
18079     {
18080       for (int i = 0; i < num_insns; i++)
18081 	{
18082 	  reg[i] = operands[2 * i];
18083 	  mem[i] = operands[2 * i + 1];
18084 
18085 	  gcc_assert (REG_P (reg[i]));
18086 	}
18087 
18088       /* Do not attempt to merge the loads if the loads clobber each other.  */
18089       for (int i = 0; i < 8; i += 2)
18090 	for (int j = i + 2; j < 8; j += 2)
18091 	  if (reg_overlap_mentioned_p (operands[i], operands[j]))
18092 	    return false;
18093     }
18094   else
18095     for (int i = 0; i < num_insns; i++)
18096       {
18097 	mem[i] = operands[2 * i];
18098 	reg[i] = operands[2 * i + 1];
18099       }
18100 
18101   /* Skip if memory operand is by itself valid for ldp/stp.  */
18102   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
18103     return false;
18104 
18105   for (int i = 0; i < num_insns; i++)
18106     {
18107       /* The mems cannot be volatile.  */
18108       if (MEM_VOLATILE_P (mem[i]))
18109 	return false;
18110 
18111       /* Check if the addresses are in the form of [base+offset].  */
18112       extract_base_offset_in_addr (mem[i], base + i, offset + i);
18113       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
18114 	return false;
18115     }
18116 
18117   /* Check if the registers are of same class.  */
18118   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
18119     ? FP_REGS : GENERAL_REGS;
18120 
18121   for (int i = 1; i < num_insns; i++)
18122     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
18123       {
18124 	if (rclass != FP_REGS)
18125 	  return false;
18126       }
18127     else
18128       {
18129 	if (rclass != GENERAL_REGS)
18130 	  return false;
18131       }
18132 
18133   /* Only the last register in the order in which they occur
18134      may be clobbered by the load.  */
18135   if (rclass == GENERAL_REGS && load)
18136     for (int i = 0; i < num_insns - 1; i++)
18137       if (reg_mentioned_p (reg[i], mem[i]))
18138 	return false;
18139 
18140   /* Check if the bases are same.  */
18141   for (int i = 0; i < num_insns - 1; i++)
18142     if (!rtx_equal_p (base[i], base[i + 1]))
18143       return false;
18144 
18145   for (int i = 0; i < num_insns; i++)
18146     offvals[i] = INTVAL (offset[i]);
18147 
18148   msize = GET_MODE_SIZE (mode);
18149 
18150   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
18151   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
18152 	 aarch64_host_wide_int_compare);
18153 
18154   if (!(offvals[1] == offvals[0] + msize
18155 	&& offvals[3] == offvals[2] + msize))
18156     return false;
18157 
18158   /* Check that offsets are within range of each other.  The ldp/stp
18159      instructions have 7 bit immediate offsets, so use 0x80.  */
18160   if (offvals[2] - offvals[0] >= msize * 0x80)
18161     return false;
18162 
18163   /* The offsets must be aligned with respect to each other.  */
18164   if (offvals[0] % msize != offvals[2] % msize)
18165     return false;
18166 
18167   /* If we have SImode and slow unaligned ldp,
18168      check the alignment to be at least 8 byte. */
18169   if (mode == SImode
18170       && (aarch64_tune_params.extra_tuning_flags
18171 	  & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
18172       && !optimize_size
18173       && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
18174     return false;
18175 
18176   return true;
18177 }
18178 
18179 /* Given OPERANDS of consecutive load/store, this function pairs them
18180    into LDP/STP after adjusting the offset.  It depends on the fact
18181    that the operands can be sorted so the offsets are correct for STP.
18182    MODE is the mode of memory operands.  CODE is the rtl operator
18183    which should be applied to all memory operands, it's SIGN_EXTEND,
18184    ZERO_EXTEND or UNKNOWN.  */
18185 
18186 bool
18187 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
18188 			     scalar_mode mode, RTX_CODE code)
18189 {
18190   rtx base, offset_1, offset_3, t1, t2;
18191   rtx mem_1, mem_2, mem_3, mem_4;
18192   rtx temp_operands[8];
18193   HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
18194 		stp_off_upper_limit, stp_off_lower_limit, msize;
18195 
18196   /* We make changes on a copy as we may still bail out.  */
18197   for (int i = 0; i < 8; i ++)
18198     temp_operands[i] = operands[i];
18199 
18200   /* Sort the operands.  */
18201   qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
18202 
18203   /* Copy the memory operands so that if we have to bail for some
18204      reason the original addresses are unchanged.  */
18205   if (load)
18206     {
18207       mem_1 = copy_rtx (temp_operands[1]);
18208       mem_2 = copy_rtx (temp_operands[3]);
18209       mem_3 = copy_rtx (temp_operands[5]);
18210       mem_4 = copy_rtx (temp_operands[7]);
18211     }
18212   else
18213     {
18214       mem_1 = copy_rtx (temp_operands[0]);
18215       mem_2 = copy_rtx (temp_operands[2]);
18216       mem_3 = copy_rtx (temp_operands[4]);
18217       mem_4 = copy_rtx (temp_operands[6]);
18218       gcc_assert (code == UNKNOWN);
18219     }
18220 
18221   extract_base_offset_in_addr (mem_1, &base, &offset_1);
18222   extract_base_offset_in_addr (mem_3, &base, &offset_3);
18223   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
18224 	      && offset_3 != NULL_RTX);
18225 
18226   /* Adjust offset so it can fit in LDP/STP instruction.  */
18227   msize = GET_MODE_SIZE (mode);
18228   stp_off_upper_limit = msize * (0x40 - 1);
18229   stp_off_lower_limit = - msize * 0x40;
18230 
18231   off_val_1 = INTVAL (offset_1);
18232   off_val_3 = INTVAL (offset_3);
18233 
18234   /* The base offset is optimally half way between the two STP/LDP offsets.  */
18235   if (msize <= 4)
18236     base_off = (off_val_1 + off_val_3) / 2;
18237   else
18238     /* However, due to issues with negative LDP/STP offset generation for
18239        larger modes, for DF, DI and vector modes. we must not use negative
18240        addresses smaller than 9 signed unadjusted bits can store.  This
18241        provides the most range in this case.  */
18242     base_off = off_val_1;
18243 
18244   /* Adjust the base so that it is aligned with the addresses but still
18245      optimal.  */
18246   if (base_off % msize != off_val_1 % msize)
18247     /* Fix the offset, bearing in mind we want to make it bigger not
18248        smaller.  */
18249     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18250   else if (msize <= 4)
18251     /* The negative range of LDP/STP is one larger than the positive range.  */
18252     base_off += msize;
18253 
18254   /* Check if base offset is too big or too small.  We can attempt to resolve
18255      this issue by setting it to the maximum value and seeing if the offsets
18256      still fit.  */
18257   if (base_off >= 0x1000)
18258     {
18259       base_off = 0x1000 - 1;
18260       /* We must still make sure that the base offset is aligned with respect
18261 	 to the address.  But it may may not be made any bigger.  */
18262       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18263     }
18264 
18265   /* Likewise for the case where the base is too small.  */
18266   if (base_off <= -0x1000)
18267     {
18268       base_off = -0x1000 + 1;
18269       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
18270     }
18271 
18272   /* Offset of the first STP/LDP.  */
18273   new_off_1 = off_val_1 - base_off;
18274 
18275   /* Offset of the second STP/LDP.  */
18276   new_off_3 = off_val_3 - base_off;
18277 
18278   /* The offsets must be within the range of the LDP/STP instructions.  */
18279   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
18280       || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
18281     return false;
18282 
18283   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
18284 						  new_off_1), true);
18285   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
18286 						  new_off_1 + msize), true);
18287   replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
18288 						  new_off_3), true);
18289   replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
18290 						  new_off_3 + msize), true);
18291 
18292   if (!aarch64_mem_pair_operand (mem_1, mode)
18293       || !aarch64_mem_pair_operand (mem_3, mode))
18294     return false;
18295 
18296   if (code == ZERO_EXTEND)
18297     {
18298       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
18299       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
18300       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
18301       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
18302     }
18303   else if (code == SIGN_EXTEND)
18304     {
18305       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
18306       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
18307       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
18308       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
18309     }
18310 
18311   if (load)
18312     {
18313       operands[0] = temp_operands[0];
18314       operands[1] = mem_1;
18315       operands[2] = temp_operands[2];
18316       operands[3] = mem_2;
18317       operands[4] = temp_operands[4];
18318       operands[5] = mem_3;
18319       operands[6] = temp_operands[6];
18320       operands[7] = mem_4;
18321     }
18322   else
18323     {
18324       operands[0] = mem_1;
18325       operands[1] = temp_operands[1];
18326       operands[2] = mem_2;
18327       operands[3] = temp_operands[3];
18328       operands[4] = mem_3;
18329       operands[5] = temp_operands[5];
18330       operands[6] = mem_4;
18331       operands[7] = temp_operands[7];
18332     }
18333 
18334   /* Emit adjusting instruction.  */
18335   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
18336   /* Emit ldp/stp instructions.  */
18337   t1 = gen_rtx_SET (operands[0], operands[1]);
18338   t2 = gen_rtx_SET (operands[2], operands[3]);
18339   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18340   t1 = gen_rtx_SET (operands[4], operands[5]);
18341   t2 = gen_rtx_SET (operands[6], operands[7]);
18342   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
18343   return true;
18344 }
18345 
18346 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
18347    it isn't worth branching around empty masked ops (including masked
18348    stores).  */
18349 
18350 static bool
18351 aarch64_empty_mask_is_expensive (unsigned)
18352 {
18353   return false;
18354 }
18355 
18356 /* Return 1 if pseudo register should be created and used to hold
18357    GOT address for PIC code.  */
18358 
18359 bool
18360 aarch64_use_pseudo_pic_reg (void)
18361 {
18362   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
18363 }
18364 
18365 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
18366 
18367 static int
18368 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
18369 {
18370   switch (XINT (x, 1))
18371     {
18372     case UNSPEC_GOTSMALLPIC:
18373     case UNSPEC_GOTSMALLPIC28K:
18374     case UNSPEC_GOTTINYPIC:
18375       return 0;
18376     default:
18377       break;
18378     }
18379 
18380   return default_unspec_may_trap_p (x, flags);
18381 }
18382 
18383 
18384 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
18385    return the log2 of that value.  Otherwise return -1.  */
18386 
18387 int
18388 aarch64_fpconst_pow_of_2 (rtx x)
18389 {
18390   const REAL_VALUE_TYPE *r;
18391 
18392   if (!CONST_DOUBLE_P (x))
18393     return -1;
18394 
18395   r = CONST_DOUBLE_REAL_VALUE (x);
18396 
18397   if (REAL_VALUE_NEGATIVE (*r)
18398       || REAL_VALUE_ISNAN (*r)
18399       || REAL_VALUE_ISINF (*r)
18400       || !real_isinteger (r, DFmode))
18401     return -1;
18402 
18403   return exact_log2 (real_to_integer (r));
18404 }
18405 
18406 /* If X is a vector of equal CONST_DOUBLE values and that value is
18407    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
18408 
18409 int
18410 aarch64_vec_fpconst_pow_of_2 (rtx x)
18411 {
18412   int nelts;
18413   if (GET_CODE (x) != CONST_VECTOR
18414       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
18415     return -1;
18416 
18417   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
18418     return -1;
18419 
18420   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
18421   if (firstval <= 0)
18422     return -1;
18423 
18424   for (int i = 1; i < nelts; i++)
18425     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
18426       return -1;
18427 
18428   return firstval;
18429 }
18430 
18431 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
18432    to float.
18433 
18434    __fp16 always promotes through this hook.
18435    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
18436    through the generic excess precision logic rather than here.  */
18437 
18438 static tree
18439 aarch64_promoted_type (const_tree t)
18440 {
18441   if (SCALAR_FLOAT_TYPE_P (t)
18442       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
18443     return float_type_node;
18444 
18445   return NULL_TREE;
18446 }
18447 
18448 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
18449 
18450 static bool
18451 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
18452 			   optimization_type opt_type)
18453 {
18454   switch (op)
18455     {
18456     case rsqrt_optab:
18457       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
18458 
18459     default:
18460       return true;
18461     }
18462 }
18463 
18464 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
18465 
18466 static unsigned int
18467 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
18468 					int *offset)
18469 {
18470   /* Polynomial invariant 1 == (VG / 2) - 1.  */
18471   gcc_assert (i == 1);
18472   *factor = 2;
18473   *offset = 1;
18474   return AARCH64_DWARF_VG;
18475 }
18476 
18477 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
18478    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18479 
18480 static bool
18481 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
18482 {
18483   return (mode == HFmode
18484 	  ? true
18485 	  : default_libgcc_floating_mode_supported_p (mode));
18486 }
18487 
18488 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
18489    if MODE is HFmode, and punt to the generic implementation otherwise.  */
18490 
18491 static bool
18492 aarch64_scalar_mode_supported_p (scalar_mode mode)
18493 {
18494   return (mode == HFmode
18495 	  ? true
18496 	  : default_scalar_mode_supported_p (mode));
18497 }
18498 
18499 /* Set the value of FLT_EVAL_METHOD.
18500    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
18501 
18502     0: evaluate all operations and constants, whose semantic type has at
18503        most the range and precision of type float, to the range and
18504        precision of float; evaluate all other operations and constants to
18505        the range and precision of the semantic type;
18506 
18507     N, where _FloatN is a supported interchange floating type
18508        evaluate all operations and constants, whose semantic type has at
18509        most the range and precision of _FloatN type, to the range and
18510        precision of the _FloatN type; evaluate all other operations and
18511        constants to the range and precision of the semantic type;
18512 
18513    If we have the ARMv8.2-A extensions then we support _Float16 in native
18514    precision, so we should set this to 16.  Otherwise, we support the type,
18515    but want to evaluate expressions in float precision, so set this to
18516    0.  */
18517 
18518 static enum flt_eval_method
18519 aarch64_excess_precision (enum excess_precision_type type)
18520 {
18521   switch (type)
18522     {
18523       case EXCESS_PRECISION_TYPE_FAST:
18524       case EXCESS_PRECISION_TYPE_STANDARD:
18525 	/* We can calculate either in 16-bit range and precision or
18526 	   32-bit range and precision.  Make that decision based on whether
18527 	   we have native support for the ARMv8.2-A 16-bit floating-point
18528 	   instructions or not.  */
18529 	return (TARGET_FP_F16INST
18530 		? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
18531 		: FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
18532       case EXCESS_PRECISION_TYPE_IMPLICIT:
18533 	return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
18534       default:
18535 	gcc_unreachable ();
18536     }
18537   return FLT_EVAL_METHOD_UNPREDICTABLE;
18538 }
18539 
18540 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
18541    scheduled for speculative execution.  Reject the long-running division
18542    and square-root instructions.  */
18543 
18544 static bool
18545 aarch64_sched_can_speculate_insn (rtx_insn *insn)
18546 {
18547   switch (get_attr_type (insn))
18548     {
18549       case TYPE_SDIV:
18550       case TYPE_UDIV:
18551       case TYPE_FDIVS:
18552       case TYPE_FDIVD:
18553       case TYPE_FSQRTS:
18554       case TYPE_FSQRTD:
18555       case TYPE_NEON_FP_SQRT_S:
18556       case TYPE_NEON_FP_SQRT_D:
18557       case TYPE_NEON_FP_SQRT_S_Q:
18558       case TYPE_NEON_FP_SQRT_D_Q:
18559       case TYPE_NEON_FP_DIV_S:
18560       case TYPE_NEON_FP_DIV_D:
18561       case TYPE_NEON_FP_DIV_S_Q:
18562       case TYPE_NEON_FP_DIV_D_Q:
18563 	return false;
18564       default:
18565 	return true;
18566     }
18567 }
18568 
18569 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
18570 
18571 static int
18572 aarch64_compute_pressure_classes (reg_class *classes)
18573 {
18574   int i = 0;
18575   classes[i++] = GENERAL_REGS;
18576   classes[i++] = FP_REGS;
18577   /* PR_REGS isn't a useful pressure class because many predicate pseudo
18578      registers need to go in PR_LO_REGS at some point during their
18579      lifetime.  Splitting it into two halves has the effect of making
18580      all predicates count against PR_LO_REGS, so that we try whenever
18581      possible to restrict the number of live predicates to 8.  This
18582      greatly reduces the amount of spilling in certain loops.  */
18583   classes[i++] = PR_LO_REGS;
18584   classes[i++] = PR_HI_REGS;
18585   return i;
18586 }
18587 
18588 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
18589 
18590 static bool
18591 aarch64_can_change_mode_class (machine_mode from,
18592 			       machine_mode to, reg_class_t)
18593 {
18594   if (BYTES_BIG_ENDIAN)
18595     {
18596       bool from_sve_p = aarch64_sve_data_mode_p (from);
18597       bool to_sve_p = aarch64_sve_data_mode_p (to);
18598 
18599       /* Don't allow changes between SVE data modes and non-SVE modes.
18600 	 See the comment at the head of aarch64-sve.md for details.  */
18601       if (from_sve_p != to_sve_p)
18602 	return false;
18603 
18604       /* Don't allow changes in element size: lane 0 of the new vector
18605 	 would not then be lane 0 of the old vector.  See the comment
18606 	 above aarch64_maybe_expand_sve_subreg_move for a more detailed
18607 	 description.
18608 
18609 	 In the worst case, this forces a register to be spilled in
18610 	 one mode and reloaded in the other, which handles the
18611 	 endianness correctly.  */
18612       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
18613 	return false;
18614     }
18615   return true;
18616 }
18617 
18618 /* Implement TARGET_EARLY_REMAT_MODES.  */
18619 
18620 static void
18621 aarch64_select_early_remat_modes (sbitmap modes)
18622 {
18623   /* SVE values are not normally live across a call, so it should be
18624      worth doing early rematerialization even in VL-specific mode.  */
18625   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
18626     {
18627       machine_mode mode = (machine_mode) i;
18628       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18629       if (vec_flags & VEC_ANY_SVE)
18630 	bitmap_set_bit (modes, i);
18631     }
18632 }
18633 
18634 /* Override the default target speculation_safe_value.  */
18635 static rtx
18636 aarch64_speculation_safe_value (machine_mode mode,
18637 				rtx result, rtx val, rtx failval)
18638 {
18639   /* Maybe we should warn if falling back to hard barriers.  They are
18640      likely to be noticably more expensive than the alternative below.  */
18641   if (!aarch64_track_speculation)
18642     return default_speculation_safe_value (mode, result, val, failval);
18643 
18644   if (!REG_P (val))
18645     val = copy_to_mode_reg (mode, val);
18646 
18647   if (!aarch64_reg_or_zero (failval, mode))
18648     failval = copy_to_mode_reg (mode, failval);
18649 
18650   emit_insn (gen_despeculate_copy (mode, result, val, failval));
18651   return result;
18652 }
18653 
18654 /* Implement TARGET_ESTIMATED_POLY_VALUE.
18655    Look into the tuning structure for an estimate.
18656    VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
18657    Advanced SIMD 128 bits.  */
18658 
18659 static HOST_WIDE_INT
18660 aarch64_estimated_poly_value (poly_int64 val)
18661 {
18662   enum aarch64_sve_vector_bits_enum width_source
18663     = aarch64_tune_params.sve_width;
18664 
18665   /* If we still don't have an estimate, use the default.  */
18666   if (width_source == SVE_SCALABLE)
18667     return default_estimated_poly_value (val);
18668 
18669   HOST_WIDE_INT over_128 = width_source - 128;
18670   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
18671 }
18672 
18673 
18674 /* Return true for types that could be supported as SIMD return or
18675    argument types.  */
18676 
18677 static bool
18678 supported_simd_type (tree t)
18679 {
18680   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
18681     {
18682       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
18683       return s == 1 || s == 2 || s == 4 || s == 8;
18684     }
18685   return false;
18686 }
18687 
18688 /* Return true for types that currently are supported as SIMD return
18689    or argument types.  */
18690 
18691 static bool
18692 currently_supported_simd_type (tree t, tree b)
18693 {
18694   if (COMPLEX_FLOAT_TYPE_P (t))
18695     return false;
18696 
18697   if (TYPE_SIZE (t) != TYPE_SIZE (b))
18698     return false;
18699 
18700   return supported_simd_type (t);
18701 }
18702 
18703 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
18704 
18705 static int
18706 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
18707 					struct cgraph_simd_clone *clonei,
18708 					tree base_type, int num)
18709 {
18710   tree t, ret_type, arg_type;
18711   unsigned int elt_bits, vec_bits, count;
18712 
18713   if (!TARGET_SIMD)
18714     return 0;
18715 
18716   if (clonei->simdlen
18717       && (clonei->simdlen < 2
18718 	  || clonei->simdlen > 1024
18719 	  || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
18720     {
18721       warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18722 		  "unsupported simdlen %d", clonei->simdlen);
18723       return 0;
18724     }
18725 
18726   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
18727   if (TREE_CODE (ret_type) != VOID_TYPE
18728       && !currently_supported_simd_type (ret_type, base_type))
18729     {
18730       if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
18731 	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18732 		    "GCC does not currently support mixed size types "
18733 		    "for %<simd%> functions");
18734       else if (supported_simd_type (ret_type))
18735 	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18736 		    "GCC does not currently support return type %qT "
18737 		    "for %<simd%> functions", ret_type);
18738       else
18739 	warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18740 		    "unsupported return type %qT for %<simd%> functions",
18741 		    ret_type);
18742       return 0;
18743     }
18744 
18745   for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
18746     {
18747       arg_type = TREE_TYPE (t);
18748 
18749       if (!currently_supported_simd_type (arg_type, base_type))
18750 	{
18751 	  if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
18752 	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18753 			"GCC does not currently support mixed size types "
18754 			"for %<simd%> functions");
18755 	  else
18756 	    warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18757 			"GCC does not currently support argument type %qT "
18758 			"for %<simd%> functions", arg_type);
18759 	  return 0;
18760 	}
18761     }
18762 
18763   clonei->vecsize_mangle = 'n';
18764   clonei->mask_mode = VOIDmode;
18765   elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
18766   if (clonei->simdlen == 0)
18767     {
18768       count = 2;
18769       vec_bits = (num == 0 ? 64 : 128);
18770       clonei->simdlen = vec_bits / elt_bits;
18771     }
18772   else
18773     {
18774       count = 1;
18775       vec_bits = clonei->simdlen * elt_bits;
18776       if (vec_bits != 64 && vec_bits != 128)
18777 	{
18778 	  warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
18779 		      "GCC does not currently support simdlen %d for type %qT",
18780 		      clonei->simdlen, base_type);
18781 	  return 0;
18782 	}
18783     }
18784   clonei->vecsize_int = vec_bits;
18785   clonei->vecsize_float = vec_bits;
18786   return count;
18787 }
18788 
18789 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
18790 
18791 static void
18792 aarch64_simd_clone_adjust (struct cgraph_node *node)
18793 {
18794   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
18795      use the correct ABI.  */
18796 
18797   tree t = TREE_TYPE (node->decl);
18798   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
18799 					TYPE_ATTRIBUTES (t));
18800 }
18801 
18802 /* Implement TARGET_SIMD_CLONE_USABLE.  */
18803 
18804 static int
18805 aarch64_simd_clone_usable (struct cgraph_node *node)
18806 {
18807   switch (node->simdclone->vecsize_mangle)
18808     {
18809     case 'n':
18810       if (!TARGET_SIMD)
18811 	return -1;
18812       return 0;
18813     default:
18814       gcc_unreachable ();
18815     }
18816 }
18817 
18818 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
18819 
18820 static int
18821 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
18822 {
18823   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1))
18824       != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2)))
18825     return 0;
18826   return 1;
18827 }
18828 
18829 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
18830 
18831 static const char *
18832 aarch64_get_multilib_abi_name (void)
18833 {
18834   if (TARGET_BIG_END)
18835     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
18836   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
18837 }
18838 
18839 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
18840    global variable based guard use the default else
18841    return a null tree.  */
18842 static tree
18843 aarch64_stack_protect_guard (void)
18844 {
18845   if (aarch64_stack_protector_guard == SSP_GLOBAL)
18846     return default_stack_protect_guard ();
18847 
18848   return NULL_TREE;
18849 }
18850 
18851 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
18852    section at the end if needed.  */
18853 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND	0xc0000000
18854 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI	(1U << 0)
18855 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC	(1U << 1)
18856 void
18857 aarch64_file_end_indicate_exec_stack ()
18858 {
18859   file_end_indicate_exec_stack ();
18860 
18861   unsigned feature_1_and = 0;
18862   if (aarch64_bti_enabled ())
18863     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
18864 
18865   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
18866     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
18867 
18868   if (feature_1_and)
18869     {
18870       /* Generate .note.gnu.property section.  */
18871       switch_to_section (get_section (".note.gnu.property",
18872 				      SECTION_NOTYPE, NULL));
18873 
18874       /* PT_NOTE header: namesz, descsz, type.
18875 	 namesz = 4 ("GNU\0")
18876 	 descsz = 16 (Size of the program property array)
18877 		  [(12 + padding) * Number of array elements]
18878 	 type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
18879       assemble_align (POINTER_SIZE);
18880       assemble_integer (GEN_INT (4), 4, 32, 1);
18881       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
18882       assemble_integer (GEN_INT (5), 4, 32, 1);
18883 
18884       /* PT_NOTE name.  */
18885       assemble_string ("GNU", 4);
18886 
18887       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
18888 	 type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
18889 	 datasz = 4
18890 	 data   = feature_1_and.  */
18891       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
18892       assemble_integer (GEN_INT (4), 4, 32, 1);
18893       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
18894 
18895       /* Pad the size of the note to the required alignment.  */
18896       assemble_align (POINTER_SIZE);
18897     }
18898 }
18899 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
18900 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
18901 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
18902 
18903 /* Target-specific selftests.  */
18904 
18905 #if CHECKING_P
18906 
18907 namespace selftest {
18908 
18909 /* Selftest for the RTL loader.
18910    Verify that the RTL loader copes with a dump from
18911    print_rtx_function.  This is essentially just a test that class
18912    function_reader can handle a real dump, but it also verifies
18913    that lookup_reg_by_dump_name correctly handles hard regs.
18914    The presence of hard reg names in the dump means that the test is
18915    target-specific, hence it is in this file.  */
18916 
18917 static void
18918 aarch64_test_loading_full_dump ()
18919 {
18920   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
18921 
18922   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
18923 
18924   rtx_insn *insn_1 = get_insn_by_uid (1);
18925   ASSERT_EQ (NOTE, GET_CODE (insn_1));
18926 
18927   rtx_insn *insn_15 = get_insn_by_uid (15);
18928   ASSERT_EQ (INSN, GET_CODE (insn_15));
18929   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
18930 
18931   /* Verify crtl->return_rtx.  */
18932   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
18933   ASSERT_EQ (0, REGNO (crtl->return_rtx));
18934   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
18935 }
18936 
18937 /* Run all target-specific selftests.  */
18938 
18939 static void
18940 aarch64_run_selftests (void)
18941 {
18942   aarch64_test_loading_full_dump ();
18943 }
18944 
18945 } // namespace selftest
18946 
18947 #endif /* #if CHECKING_P */
18948 
18949 #undef TARGET_STACK_PROTECT_GUARD
18950 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
18951 
18952 #undef TARGET_ADDRESS_COST
18953 #define TARGET_ADDRESS_COST aarch64_address_cost
18954 
18955 /* This hook will determines whether unnamed bitfields affect the alignment
18956    of the containing structure.  The hook returns true if the structure
18957    should inherit the alignment requirements of an unnamed bitfield's
18958    type.  */
18959 #undef TARGET_ALIGN_ANON_BITFIELD
18960 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
18961 
18962 #undef TARGET_ASM_ALIGNED_DI_OP
18963 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
18964 
18965 #undef TARGET_ASM_ALIGNED_HI_OP
18966 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
18967 
18968 #undef TARGET_ASM_ALIGNED_SI_OP
18969 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
18970 
18971 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
18972 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
18973   hook_bool_const_tree_hwi_hwi_const_tree_true
18974 
18975 #undef TARGET_ASM_FILE_START
18976 #define TARGET_ASM_FILE_START aarch64_start_file
18977 
18978 #undef TARGET_ASM_OUTPUT_MI_THUNK
18979 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
18980 
18981 #undef TARGET_ASM_SELECT_RTX_SECTION
18982 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
18983 
18984 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
18985 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
18986 
18987 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
18988 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
18989 
18990 #undef TARGET_BUILD_BUILTIN_VA_LIST
18991 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
18992 
18993 #undef TARGET_CALLEE_COPIES
18994 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
18995 
18996 #undef TARGET_CAN_ELIMINATE
18997 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
18998 
18999 #undef TARGET_CAN_INLINE_P
19000 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
19001 
19002 #undef TARGET_CANNOT_FORCE_CONST_MEM
19003 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
19004 
19005 #undef TARGET_CASE_VALUES_THRESHOLD
19006 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
19007 
19008 #undef TARGET_CONDITIONAL_REGISTER_USAGE
19009 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
19010 
19011 /* Only the least significant bit is used for initialization guard
19012    variables.  */
19013 #undef TARGET_CXX_GUARD_MASK_BIT
19014 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
19015 
19016 #undef TARGET_C_MODE_FOR_SUFFIX
19017 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
19018 
19019 #ifdef TARGET_BIG_ENDIAN_DEFAULT
19020 #undef  TARGET_DEFAULT_TARGET_FLAGS
19021 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
19022 #endif
19023 
19024 #undef TARGET_CLASS_MAX_NREGS
19025 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
19026 
19027 #undef TARGET_BUILTIN_DECL
19028 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
19029 
19030 #undef TARGET_BUILTIN_RECIPROCAL
19031 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
19032 
19033 #undef TARGET_C_EXCESS_PRECISION
19034 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
19035 
19036 #undef  TARGET_EXPAND_BUILTIN
19037 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
19038 
19039 #undef TARGET_EXPAND_BUILTIN_VA_START
19040 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
19041 
19042 #undef TARGET_FOLD_BUILTIN
19043 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
19044 
19045 #undef TARGET_FUNCTION_ARG
19046 #define TARGET_FUNCTION_ARG aarch64_function_arg
19047 
19048 #undef TARGET_FUNCTION_ARG_ADVANCE
19049 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
19050 
19051 #undef TARGET_FUNCTION_ARG_BOUNDARY
19052 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
19053 
19054 #undef TARGET_FUNCTION_ARG_PADDING
19055 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
19056 
19057 #undef TARGET_GET_RAW_RESULT_MODE
19058 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
19059 #undef TARGET_GET_RAW_ARG_MODE
19060 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
19061 
19062 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
19063 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
19064 
19065 #undef TARGET_FUNCTION_VALUE
19066 #define TARGET_FUNCTION_VALUE aarch64_function_value
19067 
19068 #undef TARGET_FUNCTION_VALUE_REGNO_P
19069 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
19070 
19071 #undef TARGET_GIMPLE_FOLD_BUILTIN
19072 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
19073 
19074 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
19075 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
19076 
19077 #undef  TARGET_INIT_BUILTINS
19078 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
19079 
19080 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
19081 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
19082   aarch64_ira_change_pseudo_allocno_class
19083 
19084 #undef TARGET_LEGITIMATE_ADDRESS_P
19085 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
19086 
19087 #undef TARGET_LEGITIMATE_CONSTANT_P
19088 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
19089 
19090 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
19091 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
19092   aarch64_legitimize_address_displacement
19093 
19094 #undef TARGET_LIBGCC_CMP_RETURN_MODE
19095 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
19096 
19097 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
19098 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
19099 aarch64_libgcc_floating_mode_supported_p
19100 
19101 #undef TARGET_MANGLE_TYPE
19102 #define TARGET_MANGLE_TYPE aarch64_mangle_type
19103 
19104 #undef TARGET_MEMORY_MOVE_COST
19105 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
19106 
19107 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
19108 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
19109 
19110 #undef TARGET_MUST_PASS_IN_STACK
19111 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
19112 
19113 /* This target hook should return true if accesses to volatile bitfields
19114    should use the narrowest mode possible.  It should return false if these
19115    accesses should use the bitfield container type.  */
19116 #undef TARGET_NARROW_VOLATILE_BITFIELD
19117 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
19118 
19119 #undef  TARGET_OPTION_OVERRIDE
19120 #define TARGET_OPTION_OVERRIDE aarch64_override_options
19121 
19122 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
19123 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
19124   aarch64_override_options_after_change
19125 
19126 #undef TARGET_OPTION_SAVE
19127 #define TARGET_OPTION_SAVE aarch64_option_save
19128 
19129 #undef TARGET_OPTION_RESTORE
19130 #define TARGET_OPTION_RESTORE aarch64_option_restore
19131 
19132 #undef TARGET_OPTION_PRINT
19133 #define TARGET_OPTION_PRINT aarch64_option_print
19134 
19135 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
19136 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
19137 
19138 #undef TARGET_SET_CURRENT_FUNCTION
19139 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
19140 
19141 #undef TARGET_PASS_BY_REFERENCE
19142 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
19143 
19144 #undef TARGET_PREFERRED_RELOAD_CLASS
19145 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
19146 
19147 #undef TARGET_SCHED_REASSOCIATION_WIDTH
19148 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
19149 
19150 #undef TARGET_PROMOTED_TYPE
19151 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
19152 
19153 #undef TARGET_SECONDARY_RELOAD
19154 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
19155 
19156 #undef TARGET_SHIFT_TRUNCATION_MASK
19157 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
19158 
19159 #undef TARGET_SETUP_INCOMING_VARARGS
19160 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
19161 
19162 #undef TARGET_STRUCT_VALUE_RTX
19163 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
19164 
19165 #undef TARGET_REGISTER_MOVE_COST
19166 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
19167 
19168 #undef TARGET_RETURN_IN_MEMORY
19169 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
19170 
19171 #undef TARGET_RETURN_IN_MSB
19172 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
19173 
19174 #undef TARGET_RTX_COSTS
19175 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
19176 
19177 #undef TARGET_SCALAR_MODE_SUPPORTED_P
19178 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
19179 
19180 #undef TARGET_SCHED_ISSUE_RATE
19181 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
19182 
19183 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
19184 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
19185   aarch64_sched_first_cycle_multipass_dfa_lookahead
19186 
19187 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
19188 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
19189   aarch64_first_cycle_multipass_dfa_lookahead_guard
19190 
19191 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
19192 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
19193   aarch64_get_separate_components
19194 
19195 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
19196 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
19197   aarch64_components_for_bb
19198 
19199 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
19200 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
19201   aarch64_disqualify_components
19202 
19203 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
19204 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
19205   aarch64_emit_prologue_components
19206 
19207 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
19208 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
19209   aarch64_emit_epilogue_components
19210 
19211 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
19212 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
19213   aarch64_set_handled_components
19214 
19215 #undef TARGET_TRAMPOLINE_INIT
19216 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
19217 
19218 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
19219 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
19220 
19221 #undef TARGET_VECTOR_MODE_SUPPORTED_P
19222 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
19223 
19224 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
19225 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
19226   aarch64_builtin_support_vector_misalignment
19227 
19228 #undef TARGET_ARRAY_MODE
19229 #define TARGET_ARRAY_MODE aarch64_array_mode
19230 
19231 #undef TARGET_ARRAY_MODE_SUPPORTED_P
19232 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
19233 
19234 #undef TARGET_VECTORIZE_ADD_STMT_COST
19235 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
19236 
19237 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
19238 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
19239   aarch64_builtin_vectorization_cost
19240 
19241 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
19242 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
19243 
19244 #undef TARGET_VECTORIZE_BUILTINS
19245 #define TARGET_VECTORIZE_BUILTINS
19246 
19247 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
19248 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
19249   aarch64_builtin_vectorized_function
19250 
19251 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
19252 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
19253   aarch64_autovectorize_vector_sizes
19254 
19255 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
19256 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
19257   aarch64_atomic_assign_expand_fenv
19258 
19259 /* Section anchor support.  */
19260 
19261 #undef TARGET_MIN_ANCHOR_OFFSET
19262 #define TARGET_MIN_ANCHOR_OFFSET -256
19263 
19264 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
19265    byte offset; we can do much more for larger data types, but have no way
19266    to determine the size of the access.  We assume accesses are aligned.  */
19267 #undef TARGET_MAX_ANCHOR_OFFSET
19268 #define TARGET_MAX_ANCHOR_OFFSET 4095
19269 
19270 #undef TARGET_VECTOR_ALIGNMENT
19271 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
19272 
19273 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
19274 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
19275   aarch64_vectorize_preferred_vector_alignment
19276 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
19277 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
19278   aarch64_simd_vector_alignment_reachable
19279 
19280 /* vec_perm support.  */
19281 
19282 #undef TARGET_VECTORIZE_VEC_PERM_CONST
19283 #define TARGET_VECTORIZE_VEC_PERM_CONST \
19284   aarch64_vectorize_vec_perm_const
19285 
19286 #undef TARGET_VECTORIZE_GET_MASK_MODE
19287 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
19288 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
19289 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
19290   aarch64_empty_mask_is_expensive
19291 #undef TARGET_PREFERRED_ELSE_VALUE
19292 #define TARGET_PREFERRED_ELSE_VALUE \
19293   aarch64_preferred_else_value
19294 
19295 #undef TARGET_INIT_LIBFUNCS
19296 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
19297 
19298 #undef TARGET_FIXED_CONDITION_CODE_REGS
19299 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
19300 
19301 #undef TARGET_FLAGS_REGNUM
19302 #define TARGET_FLAGS_REGNUM CC_REGNUM
19303 
19304 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
19305 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
19306 
19307 #undef TARGET_ASAN_SHADOW_OFFSET
19308 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
19309 
19310 #undef TARGET_LEGITIMIZE_ADDRESS
19311 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
19312 
19313 #undef TARGET_SCHED_CAN_SPECULATE_INSN
19314 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
19315 
19316 #undef TARGET_CAN_USE_DOLOOP_P
19317 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
19318 
19319 #undef TARGET_SCHED_ADJUST_PRIORITY
19320 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
19321 
19322 #undef TARGET_SCHED_MACRO_FUSION_P
19323 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
19324 
19325 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
19326 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
19327 
19328 #undef TARGET_SCHED_FUSION_PRIORITY
19329 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
19330 
19331 #undef TARGET_UNSPEC_MAY_TRAP_P
19332 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
19333 
19334 #undef TARGET_USE_PSEUDO_PIC_REG
19335 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
19336 
19337 #undef TARGET_PRINT_OPERAND
19338 #define TARGET_PRINT_OPERAND aarch64_print_operand
19339 
19340 #undef TARGET_PRINT_OPERAND_ADDRESS
19341 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
19342 
19343 #undef TARGET_OPTAB_SUPPORTED_P
19344 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
19345 
19346 #undef TARGET_OMIT_STRUCT_RETURN_REG
19347 #define TARGET_OMIT_STRUCT_RETURN_REG true
19348 
19349 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
19350 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
19351   aarch64_dwarf_poly_indeterminate_value
19352 
19353 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
19354 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
19355 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
19356 
19357 #undef TARGET_HARD_REGNO_NREGS
19358 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
19359 #undef TARGET_HARD_REGNO_MODE_OK
19360 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
19361 
19362 #undef TARGET_MODES_TIEABLE_P
19363 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
19364 
19365 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
19366 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
19367   aarch64_hard_regno_call_part_clobbered
19368 
19369 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS
19370 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \
19371   aarch64_remove_extra_call_preserved_regs
19372 
19373 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS
19374 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \
19375   aarch64_return_call_with_max_clobbers
19376 
19377 #undef TARGET_CONSTANT_ALIGNMENT
19378 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
19379 
19380 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
19381 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
19382   aarch64_stack_clash_protection_alloca_probe_range
19383 
19384 #undef TARGET_COMPUTE_PRESSURE_CLASSES
19385 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
19386 
19387 #undef TARGET_CAN_CHANGE_MODE_CLASS
19388 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
19389 
19390 #undef TARGET_SELECT_EARLY_REMAT_MODES
19391 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
19392 
19393 #undef TARGET_SPECULATION_SAFE_VALUE
19394 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
19395 
19396 #undef TARGET_ESTIMATED_POLY_VALUE
19397 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
19398 
19399 #undef TARGET_ATTRIBUTE_TABLE
19400 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
19401 
19402 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
19403 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
19404   aarch64_simd_clone_compute_vecsize_and_simdlen
19405 
19406 #undef TARGET_SIMD_CLONE_ADJUST
19407 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
19408 
19409 #undef TARGET_SIMD_CLONE_USABLE
19410 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
19411 
19412 #undef TARGET_COMP_TYPE_ATTRIBUTES
19413 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
19414 
19415 #undef TARGET_GET_MULTILIB_ABI_NAME
19416 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
19417 
19418 #if CHECKING_P
19419 #undef TARGET_RUN_TARGET_SELFTESTS
19420 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
19421 #endif /* #if CHECKING_P */
19422 
19423 struct gcc_target targetm = TARGET_INITIALIZER;
19424 
19425 #include "gt-aarch64.h"
19426