1 /* Copyright (C) 1988-2022 Free Software Foundation, Inc.
2
3 This file is part of GCC.
4
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
18
19 #define IN_TARGET_CODE 1
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-builtins.h"
93 #include "i386-features.h"
94
95 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
96 "savms64",
97 "resms64",
98 "resms64x",
99 "savms64f",
100 "resms64f",
101 "resms64fx"
102 };
103
104 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
105 /* The below offset values are where each register is stored for the layout
106 relative to incoming stack pointer. The value of each m_regs[].offset will
107 be relative to the incoming base pointer (rax or rsi) used by the stub.
108
109 s_instances: 0 1 2 3
110 Offset: realigned or aligned + 8
111 Register aligned aligned + 8 aligned w/HFP w/HFP */
112 XMM15_REG, /* 0x10 0x18 0x10 0x18 */
113 XMM14_REG, /* 0x20 0x28 0x20 0x28 */
114 XMM13_REG, /* 0x30 0x38 0x30 0x38 */
115 XMM12_REG, /* 0x40 0x48 0x40 0x48 */
116 XMM11_REG, /* 0x50 0x58 0x50 0x58 */
117 XMM10_REG, /* 0x60 0x68 0x60 0x68 */
118 XMM9_REG, /* 0x70 0x78 0x70 0x78 */
119 XMM8_REG, /* 0x80 0x88 0x80 0x88 */
120 XMM7_REG, /* 0x90 0x98 0x90 0x98 */
121 XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
122 SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
123 DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
124 BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
125 BP_REG, /* 0xc0 0xc8 N/A N/A */
126 R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
127 R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
128 R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
129 R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
130 };
131
132 /* Instantiate static const values. */
133 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
134 const unsigned xlogue_layout::MIN_REGS;
135 const unsigned xlogue_layout::MAX_REGS;
136 const unsigned xlogue_layout::MAX_EXTRA_REGS;
137 const unsigned xlogue_layout::VARIANT_COUNT;
138 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
139
140 /* Initialize xlogue_layout::s_stub_names to zero. */
141 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
142 [STUB_NAME_MAX_LEN];
143
144 /* Instantiates all xlogue_layout instances. */
145 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
146 xlogue_layout (0, false),
147 xlogue_layout (8, false),
148 xlogue_layout (0, true),
149 xlogue_layout (8, true)
150 };
151
152 /* Return an appropriate const instance of xlogue_layout based upon values
153 in cfun->machine and crtl. */
154 const class xlogue_layout &
get_instance()155 xlogue_layout::get_instance ()
156 {
157 enum xlogue_stub_sets stub_set;
158 bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
159
160 if (stack_realign_fp)
161 stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
162 else if (frame_pointer_needed)
163 stub_set = aligned_plus_8
164 ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
165 : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
166 else
167 stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
168
169 return s_instances[stub_set];
170 }
171
172 /* Determine how many clobbered registers can be saved by the stub.
173 Returns the count of registers the stub will save and restore. */
174 unsigned
count_stub_managed_regs()175 xlogue_layout::count_stub_managed_regs ()
176 {
177 bool hfp = frame_pointer_needed || stack_realign_fp;
178 unsigned i, count;
179 unsigned regno;
180
181 for (count = i = MIN_REGS; i < MAX_REGS; ++i)
182 {
183 regno = REG_ORDER[i];
184 if (regno == BP_REG && hfp)
185 continue;
186 if (!ix86_save_reg (regno, false, false))
187 break;
188 ++count;
189 }
190 return count;
191 }
192
193 /* Determine if register REGNO is a stub managed register given the
194 total COUNT of stub managed registers. */
195 bool
is_stub_managed_reg(unsigned regno,unsigned count)196 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
197 {
198 bool hfp = frame_pointer_needed || stack_realign_fp;
199 unsigned i;
200
201 for (i = 0; i < count; ++i)
202 {
203 gcc_assert (i < MAX_REGS);
204 if (REG_ORDER[i] == BP_REG && hfp)
205 ++count;
206 else if (REG_ORDER[i] == regno)
207 return true;
208 }
209 return false;
210 }
211
212 /* Constructor for xlogue_layout. */
xlogue_layout(HOST_WIDE_INT stack_align_off_in,bool hfp)213 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
214 : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
215 m_stack_align_off_in (stack_align_off_in)
216 {
217 HOST_WIDE_INT offset = stack_align_off_in;
218 unsigned i, j;
219
220 for (i = j = 0; i < MAX_REGS; ++i)
221 {
222 unsigned regno = REG_ORDER[i];
223
224 if (regno == BP_REG && hfp)
225 continue;
226 if (SSE_REGNO_P (regno))
227 {
228 offset += 16;
229 /* Verify that SSE regs are always aligned. */
230 gcc_assert (!((stack_align_off_in + offset) & 15));
231 }
232 else
233 offset += 8;
234
235 m_regs[j].regno = regno;
236 m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
237 }
238 gcc_assert (j == m_nregs);
239 }
240
241 const char *
get_stub_name(enum xlogue_stub stub,unsigned n_extra_regs)242 xlogue_layout::get_stub_name (enum xlogue_stub stub,
243 unsigned n_extra_regs)
244 {
245 const int have_avx = TARGET_AVX;
246 char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
247
248 /* Lazy init */
249 if (!*name)
250 {
251 int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
252 (have_avx ? "avx" : "sse"),
253 STUB_BASE_NAMES[stub],
254 MIN_REGS + n_extra_regs);
255 gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
256 }
257
258 return name;
259 }
260
261 /* Return rtx of a symbol ref for the entry point (based upon
262 cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
263 rtx
get_stub_rtx(enum xlogue_stub stub)264 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
265 {
266 const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
267 gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
268 gcc_assert (stub < XLOGUE_STUB_COUNT);
269 gcc_assert (crtl->stack_realign_finalized);
270
271 return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
272 }
273
274 unsigned scalar_chain::max_id = 0;
275
276 namespace {
277
278 /* Initialize new chain. */
279
scalar_chain(enum machine_mode smode_,enum machine_mode vmode_)280 scalar_chain::scalar_chain (enum machine_mode smode_, enum machine_mode vmode_)
281 {
282 smode = smode_;
283 vmode = vmode_;
284
285 chain_id = ++max_id;
286
287 if (dump_file)
288 fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
289
290 bitmap_obstack_initialize (NULL);
291 insns = BITMAP_ALLOC (NULL);
292 defs = BITMAP_ALLOC (NULL);
293 defs_conv = BITMAP_ALLOC (NULL);
294 queue = NULL;
295 }
296
297 /* Free chain's data. */
298
~scalar_chain()299 scalar_chain::~scalar_chain ()
300 {
301 BITMAP_FREE (insns);
302 BITMAP_FREE (defs);
303 BITMAP_FREE (defs_conv);
304 bitmap_obstack_release (NULL);
305 }
306
307 /* Add instruction into chains' queue. */
308
309 void
add_to_queue(unsigned insn_uid)310 scalar_chain::add_to_queue (unsigned insn_uid)
311 {
312 if (bitmap_bit_p (insns, insn_uid)
313 || bitmap_bit_p (queue, insn_uid))
314 return;
315
316 if (dump_file)
317 fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
318 insn_uid, chain_id);
319 bitmap_set_bit (queue, insn_uid);
320 }
321
general_scalar_chain(enum machine_mode smode_,enum machine_mode vmode_)322 general_scalar_chain::general_scalar_chain (enum machine_mode smode_,
323 enum machine_mode vmode_)
324 : scalar_chain (smode_, vmode_)
325 {
326 insns_conv = BITMAP_ALLOC (NULL);
327 n_sse_to_integer = 0;
328 n_integer_to_sse = 0;
329 }
330
~general_scalar_chain()331 general_scalar_chain::~general_scalar_chain ()
332 {
333 BITMAP_FREE (insns_conv);
334 }
335
336 /* For DImode conversion, mark register defined by DEF as requiring
337 conversion. */
338
339 void
mark_dual_mode_def(df_ref def)340 general_scalar_chain::mark_dual_mode_def (df_ref def)
341 {
342 gcc_assert (DF_REF_REG_DEF_P (def));
343
344 /* Record the def/insn pair so we can later efficiently iterate over
345 the defs to convert on insns not in the chain. */
346 bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
347 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
348 {
349 if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def))
350 && !reg_new)
351 return;
352 n_integer_to_sse++;
353 }
354 else
355 {
356 if (!reg_new)
357 return;
358 n_sse_to_integer++;
359 }
360
361 if (dump_file)
362 fprintf (dump_file,
363 " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
364 DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
365 }
366
367 /* For TImode conversion, it is unused. */
368
369 void
mark_dual_mode_def(df_ref)370 timode_scalar_chain::mark_dual_mode_def (df_ref)
371 {
372 gcc_unreachable ();
373 }
374
375 /* Check REF's chain to add new insns into a queue
376 and find registers requiring conversion. */
377
378 void
analyze_register_chain(bitmap candidates,df_ref ref)379 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
380 {
381 df_link *chain;
382
383 gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
384 || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
385 add_to_queue (DF_REF_INSN_UID (ref));
386
387 for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
388 {
389 unsigned uid = DF_REF_INSN_UID (chain->ref);
390
391 if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
392 continue;
393
394 if (!DF_REF_REG_MEM_P (chain->ref))
395 {
396 if (bitmap_bit_p (insns, uid))
397 continue;
398
399 if (bitmap_bit_p (candidates, uid))
400 {
401 add_to_queue (uid);
402 continue;
403 }
404 }
405
406 if (DF_REF_REG_DEF_P (chain->ref))
407 {
408 if (dump_file)
409 fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
410 DF_REF_REGNO (chain->ref), uid);
411 mark_dual_mode_def (chain->ref);
412 }
413 else
414 {
415 if (dump_file)
416 fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
417 DF_REF_REGNO (chain->ref), uid);
418 mark_dual_mode_def (ref);
419 }
420 }
421 }
422
423 /* Add instruction into a chain. */
424
425 void
add_insn(bitmap candidates,unsigned int insn_uid)426 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
427 {
428 if (bitmap_bit_p (insns, insn_uid))
429 return;
430
431 if (dump_file)
432 fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
433
434 bitmap_set_bit (insns, insn_uid);
435
436 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
437 rtx def_set = single_set (insn);
438 if (def_set && REG_P (SET_DEST (def_set))
439 && !HARD_REGISTER_P (SET_DEST (def_set)))
440 bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
441
442 /* ??? The following is quadratic since analyze_register_chain
443 iterates over all refs to look for dual-mode regs. Instead this
444 should be done separately for all regs mentioned in the chain once. */
445 df_ref ref;
446 for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
447 if (!HARD_REGISTER_P (DF_REF_REG (ref)))
448 analyze_register_chain (candidates, ref);
449 for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
450 if (!DF_REF_REG_MEM_P (ref))
451 analyze_register_chain (candidates, ref);
452 }
453
454 /* Build new chain starting from insn INSN_UID recursively
455 adding all dependent uses and definitions. */
456
457 void
build(bitmap candidates,unsigned insn_uid)458 scalar_chain::build (bitmap candidates, unsigned insn_uid)
459 {
460 queue = BITMAP_ALLOC (NULL);
461 bitmap_set_bit (queue, insn_uid);
462
463 if (dump_file)
464 fprintf (dump_file, "Building chain #%d...\n", chain_id);
465
466 while (!bitmap_empty_p (queue))
467 {
468 insn_uid = bitmap_first_set_bit (queue);
469 bitmap_clear_bit (queue, insn_uid);
470 bitmap_clear_bit (candidates, insn_uid);
471 add_insn (candidates, insn_uid);
472 }
473
474 if (dump_file)
475 {
476 fprintf (dump_file, "Collected chain #%d...\n", chain_id);
477 fprintf (dump_file, " insns: ");
478 dump_bitmap (dump_file, insns);
479 if (!bitmap_empty_p (defs_conv))
480 {
481 bitmap_iterator bi;
482 unsigned id;
483 const char *comma = "";
484 fprintf (dump_file, " defs to convert: ");
485 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
486 {
487 fprintf (dump_file, "%sr%d", comma, id);
488 comma = ", ";
489 }
490 fprintf (dump_file, "\n");
491 }
492 }
493
494 BITMAP_FREE (queue);
495 }
496
497 /* Return a cost of building a vector costant
498 instead of using a scalar one. */
499
500 int
vector_const_cost(rtx exp)501 general_scalar_chain::vector_const_cost (rtx exp)
502 {
503 gcc_assert (CONST_INT_P (exp));
504
505 if (standard_sse_constant_p (exp, vmode))
506 return ix86_cost->sse_op;
507 /* We have separate costs for SImode and DImode, use SImode costs
508 for smaller modes. */
509 return ix86_cost->sse_load[smode == DImode ? 1 : 0];
510 }
511
512 /* Compute a gain for chain conversion. */
513
514 int
compute_convert_gain()515 general_scalar_chain::compute_convert_gain ()
516 {
517 bitmap_iterator bi;
518 unsigned insn_uid;
519 int gain = 0;
520 int cost = 0;
521
522 if (dump_file)
523 fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
524
525 /* SSE costs distinguish between SImode and DImode loads/stores, for
526 int costs factor in the number of GPRs involved. When supporting
527 smaller modes than SImode the int load/store costs need to be
528 adjusted as well. */
529 unsigned sse_cost_idx = smode == DImode ? 1 : 0;
530 unsigned m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1;
531
532 EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
533 {
534 rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
535 rtx def_set = single_set (insn);
536 rtx src = SET_SRC (def_set);
537 rtx dst = SET_DEST (def_set);
538 int igain = 0;
539
540 if (REG_P (src) && REG_P (dst))
541 igain += 2 * m - ix86_cost->xmm_move;
542 else if (REG_P (src) && MEM_P (dst))
543 igain
544 += m * ix86_cost->int_store[2] - ix86_cost->sse_store[sse_cost_idx];
545 else if (MEM_P (src) && REG_P (dst))
546 igain += m * ix86_cost->int_load[2] - ix86_cost->sse_load[sse_cost_idx];
547 else
548 switch (GET_CODE (src))
549 {
550 case ASHIFT:
551 case ASHIFTRT:
552 case LSHIFTRT:
553 if (m == 2)
554 {
555 if (INTVAL (XEXP (src, 1)) >= 32)
556 igain += ix86_cost->add;
557 else
558 igain += ix86_cost->shift_const;
559 }
560
561 igain += ix86_cost->shift_const - ix86_cost->sse_op;
562
563 if (CONST_INT_P (XEXP (src, 0)))
564 igain -= vector_const_cost (XEXP (src, 0));
565 break;
566
567 case AND:
568 case IOR:
569 case XOR:
570 case PLUS:
571 case MINUS:
572 igain += m * ix86_cost->add - ix86_cost->sse_op;
573 /* Additional gain for andnot for targets without BMI. */
574 if (GET_CODE (XEXP (src, 0)) == NOT
575 && !TARGET_BMI)
576 igain += m * ix86_cost->add;
577
578 if (CONST_INT_P (XEXP (src, 0)))
579 igain -= vector_const_cost (XEXP (src, 0));
580 if (CONST_INT_P (XEXP (src, 1)))
581 igain -= vector_const_cost (XEXP (src, 1));
582 break;
583
584 case NEG:
585 case NOT:
586 igain -= ix86_cost->sse_op + COSTS_N_INSNS (1);
587
588 if (GET_CODE (XEXP (src, 0)) != ABS)
589 {
590 igain += m * ix86_cost->add;
591 break;
592 }
593 /* FALLTHRU */
594
595 case ABS:
596 case SMAX:
597 case SMIN:
598 case UMAX:
599 case UMIN:
600 /* We do not have any conditional move cost, estimate it as a
601 reg-reg move. Comparisons are costed as adds. */
602 igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
603 /* Integer SSE ops are all costed the same. */
604 igain -= ix86_cost->sse_op;
605 break;
606
607 case COMPARE:
608 /* Assume comparison cost is the same. */
609 break;
610
611 case CONST_INT:
612 if (REG_P (dst))
613 {
614 if (optimize_insn_for_size_p ())
615 {
616 /* xor (2 bytes) vs. xorps (3 bytes). */
617 if (src == const0_rtx)
618 igain -= COSTS_N_BYTES (1);
619 /* movdi_internal vs. movv2di_internal. */
620 /* => mov (5 bytes) vs. movaps (7 bytes). */
621 else if (x86_64_immediate_operand (src, SImode))
622 igain -= COSTS_N_BYTES (2);
623 else
624 /* ??? Larger immediate constants are placed in the
625 constant pool, where the size benefit/impact of
626 STV conversion is affected by whether and how
627 often each constant pool entry is shared/reused.
628 The value below is empirically derived from the
629 CSiBE benchmark (and the optimal value may drift
630 over time). */
631 igain += COSTS_N_BYTES (0);
632 }
633 else
634 {
635 /* DImode can be immediate for TARGET_64BIT
636 and SImode always. */
637 igain += m * COSTS_N_INSNS (1);
638 igain -= vector_const_cost (src);
639 }
640 }
641 else if (MEM_P (dst))
642 {
643 igain += (m * ix86_cost->int_store[2]
644 - ix86_cost->sse_store[sse_cost_idx]);
645 igain -= vector_const_cost (src);
646 }
647 break;
648
649 default:
650 gcc_unreachable ();
651 }
652
653 if (igain != 0 && dump_file)
654 {
655 fprintf (dump_file, " Instruction gain %d for ", igain);
656 dump_insn_slim (dump_file, insn);
657 }
658 gain += igain;
659 }
660
661 if (dump_file)
662 fprintf (dump_file, " Instruction conversion gain: %d\n", gain);
663
664 /* Cost the integer to sse and sse to integer moves. */
665 cost += n_sse_to_integer * ix86_cost->sse_to_integer;
666 /* ??? integer_to_sse but we only have that in the RA cost table.
667 Assume sse_to_integer/integer_to_sse are the same which they
668 are at the moment. */
669 cost += n_integer_to_sse * ix86_cost->sse_to_integer;
670
671 if (dump_file)
672 fprintf (dump_file, " Registers conversion cost: %d\n", cost);
673
674 gain -= cost;
675
676 if (dump_file)
677 fprintf (dump_file, " Total gain: %d\n", gain);
678
679 return gain;
680 }
681
682 /* Insert generated conversion instruction sequence INSNS
683 after instruction AFTER. New BB may be required in case
684 instruction has EH region attached. */
685
686 void
emit_conversion_insns(rtx insns,rtx_insn * after)687 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
688 {
689 if (!control_flow_insn_p (after))
690 {
691 emit_insn_after (insns, after);
692 return;
693 }
694
695 basic_block bb = BLOCK_FOR_INSN (after);
696 edge e = find_fallthru_edge (bb->succs);
697 gcc_assert (e);
698
699 basic_block new_bb = split_edge (e);
700 emit_insn_after (insns, BB_HEAD (new_bb));
701 }
702
703 } // anon namespace
704
705 /* Generate the canonical SET_SRC to move GPR to a VMODE vector register,
706 zeroing the upper parts. */
707
708 static rtx
gen_gpr_to_xmm_move_src(enum machine_mode vmode,rtx gpr)709 gen_gpr_to_xmm_move_src (enum machine_mode vmode, rtx gpr)
710 {
711 switch (GET_MODE_NUNITS (vmode))
712 {
713 case 1:
714 /* We are not using this case currently. */
715 gcc_unreachable ();
716 case 2:
717 return gen_rtx_VEC_CONCAT (vmode, gpr,
718 CONST0_RTX (GET_MODE_INNER (vmode)));
719 default:
720 return gen_rtx_VEC_MERGE (vmode, gen_rtx_VEC_DUPLICATE (vmode, gpr),
721 CONST0_RTX (vmode), GEN_INT (HOST_WIDE_INT_1U));
722 }
723 }
724
725 /* Make vector copies for all register REGNO definitions
726 and replace its uses in a chain. */
727
728 void
make_vector_copies(rtx_insn * insn,rtx reg)729 general_scalar_chain::make_vector_copies (rtx_insn *insn, rtx reg)
730 {
731 rtx vreg = *defs_map.get (reg);
732
733 start_sequence ();
734 if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
735 {
736 rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
737 if (smode == DImode && !TARGET_64BIT)
738 {
739 emit_move_insn (adjust_address (tmp, SImode, 0),
740 gen_rtx_SUBREG (SImode, reg, 0));
741 emit_move_insn (adjust_address (tmp, SImode, 4),
742 gen_rtx_SUBREG (SImode, reg, 4));
743 }
744 else
745 emit_move_insn (copy_rtx (tmp), reg);
746 emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
747 gen_gpr_to_xmm_move_src (vmode, tmp)));
748 }
749 else if (!TARGET_64BIT && smode == DImode)
750 {
751 if (TARGET_SSE4_1)
752 {
753 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
754 CONST0_RTX (V4SImode),
755 gen_rtx_SUBREG (SImode, reg, 0)));
756 emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
757 gen_rtx_SUBREG (V4SImode, vreg, 0),
758 gen_rtx_SUBREG (SImode, reg, 4),
759 GEN_INT (2)));
760 }
761 else
762 {
763 rtx tmp = gen_reg_rtx (DImode);
764 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
765 CONST0_RTX (V4SImode),
766 gen_rtx_SUBREG (SImode, reg, 0)));
767 emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
768 CONST0_RTX (V4SImode),
769 gen_rtx_SUBREG (SImode, reg, 4)));
770 emit_insn (gen_vec_interleave_lowv4si
771 (gen_rtx_SUBREG (V4SImode, vreg, 0),
772 gen_rtx_SUBREG (V4SImode, vreg, 0),
773 gen_rtx_SUBREG (V4SImode, tmp, 0)));
774 }
775 }
776 else
777 emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
778 gen_gpr_to_xmm_move_src (vmode, reg)));
779 rtx_insn *seq = get_insns ();
780 end_sequence ();
781 emit_conversion_insns (seq, insn);
782
783 if (dump_file)
784 fprintf (dump_file,
785 " Copied r%d to a vector register r%d for insn %d\n",
786 REGNO (reg), REGNO (vreg), INSN_UID (insn));
787 }
788
789 /* Copy the definition SRC of INSN inside the chain to DST for
790 scalar uses outside of the chain. */
791
792 void
convert_reg(rtx_insn * insn,rtx dst,rtx src)793 general_scalar_chain::convert_reg (rtx_insn *insn, rtx dst, rtx src)
794 {
795 start_sequence ();
796 if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
797 {
798 rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
799 emit_move_insn (tmp, src);
800 if (!TARGET_64BIT && smode == DImode)
801 {
802 emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
803 adjust_address (tmp, SImode, 0));
804 emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
805 adjust_address (tmp, SImode, 4));
806 }
807 else
808 emit_move_insn (dst, copy_rtx (tmp));
809 }
810 else if (!TARGET_64BIT && smode == DImode)
811 {
812 if (TARGET_SSE4_1)
813 {
814 rtx tmp = gen_rtx_PARALLEL (VOIDmode,
815 gen_rtvec (1, const0_rtx));
816 emit_insn
817 (gen_rtx_SET
818 (gen_rtx_SUBREG (SImode, dst, 0),
819 gen_rtx_VEC_SELECT (SImode,
820 gen_rtx_SUBREG (V4SImode, src, 0),
821 tmp)));
822
823 tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
824 emit_insn
825 (gen_rtx_SET
826 (gen_rtx_SUBREG (SImode, dst, 4),
827 gen_rtx_VEC_SELECT (SImode,
828 gen_rtx_SUBREG (V4SImode, src, 0),
829 tmp)));
830 }
831 else
832 {
833 rtx vcopy = gen_reg_rtx (V2DImode);
834 emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, src, 0));
835 emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
836 gen_rtx_SUBREG (SImode, vcopy, 0));
837 emit_move_insn (vcopy,
838 gen_rtx_LSHIFTRT (V2DImode,
839 vcopy, GEN_INT (32)));
840 emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
841 gen_rtx_SUBREG (SImode, vcopy, 0));
842 }
843 }
844 else
845 emit_move_insn (dst, src);
846
847 rtx_insn *seq = get_insns ();
848 end_sequence ();
849 emit_conversion_insns (seq, insn);
850
851 if (dump_file)
852 fprintf (dump_file,
853 " Copied r%d to a scalar register r%d for insn %d\n",
854 REGNO (src), REGNO (dst), INSN_UID (insn));
855 }
856
857 /* Convert operand OP in INSN. We should handle
858 memory operands and uninitialized registers.
859 All other register uses are converted during
860 registers conversion. */
861
862 void
convert_op(rtx * op,rtx_insn * insn)863 general_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
864 {
865 *op = copy_rtx_if_shared (*op);
866
867 if (GET_CODE (*op) == NOT)
868 {
869 convert_op (&XEXP (*op, 0), insn);
870 PUT_MODE (*op, vmode);
871 }
872 else if (MEM_P (*op))
873 {
874 rtx_insn* eh_insn, *movabs = NULL;
875 rtx tmp = gen_reg_rtx (GET_MODE (*op));
876
877 /* Emit MOVABS to load from a 64-bit absolute address to a GPR. */
878 if (!memory_operand (*op, GET_MODE (*op)))
879 {
880 rtx tmp2 = gen_reg_rtx (GET_MODE (*op));
881 movabs = emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
882
883 *op = tmp2;
884 }
885
886 eh_insn
887 = emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
888 gen_gpr_to_xmm_move_src (vmode, *op)),
889 insn);
890
891 if (cfun->can_throw_non_call_exceptions)
892 {
893 /* Handle REG_EH_REGION note. */
894 rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
895 if (note)
896 {
897 if (movabs)
898 eh_insn = movabs;
899 control_flow_insns.safe_push (eh_insn);
900 add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0));
901 }
902 }
903
904 *op = gen_rtx_SUBREG (vmode, tmp, 0);
905
906 if (dump_file)
907 fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
908 INSN_UID (insn), REGNO (tmp));
909 }
910 else if (REG_P (*op))
911 {
912 *op = gen_rtx_SUBREG (vmode, *op, 0);
913 }
914 else if (CONST_INT_P (*op))
915 {
916 rtx vec_cst;
917 rtx tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (smode), 0);
918
919 /* Prefer all ones vector in case of -1. */
920 if (constm1_operand (*op, GET_MODE (*op)))
921 vec_cst = CONSTM1_RTX (vmode);
922 else
923 {
924 unsigned n = GET_MODE_NUNITS (vmode);
925 rtx *v = XALLOCAVEC (rtx, n);
926 v[0] = *op;
927 for (unsigned i = 1; i < n; ++i)
928 v[i] = const0_rtx;
929 vec_cst = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
930 }
931
932 if (!standard_sse_constant_p (vec_cst, vmode))
933 {
934 start_sequence ();
935 vec_cst = validize_mem (force_const_mem (vmode, vec_cst));
936 rtx_insn *seq = get_insns ();
937 end_sequence ();
938 emit_insn_before (seq, insn);
939 }
940
941 emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
942 *op = tmp;
943 }
944 else
945 {
946 gcc_assert (SUBREG_P (*op));
947 gcc_assert (GET_MODE (*op) == vmode);
948 }
949 }
950
951 /* Convert INSN to vector mode. */
952
953 void
convert_insn(rtx_insn * insn)954 general_scalar_chain::convert_insn (rtx_insn *insn)
955 {
956 /* Generate copies for out-of-chain uses of defs and adjust debug uses. */
957 for (df_ref ref = DF_INSN_DEFS (insn); ref; ref = DF_REF_NEXT_LOC (ref))
958 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
959 {
960 df_link *use;
961 for (use = DF_REF_CHAIN (ref); use; use = use->next)
962 if (NONDEBUG_INSN_P (DF_REF_INSN (use->ref))
963 && (DF_REF_REG_MEM_P (use->ref)
964 || !bitmap_bit_p (insns, DF_REF_INSN_UID (use->ref))))
965 break;
966 if (use)
967 convert_reg (insn, DF_REF_REG (ref),
968 *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]));
969 else if (MAY_HAVE_DEBUG_BIND_INSNS)
970 {
971 /* If we generated a scalar copy we can leave debug-insns
972 as-is, if not, we have to adjust them. */
973 auto_vec<rtx_insn *, 5> to_reset_debug_insns;
974 for (use = DF_REF_CHAIN (ref); use; use = use->next)
975 if (DEBUG_INSN_P (DF_REF_INSN (use->ref)))
976 {
977 rtx_insn *debug_insn = DF_REF_INSN (use->ref);
978 /* If there's a reaching definition outside of the
979 chain we have to reset. */
980 df_link *def;
981 for (def = DF_REF_CHAIN (use->ref); def; def = def->next)
982 if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def->ref)))
983 break;
984 if (def)
985 to_reset_debug_insns.safe_push (debug_insn);
986 else
987 {
988 *DF_REF_REAL_LOC (use->ref)
989 = *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]);
990 df_insn_rescan (debug_insn);
991 }
992 }
993 /* Have to do the reset outside of the DF_CHAIN walk to not
994 disrupt it. */
995 while (!to_reset_debug_insns.is_empty ())
996 {
997 rtx_insn *debug_insn = to_reset_debug_insns.pop ();
998 INSN_VAR_LOCATION_LOC (debug_insn) = gen_rtx_UNKNOWN_VAR_LOC ();
999 df_insn_rescan_debug_internal (debug_insn);
1000 }
1001 }
1002 }
1003
1004 /* Replace uses in this insn with the defs we use in the chain. */
1005 for (df_ref ref = DF_INSN_USES (insn); ref; ref = DF_REF_NEXT_LOC (ref))
1006 if (!DF_REF_REG_MEM_P (ref))
1007 if (rtx *vreg = defs_map.get (regno_reg_rtx[DF_REF_REGNO (ref)]))
1008 {
1009 /* Also update a corresponding REG_DEAD note. */
1010 rtx note = find_reg_note (insn, REG_DEAD, DF_REF_REG (ref));
1011 if (note)
1012 XEXP (note, 0) = *vreg;
1013 *DF_REF_REAL_LOC (ref) = *vreg;
1014 }
1015
1016 rtx def_set = single_set (insn);
1017 rtx src = SET_SRC (def_set);
1018 rtx dst = SET_DEST (def_set);
1019 rtx subreg;
1020
1021 if (MEM_P (dst) && !REG_P (src))
1022 {
1023 /* There are no scalar integer instructions and therefore
1024 temporary register usage is required. */
1025 rtx tmp = gen_reg_rtx (smode);
1026 emit_conversion_insns (gen_move_insn (dst, tmp), insn);
1027 dst = gen_rtx_SUBREG (vmode, tmp, 0);
1028 }
1029 else if (REG_P (dst))
1030 {
1031 /* Replace the definition with a SUBREG to the definition we
1032 use inside the chain. */
1033 rtx *vdef = defs_map.get (dst);
1034 if (vdef)
1035 dst = *vdef;
1036 dst = gen_rtx_SUBREG (vmode, dst, 0);
1037 /* IRA doesn't like to have REG_EQUAL/EQUIV notes when the SET_DEST
1038 is a non-REG_P. So kill those off. */
1039 rtx note = find_reg_equal_equiv_note (insn);
1040 if (note)
1041 remove_note (insn, note);
1042 }
1043
1044 switch (GET_CODE (src))
1045 {
1046 case PLUS:
1047 case MINUS:
1048 case IOR:
1049 case XOR:
1050 case AND:
1051 case SMAX:
1052 case SMIN:
1053 case UMAX:
1054 case UMIN:
1055 convert_op (&XEXP (src, 1), insn);
1056 /* FALLTHRU */
1057
1058 case ABS:
1059 case ASHIFT:
1060 case ASHIFTRT:
1061 case LSHIFTRT:
1062 convert_op (&XEXP (src, 0), insn);
1063 PUT_MODE (src, vmode);
1064 break;
1065
1066 case NEG:
1067 src = XEXP (src, 0);
1068
1069 if (GET_CODE (src) == ABS)
1070 {
1071 src = XEXP (src, 0);
1072 convert_op (&src, insn);
1073 subreg = gen_reg_rtx (vmode);
1074 emit_insn_before (gen_rtx_SET (subreg,
1075 gen_rtx_ABS (vmode, src)), insn);
1076 src = subreg;
1077 }
1078 else
1079 convert_op (&src, insn);
1080
1081 subreg = gen_reg_rtx (vmode);
1082 emit_insn_before (gen_move_insn (subreg, CONST0_RTX (vmode)), insn);
1083 src = gen_rtx_MINUS (vmode, subreg, src);
1084 break;
1085
1086 case NOT:
1087 src = XEXP (src, 0);
1088 convert_op (&src, insn);
1089 subreg = gen_reg_rtx (vmode);
1090 emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (vmode)), insn);
1091 src = gen_rtx_XOR (vmode, src, subreg);
1092 break;
1093
1094 case MEM:
1095 if (!REG_P (dst))
1096 convert_op (&src, insn);
1097 break;
1098
1099 case REG:
1100 if (!MEM_P (dst))
1101 convert_op (&src, insn);
1102 break;
1103
1104 case SUBREG:
1105 gcc_assert (GET_MODE (src) == vmode);
1106 break;
1107
1108 case COMPARE:
1109 src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
1110
1111 gcc_assert (REG_P (src) && GET_MODE (src) == DImode);
1112 subreg = gen_rtx_SUBREG (V2DImode, src, 0);
1113 emit_insn_before (gen_vec_interleave_lowv2di
1114 (copy_rtx_if_shared (subreg),
1115 copy_rtx_if_shared (subreg),
1116 copy_rtx_if_shared (subreg)),
1117 insn);
1118 dst = gen_rtx_REG (CCmode, FLAGS_REG);
1119 src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (subreg),
1120 copy_rtx_if_shared (subreg)),
1121 UNSPEC_PTEST);
1122 break;
1123
1124 case CONST_INT:
1125 convert_op (&src, insn);
1126 break;
1127
1128 default:
1129 gcc_unreachable ();
1130 }
1131
1132 SET_SRC (def_set) = src;
1133 SET_DEST (def_set) = dst;
1134
1135 /* Drop possible dead definitions. */
1136 PATTERN (insn) = def_set;
1137
1138 INSN_CODE (insn) = -1;
1139 int patt = recog_memoized (insn);
1140 if (patt == -1)
1141 fatal_insn_not_found (insn);
1142 df_insn_rescan (insn);
1143 }
1144
1145 /* Fix uses of converted REG in debug insns. */
1146
1147 void
fix_debug_reg_uses(rtx reg)1148 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
1149 {
1150 if (!flag_var_tracking)
1151 return;
1152
1153 df_ref ref, next;
1154 for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
1155 {
1156 rtx_insn *insn = DF_REF_INSN (ref);
1157 /* Make sure the next ref is for a different instruction,
1158 so that we're not affected by the rescan. */
1159 next = DF_REF_NEXT_REG (ref);
1160 while (next && DF_REF_INSN (next) == insn)
1161 next = DF_REF_NEXT_REG (next);
1162
1163 if (DEBUG_INSN_P (insn))
1164 {
1165 /* It may be a debug insn with a TImode variable in
1166 register. */
1167 bool changed = false;
1168 for (; ref != next; ref = DF_REF_NEXT_REG (ref))
1169 {
1170 rtx *loc = DF_REF_LOC (ref);
1171 if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
1172 {
1173 *loc = gen_rtx_SUBREG (TImode, *loc, 0);
1174 changed = true;
1175 }
1176 }
1177 if (changed)
1178 df_insn_rescan (insn);
1179 }
1180 }
1181 }
1182
1183 /* Convert INSN from TImode to V1T1mode. */
1184
1185 void
convert_insn(rtx_insn * insn)1186 timode_scalar_chain::convert_insn (rtx_insn *insn)
1187 {
1188 rtx def_set = single_set (insn);
1189 rtx src = SET_SRC (def_set);
1190 rtx dst = SET_DEST (def_set);
1191
1192 switch (GET_CODE (dst))
1193 {
1194 case REG:
1195 {
1196 rtx tmp = find_reg_equal_equiv_note (insn);
1197 if (tmp)
1198 PUT_MODE (XEXP (tmp, 0), V1TImode);
1199 PUT_MODE (dst, V1TImode);
1200 fix_debug_reg_uses (dst);
1201 }
1202 break;
1203 case MEM:
1204 PUT_MODE (dst, V1TImode);
1205 break;
1206
1207 default:
1208 gcc_unreachable ();
1209 }
1210
1211 switch (GET_CODE (src))
1212 {
1213 case REG:
1214 PUT_MODE (src, V1TImode);
1215 /* Call fix_debug_reg_uses only if SRC is never defined. */
1216 if (!DF_REG_DEF_CHAIN (REGNO (src)))
1217 fix_debug_reg_uses (src);
1218 break;
1219
1220 case MEM:
1221 PUT_MODE (src, V1TImode);
1222 break;
1223
1224 case CONST_WIDE_INT:
1225 if (NONDEBUG_INSN_P (insn))
1226 {
1227 /* Since there are no instructions to store 128-bit constant,
1228 temporary register usage is required. */
1229 rtx tmp = gen_reg_rtx (V1TImode);
1230 start_sequence ();
1231 src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
1232 src = validize_mem (force_const_mem (V1TImode, src));
1233 rtx_insn *seq = get_insns ();
1234 end_sequence ();
1235 if (seq)
1236 emit_insn_before (seq, insn);
1237 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
1238 dst = tmp;
1239 }
1240 break;
1241
1242 case CONST_INT:
1243 switch (standard_sse_constant_p (src, TImode))
1244 {
1245 case 1:
1246 src = CONST0_RTX (GET_MODE (dst));
1247 break;
1248 case 2:
1249 src = CONSTM1_RTX (GET_MODE (dst));
1250 break;
1251 default:
1252 gcc_unreachable ();
1253 }
1254 if (NONDEBUG_INSN_P (insn))
1255 {
1256 rtx tmp = gen_reg_rtx (V1TImode);
1257 /* Since there are no instructions to store standard SSE
1258 constant, temporary register usage is required. */
1259 emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
1260 dst = tmp;
1261 }
1262 break;
1263
1264 default:
1265 gcc_unreachable ();
1266 }
1267
1268 SET_SRC (def_set) = src;
1269 SET_DEST (def_set) = dst;
1270
1271 /* Drop possible dead definitions. */
1272 PATTERN (insn) = def_set;
1273
1274 INSN_CODE (insn) = -1;
1275 recog_memoized (insn);
1276 df_insn_rescan (insn);
1277 }
1278
1279 /* Generate copies from defs used by the chain but not defined therein.
1280 Also populates defs_map which is used later by convert_insn. */
1281
1282 void
convert_registers()1283 general_scalar_chain::convert_registers ()
1284 {
1285 bitmap_iterator bi;
1286 unsigned id;
1287 EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1288 {
1289 rtx chain_reg = gen_reg_rtx (smode);
1290 defs_map.put (regno_reg_rtx[id], chain_reg);
1291 }
1292 EXECUTE_IF_SET_IN_BITMAP (insns_conv, 0, id, bi)
1293 for (df_ref ref = DF_INSN_UID_DEFS (id); ref; ref = DF_REF_NEXT_LOC (ref))
1294 if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
1295 make_vector_copies (DF_REF_INSN (ref), DF_REF_REAL_REG (ref));
1296 }
1297
1298 /* Convert whole chain creating required register
1299 conversions and copies. */
1300
1301 int
convert()1302 scalar_chain::convert ()
1303 {
1304 bitmap_iterator bi;
1305 unsigned id;
1306 int converted_insns = 0;
1307
1308 if (!dbg_cnt (stv_conversion))
1309 return 0;
1310
1311 if (dump_file)
1312 fprintf (dump_file, "Converting chain #%d...\n", chain_id);
1313
1314 convert_registers ();
1315
1316 EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
1317 {
1318 convert_insn (DF_INSN_UID_GET (id)->insn);
1319 converted_insns++;
1320 }
1321
1322 return converted_insns;
1323 }
1324
1325 /* Return the SET expression if INSN doesn't reference hard register.
1326 Return NULL if INSN uses or defines a hard register, excluding
1327 pseudo register pushes, hard register uses in a memory address,
1328 clobbers and flags definitions. */
1329
1330 static rtx
pseudo_reg_set(rtx_insn * insn)1331 pseudo_reg_set (rtx_insn *insn)
1332 {
1333 rtx set = single_set (insn);
1334 if (!set)
1335 return NULL;
1336
1337 /* Check pseudo register push first. */
1338 machine_mode mode = TARGET_64BIT ? TImode : DImode;
1339 if (REG_P (SET_SRC (set))
1340 && !HARD_REGISTER_P (SET_SRC (set))
1341 && push_operand (SET_DEST (set), mode))
1342 return set;
1343
1344 df_ref ref;
1345 FOR_EACH_INSN_DEF (ref, insn)
1346 if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
1347 && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
1348 && DF_REF_REGNO (ref) != FLAGS_REG)
1349 return NULL;
1350
1351 FOR_EACH_INSN_USE (ref, insn)
1352 if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
1353 return NULL;
1354
1355 return set;
1356 }
1357
1358 /* Check if comparison INSN may be transformed
1359 into vector comparison. Currently we transform
1360 zero checks only which look like:
1361
1362 (set (reg:CCZ 17 flags)
1363 (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
1364 (subreg:SI (reg:DI x) 0))
1365 (const_int 0 [0]))) */
1366
1367 static bool
convertible_comparison_p(rtx_insn * insn,enum machine_mode mode)1368 convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
1369 {
1370 /* ??? Currently convertible for double-word DImode chain only. */
1371 if (TARGET_64BIT || mode != DImode)
1372 return false;
1373
1374 if (!TARGET_SSE4_1)
1375 return false;
1376
1377 rtx def_set = single_set (insn);
1378
1379 gcc_assert (def_set);
1380
1381 rtx src = SET_SRC (def_set);
1382 rtx dst = SET_DEST (def_set);
1383
1384 gcc_assert (GET_CODE (src) == COMPARE);
1385
1386 if (GET_CODE (dst) != REG
1387 || REGNO (dst) != FLAGS_REG
1388 || GET_MODE (dst) != CCZmode)
1389 return false;
1390
1391 rtx op1 = XEXP (src, 0);
1392 rtx op2 = XEXP (src, 1);
1393
1394 if (op2 != CONST0_RTX (GET_MODE (op2)))
1395 return false;
1396
1397 if (GET_CODE (op1) != IOR)
1398 return false;
1399
1400 op2 = XEXP (op1, 1);
1401 op1 = XEXP (op1, 0);
1402
1403 if (!SUBREG_P (op1)
1404 || !SUBREG_P (op2)
1405 || GET_MODE (op1) != SImode
1406 || GET_MODE (op2) != SImode
1407 || ((SUBREG_BYTE (op1) != 0
1408 || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
1409 && (SUBREG_BYTE (op2) != 0
1410 || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
1411 return false;
1412
1413 op1 = SUBREG_REG (op1);
1414 op2 = SUBREG_REG (op2);
1415
1416 if (op1 != op2
1417 || !REG_P (op1)
1418 || GET_MODE (op1) != DImode)
1419 return false;
1420
1421 return true;
1422 }
1423
1424 /* The general version of scalar_to_vector_candidate_p. */
1425
1426 static bool
general_scalar_to_vector_candidate_p(rtx_insn * insn,enum machine_mode mode)1427 general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
1428 {
1429 rtx def_set = pseudo_reg_set (insn);
1430
1431 if (!def_set)
1432 return false;
1433
1434 rtx src = SET_SRC (def_set);
1435 rtx dst = SET_DEST (def_set);
1436
1437 if (GET_CODE (src) == COMPARE)
1438 return convertible_comparison_p (insn, mode);
1439
1440 /* We are interested in "mode" only. */
1441 if ((GET_MODE (src) != mode
1442 && !CONST_INT_P (src))
1443 || GET_MODE (dst) != mode)
1444 return false;
1445
1446 if (!REG_P (dst) && !MEM_P (dst))
1447 return false;
1448
1449 switch (GET_CODE (src))
1450 {
1451 case ASHIFTRT:
1452 if (!TARGET_AVX512VL)
1453 return false;
1454 /* FALLTHRU */
1455
1456 case ASHIFT:
1457 case LSHIFTRT:
1458 if (!CONST_INT_P (XEXP (src, 1))
1459 || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, GET_MODE_BITSIZE (mode)-1))
1460 return false;
1461 break;
1462
1463 case SMAX:
1464 case SMIN:
1465 case UMAX:
1466 case UMIN:
1467 if ((mode == DImode && !TARGET_AVX512VL)
1468 || (mode == SImode && !TARGET_SSE4_1))
1469 return false;
1470 /* Fallthru. */
1471
1472 case AND:
1473 case IOR:
1474 case XOR:
1475 case PLUS:
1476 case MINUS:
1477 if (!REG_P (XEXP (src, 1))
1478 && !MEM_P (XEXP (src, 1))
1479 && !CONST_INT_P (XEXP (src, 1)))
1480 return false;
1481
1482 if (GET_MODE (XEXP (src, 1)) != mode
1483 && !CONST_INT_P (XEXP (src, 1)))
1484 return false;
1485
1486 /* Check for andnot case. */
1487 if (GET_CODE (src) != AND
1488 || GET_CODE (XEXP (src, 0)) != NOT)
1489 break;
1490
1491 src = XEXP (src, 0);
1492 /* FALLTHRU */
1493
1494 case NOT:
1495 break;
1496
1497 case NEG:
1498 /* Check for nabs case. */
1499 if (GET_CODE (XEXP (src, 0)) != ABS)
1500 break;
1501
1502 src = XEXP (src, 0);
1503 /* FALLTHRU */
1504
1505 case ABS:
1506 if ((mode == DImode && !TARGET_AVX512VL)
1507 || (mode == SImode && !TARGET_SSSE3))
1508 return false;
1509 break;
1510
1511 case REG:
1512 return true;
1513
1514 case MEM:
1515 case CONST_INT:
1516 return REG_P (dst);
1517
1518 default:
1519 return false;
1520 }
1521
1522 if (!REG_P (XEXP (src, 0))
1523 && !MEM_P (XEXP (src, 0))
1524 && !CONST_INT_P (XEXP (src, 0)))
1525 return false;
1526
1527 if (GET_MODE (XEXP (src, 0)) != mode
1528 && !CONST_INT_P (XEXP (src, 0)))
1529 return false;
1530
1531 return true;
1532 }
1533
1534 /* The TImode version of scalar_to_vector_candidate_p. */
1535
1536 static bool
timode_scalar_to_vector_candidate_p(rtx_insn * insn)1537 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
1538 {
1539 rtx def_set = pseudo_reg_set (insn);
1540
1541 if (!def_set)
1542 return false;
1543
1544 rtx src = SET_SRC (def_set);
1545 rtx dst = SET_DEST (def_set);
1546
1547 /* Only TImode load and store are allowed. */
1548 if (GET_MODE (dst) != TImode)
1549 return false;
1550
1551 if (MEM_P (dst))
1552 {
1553 /* Check for store. Memory must be aligned or unaligned store
1554 is optimal. Only support store from register, standard SSE
1555 constant or CONST_WIDE_INT generated from piecewise store.
1556
1557 ??? Verify performance impact before enabling CONST_INT for
1558 __int128 store. */
1559 if (misaligned_operand (dst, TImode)
1560 && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
1561 return false;
1562
1563 switch (GET_CODE (src))
1564 {
1565 default:
1566 return false;
1567
1568 case REG:
1569 case CONST_WIDE_INT:
1570 return true;
1571
1572 case CONST_INT:
1573 return standard_sse_constant_p (src, TImode);
1574 }
1575 }
1576 else if (MEM_P (src))
1577 {
1578 /* Check for load. Memory must be aligned or unaligned load is
1579 optimal. */
1580 return (REG_P (dst)
1581 && (!misaligned_operand (src, TImode)
1582 || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
1583 }
1584
1585 return false;
1586 }
1587
1588 /* For a register REGNO, scan instructions for its defs and uses.
1589 Put REGNO in REGS if a def or use isn't in CANDIDATES. */
1590
1591 static void
timode_check_non_convertible_regs(bitmap candidates,bitmap regs,unsigned int regno)1592 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
1593 unsigned int regno)
1594 {
1595 for (df_ref def = DF_REG_DEF_CHAIN (regno);
1596 def;
1597 def = DF_REF_NEXT_REG (def))
1598 {
1599 if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1600 {
1601 if (dump_file)
1602 fprintf (dump_file,
1603 "r%d has non convertible def in insn %d\n",
1604 regno, DF_REF_INSN_UID (def));
1605
1606 bitmap_set_bit (regs, regno);
1607 break;
1608 }
1609 }
1610
1611 for (df_ref ref = DF_REG_USE_CHAIN (regno);
1612 ref;
1613 ref = DF_REF_NEXT_REG (ref))
1614 {
1615 /* Debug instructions are skipped. */
1616 if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
1617 && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1618 {
1619 if (dump_file)
1620 fprintf (dump_file,
1621 "r%d has non convertible use in insn %d\n",
1622 regno, DF_REF_INSN_UID (ref));
1623
1624 bitmap_set_bit (regs, regno);
1625 break;
1626 }
1627 }
1628 }
1629
1630 /* The TImode version of remove_non_convertible_regs. */
1631
1632 static void
timode_remove_non_convertible_regs(bitmap candidates)1633 timode_remove_non_convertible_regs (bitmap candidates)
1634 {
1635 bitmap_iterator bi;
1636 unsigned id;
1637 bitmap regs = BITMAP_ALLOC (NULL);
1638
1639 EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1640 {
1641 rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1642 rtx dest = SET_DEST (def_set);
1643 rtx src = SET_SRC (def_set);
1644
1645 if ((!REG_P (dest)
1646 || bitmap_bit_p (regs, REGNO (dest))
1647 || HARD_REGISTER_P (dest))
1648 && (!REG_P (src)
1649 || bitmap_bit_p (regs, REGNO (src))
1650 || HARD_REGISTER_P (src)))
1651 continue;
1652
1653 if (REG_P (dest))
1654 timode_check_non_convertible_regs (candidates, regs,
1655 REGNO (dest));
1656
1657 if (REG_P (src))
1658 timode_check_non_convertible_regs (candidates, regs,
1659 REGNO (src));
1660 }
1661
1662 EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1663 {
1664 for (df_ref def = DF_REG_DEF_CHAIN (id);
1665 def;
1666 def = DF_REF_NEXT_REG (def))
1667 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1668 {
1669 if (dump_file)
1670 fprintf (dump_file, "Removing insn %d from candidates list\n",
1671 DF_REF_INSN_UID (def));
1672
1673 bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1674 }
1675
1676 for (df_ref ref = DF_REG_USE_CHAIN (id);
1677 ref;
1678 ref = DF_REF_NEXT_REG (ref))
1679 if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1680 {
1681 if (dump_file)
1682 fprintf (dump_file, "Removing insn %d from candidates list\n",
1683 DF_REF_INSN_UID (ref));
1684
1685 bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
1686 }
1687 }
1688
1689 BITMAP_FREE (regs);
1690 }
1691
1692 /* Main STV pass function. Find and convert scalar
1693 instructions into vector mode when profitable. */
1694
1695 static unsigned int
convert_scalars_to_vector(bool timode_p)1696 convert_scalars_to_vector (bool timode_p)
1697 {
1698 basic_block bb;
1699 int converted_insns = 0;
1700 auto_vec<rtx_insn *> control_flow_insns;
1701
1702 bitmap_obstack_initialize (NULL);
1703 const machine_mode cand_mode[3] = { SImode, DImode, TImode };
1704 const machine_mode cand_vmode[3] = { V4SImode, V2DImode, V1TImode };
1705 bitmap_head candidates[3]; /* { SImode, DImode, TImode } */
1706 for (unsigned i = 0; i < 3; ++i)
1707 bitmap_initialize (&candidates[i], &bitmap_default_obstack);
1708
1709 calculate_dominance_info (CDI_DOMINATORS);
1710 df_set_flags (DF_DEFER_INSN_RESCAN | DF_RD_PRUNE_DEAD_DEFS);
1711 df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
1712 df_analyze ();
1713
1714 /* Find all instructions we want to convert into vector mode. */
1715 if (dump_file)
1716 fprintf (dump_file, "Searching for mode conversion candidates...\n");
1717
1718 FOR_EACH_BB_FN (bb, cfun)
1719 {
1720 rtx_insn *insn;
1721 FOR_BB_INSNS (bb, insn)
1722 if (timode_p
1723 && timode_scalar_to_vector_candidate_p (insn))
1724 {
1725 if (dump_file)
1726 fprintf (dump_file, " insn %d is marked as a TImode candidate\n",
1727 INSN_UID (insn));
1728
1729 bitmap_set_bit (&candidates[2], INSN_UID (insn));
1730 }
1731 else if (!timode_p)
1732 {
1733 /* Check {SI,DI}mode. */
1734 for (unsigned i = 0; i <= 1; ++i)
1735 if (general_scalar_to_vector_candidate_p (insn, cand_mode[i]))
1736 {
1737 if (dump_file)
1738 fprintf (dump_file, " insn %d is marked as a %s candidate\n",
1739 INSN_UID (insn), i == 0 ? "SImode" : "DImode");
1740
1741 bitmap_set_bit (&candidates[i], INSN_UID (insn));
1742 break;
1743 }
1744 }
1745 }
1746
1747 if (timode_p)
1748 timode_remove_non_convertible_regs (&candidates[2]);
1749
1750 for (unsigned i = 0; i <= 2; ++i)
1751 if (!bitmap_empty_p (&candidates[i]))
1752 break;
1753 else if (i == 2 && dump_file)
1754 fprintf (dump_file, "There are no candidates for optimization.\n");
1755
1756 for (unsigned i = 0; i <= 2; ++i)
1757 while (!bitmap_empty_p (&candidates[i]))
1758 {
1759 unsigned uid = bitmap_first_set_bit (&candidates[i]);
1760 scalar_chain *chain;
1761
1762 if (cand_mode[i] == TImode)
1763 chain = new timode_scalar_chain;
1764 else
1765 chain = new general_scalar_chain (cand_mode[i], cand_vmode[i]);
1766
1767 /* Find instructions chain we want to convert to vector mode.
1768 Check all uses and definitions to estimate all required
1769 conversions. */
1770 chain->build (&candidates[i], uid);
1771
1772 if (chain->compute_convert_gain () > 0)
1773 converted_insns += chain->convert ();
1774 else
1775 if (dump_file)
1776 fprintf (dump_file, "Chain #%d conversion is not profitable\n",
1777 chain->chain_id);
1778
1779 rtx_insn* iter_insn;
1780 unsigned int ii;
1781 FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn)
1782 control_flow_insns.safe_push (iter_insn);
1783
1784 delete chain;
1785 }
1786
1787 if (dump_file)
1788 fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
1789
1790 for (unsigned i = 0; i <= 2; ++i)
1791 bitmap_release (&candidates[i]);
1792 bitmap_obstack_release (NULL);
1793 df_process_deferred_rescans ();
1794
1795 /* Conversion means we may have 128bit register spills/fills
1796 which require aligned stack. */
1797 if (converted_insns)
1798 {
1799 if (crtl->stack_alignment_needed < 128)
1800 crtl->stack_alignment_needed = 128;
1801 if (crtl->stack_alignment_estimated < 128)
1802 crtl->stack_alignment_estimated = 128;
1803
1804 crtl->stack_realign_needed
1805 = INCOMING_STACK_BOUNDARY < crtl->stack_alignment_estimated;
1806 crtl->stack_realign_tried = crtl->stack_realign_needed;
1807
1808 crtl->stack_realign_processed = true;
1809
1810 if (!crtl->drap_reg)
1811 {
1812 rtx drap_rtx = targetm.calls.get_drap_rtx ();
1813
1814 /* stack_realign_drap and drap_rtx must match. */
1815 gcc_assert ((stack_realign_drap != 0) == (drap_rtx != NULL));
1816
1817 /* Do nothing if NULL is returned,
1818 which means DRAP is not needed. */
1819 if (drap_rtx != NULL)
1820 {
1821 crtl->args.internal_arg_pointer = drap_rtx;
1822
1823 /* Call fixup_tail_calls to clean up
1824 REG_EQUIV note if DRAP is needed. */
1825 fixup_tail_calls ();
1826 }
1827 }
1828
1829 /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
1830 if (TARGET_64BIT)
1831 for (tree parm = DECL_ARGUMENTS (current_function_decl);
1832 parm; parm = DECL_CHAIN (parm))
1833 {
1834 if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
1835 continue;
1836 if (DECL_RTL_SET_P (parm)
1837 && GET_MODE (DECL_RTL (parm)) == V1TImode)
1838 {
1839 rtx r = DECL_RTL (parm);
1840 if (REG_P (r))
1841 SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
1842 }
1843 if (DECL_INCOMING_RTL (parm)
1844 && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
1845 {
1846 rtx r = DECL_INCOMING_RTL (parm);
1847 if (REG_P (r))
1848 DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
1849 }
1850 }
1851
1852 if (!control_flow_insns.is_empty ())
1853 {
1854 free_dominance_info (CDI_DOMINATORS);
1855
1856 unsigned int i;
1857 rtx_insn* insn;
1858 FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
1859 if (control_flow_insn_p (insn))
1860 {
1861 /* Split the block after insn. There will be a fallthru
1862 edge, which is OK so we keep it. We have to create
1863 the exception edges ourselves. */
1864 bb = BLOCK_FOR_INSN (insn);
1865 split_block (bb, insn);
1866 rtl_make_eh_edge (NULL, bb, BB_END (bb));
1867 }
1868 }
1869 }
1870
1871 return 0;
1872 }
1873
1874 static unsigned int
rest_of_handle_insert_vzeroupper(void)1875 rest_of_handle_insert_vzeroupper (void)
1876 {
1877 /* vzeroupper instructions are inserted immediately after reload to
1878 account for possible spills from 256bit or 512bit registers. The pass
1879 reuses mode switching infrastructure by re-running mode insertion
1880 pass, so disable entities that have already been processed. */
1881 for (int i = 0; i < MAX_386_ENTITIES; i++)
1882 ix86_optimize_mode_switching[i] = 0;
1883
1884 ix86_optimize_mode_switching[AVX_U128] = 1;
1885
1886 /* Call optimize_mode_switching. */
1887 g->get_passes ()->execute_pass_mode_switching ();
1888
1889 df_analyze ();
1890 return 0;
1891 }
1892
1893 namespace {
1894
1895 const pass_data pass_data_insert_vzeroupper =
1896 {
1897 RTL_PASS, /* type */
1898 "vzeroupper", /* name */
1899 OPTGROUP_NONE, /* optinfo_flags */
1900 TV_MACH_DEP, /* tv_id */
1901 0, /* properties_required */
1902 0, /* properties_provided */
1903 0, /* properties_destroyed */
1904 0, /* todo_flags_start */
1905 TODO_df_finish, /* todo_flags_finish */
1906 };
1907
1908 class pass_insert_vzeroupper : public rtl_opt_pass
1909 {
1910 public:
pass_insert_vzeroupper(gcc::context * ctxt)1911 pass_insert_vzeroupper(gcc::context *ctxt)
1912 : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
1913 {}
1914
1915 /* opt_pass methods: */
gate(function *)1916 virtual bool gate (function *)
1917 {
1918 return TARGET_AVX && TARGET_VZEROUPPER;
1919 }
1920
execute(function *)1921 virtual unsigned int execute (function *)
1922 {
1923 return rest_of_handle_insert_vzeroupper ();
1924 }
1925
1926 }; // class pass_insert_vzeroupper
1927
1928 const pass_data pass_data_stv =
1929 {
1930 RTL_PASS, /* type */
1931 "stv", /* name */
1932 OPTGROUP_NONE, /* optinfo_flags */
1933 TV_MACH_DEP, /* tv_id */
1934 0, /* properties_required */
1935 0, /* properties_provided */
1936 0, /* properties_destroyed */
1937 0, /* todo_flags_start */
1938 TODO_df_finish, /* todo_flags_finish */
1939 };
1940
1941 class pass_stv : public rtl_opt_pass
1942 {
1943 public:
pass_stv(gcc::context * ctxt)1944 pass_stv (gcc::context *ctxt)
1945 : rtl_opt_pass (pass_data_stv, ctxt),
1946 timode_p (false)
1947 {}
1948
1949 /* opt_pass methods: */
gate(function *)1950 virtual bool gate (function *)
1951 {
1952 return ((!timode_p || TARGET_64BIT)
1953 && TARGET_STV && TARGET_SSE2 && optimize > 1);
1954 }
1955
execute(function *)1956 virtual unsigned int execute (function *)
1957 {
1958 return convert_scalars_to_vector (timode_p);
1959 }
1960
clone()1961 opt_pass *clone ()
1962 {
1963 return new pass_stv (m_ctxt);
1964 }
1965
set_pass_param(unsigned int n,bool param)1966 void set_pass_param (unsigned int n, bool param)
1967 {
1968 gcc_assert (n == 0);
1969 timode_p = param;
1970 }
1971
1972 private:
1973 bool timode_p;
1974 }; // class pass_stv
1975
1976 } // anon namespace
1977
1978 rtl_opt_pass *
make_pass_insert_vzeroupper(gcc::context * ctxt)1979 make_pass_insert_vzeroupper (gcc::context *ctxt)
1980 {
1981 return new pass_insert_vzeroupper (ctxt);
1982 }
1983
1984 rtl_opt_pass *
make_pass_stv(gcc::context * ctxt)1985 make_pass_stv (gcc::context *ctxt)
1986 {
1987 return new pass_stv (ctxt);
1988 }
1989
1990 /* Inserting ENDBR and pseudo patchable-area instructions. */
1991
1992 static void
rest_of_insert_endbr_and_patchable_area(bool need_endbr,unsigned int patchable_area_size)1993 rest_of_insert_endbr_and_patchable_area (bool need_endbr,
1994 unsigned int patchable_area_size)
1995 {
1996 rtx endbr;
1997 rtx_insn *insn;
1998 rtx_insn *endbr_insn = NULL;
1999 basic_block bb;
2000
2001 if (need_endbr)
2002 {
2003 /* Currently emit EB if it's a tracking function, i.e. 'nocf_check'
2004 is absent among function attributes. Later an optimization will
2005 be introduced to make analysis if an address of a static function
2006 is taken. A static function whose address is not taken will get
2007 a nocf_check attribute. This will allow to reduce the number of
2008 EB. */
2009 if (!lookup_attribute ("nocf_check",
2010 TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
2011 && (!flag_manual_endbr
2012 || lookup_attribute ("cf_check",
2013 DECL_ATTRIBUTES (cfun->decl)))
2014 && (!cgraph_node::get (cfun->decl)->only_called_directly_p ()
2015 || ix86_cmodel == CM_LARGE
2016 || ix86_cmodel == CM_LARGE_PIC
2017 || flag_force_indirect_call
2018 || (TARGET_DLLIMPORT_DECL_ATTRIBUTES
2019 && DECL_DLLIMPORT_P (cfun->decl))))
2020 {
2021 if (crtl->profile && flag_fentry)
2022 {
2023 /* Queue ENDBR insertion to x86_function_profiler.
2024 NB: Any patchable-area insn will be inserted after
2025 ENDBR. */
2026 cfun->machine->insn_queued_at_entrance = TYPE_ENDBR;
2027 }
2028 else
2029 {
2030 endbr = gen_nop_endbr ();
2031 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2032 rtx_insn *insn = BB_HEAD (bb);
2033 endbr_insn = emit_insn_before (endbr, insn);
2034 }
2035 }
2036 }
2037
2038 if (patchable_area_size)
2039 {
2040 if (crtl->profile && flag_fentry)
2041 {
2042 /* Queue patchable-area insertion to x86_function_profiler.
2043 NB: If there is a queued ENDBR, x86_function_profiler
2044 will also handle patchable-area. */
2045 if (!cfun->machine->insn_queued_at_entrance)
2046 cfun->machine->insn_queued_at_entrance = TYPE_PATCHABLE_AREA;
2047 }
2048 else
2049 {
2050 rtx patchable_area
2051 = gen_patchable_area (GEN_INT (patchable_area_size),
2052 GEN_INT (crtl->patch_area_entry == 0));
2053 if (endbr_insn)
2054 emit_insn_after (patchable_area, endbr_insn);
2055 else
2056 {
2057 bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2058 insn = BB_HEAD (bb);
2059 emit_insn_before (patchable_area, insn);
2060 }
2061 }
2062 }
2063
2064 if (!need_endbr)
2065 return;
2066
2067 bb = 0;
2068 FOR_EACH_BB_FN (bb, cfun)
2069 {
2070 for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
2071 insn = NEXT_INSN (insn))
2072 {
2073 if (CALL_P (insn))
2074 {
2075 need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
2076 if (!need_endbr && !SIBLING_CALL_P (insn))
2077 {
2078 rtx call = get_call_rtx_from (insn);
2079 rtx fnaddr = XEXP (call, 0);
2080 tree fndecl = NULL_TREE;
2081
2082 /* Also generate ENDBRANCH for non-tail call which
2083 may return via indirect branch. */
2084 if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
2085 fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
2086 if (fndecl == NULL_TREE)
2087 fndecl = MEM_EXPR (fnaddr);
2088 if (fndecl
2089 && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
2090 && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
2091 fndecl = NULL_TREE;
2092 if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
2093 {
2094 tree fntype = TREE_TYPE (fndecl);
2095 if (lookup_attribute ("indirect_return",
2096 TYPE_ATTRIBUTES (fntype)))
2097 need_endbr = true;
2098 }
2099 }
2100 if (!need_endbr)
2101 continue;
2102 /* Generate ENDBRANCH after CALL, which can return more than
2103 twice, setjmp-like functions. */
2104
2105 endbr = gen_nop_endbr ();
2106 emit_insn_after_setloc (endbr, insn, INSN_LOCATION (insn));
2107 continue;
2108 }
2109
2110 if (JUMP_P (insn) && flag_cet_switch)
2111 {
2112 rtx target = JUMP_LABEL (insn);
2113 if (target == NULL_RTX || ANY_RETURN_P (target))
2114 continue;
2115
2116 /* Check the jump is a switch table. */
2117 rtx_insn *label = as_a<rtx_insn *> (target);
2118 rtx_insn *table = next_insn (label);
2119 if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2120 continue;
2121
2122 /* For the indirect jump find out all places it jumps and insert
2123 ENDBRANCH there. It should be done under a special flag to
2124 control ENDBRANCH generation for switch stmts. */
2125 edge_iterator ei;
2126 edge e;
2127 basic_block dest_blk;
2128
2129 FOR_EACH_EDGE (e, ei, bb->succs)
2130 {
2131 rtx_insn *insn;
2132
2133 dest_blk = e->dest;
2134 insn = BB_HEAD (dest_blk);
2135 gcc_assert (LABEL_P (insn));
2136 endbr = gen_nop_endbr ();
2137 emit_insn_after (endbr, insn);
2138 }
2139 continue;
2140 }
2141
2142 if (LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2143 {
2144 endbr = gen_nop_endbr ();
2145 emit_insn_after (endbr, insn);
2146 continue;
2147 }
2148 }
2149 }
2150
2151 return;
2152 }
2153
2154 namespace {
2155
2156 const pass_data pass_data_insert_endbr_and_patchable_area =
2157 {
2158 RTL_PASS, /* type. */
2159 "endbr_and_patchable_area", /* name. */
2160 OPTGROUP_NONE, /* optinfo_flags. */
2161 TV_MACH_DEP, /* tv_id. */
2162 0, /* properties_required. */
2163 0, /* properties_provided. */
2164 0, /* properties_destroyed. */
2165 0, /* todo_flags_start. */
2166 0, /* todo_flags_finish. */
2167 };
2168
2169 class pass_insert_endbr_and_patchable_area : public rtl_opt_pass
2170 {
2171 public:
pass_insert_endbr_and_patchable_area(gcc::context * ctxt)2172 pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
2173 : rtl_opt_pass (pass_data_insert_endbr_and_patchable_area, ctxt)
2174 {}
2175
2176 /* opt_pass methods: */
gate(function *)2177 virtual bool gate (function *)
2178 {
2179 need_endbr = (flag_cf_protection & CF_BRANCH) != 0;
2180 patchable_area_size = crtl->patch_area_size - crtl->patch_area_entry;
2181 return need_endbr || patchable_area_size;
2182 }
2183
execute(function *)2184 virtual unsigned int execute (function *)
2185 {
2186 timevar_push (TV_MACH_DEP);
2187 rest_of_insert_endbr_and_patchable_area (need_endbr,
2188 patchable_area_size);
2189 timevar_pop (TV_MACH_DEP);
2190 return 0;
2191 }
2192
2193 private:
2194 bool need_endbr;
2195 unsigned int patchable_area_size;
2196 }; // class pass_insert_endbr_and_patchable_area
2197
2198 } // anon namespace
2199
2200 rtl_opt_pass *
make_pass_insert_endbr_and_patchable_area(gcc::context * ctxt)2201 make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
2202 {
2203 return new pass_insert_endbr_and_patchable_area (ctxt);
2204 }
2205
2206 /* At entry of the nearest common dominator for basic blocks with
2207 conversions/rcp/sqrt/rsqrt/round, generate a single
2208 vxorps %xmmN, %xmmN, %xmmN
2209 for all
2210 vcvtss2sd op, %xmmN, %xmmX
2211 vcvtsd2ss op, %xmmN, %xmmX
2212 vcvtsi2ss op, %xmmN, %xmmX
2213 vcvtsi2sd op, %xmmN, %xmmX
2214
2215 NB: We want to generate only a single vxorps to cover the whole
2216 function. The LCM algorithm isn't appropriate here since it may
2217 place a vxorps inside the loop. */
2218
2219 static unsigned int
remove_partial_avx_dependency(void)2220 remove_partial_avx_dependency (void)
2221 {
2222 timevar_push (TV_MACH_DEP);
2223
2224 bitmap_obstack_initialize (NULL);
2225 bitmap convert_bbs = BITMAP_ALLOC (NULL);
2226
2227 basic_block bb;
2228 rtx_insn *insn, *set_insn;
2229 rtx set;
2230 rtx v4sf_const0 = NULL_RTX;
2231
2232 auto_vec<rtx_insn *> control_flow_insns;
2233
2234 /* We create invalid RTL initially so defer rescans. */
2235 df_set_flags (DF_DEFER_INSN_RESCAN);
2236
2237 FOR_EACH_BB_FN (bb, cfun)
2238 {
2239 FOR_BB_INSNS (bb, insn)
2240 {
2241 if (!NONDEBUG_INSN_P (insn))
2242 continue;
2243
2244 set = single_set (insn);
2245 if (!set)
2246 continue;
2247
2248 if (get_attr_avx_partial_xmm_update (insn)
2249 != AVX_PARTIAL_XMM_UPDATE_TRUE)
2250 continue;
2251
2252 /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
2253 SI -> SF, SI -> DF, DI -> SF, DI -> DF, sqrt, rsqrt, rcp,
2254 round, to vec_dup and vec_merge with subreg. */
2255 rtx src = SET_SRC (set);
2256 rtx dest = SET_DEST (set);
2257 machine_mode dest_mode = GET_MODE (dest);
2258 bool convert_p = false;
2259 switch (GET_CODE (src))
2260 {
2261 case FLOAT:
2262 case FLOAT_EXTEND:
2263 case FLOAT_TRUNCATE:
2264 case UNSIGNED_FLOAT:
2265 convert_p = true;
2266 break;
2267 default:
2268 break;
2269 }
2270
2271 /* Only hanlde conversion here. */
2272 machine_mode src_mode
2273 = convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode;
2274 switch (src_mode)
2275 {
2276 case E_SFmode:
2277 case E_DFmode:
2278 if (TARGET_USE_VECTOR_FP_CONVERTS
2279 || !TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY)
2280 continue;
2281 break;
2282 case E_SImode:
2283 case E_DImode:
2284 if (TARGET_USE_VECTOR_CONVERTS
2285 || !TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY)
2286 continue;
2287 break;
2288 case E_VOIDmode:
2289 gcc_assert (!convert_p);
2290 break;
2291 default:
2292 gcc_unreachable ();
2293 }
2294
2295 if (!v4sf_const0)
2296 v4sf_const0 = gen_reg_rtx (V4SFmode);
2297
2298 rtx zero;
2299 machine_mode dest_vecmode;
2300 switch (dest_mode)
2301 {
2302 case E_HFmode:
2303 dest_vecmode = V8HFmode;
2304 zero = gen_rtx_SUBREG (V8HFmode, v4sf_const0, 0);
2305 break;
2306 case E_SFmode:
2307 dest_vecmode = V4SFmode;
2308 zero = v4sf_const0;
2309 break;
2310 case E_DFmode:
2311 dest_vecmode = V2DFmode;
2312 zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
2313 break;
2314 default:
2315 gcc_unreachable ();
2316 }
2317
2318 /* Change source to vector mode. */
2319 src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
2320 src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
2321 GEN_INT (HOST_WIDE_INT_1U));
2322 /* Change destination to vector mode. */
2323 rtx vec = gen_reg_rtx (dest_vecmode);
2324 /* Generate an XMM vector SET. */
2325 set = gen_rtx_SET (vec, src);
2326 set_insn = emit_insn_before (set, insn);
2327 df_insn_rescan (set_insn);
2328
2329 if (cfun->can_throw_non_call_exceptions)
2330 {
2331 /* Handle REG_EH_REGION note. */
2332 rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
2333 if (note)
2334 {
2335 control_flow_insns.safe_push (set_insn);
2336 add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0));
2337 }
2338 }
2339
2340 src = gen_rtx_SUBREG (dest_mode, vec, 0);
2341 set = gen_rtx_SET (dest, src);
2342
2343 /* Drop possible dead definitions. */
2344 PATTERN (insn) = set;
2345
2346 INSN_CODE (insn) = -1;
2347 recog_memoized (insn);
2348 df_insn_rescan (insn);
2349 bitmap_set_bit (convert_bbs, bb->index);
2350 }
2351 }
2352
2353 if (v4sf_const0)
2354 {
2355 /* (Re-)discover loops so that bb->loop_father can be used in the
2356 analysis below. */
2357 calculate_dominance_info (CDI_DOMINATORS);
2358 loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
2359
2360 /* Generate a vxorps at entry of the nearest dominator for basic
2361 blocks with conversions, which is in the fake loop that
2362 contains the whole function, so that there is only a single
2363 vxorps in the whole function. */
2364 bb = nearest_common_dominator_for_set (CDI_DOMINATORS,
2365 convert_bbs);
2366 while (bb->loop_father->latch
2367 != EXIT_BLOCK_PTR_FOR_FN (cfun))
2368 bb = get_immediate_dominator (CDI_DOMINATORS,
2369 bb->loop_father->header);
2370
2371 set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
2372
2373 insn = BB_HEAD (bb);
2374 while (insn && !NONDEBUG_INSN_P (insn))
2375 {
2376 if (insn == BB_END (bb))
2377 {
2378 insn = NULL;
2379 break;
2380 }
2381 insn = NEXT_INSN (insn);
2382 }
2383 if (insn == BB_HEAD (bb))
2384 set_insn = emit_insn_before (set, insn);
2385 else
2386 set_insn = emit_insn_after (set,
2387 insn ? PREV_INSN (insn) : BB_END (bb));
2388 df_insn_rescan (set_insn);
2389 loop_optimizer_finalize ();
2390
2391 if (!control_flow_insns.is_empty ())
2392 {
2393 free_dominance_info (CDI_DOMINATORS);
2394
2395 unsigned int i;
2396 FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
2397 if (control_flow_insn_p (insn))
2398 {
2399 /* Split the block after insn. There will be a fallthru
2400 edge, which is OK so we keep it. We have to create
2401 the exception edges ourselves. */
2402 bb = BLOCK_FOR_INSN (insn);
2403 split_block (bb, insn);
2404 rtl_make_eh_edge (NULL, bb, BB_END (bb));
2405 }
2406 }
2407 }
2408
2409 df_process_deferred_rescans ();
2410 df_clear_flags (DF_DEFER_INSN_RESCAN);
2411 bitmap_obstack_release (NULL);
2412 BITMAP_FREE (convert_bbs);
2413
2414 timevar_pop (TV_MACH_DEP);
2415 return 0;
2416 }
2417
2418 namespace {
2419
2420 const pass_data pass_data_remove_partial_avx_dependency =
2421 {
2422 RTL_PASS, /* type */
2423 "rpad", /* name */
2424 OPTGROUP_NONE, /* optinfo_flags */
2425 TV_MACH_DEP, /* tv_id */
2426 0, /* properties_required */
2427 0, /* properties_provided */
2428 0, /* properties_destroyed */
2429 0, /* todo_flags_start */
2430 0, /* todo_flags_finish */
2431 };
2432
2433 class pass_remove_partial_avx_dependency : public rtl_opt_pass
2434 {
2435 public:
pass_remove_partial_avx_dependency(gcc::context * ctxt)2436 pass_remove_partial_avx_dependency (gcc::context *ctxt)
2437 : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
2438 {}
2439
2440 /* opt_pass methods: */
gate(function *)2441 virtual bool gate (function *)
2442 {
2443 return (TARGET_AVX
2444 && TARGET_SSE_PARTIAL_REG_DEPENDENCY
2445 && TARGET_SSE_MATH
2446 && optimize
2447 && optimize_function_for_speed_p (cfun));
2448 }
2449
execute(function *)2450 virtual unsigned int execute (function *)
2451 {
2452 return remove_partial_avx_dependency ();
2453 }
2454 }; // class pass_rpad
2455
2456 } // anon namespace
2457
2458 rtl_opt_pass *
make_pass_remove_partial_avx_dependency(gcc::context * ctxt)2459 make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
2460 {
2461 return new pass_remove_partial_avx_dependency (ctxt);
2462 }
2463
2464 /* This compares the priority of target features in function DECL1
2465 and DECL2. It returns positive value if DECL1 is higher priority,
2466 negative value if DECL2 is higher priority and 0 if they are the
2467 same. */
2468
2469 int
ix86_compare_version_priority(tree decl1,tree decl2)2470 ix86_compare_version_priority (tree decl1, tree decl2)
2471 {
2472 unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
2473 unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
2474
2475 return (int)priority1 - (int)priority2;
2476 }
2477
2478 /* V1 and V2 point to function versions with different priorities
2479 based on the target ISA. This function compares their priorities. */
2480
2481 static int
feature_compare(const void * v1,const void * v2)2482 feature_compare (const void *v1, const void *v2)
2483 {
2484 typedef struct _function_version_info
2485 {
2486 tree version_decl;
2487 tree predicate_chain;
2488 unsigned int dispatch_priority;
2489 } function_version_info;
2490
2491 const function_version_info c1 = *(const function_version_info *)v1;
2492 const function_version_info c2 = *(const function_version_info *)v2;
2493 return (c2.dispatch_priority - c1.dispatch_priority);
2494 }
2495
2496 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
2497 to return a pointer to VERSION_DECL if the outcome of the expression
2498 formed by PREDICATE_CHAIN is true. This function will be called during
2499 version dispatch to decide which function version to execute. It returns
2500 the basic block at the end, to which more conditions can be added. */
2501
2502 static basic_block
add_condition_to_bb(tree function_decl,tree version_decl,tree predicate_chain,basic_block new_bb)2503 add_condition_to_bb (tree function_decl, tree version_decl,
2504 tree predicate_chain, basic_block new_bb)
2505 {
2506 gimple *return_stmt;
2507 tree convert_expr, result_var;
2508 gimple *convert_stmt;
2509 gimple *call_cond_stmt;
2510 gimple *if_else_stmt;
2511
2512 basic_block bb1, bb2, bb3;
2513 edge e12, e23;
2514
2515 tree cond_var, and_expr_var = NULL_TREE;
2516 gimple_seq gseq;
2517
2518 tree predicate_decl, predicate_arg;
2519
2520 push_cfun (DECL_STRUCT_FUNCTION (function_decl));
2521
2522 gcc_assert (new_bb != NULL);
2523 gseq = bb_seq (new_bb);
2524
2525
2526 convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
2527 build_fold_addr_expr (version_decl));
2528 result_var = create_tmp_var (ptr_type_node);
2529 convert_stmt = gimple_build_assign (result_var, convert_expr);
2530 return_stmt = gimple_build_return (result_var);
2531
2532 if (predicate_chain == NULL_TREE)
2533 {
2534 gimple_seq_add_stmt (&gseq, convert_stmt);
2535 gimple_seq_add_stmt (&gseq, return_stmt);
2536 set_bb_seq (new_bb, gseq);
2537 gimple_set_bb (convert_stmt, new_bb);
2538 gimple_set_bb (return_stmt, new_bb);
2539 pop_cfun ();
2540 return new_bb;
2541 }
2542
2543 while (predicate_chain != NULL)
2544 {
2545 cond_var = create_tmp_var (integer_type_node);
2546 predicate_decl = TREE_PURPOSE (predicate_chain);
2547 predicate_arg = TREE_VALUE (predicate_chain);
2548 call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
2549 gimple_call_set_lhs (call_cond_stmt, cond_var);
2550
2551 gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
2552 gimple_set_bb (call_cond_stmt, new_bb);
2553 gimple_seq_add_stmt (&gseq, call_cond_stmt);
2554
2555 predicate_chain = TREE_CHAIN (predicate_chain);
2556
2557 if (and_expr_var == NULL)
2558 and_expr_var = cond_var;
2559 else
2560 {
2561 gimple *assign_stmt;
2562 /* Use MIN_EXPR to check if any integer is zero?.
2563 and_expr_var = min_expr <cond_var, and_expr_var> */
2564 assign_stmt = gimple_build_assign (and_expr_var,
2565 build2 (MIN_EXPR, integer_type_node,
2566 cond_var, and_expr_var));
2567
2568 gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
2569 gimple_set_bb (assign_stmt, new_bb);
2570 gimple_seq_add_stmt (&gseq, assign_stmt);
2571 }
2572 }
2573
2574 if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
2575 integer_zero_node,
2576 NULL_TREE, NULL_TREE);
2577 gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
2578 gimple_set_bb (if_else_stmt, new_bb);
2579 gimple_seq_add_stmt (&gseq, if_else_stmt);
2580
2581 gimple_seq_add_stmt (&gseq, convert_stmt);
2582 gimple_seq_add_stmt (&gseq, return_stmt);
2583 set_bb_seq (new_bb, gseq);
2584
2585 bb1 = new_bb;
2586 e12 = split_block (bb1, if_else_stmt);
2587 bb2 = e12->dest;
2588 e12->flags &= ~EDGE_FALLTHRU;
2589 e12->flags |= EDGE_TRUE_VALUE;
2590
2591 e23 = split_block (bb2, return_stmt);
2592
2593 gimple_set_bb (convert_stmt, bb2);
2594 gimple_set_bb (return_stmt, bb2);
2595
2596 bb3 = e23->dest;
2597 make_edge (bb1, bb3, EDGE_FALSE_VALUE);
2598
2599 remove_edge (e23);
2600 make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
2601
2602 pop_cfun ();
2603
2604 return bb3;
2605 }
2606
2607 /* This function generates the dispatch function for
2608 multi-versioned functions. DISPATCH_DECL is the function which will
2609 contain the dispatch logic. FNDECLS are the function choices for
2610 dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
2611 in DISPATCH_DECL in which the dispatch code is generated. */
2612
2613 static int
dispatch_function_versions(tree dispatch_decl,void * fndecls_p,basic_block * empty_bb)2614 dispatch_function_versions (tree dispatch_decl,
2615 void *fndecls_p,
2616 basic_block *empty_bb)
2617 {
2618 tree default_decl;
2619 gimple *ifunc_cpu_init_stmt;
2620 gimple_seq gseq;
2621 int ix;
2622 tree ele;
2623 vec<tree> *fndecls;
2624 unsigned int num_versions = 0;
2625 unsigned int actual_versions = 0;
2626 unsigned int i;
2627
2628 struct _function_version_info
2629 {
2630 tree version_decl;
2631 tree predicate_chain;
2632 unsigned int dispatch_priority;
2633 }*function_version_info;
2634
2635 gcc_assert (dispatch_decl != NULL
2636 && fndecls_p != NULL
2637 && empty_bb != NULL);
2638
2639 /*fndecls_p is actually a vector. */
2640 fndecls = static_cast<vec<tree> *> (fndecls_p);
2641
2642 /* At least one more version other than the default. */
2643 num_versions = fndecls->length ();
2644 gcc_assert (num_versions >= 2);
2645
2646 function_version_info = (struct _function_version_info *)
2647 XNEWVEC (struct _function_version_info, (num_versions - 1));
2648
2649 /* The first version in the vector is the default decl. */
2650 default_decl = (*fndecls)[0];
2651
2652 push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
2653
2654 gseq = bb_seq (*empty_bb);
2655 /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
2656 constructors, so explicity call __builtin_cpu_init here. */
2657 ifunc_cpu_init_stmt
2658 = gimple_build_call_vec (get_ix86_builtin (IX86_BUILTIN_CPU_INIT), vNULL);
2659 gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
2660 gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
2661 set_bb_seq (*empty_bb, gseq);
2662
2663 pop_cfun ();
2664
2665
2666 for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
2667 {
2668 tree version_decl = ele;
2669 tree predicate_chain = NULL_TREE;
2670 unsigned int priority;
2671 /* Get attribute string, parse it and find the right predicate decl.
2672 The predicate function could be a lengthy combination of many
2673 features, like arch-type and various isa-variants. */
2674 priority = get_builtin_code_for_version (version_decl,
2675 &predicate_chain);
2676
2677 if (predicate_chain == NULL_TREE)
2678 continue;
2679
2680 function_version_info [actual_versions].version_decl = version_decl;
2681 function_version_info [actual_versions].predicate_chain
2682 = predicate_chain;
2683 function_version_info [actual_versions].dispatch_priority = priority;
2684 actual_versions++;
2685 }
2686
2687 /* Sort the versions according to descending order of dispatch priority. The
2688 priority is based on the ISA. This is not a perfect solution. There
2689 could still be ambiguity. If more than one function version is suitable
2690 to execute, which one should be dispatched? In future, allow the user
2691 to specify a dispatch priority next to the version. */
2692 qsort (function_version_info, actual_versions,
2693 sizeof (struct _function_version_info), feature_compare);
2694
2695 for (i = 0; i < actual_versions; ++i)
2696 *empty_bb = add_condition_to_bb (dispatch_decl,
2697 function_version_info[i].version_decl,
2698 function_version_info[i].predicate_chain,
2699 *empty_bb);
2700
2701 /* dispatch default version at the end. */
2702 *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
2703 NULL, *empty_bb);
2704
2705 free (function_version_info);
2706 return 0;
2707 }
2708
2709 /* This function changes the assembler name for functions that are
2710 versions. If DECL is a function version and has a "target"
2711 attribute, it appends the attribute string to its assembler name. */
2712
2713 static tree
ix86_mangle_function_version_assembler_name(tree decl,tree id)2714 ix86_mangle_function_version_assembler_name (tree decl, tree id)
2715 {
2716 tree version_attr;
2717 const char *orig_name, *version_string;
2718 char *attr_str, *assembler_name;
2719
2720 if (DECL_DECLARED_INLINE_P (decl)
2721 && lookup_attribute ("gnu_inline",
2722 DECL_ATTRIBUTES (decl)))
2723 error_at (DECL_SOURCE_LOCATION (decl),
2724 "function versions cannot be marked as %<gnu_inline%>,"
2725 " bodies have to be generated");
2726
2727 if (DECL_VIRTUAL_P (decl)
2728 || DECL_VINDEX (decl))
2729 sorry ("virtual function multiversioning not supported");
2730
2731 version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
2732
2733 /* target attribute string cannot be NULL. */
2734 gcc_assert (version_attr != NULL_TREE);
2735
2736 orig_name = IDENTIFIER_POINTER (id);
2737 version_string
2738 = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
2739
2740 if (strcmp (version_string, "default") == 0)
2741 return id;
2742
2743 attr_str = sorted_attr_string (TREE_VALUE (version_attr));
2744 assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
2745
2746 sprintf (assembler_name, "%s.%s", orig_name, attr_str);
2747
2748 /* Allow assembler name to be modified if already set. */
2749 if (DECL_ASSEMBLER_NAME_SET_P (decl))
2750 SET_DECL_RTL (decl, NULL);
2751
2752 tree ret = get_identifier (assembler_name);
2753 XDELETEVEC (attr_str);
2754 XDELETEVEC (assembler_name);
2755 return ret;
2756 }
2757
2758 tree
ix86_mangle_decl_assembler_name(tree decl,tree id)2759 ix86_mangle_decl_assembler_name (tree decl, tree id)
2760 {
2761 /* For function version, add the target suffix to the assembler name. */
2762 if (TREE_CODE (decl) == FUNCTION_DECL
2763 && DECL_FUNCTION_VERSIONED (decl))
2764 id = ix86_mangle_function_version_assembler_name (decl, id);
2765 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
2766 id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
2767 #endif
2768
2769 return id;
2770 }
2771
2772 /* Make a dispatcher declaration for the multi-versioned function DECL.
2773 Calls to DECL function will be replaced with calls to the dispatcher
2774 by the front-end. Returns the decl of the dispatcher function. */
2775
2776 tree
ix86_get_function_versions_dispatcher(void * decl)2777 ix86_get_function_versions_dispatcher (void *decl)
2778 {
2779 tree fn = (tree) decl;
2780 struct cgraph_node *node = NULL;
2781 struct cgraph_node *default_node = NULL;
2782 struct cgraph_function_version_info *node_v = NULL;
2783 struct cgraph_function_version_info *first_v = NULL;
2784
2785 tree dispatch_decl = NULL;
2786
2787 struct cgraph_function_version_info *default_version_info = NULL;
2788
2789 gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
2790
2791 node = cgraph_node::get (fn);
2792 gcc_assert (node != NULL);
2793
2794 node_v = node->function_version ();
2795 gcc_assert (node_v != NULL);
2796
2797 if (node_v->dispatcher_resolver != NULL)
2798 return node_v->dispatcher_resolver;
2799
2800 /* Find the default version and make it the first node. */
2801 first_v = node_v;
2802 /* Go to the beginning of the chain. */
2803 while (first_v->prev != NULL)
2804 first_v = first_v->prev;
2805 default_version_info = first_v;
2806 while (default_version_info != NULL)
2807 {
2808 if (is_function_default_version
2809 (default_version_info->this_node->decl))
2810 break;
2811 default_version_info = default_version_info->next;
2812 }
2813
2814 /* If there is no default node, just return NULL. */
2815 if (default_version_info == NULL)
2816 return NULL;
2817
2818 /* Make default info the first node. */
2819 if (first_v != default_version_info)
2820 {
2821 default_version_info->prev->next = default_version_info->next;
2822 if (default_version_info->next)
2823 default_version_info->next->prev = default_version_info->prev;
2824 first_v->prev = default_version_info;
2825 default_version_info->next = first_v;
2826 default_version_info->prev = NULL;
2827 }
2828
2829 default_node = default_version_info->this_node;
2830
2831 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
2832 if (targetm.has_ifunc_p ())
2833 {
2834 struct cgraph_function_version_info *it_v = NULL;
2835 struct cgraph_node *dispatcher_node = NULL;
2836 struct cgraph_function_version_info *dispatcher_version_info = NULL;
2837
2838 /* Right now, the dispatching is done via ifunc. */
2839 dispatch_decl = make_dispatcher_decl (default_node->decl);
2840
2841 dispatcher_node = cgraph_node::get_create (dispatch_decl);
2842 gcc_assert (dispatcher_node != NULL);
2843 dispatcher_node->dispatcher_function = 1;
2844 dispatcher_version_info
2845 = dispatcher_node->insert_new_function_version ();
2846 dispatcher_version_info->next = default_version_info;
2847 dispatcher_node->definition = 1;
2848
2849 /* Set the dispatcher for all the versions. */
2850 it_v = default_version_info;
2851 while (it_v != NULL)
2852 {
2853 it_v->dispatcher_resolver = dispatch_decl;
2854 it_v = it_v->next;
2855 }
2856 }
2857 else
2858 #endif
2859 {
2860 error_at (DECL_SOURCE_LOCATION (default_node->decl),
2861 "multiversioning needs %<ifunc%> which is not supported "
2862 "on this target");
2863 }
2864
2865 return dispatch_decl;
2866 }
2867
2868 /* Make the resolver function decl to dispatch the versions of
2869 a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
2870 ifunc alias that will point to the created resolver. Create an
2871 empty basic block in the resolver and store the pointer in
2872 EMPTY_BB. Return the decl of the resolver function. */
2873
2874 static tree
make_resolver_func(const tree default_decl,const tree ifunc_alias_decl,basic_block * empty_bb)2875 make_resolver_func (const tree default_decl,
2876 const tree ifunc_alias_decl,
2877 basic_block *empty_bb)
2878 {
2879 tree decl, type, t;
2880
2881 /* Create resolver function name based on default_decl. */
2882 tree decl_name = clone_function_name (default_decl, "resolver");
2883 const char *resolver_name = IDENTIFIER_POINTER (decl_name);
2884
2885 /* The resolver function should return a (void *). */
2886 type = build_function_type_list (ptr_type_node, NULL_TREE);
2887
2888 decl = build_fn_decl (resolver_name, type);
2889 SET_DECL_ASSEMBLER_NAME (decl, decl_name);
2890
2891 DECL_NAME (decl) = decl_name;
2892 TREE_USED (decl) = 1;
2893 DECL_ARTIFICIAL (decl) = 1;
2894 DECL_IGNORED_P (decl) = 1;
2895 TREE_PUBLIC (decl) = 0;
2896 DECL_UNINLINABLE (decl) = 1;
2897
2898 /* Resolver is not external, body is generated. */
2899 DECL_EXTERNAL (decl) = 0;
2900 DECL_EXTERNAL (ifunc_alias_decl) = 0;
2901
2902 DECL_CONTEXT (decl) = NULL_TREE;
2903 DECL_INITIAL (decl) = make_node (BLOCK);
2904 DECL_STATIC_CONSTRUCTOR (decl) = 0;
2905
2906 if (DECL_COMDAT_GROUP (default_decl)
2907 || TREE_PUBLIC (default_decl))
2908 {
2909 /* In this case, each translation unit with a call to this
2910 versioned function will put out a resolver. Ensure it
2911 is comdat to keep just one copy. */
2912 DECL_COMDAT (decl) = 1;
2913 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
2914 }
2915 else
2916 TREE_PUBLIC (ifunc_alias_decl) = 0;
2917
2918 /* Build result decl and add to function_decl. */
2919 t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
2920 DECL_CONTEXT (t) = decl;
2921 DECL_ARTIFICIAL (t) = 1;
2922 DECL_IGNORED_P (t) = 1;
2923 DECL_RESULT (decl) = t;
2924
2925 gimplify_function_tree (decl);
2926 push_cfun (DECL_STRUCT_FUNCTION (decl));
2927 *empty_bb = init_lowered_empty_function (decl, false,
2928 profile_count::uninitialized ());
2929
2930 cgraph_node::add_new_function (decl, true);
2931 symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
2932
2933 pop_cfun ();
2934
2935 gcc_assert (ifunc_alias_decl != NULL);
2936 /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
2937 DECL_ATTRIBUTES (ifunc_alias_decl)
2938 = make_attribute ("ifunc", resolver_name,
2939 DECL_ATTRIBUTES (ifunc_alias_decl));
2940
2941 /* Create the alias for dispatch to resolver here. */
2942 cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
2943 return decl;
2944 }
2945
2946 /* Generate the dispatching code body to dispatch multi-versioned function
2947 DECL. The target hook is called to process the "target" attributes and
2948 provide the code to dispatch the right function at run-time. NODE points
2949 to the dispatcher decl whose body will be created. */
2950
2951 tree
ix86_generate_version_dispatcher_body(void * node_p)2952 ix86_generate_version_dispatcher_body (void *node_p)
2953 {
2954 tree resolver_decl;
2955 basic_block empty_bb;
2956 tree default_ver_decl;
2957 struct cgraph_node *versn;
2958 struct cgraph_node *node;
2959
2960 struct cgraph_function_version_info *node_version_info = NULL;
2961 struct cgraph_function_version_info *versn_info = NULL;
2962
2963 node = (cgraph_node *)node_p;
2964
2965 node_version_info = node->function_version ();
2966 gcc_assert (node->dispatcher_function
2967 && node_version_info != NULL);
2968
2969 if (node_version_info->dispatcher_resolver)
2970 return node_version_info->dispatcher_resolver;
2971
2972 /* The first version in the chain corresponds to the default version. */
2973 default_ver_decl = node_version_info->next->this_node->decl;
2974
2975 /* node is going to be an alias, so remove the finalized bit. */
2976 node->definition = false;
2977
2978 resolver_decl = make_resolver_func (default_ver_decl,
2979 node->decl, &empty_bb);
2980
2981 node_version_info->dispatcher_resolver = resolver_decl;
2982
2983 push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
2984
2985 auto_vec<tree, 2> fn_ver_vec;
2986
2987 for (versn_info = node_version_info->next; versn_info;
2988 versn_info = versn_info->next)
2989 {
2990 versn = versn_info->this_node;
2991 /* Check for virtual functions here again, as by this time it should
2992 have been determined if this function needs a vtable index or
2993 not. This happens for methods in derived classes that override
2994 virtual methods in base classes but are not explicitly marked as
2995 virtual. */
2996 if (DECL_VINDEX (versn->decl))
2997 sorry ("virtual function multiversioning not supported");
2998
2999 fn_ver_vec.safe_push (versn->decl);
3000 }
3001
3002 dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
3003 cgraph_edge::rebuild_edges ();
3004 pop_cfun ();
3005 return resolver_decl;
3006 }
3007
3008
3009