xref: /netbsd-src/external/gpl3/gcc/dist/gcc/config/i386/i386-features.cc (revision 0a3071956a3a9fdebdbf7f338cf2d439b45fc728)
1 /* Copyright (C) 1988-2022 Free Software Foundation, Inc.
2 
3 This file is part of GCC.
4 
5 GCC is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3, or (at your option)
8 any later version.
9 
10 GCC is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License
16 along with GCC; see the file COPYING3.  If not see
17 <http://www.gnu.org/licenses/>.  */
18 
19 #define IN_TARGET_CODE 1
20 
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "memmodel.h"
28 #include "gimple.h"
29 #include "cfghooks.h"
30 #include "cfgloop.h"
31 #include "df.h"
32 #include "tm_p.h"
33 #include "stringpool.h"
34 #include "expmed.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "cgraph.h"
40 #include "diagnostic.h"
41 #include "cfgbuild.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "attribs.h"
45 #include "calls.h"
46 #include "stor-layout.h"
47 #include "varasm.h"
48 #include "output.h"
49 #include "insn-attr.h"
50 #include "flags.h"
51 #include "except.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "cfgrtl.h"
55 #include "common/common-target.h"
56 #include "langhooks.h"
57 #include "reload.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "tm-constrs.h"
61 #include "cselib.h"
62 #include "sched-int.h"
63 #include "opts.h"
64 #include "tree-pass.h"
65 #include "context.h"
66 #include "pass_manager.h"
67 #include "target-globals.h"
68 #include "gimple-iterator.h"
69 #include "tree-vectorizer.h"
70 #include "shrink-wrap.h"
71 #include "builtins.h"
72 #include "rtl-iter.h"
73 #include "tree-iterator.h"
74 #include "dbgcnt.h"
75 #include "case-cfn-macros.h"
76 #include "dojump.h"
77 #include "fold-const-call.h"
78 #include "tree-vrp.h"
79 #include "tree-ssanames.h"
80 #include "selftest.h"
81 #include "selftest-rtl.h"
82 #include "print-rtl.h"
83 #include "intl.h"
84 #include "ifcvt.h"
85 #include "symbol-summary.h"
86 #include "ipa-prop.h"
87 #include "ipa-fnsummary.h"
88 #include "wide-int-bitmask.h"
89 #include "tree-vector-builder.h"
90 #include "debug.h"
91 #include "dwarf2out.h"
92 #include "i386-builtins.h"
93 #include "i386-features.h"
94 
95 const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
96   "savms64",
97   "resms64",
98   "resms64x",
99   "savms64f",
100   "resms64f",
101   "resms64fx"
102 };
103 
104 const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
105 /* The below offset values are where each register is stored for the layout
106    relative to incoming stack pointer.  The value of each m_regs[].offset will
107    be relative to the incoming base pointer (rax or rsi) used by the stub.
108 
109     s_instances:   0		1		2		3
110     Offset:					realigned or	aligned + 8
111     Register	   aligned	aligned + 8	aligned w/HFP	w/HFP	*/
112     XMM15_REG,	/* 0x10		0x18		0x10		0x18	*/
113     XMM14_REG,	/* 0x20		0x28		0x20		0x28	*/
114     XMM13_REG,	/* 0x30		0x38		0x30		0x38	*/
115     XMM12_REG,	/* 0x40		0x48		0x40		0x48	*/
116     XMM11_REG,	/* 0x50		0x58		0x50		0x58	*/
117     XMM10_REG,	/* 0x60		0x68		0x60		0x68	*/
118     XMM9_REG,	/* 0x70		0x78		0x70		0x78	*/
119     XMM8_REG,	/* 0x80		0x88		0x80		0x88	*/
120     XMM7_REG,	/* 0x90		0x98		0x90		0x98	*/
121     XMM6_REG,	/* 0xa0		0xa8		0xa0		0xa8	*/
122     SI_REG,	/* 0xa8		0xb0		0xa8		0xb0	*/
123     DI_REG,	/* 0xb0		0xb8		0xb0		0xb8	*/
124     BX_REG,	/* 0xb8		0xc0		0xb8		0xc0	*/
125     BP_REG,	/* 0xc0		0xc8		N/A		N/A	*/
126     R12_REG,	/* 0xc8		0xd0		0xc0		0xc8	*/
127     R13_REG,	/* 0xd0		0xd8		0xc8		0xd0	*/
128     R14_REG,	/* 0xd8		0xe0		0xd0		0xd8	*/
129     R15_REG,	/* 0xe0		0xe8		0xd8		0xe0	*/
130 };
131 
132 /* Instantiate static const values.  */
133 const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
134 const unsigned xlogue_layout::MIN_REGS;
135 const unsigned xlogue_layout::MAX_REGS;
136 const unsigned xlogue_layout::MAX_EXTRA_REGS;
137 const unsigned xlogue_layout::VARIANT_COUNT;
138 const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
139 
140 /* Initialize xlogue_layout::s_stub_names to zero.  */
141 char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
142 				[STUB_NAME_MAX_LEN];
143 
144 /* Instantiates all xlogue_layout instances.  */
145 const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
146   xlogue_layout (0, false),
147   xlogue_layout (8, false),
148   xlogue_layout (0, true),
149   xlogue_layout (8, true)
150 };
151 
152 /* Return an appropriate const instance of xlogue_layout based upon values
153    in cfun->machine and crtl.  */
154 const class xlogue_layout &
get_instance()155 xlogue_layout::get_instance ()
156 {
157   enum xlogue_stub_sets stub_set;
158   bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
159 
160   if (stack_realign_fp)
161     stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
162   else if (frame_pointer_needed)
163     stub_set = aligned_plus_8
164 	      ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
165 	      : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
166   else
167     stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
168 
169   return s_instances[stub_set];
170 }
171 
172 /* Determine how many clobbered registers can be saved by the stub.
173    Returns the count of registers the stub will save and restore.  */
174 unsigned
count_stub_managed_regs()175 xlogue_layout::count_stub_managed_regs ()
176 {
177   bool hfp = frame_pointer_needed || stack_realign_fp;
178   unsigned i, count;
179   unsigned regno;
180 
181   for (count = i = MIN_REGS; i < MAX_REGS; ++i)
182     {
183       regno = REG_ORDER[i];
184       if (regno == BP_REG && hfp)
185 	continue;
186       if (!ix86_save_reg (regno, false, false))
187 	break;
188       ++count;
189     }
190   return count;
191 }
192 
193 /* Determine if register REGNO is a stub managed register given the
194    total COUNT of stub managed registers.  */
195 bool
is_stub_managed_reg(unsigned regno,unsigned count)196 xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
197 {
198   bool hfp = frame_pointer_needed || stack_realign_fp;
199   unsigned i;
200 
201   for (i = 0; i < count; ++i)
202     {
203       gcc_assert (i < MAX_REGS);
204       if (REG_ORDER[i] == BP_REG && hfp)
205 	++count;
206       else if (REG_ORDER[i] == regno)
207 	return true;
208     }
209   return false;
210 }
211 
212 /* Constructor for xlogue_layout.  */
xlogue_layout(HOST_WIDE_INT stack_align_off_in,bool hfp)213 xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
214   : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
215     m_stack_align_off_in (stack_align_off_in)
216 {
217   HOST_WIDE_INT offset = stack_align_off_in;
218   unsigned i, j;
219 
220   for (i = j = 0; i < MAX_REGS; ++i)
221     {
222       unsigned regno = REG_ORDER[i];
223 
224       if (regno == BP_REG && hfp)
225 	continue;
226       if (SSE_REGNO_P (regno))
227 	{
228 	  offset += 16;
229 	  /* Verify that SSE regs are always aligned.  */
230 	  gcc_assert (!((stack_align_off_in + offset) & 15));
231 	}
232       else
233 	offset += 8;
234 
235       m_regs[j].regno    = regno;
236       m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
237     }
238   gcc_assert (j == m_nregs);
239 }
240 
241 const char *
get_stub_name(enum xlogue_stub stub,unsigned n_extra_regs)242 xlogue_layout::get_stub_name (enum xlogue_stub stub,
243 			      unsigned n_extra_regs)
244 {
245   const int have_avx = TARGET_AVX;
246   char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
247 
248   /* Lazy init */
249   if (!*name)
250     {
251       int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
252 			  (have_avx ? "avx" : "sse"),
253 			  STUB_BASE_NAMES[stub],
254 			  MIN_REGS + n_extra_regs);
255       gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
256     }
257 
258   return name;
259 }
260 
261 /* Return rtx of a symbol ref for the entry point (based upon
262    cfun->machine->call_ms2sysv_extra_regs) of the specified stub.  */
263 rtx
get_stub_rtx(enum xlogue_stub stub)264 xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
265 {
266   const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
267   gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
268   gcc_assert (stub < XLOGUE_STUB_COUNT);
269   gcc_assert (crtl->stack_realign_finalized);
270 
271   return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
272 }
273 
274 unsigned scalar_chain::max_id = 0;
275 
276 namespace {
277 
278 /* Initialize new chain.  */
279 
scalar_chain(enum machine_mode smode_,enum machine_mode vmode_)280 scalar_chain::scalar_chain (enum machine_mode smode_, enum machine_mode vmode_)
281 {
282   smode = smode_;
283   vmode = vmode_;
284 
285   chain_id = ++max_id;
286 
287    if (dump_file)
288     fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
289 
290   bitmap_obstack_initialize (NULL);
291   insns = BITMAP_ALLOC (NULL);
292   defs = BITMAP_ALLOC (NULL);
293   defs_conv = BITMAP_ALLOC (NULL);
294   queue = NULL;
295 }
296 
297 /* Free chain's data.  */
298 
~scalar_chain()299 scalar_chain::~scalar_chain ()
300 {
301   BITMAP_FREE (insns);
302   BITMAP_FREE (defs);
303   BITMAP_FREE (defs_conv);
304   bitmap_obstack_release (NULL);
305 }
306 
307 /* Add instruction into chains' queue.  */
308 
309 void
add_to_queue(unsigned insn_uid)310 scalar_chain::add_to_queue (unsigned insn_uid)
311 {
312   if (bitmap_bit_p (insns, insn_uid)
313       || bitmap_bit_p (queue, insn_uid))
314     return;
315 
316   if (dump_file)
317     fprintf (dump_file, "  Adding insn %d into chain's #%d queue\n",
318 	     insn_uid, chain_id);
319   bitmap_set_bit (queue, insn_uid);
320 }
321 
general_scalar_chain(enum machine_mode smode_,enum machine_mode vmode_)322 general_scalar_chain::general_scalar_chain (enum machine_mode smode_,
323 					    enum machine_mode vmode_)
324      : scalar_chain (smode_, vmode_)
325 {
326   insns_conv = BITMAP_ALLOC (NULL);
327   n_sse_to_integer = 0;
328   n_integer_to_sse = 0;
329 }
330 
~general_scalar_chain()331 general_scalar_chain::~general_scalar_chain ()
332 {
333   BITMAP_FREE (insns_conv);
334 }
335 
336 /* For DImode conversion, mark register defined by DEF as requiring
337    conversion.  */
338 
339 void
mark_dual_mode_def(df_ref def)340 general_scalar_chain::mark_dual_mode_def (df_ref def)
341 {
342   gcc_assert (DF_REF_REG_DEF_P (def));
343 
344   /* Record the def/insn pair so we can later efficiently iterate over
345      the defs to convert on insns not in the chain.  */
346   bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
347   if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
348     {
349       if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def))
350 	  && !reg_new)
351 	return;
352       n_integer_to_sse++;
353     }
354   else
355     {
356       if (!reg_new)
357 	return;
358       n_sse_to_integer++;
359     }
360 
361   if (dump_file)
362     fprintf (dump_file,
363 	     "  Mark r%d def in insn %d as requiring both modes in chain #%d\n",
364 	     DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
365 }
366 
367 /* For TImode conversion, it is unused.  */
368 
369 void
mark_dual_mode_def(df_ref)370 timode_scalar_chain::mark_dual_mode_def (df_ref)
371 {
372   gcc_unreachable ();
373 }
374 
375 /* Check REF's chain to add new insns into a queue
376    and find registers requiring conversion.  */
377 
378 void
analyze_register_chain(bitmap candidates,df_ref ref)379 scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref)
380 {
381   df_link *chain;
382 
383   gcc_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref))
384 	      || bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)));
385   add_to_queue (DF_REF_INSN_UID (ref));
386 
387   for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
388     {
389       unsigned uid = DF_REF_INSN_UID (chain->ref);
390 
391       if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
392 	continue;
393 
394       if (!DF_REF_REG_MEM_P (chain->ref))
395 	{
396 	  if (bitmap_bit_p (insns, uid))
397 	    continue;
398 
399 	  if (bitmap_bit_p (candidates, uid))
400 	    {
401 	      add_to_queue (uid);
402 	      continue;
403 	    }
404 	}
405 
406       if (DF_REF_REG_DEF_P (chain->ref))
407 	{
408 	  if (dump_file)
409 	    fprintf (dump_file, "  r%d def in insn %d isn't convertible\n",
410 		     DF_REF_REGNO (chain->ref), uid);
411 	  mark_dual_mode_def (chain->ref);
412 	}
413       else
414 	{
415 	  if (dump_file)
416 	    fprintf (dump_file, "  r%d use in insn %d isn't convertible\n",
417 		     DF_REF_REGNO (chain->ref), uid);
418 	  mark_dual_mode_def (ref);
419 	}
420     }
421 }
422 
423 /* Add instruction into a chain.  */
424 
425 void
add_insn(bitmap candidates,unsigned int insn_uid)426 scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid)
427 {
428   if (bitmap_bit_p (insns, insn_uid))
429     return;
430 
431   if (dump_file)
432     fprintf (dump_file, "  Adding insn %d to chain #%d\n", insn_uid, chain_id);
433 
434   bitmap_set_bit (insns, insn_uid);
435 
436   rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
437   rtx def_set = single_set (insn);
438   if (def_set && REG_P (SET_DEST (def_set))
439       && !HARD_REGISTER_P (SET_DEST (def_set)))
440     bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
441 
442   /* ???  The following is quadratic since analyze_register_chain
443      iterates over all refs to look for dual-mode regs.  Instead this
444      should be done separately for all regs mentioned in the chain once.  */
445   df_ref ref;
446   for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
447     if (!HARD_REGISTER_P (DF_REF_REG (ref)))
448       analyze_register_chain (candidates, ref);
449   for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
450     if (!DF_REF_REG_MEM_P (ref))
451       analyze_register_chain (candidates, ref);
452 }
453 
454 /* Build new chain starting from insn INSN_UID recursively
455    adding all dependent uses and definitions.  */
456 
457 void
build(bitmap candidates,unsigned insn_uid)458 scalar_chain::build (bitmap candidates, unsigned insn_uid)
459 {
460   queue = BITMAP_ALLOC (NULL);
461   bitmap_set_bit (queue, insn_uid);
462 
463   if (dump_file)
464     fprintf (dump_file, "Building chain #%d...\n", chain_id);
465 
466   while (!bitmap_empty_p (queue))
467     {
468       insn_uid = bitmap_first_set_bit (queue);
469       bitmap_clear_bit (queue, insn_uid);
470       bitmap_clear_bit (candidates, insn_uid);
471       add_insn (candidates, insn_uid);
472     }
473 
474   if (dump_file)
475     {
476       fprintf (dump_file, "Collected chain #%d...\n", chain_id);
477       fprintf (dump_file, "  insns: ");
478       dump_bitmap (dump_file, insns);
479       if (!bitmap_empty_p (defs_conv))
480 	{
481 	  bitmap_iterator bi;
482 	  unsigned id;
483 	  const char *comma = "";
484 	  fprintf (dump_file, "  defs to convert: ");
485 	  EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
486 	    {
487 	      fprintf (dump_file, "%sr%d", comma, id);
488 	      comma = ", ";
489 	    }
490 	  fprintf (dump_file, "\n");
491 	}
492     }
493 
494   BITMAP_FREE (queue);
495 }
496 
497 /* Return a cost of building a vector costant
498    instead of using a scalar one.  */
499 
500 int
vector_const_cost(rtx exp)501 general_scalar_chain::vector_const_cost (rtx exp)
502 {
503   gcc_assert (CONST_INT_P (exp));
504 
505   if (standard_sse_constant_p (exp, vmode))
506     return ix86_cost->sse_op;
507   /* We have separate costs for SImode and DImode, use SImode costs
508      for smaller modes.  */
509   return ix86_cost->sse_load[smode == DImode ? 1 : 0];
510 }
511 
512 /* Compute a gain for chain conversion.  */
513 
514 int
compute_convert_gain()515 general_scalar_chain::compute_convert_gain ()
516 {
517   bitmap_iterator bi;
518   unsigned insn_uid;
519   int gain = 0;
520   int cost = 0;
521 
522   if (dump_file)
523     fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
524 
525   /* SSE costs distinguish between SImode and DImode loads/stores, for
526      int costs factor in the number of GPRs involved.  When supporting
527      smaller modes than SImode the int load/store costs need to be
528      adjusted as well.  */
529   unsigned sse_cost_idx = smode == DImode ? 1 : 0;
530   unsigned m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1;
531 
532   EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
533     {
534       rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
535       rtx def_set = single_set (insn);
536       rtx src = SET_SRC (def_set);
537       rtx dst = SET_DEST (def_set);
538       int igain = 0;
539 
540       if (REG_P (src) && REG_P (dst))
541 	igain += 2 * m - ix86_cost->xmm_move;
542       else if (REG_P (src) && MEM_P (dst))
543 	igain
544 	  += m * ix86_cost->int_store[2] - ix86_cost->sse_store[sse_cost_idx];
545       else if (MEM_P (src) && REG_P (dst))
546 	igain += m * ix86_cost->int_load[2] - ix86_cost->sse_load[sse_cost_idx];
547       else
548 	switch (GET_CODE (src))
549 	  {
550 	  case ASHIFT:
551 	  case ASHIFTRT:
552 	  case LSHIFTRT:
553 	    if (m == 2)
554 	      {
555 		if (INTVAL (XEXP (src, 1)) >= 32)
556 		  igain += ix86_cost->add;
557 		else
558 		  igain += ix86_cost->shift_const;
559 	      }
560 
561 	    igain += ix86_cost->shift_const - ix86_cost->sse_op;
562 
563 	    if (CONST_INT_P (XEXP (src, 0)))
564 	      igain -= vector_const_cost (XEXP (src, 0));
565 	    break;
566 
567 	  case AND:
568 	  case IOR:
569 	  case XOR:
570 	  case PLUS:
571 	  case MINUS:
572 	    igain += m * ix86_cost->add - ix86_cost->sse_op;
573 	    /* Additional gain for andnot for targets without BMI.  */
574 	    if (GET_CODE (XEXP (src, 0)) == NOT
575 		&& !TARGET_BMI)
576 	      igain += m * ix86_cost->add;
577 
578 	    if (CONST_INT_P (XEXP (src, 0)))
579 	      igain -= vector_const_cost (XEXP (src, 0));
580 	    if (CONST_INT_P (XEXP (src, 1)))
581 	      igain -= vector_const_cost (XEXP (src, 1));
582 	    break;
583 
584 	  case NEG:
585 	  case NOT:
586 	    igain -= ix86_cost->sse_op + COSTS_N_INSNS (1);
587 
588 	    if (GET_CODE (XEXP (src, 0)) != ABS)
589 	      {
590 		igain += m * ix86_cost->add;
591 		break;
592 	      }
593 	    /* FALLTHRU */
594 
595 	  case ABS:
596 	  case SMAX:
597 	  case SMIN:
598 	  case UMAX:
599 	  case UMIN:
600 	    /* We do not have any conditional move cost, estimate it as a
601 	       reg-reg move.  Comparisons are costed as adds.  */
602 	    igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
603 	    /* Integer SSE ops are all costed the same.  */
604 	    igain -= ix86_cost->sse_op;
605 	    break;
606 
607 	  case COMPARE:
608 	    /* Assume comparison cost is the same.  */
609 	    break;
610 
611 	  case CONST_INT:
612 	    if (REG_P (dst))
613 	      {
614 		if (optimize_insn_for_size_p ())
615 		  {
616 		    /* xor (2 bytes) vs. xorps (3 bytes).  */
617 		    if (src == const0_rtx)
618 		      igain -= COSTS_N_BYTES (1);
619 		    /* movdi_internal vs. movv2di_internal.  */
620 		    /* => mov (5 bytes) vs. movaps (7 bytes).  */
621 		    else if (x86_64_immediate_operand (src, SImode))
622 		      igain -= COSTS_N_BYTES (2);
623 		    else
624 		      /* ??? Larger immediate constants are placed in the
625 			 constant pool, where the size benefit/impact of
626 			 STV conversion is affected by whether and how
627 			 often each constant pool entry is shared/reused.
628 			 The value below is empirically derived from the
629 			 CSiBE benchmark (and the optimal value may drift
630 			 over time).  */
631 		      igain += COSTS_N_BYTES (0);
632 		  }
633 		else
634 		  {
635 		    /* DImode can be immediate for TARGET_64BIT
636 		       and SImode always.  */
637 		    igain += m * COSTS_N_INSNS (1);
638 		    igain -= vector_const_cost (src);
639 		  }
640 	      }
641 	    else if (MEM_P (dst))
642 	      {
643 		igain += (m * ix86_cost->int_store[2]
644 			  - ix86_cost->sse_store[sse_cost_idx]);
645 		igain -= vector_const_cost (src);
646 	      }
647 	    break;
648 
649 	  default:
650 	    gcc_unreachable ();
651 	  }
652 
653       if (igain != 0 && dump_file)
654 	{
655 	  fprintf (dump_file, "  Instruction gain %d for ", igain);
656 	  dump_insn_slim (dump_file, insn);
657 	}
658       gain += igain;
659     }
660 
661   if (dump_file)
662     fprintf (dump_file, "  Instruction conversion gain: %d\n", gain);
663 
664   /* Cost the integer to sse and sse to integer moves.  */
665   cost += n_sse_to_integer * ix86_cost->sse_to_integer;
666   /* ???  integer_to_sse but we only have that in the RA cost table.
667      Assume sse_to_integer/integer_to_sse are the same which they
668      are at the moment.  */
669   cost += n_integer_to_sse * ix86_cost->sse_to_integer;
670 
671   if (dump_file)
672     fprintf (dump_file, "  Registers conversion cost: %d\n", cost);
673 
674   gain -= cost;
675 
676   if (dump_file)
677     fprintf (dump_file, "  Total gain: %d\n", gain);
678 
679   return gain;
680 }
681 
682 /* Insert generated conversion instruction sequence INSNS
683    after instruction AFTER.  New BB may be required in case
684    instruction has EH region attached.  */
685 
686 void
emit_conversion_insns(rtx insns,rtx_insn * after)687 scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
688 {
689   if (!control_flow_insn_p (after))
690     {
691       emit_insn_after (insns, after);
692       return;
693     }
694 
695   basic_block bb = BLOCK_FOR_INSN (after);
696   edge e = find_fallthru_edge (bb->succs);
697   gcc_assert (e);
698 
699   basic_block new_bb = split_edge (e);
700   emit_insn_after (insns, BB_HEAD (new_bb));
701 }
702 
703 } // anon namespace
704 
705 /* Generate the canonical SET_SRC to move GPR to a VMODE vector register,
706    zeroing the upper parts.  */
707 
708 static rtx
gen_gpr_to_xmm_move_src(enum machine_mode vmode,rtx gpr)709 gen_gpr_to_xmm_move_src (enum machine_mode vmode, rtx gpr)
710 {
711   switch (GET_MODE_NUNITS (vmode))
712     {
713     case 1:
714       /* We are not using this case currently.  */
715       gcc_unreachable ();
716     case 2:
717       return gen_rtx_VEC_CONCAT (vmode, gpr,
718 				 CONST0_RTX (GET_MODE_INNER (vmode)));
719     default:
720       return gen_rtx_VEC_MERGE (vmode, gen_rtx_VEC_DUPLICATE (vmode, gpr),
721 				CONST0_RTX (vmode), GEN_INT (HOST_WIDE_INT_1U));
722     }
723 }
724 
725 /* Make vector copies for all register REGNO definitions
726    and replace its uses in a chain.  */
727 
728 void
make_vector_copies(rtx_insn * insn,rtx reg)729 general_scalar_chain::make_vector_copies (rtx_insn *insn, rtx reg)
730 {
731   rtx vreg = *defs_map.get (reg);
732 
733   start_sequence ();
734   if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
735     {
736       rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
737       if (smode == DImode && !TARGET_64BIT)
738 	{
739 	  emit_move_insn (adjust_address (tmp, SImode, 0),
740 			  gen_rtx_SUBREG (SImode, reg, 0));
741 	  emit_move_insn (adjust_address (tmp, SImode, 4),
742 			  gen_rtx_SUBREG (SImode, reg, 4));
743 	}
744       else
745 	emit_move_insn (copy_rtx (tmp), reg);
746       emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
747 			      gen_gpr_to_xmm_move_src (vmode, tmp)));
748     }
749   else if (!TARGET_64BIT && smode == DImode)
750     {
751       if (TARGET_SSE4_1)
752 	{
753 	  emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
754 				      CONST0_RTX (V4SImode),
755 				      gen_rtx_SUBREG (SImode, reg, 0)));
756 	  emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
757 					gen_rtx_SUBREG (V4SImode, vreg, 0),
758 					gen_rtx_SUBREG (SImode, reg, 4),
759 					GEN_INT (2)));
760 	}
761       else
762 	{
763 	  rtx tmp = gen_reg_rtx (DImode);
764 	  emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
765 				      CONST0_RTX (V4SImode),
766 				      gen_rtx_SUBREG (SImode, reg, 0)));
767 	  emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
768 				      CONST0_RTX (V4SImode),
769 				      gen_rtx_SUBREG (SImode, reg, 4)));
770 	  emit_insn (gen_vec_interleave_lowv4si
771 		     (gen_rtx_SUBREG (V4SImode, vreg, 0),
772 		      gen_rtx_SUBREG (V4SImode, vreg, 0),
773 		      gen_rtx_SUBREG (V4SImode, tmp, 0)));
774 	}
775     }
776   else
777     emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
778 			    gen_gpr_to_xmm_move_src (vmode, reg)));
779   rtx_insn *seq = get_insns ();
780   end_sequence ();
781   emit_conversion_insns (seq, insn);
782 
783   if (dump_file)
784     fprintf (dump_file,
785 	     "  Copied r%d to a vector register r%d for insn %d\n",
786 	     REGNO (reg), REGNO (vreg), INSN_UID (insn));
787 }
788 
789 /* Copy the definition SRC of INSN inside the chain to DST for
790    scalar uses outside of the chain.  */
791 
792 void
convert_reg(rtx_insn * insn,rtx dst,rtx src)793 general_scalar_chain::convert_reg (rtx_insn *insn, rtx dst, rtx src)
794 {
795   start_sequence ();
796   if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
797     {
798       rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
799       emit_move_insn (tmp, src);
800       if (!TARGET_64BIT && smode == DImode)
801 	{
802 	  emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
803 			  adjust_address (tmp, SImode, 0));
804 	  emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
805 			  adjust_address (tmp, SImode, 4));
806 	}
807       else
808 	emit_move_insn (dst, copy_rtx (tmp));
809     }
810   else if (!TARGET_64BIT && smode == DImode)
811     {
812       if (TARGET_SSE4_1)
813 	{
814 	  rtx tmp = gen_rtx_PARALLEL (VOIDmode,
815 				      gen_rtvec (1, const0_rtx));
816 	  emit_insn
817 	      (gen_rtx_SET
818 	       (gen_rtx_SUBREG (SImode, dst, 0),
819 		gen_rtx_VEC_SELECT (SImode,
820 				    gen_rtx_SUBREG (V4SImode, src, 0),
821 				    tmp)));
822 
823 	  tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
824 	  emit_insn
825 	      (gen_rtx_SET
826 	       (gen_rtx_SUBREG (SImode, dst, 4),
827 		gen_rtx_VEC_SELECT (SImode,
828 				    gen_rtx_SUBREG (V4SImode, src, 0),
829 				    tmp)));
830 	}
831       else
832 	{
833 	  rtx vcopy = gen_reg_rtx (V2DImode);
834 	  emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, src, 0));
835 	  emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
836 			  gen_rtx_SUBREG (SImode, vcopy, 0));
837 	  emit_move_insn (vcopy,
838 			  gen_rtx_LSHIFTRT (V2DImode,
839 					    vcopy, GEN_INT (32)));
840 	  emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
841 			  gen_rtx_SUBREG (SImode, vcopy, 0));
842 	}
843     }
844   else
845     emit_move_insn (dst, src);
846 
847   rtx_insn *seq = get_insns ();
848   end_sequence ();
849   emit_conversion_insns (seq, insn);
850 
851   if (dump_file)
852     fprintf (dump_file,
853 	     "  Copied r%d to a scalar register r%d for insn %d\n",
854 	     REGNO (src), REGNO (dst), INSN_UID (insn));
855 }
856 
857 /* Convert operand OP in INSN.  We should handle
858    memory operands and uninitialized registers.
859    All other register uses are converted during
860    registers conversion.  */
861 
862 void
convert_op(rtx * op,rtx_insn * insn)863 general_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
864 {
865   *op = copy_rtx_if_shared (*op);
866 
867   if (GET_CODE (*op) == NOT)
868     {
869       convert_op (&XEXP (*op, 0), insn);
870       PUT_MODE (*op, vmode);
871     }
872   else if (MEM_P (*op))
873     {
874       rtx_insn* eh_insn, *movabs = NULL;
875       rtx tmp = gen_reg_rtx (GET_MODE (*op));
876 
877       /* Emit MOVABS to load from a 64-bit absolute address to a GPR.  */
878       if (!memory_operand (*op, GET_MODE (*op)))
879 	{
880 	  rtx tmp2 = gen_reg_rtx (GET_MODE (*op));
881 	  movabs = emit_insn_before (gen_rtx_SET (tmp2, *op), insn);
882 
883 	  *op = tmp2;
884 	}
885 
886       eh_insn
887 	= emit_insn_before (gen_rtx_SET (gen_rtx_SUBREG (vmode, tmp, 0),
888 					 gen_gpr_to_xmm_move_src (vmode, *op)),
889 			    insn);
890 
891       if (cfun->can_throw_non_call_exceptions)
892 	{
893 	  /* Handle REG_EH_REGION note.  */
894 	  rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
895 	  if (note)
896 	    {
897 	      if (movabs)
898 		eh_insn = movabs;
899 	      control_flow_insns.safe_push (eh_insn);
900 	      add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0));
901 	    }
902 	}
903 
904       *op = gen_rtx_SUBREG (vmode, tmp, 0);
905 
906       if (dump_file)
907 	fprintf (dump_file, "  Preloading operand for insn %d into r%d\n",
908 		 INSN_UID (insn), REGNO (tmp));
909     }
910   else if (REG_P (*op))
911     {
912       *op = gen_rtx_SUBREG (vmode, *op, 0);
913     }
914   else if (CONST_INT_P (*op))
915     {
916       rtx vec_cst;
917       rtx tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (smode), 0);
918 
919       /* Prefer all ones vector in case of -1.  */
920       if (constm1_operand (*op, GET_MODE (*op)))
921 	vec_cst = CONSTM1_RTX (vmode);
922       else
923 	{
924 	  unsigned n = GET_MODE_NUNITS (vmode);
925 	  rtx *v = XALLOCAVEC (rtx, n);
926 	  v[0] = *op;
927 	  for (unsigned i = 1; i < n; ++i)
928 	    v[i] = const0_rtx;
929 	  vec_cst = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
930 	}
931 
932       if (!standard_sse_constant_p (vec_cst, vmode))
933 	{
934 	  start_sequence ();
935 	  vec_cst = validize_mem (force_const_mem (vmode, vec_cst));
936 	  rtx_insn *seq = get_insns ();
937 	  end_sequence ();
938 	  emit_insn_before (seq, insn);
939 	}
940 
941       emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
942       *op = tmp;
943     }
944   else
945     {
946       gcc_assert (SUBREG_P (*op));
947       gcc_assert (GET_MODE (*op) == vmode);
948     }
949 }
950 
951 /* Convert INSN to vector mode.  */
952 
953 void
convert_insn(rtx_insn * insn)954 general_scalar_chain::convert_insn (rtx_insn *insn)
955 {
956   /* Generate copies for out-of-chain uses of defs and adjust debug uses.  */
957   for (df_ref ref = DF_INSN_DEFS (insn); ref; ref = DF_REF_NEXT_LOC (ref))
958     if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
959       {
960 	df_link *use;
961 	for (use = DF_REF_CHAIN (ref); use; use = use->next)
962 	  if (NONDEBUG_INSN_P (DF_REF_INSN (use->ref))
963 	      && (DF_REF_REG_MEM_P (use->ref)
964 		  || !bitmap_bit_p (insns, DF_REF_INSN_UID (use->ref))))
965 	    break;
966 	if (use)
967 	  convert_reg (insn, DF_REF_REG (ref),
968 		       *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]));
969 	else if (MAY_HAVE_DEBUG_BIND_INSNS)
970 	  {
971 	    /* If we generated a scalar copy we can leave debug-insns
972 	       as-is, if not, we have to adjust them.  */
973 	    auto_vec<rtx_insn *, 5> to_reset_debug_insns;
974 	    for (use = DF_REF_CHAIN (ref); use; use = use->next)
975 	      if (DEBUG_INSN_P (DF_REF_INSN (use->ref)))
976 		{
977 		  rtx_insn *debug_insn = DF_REF_INSN (use->ref);
978 		  /* If there's a reaching definition outside of the
979 		     chain we have to reset.  */
980 		  df_link *def;
981 		  for (def = DF_REF_CHAIN (use->ref); def; def = def->next)
982 		    if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def->ref)))
983 		      break;
984 		  if (def)
985 		    to_reset_debug_insns.safe_push (debug_insn);
986 		  else
987 		    {
988 		      *DF_REF_REAL_LOC (use->ref)
989 			= *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]);
990 		      df_insn_rescan (debug_insn);
991 		    }
992 		}
993 	    /* Have to do the reset outside of the DF_CHAIN walk to not
994 	       disrupt it.  */
995 	    while (!to_reset_debug_insns.is_empty ())
996 	      {
997 		rtx_insn *debug_insn = to_reset_debug_insns.pop ();
998 		INSN_VAR_LOCATION_LOC (debug_insn) = gen_rtx_UNKNOWN_VAR_LOC ();
999 		df_insn_rescan_debug_internal (debug_insn);
1000 	      }
1001 	  }
1002       }
1003 
1004   /* Replace uses in this insn with the defs we use in the chain.  */
1005   for (df_ref ref = DF_INSN_USES (insn); ref; ref = DF_REF_NEXT_LOC (ref))
1006     if (!DF_REF_REG_MEM_P (ref))
1007       if (rtx *vreg = defs_map.get (regno_reg_rtx[DF_REF_REGNO (ref)]))
1008 	{
1009 	  /* Also update a corresponding REG_DEAD note.  */
1010 	  rtx note = find_reg_note (insn, REG_DEAD, DF_REF_REG (ref));
1011 	  if (note)
1012 	    XEXP (note, 0) = *vreg;
1013 	  *DF_REF_REAL_LOC (ref) = *vreg;
1014 	}
1015 
1016   rtx def_set = single_set (insn);
1017   rtx src = SET_SRC (def_set);
1018   rtx dst = SET_DEST (def_set);
1019   rtx subreg;
1020 
1021   if (MEM_P (dst) && !REG_P (src))
1022     {
1023       /* There are no scalar integer instructions and therefore
1024 	 temporary register usage is required.  */
1025       rtx tmp = gen_reg_rtx (smode);
1026       emit_conversion_insns (gen_move_insn (dst, tmp), insn);
1027       dst = gen_rtx_SUBREG (vmode, tmp, 0);
1028     }
1029   else if (REG_P (dst))
1030     {
1031       /* Replace the definition with a SUBREG to the definition we
1032          use inside the chain.  */
1033       rtx *vdef = defs_map.get (dst);
1034       if (vdef)
1035 	dst = *vdef;
1036       dst = gen_rtx_SUBREG (vmode, dst, 0);
1037       /* IRA doesn't like to have REG_EQUAL/EQUIV notes when the SET_DEST
1038          is a non-REG_P.  So kill those off.  */
1039       rtx note = find_reg_equal_equiv_note (insn);
1040       if (note)
1041 	remove_note (insn, note);
1042     }
1043 
1044   switch (GET_CODE (src))
1045     {
1046     case PLUS:
1047     case MINUS:
1048     case IOR:
1049     case XOR:
1050     case AND:
1051     case SMAX:
1052     case SMIN:
1053     case UMAX:
1054     case UMIN:
1055       convert_op (&XEXP (src, 1), insn);
1056       /* FALLTHRU */
1057 
1058     case ABS:
1059     case ASHIFT:
1060     case ASHIFTRT:
1061     case LSHIFTRT:
1062       convert_op (&XEXP (src, 0), insn);
1063       PUT_MODE (src, vmode);
1064       break;
1065 
1066     case NEG:
1067       src = XEXP (src, 0);
1068 
1069       if (GET_CODE (src) == ABS)
1070 	{
1071 	  src = XEXP (src, 0);
1072 	  convert_op (&src, insn);
1073 	  subreg = gen_reg_rtx (vmode);
1074 	  emit_insn_before (gen_rtx_SET (subreg,
1075 					 gen_rtx_ABS (vmode, src)), insn);
1076 	  src = subreg;
1077 	}
1078       else
1079 	convert_op (&src, insn);
1080 
1081       subreg = gen_reg_rtx (vmode);
1082       emit_insn_before (gen_move_insn (subreg, CONST0_RTX (vmode)), insn);
1083       src = gen_rtx_MINUS (vmode, subreg, src);
1084       break;
1085 
1086     case NOT:
1087       src = XEXP (src, 0);
1088       convert_op (&src, insn);
1089       subreg = gen_reg_rtx (vmode);
1090       emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (vmode)), insn);
1091       src = gen_rtx_XOR (vmode, src, subreg);
1092       break;
1093 
1094     case MEM:
1095       if (!REG_P (dst))
1096 	convert_op (&src, insn);
1097       break;
1098 
1099     case REG:
1100       if (!MEM_P (dst))
1101 	convert_op (&src, insn);
1102       break;
1103 
1104     case SUBREG:
1105       gcc_assert (GET_MODE (src) == vmode);
1106       break;
1107 
1108     case COMPARE:
1109       src = SUBREG_REG (XEXP (XEXP (src, 0), 0));
1110 
1111       gcc_assert (REG_P (src) && GET_MODE (src) == DImode);
1112       subreg = gen_rtx_SUBREG (V2DImode, src, 0);
1113       emit_insn_before (gen_vec_interleave_lowv2di
1114 			(copy_rtx_if_shared (subreg),
1115 			 copy_rtx_if_shared (subreg),
1116 			 copy_rtx_if_shared (subreg)),
1117 			insn);
1118       dst = gen_rtx_REG (CCmode, FLAGS_REG);
1119       src = gen_rtx_UNSPEC (CCmode, gen_rtvec (2, copy_rtx_if_shared (subreg),
1120 					       copy_rtx_if_shared (subreg)),
1121 			    UNSPEC_PTEST);
1122       break;
1123 
1124     case CONST_INT:
1125       convert_op (&src, insn);
1126       break;
1127 
1128     default:
1129       gcc_unreachable ();
1130     }
1131 
1132   SET_SRC (def_set) = src;
1133   SET_DEST (def_set) = dst;
1134 
1135   /* Drop possible dead definitions.  */
1136   PATTERN (insn) = def_set;
1137 
1138   INSN_CODE (insn) = -1;
1139   int patt = recog_memoized (insn);
1140   if  (patt == -1)
1141     fatal_insn_not_found (insn);
1142   df_insn_rescan (insn);
1143 }
1144 
1145 /* Fix uses of converted REG in debug insns.  */
1146 
1147 void
fix_debug_reg_uses(rtx reg)1148 timode_scalar_chain::fix_debug_reg_uses (rtx reg)
1149 {
1150   if (!flag_var_tracking)
1151     return;
1152 
1153   df_ref ref, next;
1154   for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
1155     {
1156       rtx_insn *insn = DF_REF_INSN (ref);
1157       /* Make sure the next ref is for a different instruction,
1158          so that we're not affected by the rescan.  */
1159       next = DF_REF_NEXT_REG (ref);
1160       while (next && DF_REF_INSN (next) == insn)
1161 	next = DF_REF_NEXT_REG (next);
1162 
1163       if (DEBUG_INSN_P (insn))
1164 	{
1165 	  /* It may be a debug insn with a TImode variable in
1166 	     register.  */
1167 	  bool changed = false;
1168 	  for (; ref != next; ref = DF_REF_NEXT_REG (ref))
1169 	    {
1170 	      rtx *loc = DF_REF_LOC (ref);
1171 	      if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
1172 		{
1173 		  *loc = gen_rtx_SUBREG (TImode, *loc, 0);
1174 		  changed = true;
1175 		}
1176 	    }
1177 	  if (changed)
1178 	    df_insn_rescan (insn);
1179 	}
1180     }
1181 }
1182 
1183 /* Convert INSN from TImode to V1T1mode.  */
1184 
1185 void
convert_insn(rtx_insn * insn)1186 timode_scalar_chain::convert_insn (rtx_insn *insn)
1187 {
1188   rtx def_set = single_set (insn);
1189   rtx src = SET_SRC (def_set);
1190   rtx dst = SET_DEST (def_set);
1191 
1192   switch (GET_CODE (dst))
1193     {
1194     case REG:
1195       {
1196 	rtx tmp = find_reg_equal_equiv_note (insn);
1197 	if (tmp)
1198 	  PUT_MODE (XEXP (tmp, 0), V1TImode);
1199 	PUT_MODE (dst, V1TImode);
1200 	fix_debug_reg_uses (dst);
1201       }
1202       break;
1203     case MEM:
1204       PUT_MODE (dst, V1TImode);
1205       break;
1206 
1207     default:
1208       gcc_unreachable ();
1209     }
1210 
1211   switch (GET_CODE (src))
1212     {
1213     case REG:
1214       PUT_MODE (src, V1TImode);
1215       /* Call fix_debug_reg_uses only if SRC is never defined.  */
1216       if (!DF_REG_DEF_CHAIN (REGNO (src)))
1217 	fix_debug_reg_uses (src);
1218       break;
1219 
1220     case MEM:
1221       PUT_MODE (src, V1TImode);
1222       break;
1223 
1224     case CONST_WIDE_INT:
1225       if (NONDEBUG_INSN_P (insn))
1226 	{
1227 	  /* Since there are no instructions to store 128-bit constant,
1228 	     temporary register usage is required.  */
1229 	  rtx tmp = gen_reg_rtx (V1TImode);
1230 	  start_sequence ();
1231 	  src = gen_rtx_CONST_VECTOR (V1TImode, gen_rtvec (1, src));
1232 	  src = validize_mem (force_const_mem (V1TImode, src));
1233 	  rtx_insn *seq = get_insns ();
1234 	  end_sequence ();
1235 	  if (seq)
1236 	    emit_insn_before (seq, insn);
1237 	  emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
1238 	  dst = tmp;
1239 	}
1240       break;
1241 
1242     case CONST_INT:
1243       switch (standard_sse_constant_p (src, TImode))
1244 	{
1245 	case 1:
1246 	  src = CONST0_RTX (GET_MODE (dst));
1247 	  break;
1248 	case 2:
1249 	  src = CONSTM1_RTX (GET_MODE (dst));
1250 	  break;
1251 	default:
1252 	  gcc_unreachable ();
1253 	}
1254       if (NONDEBUG_INSN_P (insn))
1255 	{
1256 	  rtx tmp = gen_reg_rtx (V1TImode);
1257 	  /* Since there are no instructions to store standard SSE
1258 	     constant, temporary register usage is required.  */
1259 	  emit_conversion_insns (gen_rtx_SET (dst, tmp), insn);
1260 	  dst = tmp;
1261 	}
1262       break;
1263 
1264     default:
1265       gcc_unreachable ();
1266     }
1267 
1268   SET_SRC (def_set) = src;
1269   SET_DEST (def_set) = dst;
1270 
1271   /* Drop possible dead definitions.  */
1272   PATTERN (insn) = def_set;
1273 
1274   INSN_CODE (insn) = -1;
1275   recog_memoized (insn);
1276   df_insn_rescan (insn);
1277 }
1278 
1279 /* Generate copies from defs used by the chain but not defined therein.
1280    Also populates defs_map which is used later by convert_insn.  */
1281 
1282 void
convert_registers()1283 general_scalar_chain::convert_registers ()
1284 {
1285   bitmap_iterator bi;
1286   unsigned id;
1287   EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
1288     {
1289       rtx chain_reg = gen_reg_rtx (smode);
1290       defs_map.put (regno_reg_rtx[id], chain_reg);
1291     }
1292   EXECUTE_IF_SET_IN_BITMAP (insns_conv, 0, id, bi)
1293     for (df_ref ref = DF_INSN_UID_DEFS (id); ref; ref = DF_REF_NEXT_LOC (ref))
1294       if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
1295 	make_vector_copies (DF_REF_INSN (ref), DF_REF_REAL_REG (ref));
1296 }
1297 
1298 /* Convert whole chain creating required register
1299    conversions and copies.  */
1300 
1301 int
convert()1302 scalar_chain::convert ()
1303 {
1304   bitmap_iterator bi;
1305   unsigned id;
1306   int converted_insns = 0;
1307 
1308   if (!dbg_cnt (stv_conversion))
1309     return 0;
1310 
1311   if (dump_file)
1312     fprintf (dump_file, "Converting chain #%d...\n", chain_id);
1313 
1314   convert_registers ();
1315 
1316   EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
1317     {
1318       convert_insn (DF_INSN_UID_GET (id)->insn);
1319       converted_insns++;
1320     }
1321 
1322   return converted_insns;
1323 }
1324 
1325 /* Return the SET expression if INSN doesn't reference hard register.
1326    Return NULL if INSN uses or defines a hard register, excluding
1327    pseudo register pushes, hard register uses in a memory address,
1328    clobbers and flags definitions.  */
1329 
1330 static rtx
pseudo_reg_set(rtx_insn * insn)1331 pseudo_reg_set (rtx_insn *insn)
1332 {
1333   rtx set = single_set (insn);
1334   if (!set)
1335     return NULL;
1336 
1337   /* Check pseudo register push first. */
1338   machine_mode mode = TARGET_64BIT ? TImode : DImode;
1339   if (REG_P (SET_SRC (set))
1340       && !HARD_REGISTER_P (SET_SRC (set))
1341       && push_operand (SET_DEST (set), mode))
1342     return set;
1343 
1344   df_ref ref;
1345   FOR_EACH_INSN_DEF (ref, insn)
1346     if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
1347 	&& !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
1348 	&& DF_REF_REGNO (ref) != FLAGS_REG)
1349       return NULL;
1350 
1351   FOR_EACH_INSN_USE (ref, insn)
1352     if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
1353       return NULL;
1354 
1355   return set;
1356 }
1357 
1358 /* Check if comparison INSN may be transformed
1359    into vector comparison.  Currently we transform
1360    zero checks only which look like:
1361 
1362    (set (reg:CCZ 17 flags)
1363         (compare:CCZ (ior:SI (subreg:SI (reg:DI x) 4)
1364                              (subreg:SI (reg:DI x) 0))
1365 		     (const_int 0 [0])))  */
1366 
1367 static bool
convertible_comparison_p(rtx_insn * insn,enum machine_mode mode)1368 convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
1369 {
1370   /* ??? Currently convertible for double-word DImode chain only.  */
1371   if (TARGET_64BIT || mode != DImode)
1372     return false;
1373 
1374   if (!TARGET_SSE4_1)
1375     return false;
1376 
1377   rtx def_set = single_set (insn);
1378 
1379   gcc_assert (def_set);
1380 
1381   rtx src = SET_SRC (def_set);
1382   rtx dst = SET_DEST (def_set);
1383 
1384   gcc_assert (GET_CODE (src) == COMPARE);
1385 
1386   if (GET_CODE (dst) != REG
1387       || REGNO (dst) != FLAGS_REG
1388       || GET_MODE (dst) != CCZmode)
1389     return false;
1390 
1391   rtx op1 = XEXP (src, 0);
1392   rtx op2 = XEXP (src, 1);
1393 
1394   if (op2 != CONST0_RTX (GET_MODE (op2)))
1395     return false;
1396 
1397   if (GET_CODE (op1) != IOR)
1398     return false;
1399 
1400   op2 = XEXP (op1, 1);
1401   op1 = XEXP (op1, 0);
1402 
1403   if (!SUBREG_P (op1)
1404       || !SUBREG_P (op2)
1405       || GET_MODE (op1) != SImode
1406       || GET_MODE (op2) != SImode
1407       || ((SUBREG_BYTE (op1) != 0
1408 	   || SUBREG_BYTE (op2) != GET_MODE_SIZE (SImode))
1409 	  && (SUBREG_BYTE (op2) != 0
1410 	      || SUBREG_BYTE (op1) != GET_MODE_SIZE (SImode))))
1411     return false;
1412 
1413   op1 = SUBREG_REG (op1);
1414   op2 = SUBREG_REG (op2);
1415 
1416   if (op1 != op2
1417       || !REG_P (op1)
1418       || GET_MODE (op1) != DImode)
1419     return false;
1420 
1421   return true;
1422 }
1423 
1424 /* The general version of scalar_to_vector_candidate_p.  */
1425 
1426 static bool
general_scalar_to_vector_candidate_p(rtx_insn * insn,enum machine_mode mode)1427 general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
1428 {
1429   rtx def_set = pseudo_reg_set (insn);
1430 
1431   if (!def_set)
1432     return false;
1433 
1434   rtx src = SET_SRC (def_set);
1435   rtx dst = SET_DEST (def_set);
1436 
1437   if (GET_CODE (src) == COMPARE)
1438     return convertible_comparison_p (insn, mode);
1439 
1440   /* We are interested in "mode" only.  */
1441   if ((GET_MODE (src) != mode
1442        && !CONST_INT_P (src))
1443       || GET_MODE (dst) != mode)
1444     return false;
1445 
1446   if (!REG_P (dst) && !MEM_P (dst))
1447     return false;
1448 
1449   switch (GET_CODE (src))
1450     {
1451     case ASHIFTRT:
1452       if (!TARGET_AVX512VL)
1453 	return false;
1454       /* FALLTHRU */
1455 
1456     case ASHIFT:
1457     case LSHIFTRT:
1458       if (!CONST_INT_P (XEXP (src, 1))
1459 	  || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, GET_MODE_BITSIZE (mode)-1))
1460 	return false;
1461       break;
1462 
1463     case SMAX:
1464     case SMIN:
1465     case UMAX:
1466     case UMIN:
1467       if ((mode == DImode && !TARGET_AVX512VL)
1468 	  || (mode == SImode && !TARGET_SSE4_1))
1469 	return false;
1470       /* Fallthru.  */
1471 
1472     case AND:
1473     case IOR:
1474     case XOR:
1475     case PLUS:
1476     case MINUS:
1477       if (!REG_P (XEXP (src, 1))
1478 	  && !MEM_P (XEXP (src, 1))
1479 	  && !CONST_INT_P (XEXP (src, 1)))
1480 	return false;
1481 
1482       if (GET_MODE (XEXP (src, 1)) != mode
1483 	  && !CONST_INT_P (XEXP (src, 1)))
1484 	return false;
1485 
1486       /* Check for andnot case.  */
1487       if (GET_CODE (src) != AND
1488 	  || GET_CODE (XEXP (src, 0)) != NOT)
1489 	break;
1490 
1491       src = XEXP (src, 0);
1492       /* FALLTHRU */
1493 
1494     case NOT:
1495       break;
1496 
1497     case NEG:
1498       /* Check for nabs case.  */
1499       if (GET_CODE (XEXP (src, 0)) != ABS)
1500 	break;
1501 
1502       src = XEXP (src, 0);
1503       /* FALLTHRU */
1504 
1505     case ABS:
1506       if ((mode == DImode && !TARGET_AVX512VL)
1507 	  || (mode == SImode && !TARGET_SSSE3))
1508 	return false;
1509       break;
1510 
1511     case REG:
1512       return true;
1513 
1514     case MEM:
1515     case CONST_INT:
1516       return REG_P (dst);
1517 
1518     default:
1519       return false;
1520     }
1521 
1522   if (!REG_P (XEXP (src, 0))
1523       && !MEM_P (XEXP (src, 0))
1524       && !CONST_INT_P (XEXP (src, 0)))
1525     return false;
1526 
1527   if (GET_MODE (XEXP (src, 0)) != mode
1528       && !CONST_INT_P (XEXP (src, 0)))
1529     return false;
1530 
1531   return true;
1532 }
1533 
1534 /* The TImode version of scalar_to_vector_candidate_p.  */
1535 
1536 static bool
timode_scalar_to_vector_candidate_p(rtx_insn * insn)1537 timode_scalar_to_vector_candidate_p (rtx_insn *insn)
1538 {
1539   rtx def_set = pseudo_reg_set (insn);
1540 
1541   if (!def_set)
1542     return false;
1543 
1544   rtx src = SET_SRC (def_set);
1545   rtx dst = SET_DEST (def_set);
1546 
1547   /* Only TImode load and store are allowed.  */
1548   if (GET_MODE (dst) != TImode)
1549     return false;
1550 
1551   if (MEM_P (dst))
1552     {
1553       /* Check for store.  Memory must be aligned or unaligned store
1554 	 is optimal.  Only support store from register, standard SSE
1555 	 constant or CONST_WIDE_INT generated from piecewise store.
1556 
1557 	 ??? Verify performance impact before enabling CONST_INT for
1558 	 __int128 store.  */
1559       if (misaligned_operand (dst, TImode)
1560 	  && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
1561 	return false;
1562 
1563       switch (GET_CODE (src))
1564 	{
1565 	default:
1566 	  return false;
1567 
1568 	case REG:
1569 	case CONST_WIDE_INT:
1570 	  return true;
1571 
1572 	case CONST_INT:
1573 	  return standard_sse_constant_p (src, TImode);
1574 	}
1575     }
1576   else if (MEM_P (src))
1577     {
1578       /* Check for load.  Memory must be aligned or unaligned load is
1579 	 optimal.  */
1580       return (REG_P (dst)
1581 	      && (!misaligned_operand (src, TImode)
1582 		  || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
1583     }
1584 
1585   return false;
1586 }
1587 
1588 /* For a register REGNO, scan instructions for its defs and uses.
1589    Put REGNO in REGS if a def or use isn't in CANDIDATES.  */
1590 
1591 static void
timode_check_non_convertible_regs(bitmap candidates,bitmap regs,unsigned int regno)1592 timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
1593 				   unsigned int regno)
1594 {
1595   for (df_ref def = DF_REG_DEF_CHAIN (regno);
1596        def;
1597        def = DF_REF_NEXT_REG (def))
1598     {
1599       if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1600 	{
1601 	  if (dump_file)
1602 	    fprintf (dump_file,
1603 		     "r%d has non convertible def in insn %d\n",
1604 		     regno, DF_REF_INSN_UID (def));
1605 
1606 	  bitmap_set_bit (regs, regno);
1607 	  break;
1608 	}
1609     }
1610 
1611   for (df_ref ref = DF_REG_USE_CHAIN (regno);
1612        ref;
1613        ref = DF_REF_NEXT_REG (ref))
1614     {
1615       /* Debug instructions are skipped.  */
1616       if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
1617 	  && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1618 	{
1619 	  if (dump_file)
1620 	    fprintf (dump_file,
1621 		     "r%d has non convertible use in insn %d\n",
1622 		     regno, DF_REF_INSN_UID (ref));
1623 
1624 	  bitmap_set_bit (regs, regno);
1625 	  break;
1626 	}
1627     }
1628 }
1629 
1630 /* The TImode version of remove_non_convertible_regs.  */
1631 
1632 static void
timode_remove_non_convertible_regs(bitmap candidates)1633 timode_remove_non_convertible_regs (bitmap candidates)
1634 {
1635   bitmap_iterator bi;
1636   unsigned id;
1637   bitmap regs = BITMAP_ALLOC (NULL);
1638 
1639   EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
1640     {
1641       rtx def_set = single_set (DF_INSN_UID_GET (id)->insn);
1642       rtx dest = SET_DEST (def_set);
1643       rtx src = SET_SRC (def_set);
1644 
1645       if ((!REG_P (dest)
1646 	   || bitmap_bit_p (regs, REGNO (dest))
1647 	   || HARD_REGISTER_P (dest))
1648 	  && (!REG_P (src)
1649 	      || bitmap_bit_p (regs, REGNO (src))
1650 	      || HARD_REGISTER_P (src)))
1651 	continue;
1652 
1653       if (REG_P (dest))
1654 	timode_check_non_convertible_regs (candidates, regs,
1655 					   REGNO (dest));
1656 
1657       if (REG_P (src))
1658 	timode_check_non_convertible_regs (candidates, regs,
1659 					   REGNO (src));
1660     }
1661 
1662   EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
1663     {
1664       for (df_ref def = DF_REG_DEF_CHAIN (id);
1665 	   def;
1666 	   def = DF_REF_NEXT_REG (def))
1667 	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
1668 	  {
1669 	    if (dump_file)
1670 	      fprintf (dump_file, "Removing insn %d from candidates list\n",
1671 		       DF_REF_INSN_UID (def));
1672 
1673 	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
1674 	  }
1675 
1676       for (df_ref ref = DF_REG_USE_CHAIN (id);
1677 	   ref;
1678 	   ref = DF_REF_NEXT_REG (ref))
1679 	if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
1680 	  {
1681 	    if (dump_file)
1682 	      fprintf (dump_file, "Removing insn %d from candidates list\n",
1683 		       DF_REF_INSN_UID (ref));
1684 
1685 	    bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
1686 	  }
1687     }
1688 
1689   BITMAP_FREE (regs);
1690 }
1691 
1692 /* Main STV pass function.  Find and convert scalar
1693    instructions into vector mode when profitable.  */
1694 
1695 static unsigned int
convert_scalars_to_vector(bool timode_p)1696 convert_scalars_to_vector (bool timode_p)
1697 {
1698   basic_block bb;
1699   int converted_insns = 0;
1700   auto_vec<rtx_insn *> control_flow_insns;
1701 
1702   bitmap_obstack_initialize (NULL);
1703   const machine_mode cand_mode[3] = { SImode, DImode, TImode };
1704   const machine_mode cand_vmode[3] = { V4SImode, V2DImode, V1TImode };
1705   bitmap_head candidates[3];  /* { SImode, DImode, TImode } */
1706   for (unsigned i = 0; i < 3; ++i)
1707     bitmap_initialize (&candidates[i], &bitmap_default_obstack);
1708 
1709   calculate_dominance_info (CDI_DOMINATORS);
1710   df_set_flags (DF_DEFER_INSN_RESCAN | DF_RD_PRUNE_DEAD_DEFS);
1711   df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
1712   df_analyze ();
1713 
1714   /* Find all instructions we want to convert into vector mode.  */
1715   if (dump_file)
1716     fprintf (dump_file, "Searching for mode conversion candidates...\n");
1717 
1718   FOR_EACH_BB_FN (bb, cfun)
1719     {
1720       rtx_insn *insn;
1721       FOR_BB_INSNS (bb, insn)
1722 	if (timode_p
1723 	    && timode_scalar_to_vector_candidate_p (insn))
1724 	  {
1725 	    if (dump_file)
1726 	      fprintf (dump_file, "  insn %d is marked as a TImode candidate\n",
1727 		       INSN_UID (insn));
1728 
1729 	    bitmap_set_bit (&candidates[2], INSN_UID (insn));
1730 	  }
1731 	else if (!timode_p)
1732 	  {
1733 	    /* Check {SI,DI}mode.  */
1734 	    for (unsigned i = 0; i <= 1; ++i)
1735 	      if (general_scalar_to_vector_candidate_p (insn, cand_mode[i]))
1736 		{
1737 		  if (dump_file)
1738 		    fprintf (dump_file, "  insn %d is marked as a %s candidate\n",
1739 			     INSN_UID (insn), i == 0 ? "SImode" : "DImode");
1740 
1741 		  bitmap_set_bit (&candidates[i], INSN_UID (insn));
1742 		  break;
1743 		}
1744 	  }
1745     }
1746 
1747   if (timode_p)
1748     timode_remove_non_convertible_regs (&candidates[2]);
1749 
1750   for (unsigned i = 0; i <= 2; ++i)
1751     if (!bitmap_empty_p (&candidates[i]))
1752       break;
1753     else if (i == 2 && dump_file)
1754       fprintf (dump_file, "There are no candidates for optimization.\n");
1755 
1756   for (unsigned i = 0; i <= 2; ++i)
1757     while (!bitmap_empty_p (&candidates[i]))
1758       {
1759 	unsigned uid = bitmap_first_set_bit (&candidates[i]);
1760 	scalar_chain *chain;
1761 
1762 	if (cand_mode[i] == TImode)
1763 	  chain = new timode_scalar_chain;
1764 	else
1765 	  chain = new general_scalar_chain (cand_mode[i], cand_vmode[i]);
1766 
1767 	/* Find instructions chain we want to convert to vector mode.
1768 	   Check all uses and definitions to estimate all required
1769 	   conversions.  */
1770 	chain->build (&candidates[i], uid);
1771 
1772 	if (chain->compute_convert_gain () > 0)
1773 	  converted_insns += chain->convert ();
1774 	else
1775 	  if (dump_file)
1776 	    fprintf (dump_file, "Chain #%d conversion is not profitable\n",
1777 		     chain->chain_id);
1778 
1779 	rtx_insn* iter_insn;
1780 	unsigned int ii;
1781 	FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn)
1782 	  control_flow_insns.safe_push (iter_insn);
1783 
1784 	delete chain;
1785       }
1786 
1787   if (dump_file)
1788     fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
1789 
1790   for (unsigned i = 0; i <= 2; ++i)
1791     bitmap_release (&candidates[i]);
1792   bitmap_obstack_release (NULL);
1793   df_process_deferred_rescans ();
1794 
1795   /* Conversion means we may have 128bit register spills/fills
1796      which require aligned stack.  */
1797   if (converted_insns)
1798     {
1799       if (crtl->stack_alignment_needed < 128)
1800 	crtl->stack_alignment_needed = 128;
1801       if (crtl->stack_alignment_estimated < 128)
1802 	crtl->stack_alignment_estimated = 128;
1803 
1804       crtl->stack_realign_needed
1805 	= INCOMING_STACK_BOUNDARY < crtl->stack_alignment_estimated;
1806       crtl->stack_realign_tried = crtl->stack_realign_needed;
1807 
1808       crtl->stack_realign_processed = true;
1809 
1810       if (!crtl->drap_reg)
1811 	{
1812 	  rtx drap_rtx = targetm.calls.get_drap_rtx ();
1813 
1814 	  /* stack_realign_drap and drap_rtx must match.  */
1815 	  gcc_assert ((stack_realign_drap != 0) == (drap_rtx != NULL));
1816 
1817 	  /* Do nothing if NULL is returned,
1818 	     which means DRAP is not needed.  */
1819 	  if (drap_rtx != NULL)
1820 	    {
1821 	      crtl->args.internal_arg_pointer = drap_rtx;
1822 
1823 	      /* Call fixup_tail_calls to clean up
1824 		 REG_EQUIV note if DRAP is needed. */
1825 	      fixup_tail_calls ();
1826 	    }
1827 	}
1828 
1829       /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments.  */
1830       if (TARGET_64BIT)
1831 	for (tree parm = DECL_ARGUMENTS (current_function_decl);
1832 	     parm; parm = DECL_CHAIN (parm))
1833 	  {
1834 	    if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
1835 	      continue;
1836 	    if (DECL_RTL_SET_P (parm)
1837 		&& GET_MODE (DECL_RTL (parm)) == V1TImode)
1838 	      {
1839 		rtx r = DECL_RTL (parm);
1840 		if (REG_P (r))
1841 		  SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
1842 	      }
1843 	    if (DECL_INCOMING_RTL (parm)
1844 		&& GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
1845 	      {
1846 		rtx r = DECL_INCOMING_RTL (parm);
1847 		if (REG_P (r))
1848 		  DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
1849 	      }
1850 	  }
1851 
1852       if (!control_flow_insns.is_empty ())
1853 	{
1854 	  free_dominance_info (CDI_DOMINATORS);
1855 
1856 	  unsigned int i;
1857 	  rtx_insn* insn;
1858 	  FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
1859 	    if (control_flow_insn_p (insn))
1860 	      {
1861 		/* Split the block after insn.  There will be a fallthru
1862 		   edge, which is OK so we keep it.  We have to create
1863 		   the exception edges ourselves.  */
1864 		bb = BLOCK_FOR_INSN (insn);
1865 		split_block (bb, insn);
1866 		rtl_make_eh_edge (NULL, bb, BB_END (bb));
1867 	      }
1868 	}
1869     }
1870 
1871   return 0;
1872 }
1873 
1874 static unsigned int
rest_of_handle_insert_vzeroupper(void)1875 rest_of_handle_insert_vzeroupper (void)
1876 {
1877   /* vzeroupper instructions are inserted immediately after reload to
1878      account for possible spills from 256bit or 512bit registers.  The pass
1879      reuses mode switching infrastructure by re-running mode insertion
1880      pass, so disable entities that have already been processed.  */
1881   for (int i = 0; i < MAX_386_ENTITIES; i++)
1882     ix86_optimize_mode_switching[i] = 0;
1883 
1884   ix86_optimize_mode_switching[AVX_U128] = 1;
1885 
1886   /* Call optimize_mode_switching.  */
1887   g->get_passes ()->execute_pass_mode_switching ();
1888 
1889   df_analyze ();
1890   return 0;
1891 }
1892 
1893 namespace {
1894 
1895 const pass_data pass_data_insert_vzeroupper =
1896 {
1897   RTL_PASS, /* type */
1898   "vzeroupper", /* name */
1899   OPTGROUP_NONE, /* optinfo_flags */
1900   TV_MACH_DEP, /* tv_id */
1901   0, /* properties_required */
1902   0, /* properties_provided */
1903   0, /* properties_destroyed */
1904   0, /* todo_flags_start */
1905   TODO_df_finish, /* todo_flags_finish */
1906 };
1907 
1908 class pass_insert_vzeroupper : public rtl_opt_pass
1909 {
1910 public:
pass_insert_vzeroupper(gcc::context * ctxt)1911   pass_insert_vzeroupper(gcc::context *ctxt)
1912     : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
1913   {}
1914 
1915   /* opt_pass methods: */
gate(function *)1916   virtual bool gate (function *)
1917     {
1918       return TARGET_AVX && TARGET_VZEROUPPER;
1919     }
1920 
execute(function *)1921   virtual unsigned int execute (function *)
1922     {
1923       return rest_of_handle_insert_vzeroupper ();
1924     }
1925 
1926 }; // class pass_insert_vzeroupper
1927 
1928 const pass_data pass_data_stv =
1929 {
1930   RTL_PASS, /* type */
1931   "stv", /* name */
1932   OPTGROUP_NONE, /* optinfo_flags */
1933   TV_MACH_DEP, /* tv_id */
1934   0, /* properties_required */
1935   0, /* properties_provided */
1936   0, /* properties_destroyed */
1937   0, /* todo_flags_start */
1938   TODO_df_finish, /* todo_flags_finish */
1939 };
1940 
1941 class pass_stv : public rtl_opt_pass
1942 {
1943 public:
pass_stv(gcc::context * ctxt)1944   pass_stv (gcc::context *ctxt)
1945     : rtl_opt_pass (pass_data_stv, ctxt),
1946       timode_p (false)
1947   {}
1948 
1949   /* opt_pass methods: */
gate(function *)1950   virtual bool gate (function *)
1951     {
1952       return ((!timode_p || TARGET_64BIT)
1953 	      && TARGET_STV && TARGET_SSE2 && optimize > 1);
1954     }
1955 
execute(function *)1956   virtual unsigned int execute (function *)
1957     {
1958       return convert_scalars_to_vector (timode_p);
1959     }
1960 
clone()1961   opt_pass *clone ()
1962     {
1963       return new pass_stv (m_ctxt);
1964     }
1965 
set_pass_param(unsigned int n,bool param)1966   void set_pass_param (unsigned int n, bool param)
1967     {
1968       gcc_assert (n == 0);
1969       timode_p = param;
1970     }
1971 
1972 private:
1973   bool timode_p;
1974 }; // class pass_stv
1975 
1976 } // anon namespace
1977 
1978 rtl_opt_pass *
make_pass_insert_vzeroupper(gcc::context * ctxt)1979 make_pass_insert_vzeroupper (gcc::context *ctxt)
1980 {
1981   return new pass_insert_vzeroupper (ctxt);
1982 }
1983 
1984 rtl_opt_pass *
make_pass_stv(gcc::context * ctxt)1985 make_pass_stv (gcc::context *ctxt)
1986 {
1987   return new pass_stv (ctxt);
1988 }
1989 
1990 /* Inserting ENDBR and pseudo patchable-area instructions.  */
1991 
1992 static void
rest_of_insert_endbr_and_patchable_area(bool need_endbr,unsigned int patchable_area_size)1993 rest_of_insert_endbr_and_patchable_area (bool need_endbr,
1994 					 unsigned int patchable_area_size)
1995 {
1996   rtx endbr;
1997   rtx_insn *insn;
1998   rtx_insn *endbr_insn = NULL;
1999   basic_block bb;
2000 
2001   if (need_endbr)
2002     {
2003       /* Currently emit EB if it's a tracking function, i.e. 'nocf_check'
2004 	 is absent among function attributes.  Later an optimization will
2005 	 be introduced to make analysis if an address of a static function
2006 	 is taken.  A static function whose address is not taken will get
2007 	 a nocf_check attribute.  This will allow to reduce the number of
2008 	 EB.  */
2009       if (!lookup_attribute ("nocf_check",
2010 			     TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
2011 	  && (!flag_manual_endbr
2012 	      || lookup_attribute ("cf_check",
2013 				   DECL_ATTRIBUTES (cfun->decl)))
2014 	  && (!cgraph_node::get (cfun->decl)->only_called_directly_p ()
2015 	      || ix86_cmodel == CM_LARGE
2016 	      || ix86_cmodel == CM_LARGE_PIC
2017 	      || flag_force_indirect_call
2018 	      || (TARGET_DLLIMPORT_DECL_ATTRIBUTES
2019 		  && DECL_DLLIMPORT_P (cfun->decl))))
2020 	{
2021 	  if (crtl->profile && flag_fentry)
2022 	    {
2023 	      /* Queue ENDBR insertion to x86_function_profiler.
2024 		 NB: Any patchable-area insn will be inserted after
2025 		 ENDBR.  */
2026 	      cfun->machine->insn_queued_at_entrance = TYPE_ENDBR;
2027 	    }
2028 	  else
2029 	    {
2030 	      endbr = gen_nop_endbr ();
2031 	      bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2032 	      rtx_insn *insn = BB_HEAD (bb);
2033 	      endbr_insn = emit_insn_before (endbr, insn);
2034 	    }
2035 	}
2036     }
2037 
2038   if (patchable_area_size)
2039     {
2040       if (crtl->profile && flag_fentry)
2041 	{
2042 	  /* Queue patchable-area insertion to x86_function_profiler.
2043 	     NB: If there is a queued ENDBR, x86_function_profiler
2044 	     will also handle patchable-area.  */
2045 	  if (!cfun->machine->insn_queued_at_entrance)
2046 	    cfun->machine->insn_queued_at_entrance = TYPE_PATCHABLE_AREA;
2047 	}
2048       else
2049 	{
2050 	  rtx patchable_area
2051 	    = gen_patchable_area (GEN_INT (patchable_area_size),
2052 				  GEN_INT (crtl->patch_area_entry == 0));
2053 	  if (endbr_insn)
2054 	    emit_insn_after (patchable_area, endbr_insn);
2055 	  else
2056 	    {
2057 	      bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
2058 	      insn = BB_HEAD (bb);
2059 	      emit_insn_before (patchable_area, insn);
2060 	    }
2061 	}
2062     }
2063 
2064   if (!need_endbr)
2065     return;
2066 
2067   bb = 0;
2068   FOR_EACH_BB_FN (bb, cfun)
2069     {
2070       for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
2071 	   insn = NEXT_INSN (insn))
2072 	{
2073 	  if (CALL_P (insn))
2074 	    {
2075 	      need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
2076 	      if (!need_endbr && !SIBLING_CALL_P (insn))
2077 		{
2078 		  rtx call = get_call_rtx_from (insn);
2079 		  rtx fnaddr = XEXP (call, 0);
2080 		  tree fndecl = NULL_TREE;
2081 
2082 		  /* Also generate ENDBRANCH for non-tail call which
2083 		     may return via indirect branch.  */
2084 		  if (GET_CODE (XEXP (fnaddr, 0)) == SYMBOL_REF)
2085 		    fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
2086 		  if (fndecl == NULL_TREE)
2087 		    fndecl = MEM_EXPR (fnaddr);
2088 		  if (fndecl
2089 		      && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
2090 		      && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
2091 		    fndecl = NULL_TREE;
2092 		  if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
2093 		    {
2094 		      tree fntype = TREE_TYPE (fndecl);
2095 		      if (lookup_attribute ("indirect_return",
2096 					    TYPE_ATTRIBUTES (fntype)))
2097 			need_endbr = true;
2098 		    }
2099 		}
2100 	      if (!need_endbr)
2101 		continue;
2102 	      /* Generate ENDBRANCH after CALL, which can return more than
2103 		 twice, setjmp-like functions.  */
2104 
2105 	      endbr = gen_nop_endbr ();
2106 	      emit_insn_after_setloc (endbr, insn, INSN_LOCATION (insn));
2107 	      continue;
2108 	    }
2109 
2110 	  if (JUMP_P (insn) && flag_cet_switch)
2111 	    {
2112 	      rtx target = JUMP_LABEL (insn);
2113 	      if (target == NULL_RTX || ANY_RETURN_P (target))
2114 		continue;
2115 
2116 	      /* Check the jump is a switch table.  */
2117 	      rtx_insn *label = as_a<rtx_insn *> (target);
2118 	      rtx_insn *table = next_insn (label);
2119 	      if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
2120 		continue;
2121 
2122 	      /* For the indirect jump find out all places it jumps and insert
2123 		 ENDBRANCH there.  It should be done under a special flag to
2124 		 control ENDBRANCH generation for switch stmts.  */
2125 	      edge_iterator ei;
2126 	      edge e;
2127 	      basic_block dest_blk;
2128 
2129 	      FOR_EACH_EDGE (e, ei, bb->succs)
2130 		{
2131 		  rtx_insn *insn;
2132 
2133 		  dest_blk = e->dest;
2134 		  insn = BB_HEAD (dest_blk);
2135 		  gcc_assert (LABEL_P (insn));
2136 		  endbr = gen_nop_endbr ();
2137 		  emit_insn_after (endbr, insn);
2138 		}
2139 	      continue;
2140 	    }
2141 
2142 	  if (LABEL_P (insn) && LABEL_PRESERVE_P (insn))
2143 	    {
2144 	      endbr = gen_nop_endbr ();
2145 	      emit_insn_after (endbr, insn);
2146 	      continue;
2147 	    }
2148 	}
2149     }
2150 
2151   return;
2152 }
2153 
2154 namespace {
2155 
2156 const pass_data pass_data_insert_endbr_and_patchable_area =
2157 {
2158   RTL_PASS, /* type.  */
2159   "endbr_and_patchable_area", /* name.  */
2160   OPTGROUP_NONE, /* optinfo_flags.  */
2161   TV_MACH_DEP, /* tv_id.  */
2162   0, /* properties_required.  */
2163   0, /* properties_provided.  */
2164   0, /* properties_destroyed.  */
2165   0, /* todo_flags_start.  */
2166   0, /* todo_flags_finish.  */
2167 };
2168 
2169 class pass_insert_endbr_and_patchable_area : public rtl_opt_pass
2170 {
2171 public:
pass_insert_endbr_and_patchable_area(gcc::context * ctxt)2172   pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
2173     : rtl_opt_pass (pass_data_insert_endbr_and_patchable_area, ctxt)
2174   {}
2175 
2176   /* opt_pass methods: */
gate(function *)2177   virtual bool gate (function *)
2178     {
2179       need_endbr = (flag_cf_protection & CF_BRANCH) != 0;
2180       patchable_area_size = crtl->patch_area_size - crtl->patch_area_entry;
2181       return need_endbr || patchable_area_size;
2182     }
2183 
execute(function *)2184   virtual unsigned int execute (function *)
2185     {
2186       timevar_push (TV_MACH_DEP);
2187       rest_of_insert_endbr_and_patchable_area (need_endbr,
2188 					       patchable_area_size);
2189       timevar_pop (TV_MACH_DEP);
2190       return 0;
2191     }
2192 
2193 private:
2194   bool need_endbr;
2195   unsigned int patchable_area_size;
2196 }; // class pass_insert_endbr_and_patchable_area
2197 
2198 } // anon namespace
2199 
2200 rtl_opt_pass *
make_pass_insert_endbr_and_patchable_area(gcc::context * ctxt)2201 make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
2202 {
2203   return new pass_insert_endbr_and_patchable_area (ctxt);
2204 }
2205 
2206 /* At entry of the nearest common dominator for basic blocks with
2207    conversions/rcp/sqrt/rsqrt/round, generate a single
2208 	vxorps %xmmN, %xmmN, %xmmN
2209    for all
2210 	vcvtss2sd  op, %xmmN, %xmmX
2211 	vcvtsd2ss  op, %xmmN, %xmmX
2212 	vcvtsi2ss  op, %xmmN, %xmmX
2213 	vcvtsi2sd  op, %xmmN, %xmmX
2214 
2215    NB: We want to generate only a single vxorps to cover the whole
2216    function.  The LCM algorithm isn't appropriate here since it may
2217    place a vxorps inside the loop.  */
2218 
2219 static unsigned int
remove_partial_avx_dependency(void)2220 remove_partial_avx_dependency (void)
2221 {
2222   timevar_push (TV_MACH_DEP);
2223 
2224   bitmap_obstack_initialize (NULL);
2225   bitmap convert_bbs = BITMAP_ALLOC (NULL);
2226 
2227   basic_block bb;
2228   rtx_insn *insn, *set_insn;
2229   rtx set;
2230   rtx v4sf_const0 = NULL_RTX;
2231 
2232   auto_vec<rtx_insn *> control_flow_insns;
2233 
2234   /* We create invalid RTL initially so defer rescans.  */
2235   df_set_flags (DF_DEFER_INSN_RESCAN);
2236 
2237   FOR_EACH_BB_FN (bb, cfun)
2238     {
2239       FOR_BB_INSNS (bb, insn)
2240 	{
2241 	  if (!NONDEBUG_INSN_P (insn))
2242 	    continue;
2243 
2244 	  set = single_set (insn);
2245 	  if (!set)
2246 	    continue;
2247 
2248 	  if (get_attr_avx_partial_xmm_update (insn)
2249 	      != AVX_PARTIAL_XMM_UPDATE_TRUE)
2250 	    continue;
2251 
2252 	  /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
2253 	     SI -> SF, SI -> DF, DI -> SF, DI -> DF, sqrt, rsqrt, rcp,
2254 	     round, to vec_dup and vec_merge with subreg.  */
2255 	  rtx src = SET_SRC (set);
2256 	  rtx dest = SET_DEST (set);
2257 	  machine_mode dest_mode = GET_MODE (dest);
2258 	  bool convert_p = false;
2259 	  switch (GET_CODE (src))
2260 	    {
2261 	    case FLOAT:
2262 	    case FLOAT_EXTEND:
2263 	    case FLOAT_TRUNCATE:
2264 	    case UNSIGNED_FLOAT:
2265 	      convert_p = true;
2266 	      break;
2267 	    default:
2268 	      break;
2269 	    }
2270 
2271 	  /* Only hanlde conversion here.  */
2272 	  machine_mode src_mode
2273 	    = convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode;
2274 	  switch (src_mode)
2275 	    {
2276 	    case E_SFmode:
2277 	    case E_DFmode:
2278 	      if (TARGET_USE_VECTOR_FP_CONVERTS
2279 		  || !TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY)
2280 		continue;
2281 	      break;
2282 	    case E_SImode:
2283 	    case E_DImode:
2284 	      if (TARGET_USE_VECTOR_CONVERTS
2285 		  || !TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY)
2286 		continue;
2287 	      break;
2288 	    case E_VOIDmode:
2289 	      gcc_assert (!convert_p);
2290 	      break;
2291 	    default:
2292 	      gcc_unreachable ();
2293 	    }
2294 
2295 	  if (!v4sf_const0)
2296 	    v4sf_const0 = gen_reg_rtx (V4SFmode);
2297 
2298 	  rtx zero;
2299 	  machine_mode dest_vecmode;
2300 	  switch (dest_mode)
2301 	    {
2302 	    case E_HFmode:
2303 	      dest_vecmode = V8HFmode;
2304 	      zero = gen_rtx_SUBREG (V8HFmode, v4sf_const0, 0);
2305 	      break;
2306 	    case E_SFmode:
2307 	      dest_vecmode = V4SFmode;
2308 	      zero = v4sf_const0;
2309 	      break;
2310 	    case E_DFmode:
2311 	      dest_vecmode = V2DFmode;
2312 	      zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
2313 	      break;
2314 	    default:
2315 	      gcc_unreachable ();
2316 	    }
2317 
2318 	  /* Change source to vector mode.  */
2319 	  src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
2320 	  src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
2321 				   GEN_INT (HOST_WIDE_INT_1U));
2322 	  /* Change destination to vector mode.  */
2323 	  rtx vec = gen_reg_rtx (dest_vecmode);
2324 	  /* Generate an XMM vector SET.  */
2325 	  set = gen_rtx_SET (vec, src);
2326 	  set_insn = emit_insn_before (set, insn);
2327 	  df_insn_rescan (set_insn);
2328 
2329 	  if (cfun->can_throw_non_call_exceptions)
2330 	    {
2331 	      /* Handle REG_EH_REGION note.  */
2332 	      rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
2333 	      if (note)
2334 		{
2335 		  control_flow_insns.safe_push (set_insn);
2336 		  add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0));
2337 		}
2338 	    }
2339 
2340 	  src = gen_rtx_SUBREG (dest_mode, vec, 0);
2341 	  set = gen_rtx_SET (dest, src);
2342 
2343 	  /* Drop possible dead definitions.  */
2344 	  PATTERN (insn) = set;
2345 
2346 	  INSN_CODE (insn) = -1;
2347 	  recog_memoized (insn);
2348 	  df_insn_rescan (insn);
2349 	  bitmap_set_bit (convert_bbs, bb->index);
2350 	}
2351     }
2352 
2353   if (v4sf_const0)
2354     {
2355       /* (Re-)discover loops so that bb->loop_father can be used in the
2356 	 analysis below.  */
2357       calculate_dominance_info (CDI_DOMINATORS);
2358       loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
2359 
2360       /* Generate a vxorps at entry of the nearest dominator for basic
2361 	 blocks with conversions, which is in the fake loop that
2362 	 contains the whole function, so that there is only a single
2363 	 vxorps in the whole function.   */
2364       bb = nearest_common_dominator_for_set (CDI_DOMINATORS,
2365 					     convert_bbs);
2366       while (bb->loop_father->latch
2367 	     != EXIT_BLOCK_PTR_FOR_FN (cfun))
2368 	bb = get_immediate_dominator (CDI_DOMINATORS,
2369 				      bb->loop_father->header);
2370 
2371       set = gen_rtx_SET (v4sf_const0, CONST0_RTX (V4SFmode));
2372 
2373       insn = BB_HEAD (bb);
2374       while (insn && !NONDEBUG_INSN_P (insn))
2375 	{
2376 	  if (insn == BB_END (bb))
2377 	    {
2378 	      insn = NULL;
2379 	      break;
2380 	    }
2381 	  insn = NEXT_INSN (insn);
2382 	}
2383       if (insn == BB_HEAD (bb))
2384         set_insn = emit_insn_before (set, insn);
2385       else
2386 	set_insn = emit_insn_after (set,
2387 				    insn ? PREV_INSN (insn) : BB_END (bb));
2388       df_insn_rescan (set_insn);
2389       loop_optimizer_finalize ();
2390 
2391       if (!control_flow_insns.is_empty ())
2392 	{
2393 	  free_dominance_info (CDI_DOMINATORS);
2394 
2395 	  unsigned int i;
2396 	  FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
2397 	    if (control_flow_insn_p (insn))
2398 	      {
2399 		/* Split the block after insn.  There will be a fallthru
2400 		   edge, which is OK so we keep it.  We have to create
2401 		   the exception edges ourselves.  */
2402 		bb = BLOCK_FOR_INSN (insn);
2403 		split_block (bb, insn);
2404 		rtl_make_eh_edge (NULL, bb, BB_END (bb));
2405 	      }
2406 	}
2407     }
2408 
2409   df_process_deferred_rescans ();
2410   df_clear_flags (DF_DEFER_INSN_RESCAN);
2411   bitmap_obstack_release (NULL);
2412   BITMAP_FREE (convert_bbs);
2413 
2414   timevar_pop (TV_MACH_DEP);
2415   return 0;
2416 }
2417 
2418 namespace {
2419 
2420 const pass_data pass_data_remove_partial_avx_dependency =
2421 {
2422   RTL_PASS, /* type */
2423   "rpad", /* name */
2424   OPTGROUP_NONE, /* optinfo_flags */
2425   TV_MACH_DEP, /* tv_id */
2426   0, /* properties_required */
2427   0, /* properties_provided */
2428   0, /* properties_destroyed */
2429   0, /* todo_flags_start */
2430   0, /* todo_flags_finish */
2431 };
2432 
2433 class pass_remove_partial_avx_dependency : public rtl_opt_pass
2434 {
2435 public:
pass_remove_partial_avx_dependency(gcc::context * ctxt)2436   pass_remove_partial_avx_dependency (gcc::context *ctxt)
2437     : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
2438   {}
2439 
2440   /* opt_pass methods: */
gate(function *)2441   virtual bool gate (function *)
2442     {
2443       return (TARGET_AVX
2444 	      && TARGET_SSE_PARTIAL_REG_DEPENDENCY
2445 	      && TARGET_SSE_MATH
2446 	      && optimize
2447 	      && optimize_function_for_speed_p (cfun));
2448     }
2449 
execute(function *)2450   virtual unsigned int execute (function *)
2451     {
2452       return remove_partial_avx_dependency ();
2453     }
2454 }; // class pass_rpad
2455 
2456 } // anon namespace
2457 
2458 rtl_opt_pass *
make_pass_remove_partial_avx_dependency(gcc::context * ctxt)2459 make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
2460 {
2461   return new pass_remove_partial_avx_dependency (ctxt);
2462 }
2463 
2464 /* This compares the priority of target features in function DECL1
2465    and DECL2.  It returns positive value if DECL1 is higher priority,
2466    negative value if DECL2 is higher priority and 0 if they are the
2467    same.  */
2468 
2469 int
ix86_compare_version_priority(tree decl1,tree decl2)2470 ix86_compare_version_priority (tree decl1, tree decl2)
2471 {
2472   unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
2473   unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
2474 
2475   return (int)priority1 - (int)priority2;
2476 }
2477 
2478 /* V1 and V2 point to function versions with different priorities
2479    based on the target ISA.  This function compares their priorities.  */
2480 
2481 static int
feature_compare(const void * v1,const void * v2)2482 feature_compare (const void *v1, const void *v2)
2483 {
2484   typedef struct _function_version_info
2485     {
2486       tree version_decl;
2487       tree predicate_chain;
2488       unsigned int dispatch_priority;
2489     } function_version_info;
2490 
2491   const function_version_info c1 = *(const function_version_info *)v1;
2492   const function_version_info c2 = *(const function_version_info *)v2;
2493   return (c2.dispatch_priority - c1.dispatch_priority);
2494 }
2495 
2496 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
2497    to return a pointer to VERSION_DECL if the outcome of the expression
2498    formed by PREDICATE_CHAIN is true.  This function will be called during
2499    version dispatch to decide which function version to execute.  It returns
2500    the basic block at the end, to which more conditions can be added.  */
2501 
2502 static basic_block
add_condition_to_bb(tree function_decl,tree version_decl,tree predicate_chain,basic_block new_bb)2503 add_condition_to_bb (tree function_decl, tree version_decl,
2504 		     tree predicate_chain, basic_block new_bb)
2505 {
2506   gimple *return_stmt;
2507   tree convert_expr, result_var;
2508   gimple *convert_stmt;
2509   gimple *call_cond_stmt;
2510   gimple *if_else_stmt;
2511 
2512   basic_block bb1, bb2, bb3;
2513   edge e12, e23;
2514 
2515   tree cond_var, and_expr_var = NULL_TREE;
2516   gimple_seq gseq;
2517 
2518   tree predicate_decl, predicate_arg;
2519 
2520   push_cfun (DECL_STRUCT_FUNCTION (function_decl));
2521 
2522   gcc_assert (new_bb != NULL);
2523   gseq = bb_seq (new_bb);
2524 
2525 
2526   convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
2527 	     		 build_fold_addr_expr (version_decl));
2528   result_var = create_tmp_var (ptr_type_node);
2529   convert_stmt = gimple_build_assign (result_var, convert_expr);
2530   return_stmt = gimple_build_return (result_var);
2531 
2532   if (predicate_chain == NULL_TREE)
2533     {
2534       gimple_seq_add_stmt (&gseq, convert_stmt);
2535       gimple_seq_add_stmt (&gseq, return_stmt);
2536       set_bb_seq (new_bb, gseq);
2537       gimple_set_bb (convert_stmt, new_bb);
2538       gimple_set_bb (return_stmt, new_bb);
2539       pop_cfun ();
2540       return new_bb;
2541     }
2542 
2543   while (predicate_chain != NULL)
2544     {
2545       cond_var = create_tmp_var (integer_type_node);
2546       predicate_decl = TREE_PURPOSE (predicate_chain);
2547       predicate_arg = TREE_VALUE (predicate_chain);
2548       call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
2549       gimple_call_set_lhs (call_cond_stmt, cond_var);
2550 
2551       gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
2552       gimple_set_bb (call_cond_stmt, new_bb);
2553       gimple_seq_add_stmt (&gseq, call_cond_stmt);
2554 
2555       predicate_chain = TREE_CHAIN (predicate_chain);
2556 
2557       if (and_expr_var == NULL)
2558         and_expr_var = cond_var;
2559       else
2560 	{
2561 	  gimple *assign_stmt;
2562 	  /* Use MIN_EXPR to check if any integer is zero?.
2563 	     and_expr_var = min_expr <cond_var, and_expr_var>  */
2564 	  assign_stmt = gimple_build_assign (and_expr_var,
2565 			  build2 (MIN_EXPR, integer_type_node,
2566 				  cond_var, and_expr_var));
2567 
2568 	  gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
2569 	  gimple_set_bb (assign_stmt, new_bb);
2570 	  gimple_seq_add_stmt (&gseq, assign_stmt);
2571 	}
2572     }
2573 
2574   if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
2575 	  		            integer_zero_node,
2576 				    NULL_TREE, NULL_TREE);
2577   gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
2578   gimple_set_bb (if_else_stmt, new_bb);
2579   gimple_seq_add_stmt (&gseq, if_else_stmt);
2580 
2581   gimple_seq_add_stmt (&gseq, convert_stmt);
2582   gimple_seq_add_stmt (&gseq, return_stmt);
2583   set_bb_seq (new_bb, gseq);
2584 
2585   bb1 = new_bb;
2586   e12 = split_block (bb1, if_else_stmt);
2587   bb2 = e12->dest;
2588   e12->flags &= ~EDGE_FALLTHRU;
2589   e12->flags |= EDGE_TRUE_VALUE;
2590 
2591   e23 = split_block (bb2, return_stmt);
2592 
2593   gimple_set_bb (convert_stmt, bb2);
2594   gimple_set_bb (return_stmt, bb2);
2595 
2596   bb3 = e23->dest;
2597   make_edge (bb1, bb3, EDGE_FALSE_VALUE);
2598 
2599   remove_edge (e23);
2600   make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
2601 
2602   pop_cfun ();
2603 
2604   return bb3;
2605 }
2606 
2607 /* This function generates the dispatch function for
2608    multi-versioned functions.  DISPATCH_DECL is the function which will
2609    contain the dispatch logic.  FNDECLS are the function choices for
2610    dispatch, and is a tree chain.  EMPTY_BB is the basic block pointer
2611    in DISPATCH_DECL in which the dispatch code is generated.  */
2612 
2613 static int
dispatch_function_versions(tree dispatch_decl,void * fndecls_p,basic_block * empty_bb)2614 dispatch_function_versions (tree dispatch_decl,
2615 			    void *fndecls_p,
2616 			    basic_block *empty_bb)
2617 {
2618   tree default_decl;
2619   gimple *ifunc_cpu_init_stmt;
2620   gimple_seq gseq;
2621   int ix;
2622   tree ele;
2623   vec<tree> *fndecls;
2624   unsigned int num_versions = 0;
2625   unsigned int actual_versions = 0;
2626   unsigned int i;
2627 
2628   struct _function_version_info
2629     {
2630       tree version_decl;
2631       tree predicate_chain;
2632       unsigned int dispatch_priority;
2633     }*function_version_info;
2634 
2635   gcc_assert (dispatch_decl != NULL
2636 	      && fndecls_p != NULL
2637 	      && empty_bb != NULL);
2638 
2639   /*fndecls_p is actually a vector.  */
2640   fndecls = static_cast<vec<tree> *> (fndecls_p);
2641 
2642   /* At least one more version other than the default.  */
2643   num_versions = fndecls->length ();
2644   gcc_assert (num_versions >= 2);
2645 
2646   function_version_info = (struct _function_version_info *)
2647     XNEWVEC (struct _function_version_info, (num_versions - 1));
2648 
2649   /* The first version in the vector is the default decl.  */
2650   default_decl = (*fndecls)[0];
2651 
2652   push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
2653 
2654   gseq = bb_seq (*empty_bb);
2655   /* Function version dispatch is via IFUNC.  IFUNC resolvers fire before
2656      constructors, so explicity call __builtin_cpu_init here.  */
2657   ifunc_cpu_init_stmt
2658     = gimple_build_call_vec (get_ix86_builtin (IX86_BUILTIN_CPU_INIT), vNULL);
2659   gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
2660   gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
2661   set_bb_seq (*empty_bb, gseq);
2662 
2663   pop_cfun ();
2664 
2665 
2666   for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
2667     {
2668       tree version_decl = ele;
2669       tree predicate_chain = NULL_TREE;
2670       unsigned int priority;
2671       /* Get attribute string, parse it and find the right predicate decl.
2672          The predicate function could be a lengthy combination of many
2673 	 features, like arch-type and various isa-variants.  */
2674       priority = get_builtin_code_for_version (version_decl,
2675 	 			               &predicate_chain);
2676 
2677       if (predicate_chain == NULL_TREE)
2678 	continue;
2679 
2680       function_version_info [actual_versions].version_decl = version_decl;
2681       function_version_info [actual_versions].predicate_chain
2682 	 = predicate_chain;
2683       function_version_info [actual_versions].dispatch_priority = priority;
2684       actual_versions++;
2685     }
2686 
2687   /* Sort the versions according to descending order of dispatch priority.  The
2688      priority is based on the ISA.  This is not a perfect solution.  There
2689      could still be ambiguity.  If more than one function version is suitable
2690      to execute,  which one should be dispatched?  In future, allow the user
2691      to specify a dispatch  priority next to the version.  */
2692   qsort (function_version_info, actual_versions,
2693          sizeof (struct _function_version_info), feature_compare);
2694 
2695   for  (i = 0; i < actual_versions; ++i)
2696     *empty_bb = add_condition_to_bb (dispatch_decl,
2697 				     function_version_info[i].version_decl,
2698 				     function_version_info[i].predicate_chain,
2699 				     *empty_bb);
2700 
2701   /* dispatch default version at the end.  */
2702   *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
2703 				   NULL, *empty_bb);
2704 
2705   free (function_version_info);
2706   return 0;
2707 }
2708 
2709 /* This function changes the assembler name for functions that are
2710    versions.  If DECL is a function version and has a "target"
2711    attribute, it appends the attribute string to its assembler name.  */
2712 
2713 static tree
ix86_mangle_function_version_assembler_name(tree decl,tree id)2714 ix86_mangle_function_version_assembler_name (tree decl, tree id)
2715 {
2716   tree version_attr;
2717   const char *orig_name, *version_string;
2718   char *attr_str, *assembler_name;
2719 
2720   if (DECL_DECLARED_INLINE_P (decl)
2721       && lookup_attribute ("gnu_inline",
2722 			   DECL_ATTRIBUTES (decl)))
2723     error_at (DECL_SOURCE_LOCATION (decl),
2724 	      "function versions cannot be marked as %<gnu_inline%>,"
2725 	      " bodies have to be generated");
2726 
2727   if (DECL_VIRTUAL_P (decl)
2728       || DECL_VINDEX (decl))
2729     sorry ("virtual function multiversioning not supported");
2730 
2731   version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
2732 
2733   /* target attribute string cannot be NULL.  */
2734   gcc_assert (version_attr != NULL_TREE);
2735 
2736   orig_name = IDENTIFIER_POINTER (id);
2737   version_string
2738     = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE (version_attr)));
2739 
2740   if (strcmp (version_string, "default") == 0)
2741     return id;
2742 
2743   attr_str = sorted_attr_string (TREE_VALUE (version_attr));
2744   assembler_name = XNEWVEC (char, strlen (orig_name) + strlen (attr_str) + 2);
2745 
2746   sprintf (assembler_name, "%s.%s", orig_name, attr_str);
2747 
2748   /* Allow assembler name to be modified if already set.  */
2749   if (DECL_ASSEMBLER_NAME_SET_P (decl))
2750     SET_DECL_RTL (decl, NULL);
2751 
2752   tree ret = get_identifier (assembler_name);
2753   XDELETEVEC (attr_str);
2754   XDELETEVEC (assembler_name);
2755   return ret;
2756 }
2757 
2758 tree
ix86_mangle_decl_assembler_name(tree decl,tree id)2759 ix86_mangle_decl_assembler_name (tree decl, tree id)
2760 {
2761   /* For function version, add the target suffix to the assembler name.  */
2762   if (TREE_CODE (decl) == FUNCTION_DECL
2763       && DECL_FUNCTION_VERSIONED (decl))
2764     id = ix86_mangle_function_version_assembler_name (decl, id);
2765 #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
2766   id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
2767 #endif
2768 
2769   return id;
2770 }
2771 
2772 /* Make a dispatcher declaration for the multi-versioned function DECL.
2773    Calls to DECL function will be replaced with calls to the dispatcher
2774    by the front-end.  Returns the decl of the dispatcher function.  */
2775 
2776 tree
ix86_get_function_versions_dispatcher(void * decl)2777 ix86_get_function_versions_dispatcher (void *decl)
2778 {
2779   tree fn = (tree) decl;
2780   struct cgraph_node *node = NULL;
2781   struct cgraph_node *default_node = NULL;
2782   struct cgraph_function_version_info *node_v = NULL;
2783   struct cgraph_function_version_info *first_v = NULL;
2784 
2785   tree dispatch_decl = NULL;
2786 
2787   struct cgraph_function_version_info *default_version_info = NULL;
2788 
2789   gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
2790 
2791   node = cgraph_node::get (fn);
2792   gcc_assert (node != NULL);
2793 
2794   node_v = node->function_version ();
2795   gcc_assert (node_v != NULL);
2796 
2797   if (node_v->dispatcher_resolver != NULL)
2798     return node_v->dispatcher_resolver;
2799 
2800   /* Find the default version and make it the first node.  */
2801   first_v = node_v;
2802   /* Go to the beginning of the chain.  */
2803   while (first_v->prev != NULL)
2804     first_v = first_v->prev;
2805   default_version_info = first_v;
2806   while (default_version_info != NULL)
2807     {
2808       if (is_function_default_version
2809 	    (default_version_info->this_node->decl))
2810         break;
2811       default_version_info = default_version_info->next;
2812     }
2813 
2814   /* If there is no default node, just return NULL.  */
2815   if (default_version_info == NULL)
2816     return NULL;
2817 
2818   /* Make default info the first node.  */
2819   if (first_v != default_version_info)
2820     {
2821       default_version_info->prev->next = default_version_info->next;
2822       if (default_version_info->next)
2823         default_version_info->next->prev = default_version_info->prev;
2824       first_v->prev = default_version_info;
2825       default_version_info->next = first_v;
2826       default_version_info->prev = NULL;
2827     }
2828 
2829   default_node = default_version_info->this_node;
2830 
2831 #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
2832   if (targetm.has_ifunc_p ())
2833     {
2834       struct cgraph_function_version_info *it_v = NULL;
2835       struct cgraph_node *dispatcher_node = NULL;
2836       struct cgraph_function_version_info *dispatcher_version_info = NULL;
2837 
2838       /* Right now, the dispatching is done via ifunc.  */
2839       dispatch_decl = make_dispatcher_decl (default_node->decl);
2840 
2841       dispatcher_node = cgraph_node::get_create (dispatch_decl);
2842       gcc_assert (dispatcher_node != NULL);
2843       dispatcher_node->dispatcher_function = 1;
2844       dispatcher_version_info
2845 	= dispatcher_node->insert_new_function_version ();
2846       dispatcher_version_info->next = default_version_info;
2847       dispatcher_node->definition = 1;
2848 
2849       /* Set the dispatcher for all the versions.  */
2850       it_v = default_version_info;
2851       while (it_v != NULL)
2852 	{
2853 	  it_v->dispatcher_resolver = dispatch_decl;
2854 	  it_v = it_v->next;
2855 	}
2856     }
2857   else
2858 #endif
2859     {
2860       error_at (DECL_SOURCE_LOCATION (default_node->decl),
2861 		"multiversioning needs %<ifunc%> which is not supported "
2862 		"on this target");
2863     }
2864 
2865   return dispatch_decl;
2866 }
2867 
2868 /* Make the resolver function decl to dispatch the versions of
2869    a multi-versioned function,  DEFAULT_DECL.  IFUNC_ALIAS_DECL is
2870    ifunc alias that will point to the created resolver.  Create an
2871    empty basic block in the resolver and store the pointer in
2872    EMPTY_BB.  Return the decl of the resolver function.  */
2873 
2874 static tree
make_resolver_func(const tree default_decl,const tree ifunc_alias_decl,basic_block * empty_bb)2875 make_resolver_func (const tree default_decl,
2876 		    const tree ifunc_alias_decl,
2877 		    basic_block *empty_bb)
2878 {
2879   tree decl, type, t;
2880 
2881   /* Create resolver function name based on default_decl.  */
2882   tree decl_name = clone_function_name (default_decl, "resolver");
2883   const char *resolver_name = IDENTIFIER_POINTER (decl_name);
2884 
2885   /* The resolver function should return a (void *). */
2886   type = build_function_type_list (ptr_type_node, NULL_TREE);
2887 
2888   decl = build_fn_decl (resolver_name, type);
2889   SET_DECL_ASSEMBLER_NAME (decl, decl_name);
2890 
2891   DECL_NAME (decl) = decl_name;
2892   TREE_USED (decl) = 1;
2893   DECL_ARTIFICIAL (decl) = 1;
2894   DECL_IGNORED_P (decl) = 1;
2895   TREE_PUBLIC (decl) = 0;
2896   DECL_UNINLINABLE (decl) = 1;
2897 
2898   /* Resolver is not external, body is generated.  */
2899   DECL_EXTERNAL (decl) = 0;
2900   DECL_EXTERNAL (ifunc_alias_decl) = 0;
2901 
2902   DECL_CONTEXT (decl) = NULL_TREE;
2903   DECL_INITIAL (decl) = make_node (BLOCK);
2904   DECL_STATIC_CONSTRUCTOR (decl) = 0;
2905 
2906   if (DECL_COMDAT_GROUP (default_decl)
2907       || TREE_PUBLIC (default_decl))
2908     {
2909       /* In this case, each translation unit with a call to this
2910 	 versioned function will put out a resolver.  Ensure it
2911 	 is comdat to keep just one copy.  */
2912       DECL_COMDAT (decl) = 1;
2913       make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
2914     }
2915   else
2916     TREE_PUBLIC (ifunc_alias_decl) = 0;
2917 
2918   /* Build result decl and add to function_decl. */
2919   t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
2920   DECL_CONTEXT (t) = decl;
2921   DECL_ARTIFICIAL (t) = 1;
2922   DECL_IGNORED_P (t) = 1;
2923   DECL_RESULT (decl) = t;
2924 
2925   gimplify_function_tree (decl);
2926   push_cfun (DECL_STRUCT_FUNCTION (decl));
2927   *empty_bb = init_lowered_empty_function (decl, false,
2928 					   profile_count::uninitialized ());
2929 
2930   cgraph_node::add_new_function (decl, true);
2931   symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
2932 
2933   pop_cfun ();
2934 
2935   gcc_assert (ifunc_alias_decl != NULL);
2936   /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name.  */
2937   DECL_ATTRIBUTES (ifunc_alias_decl)
2938     = make_attribute ("ifunc", resolver_name,
2939 		      DECL_ATTRIBUTES (ifunc_alias_decl));
2940 
2941   /* Create the alias for dispatch to resolver here.  */
2942   cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
2943   return decl;
2944 }
2945 
2946 /* Generate the dispatching code body to dispatch multi-versioned function
2947    DECL.  The target hook is called to process the "target" attributes and
2948    provide the code to dispatch the right function at run-time.  NODE points
2949    to the dispatcher decl whose body will be created.  */
2950 
2951 tree
ix86_generate_version_dispatcher_body(void * node_p)2952 ix86_generate_version_dispatcher_body (void *node_p)
2953 {
2954   tree resolver_decl;
2955   basic_block empty_bb;
2956   tree default_ver_decl;
2957   struct cgraph_node *versn;
2958   struct cgraph_node *node;
2959 
2960   struct cgraph_function_version_info *node_version_info = NULL;
2961   struct cgraph_function_version_info *versn_info = NULL;
2962 
2963   node = (cgraph_node *)node_p;
2964 
2965   node_version_info = node->function_version ();
2966   gcc_assert (node->dispatcher_function
2967 	      && node_version_info != NULL);
2968 
2969   if (node_version_info->dispatcher_resolver)
2970     return node_version_info->dispatcher_resolver;
2971 
2972   /* The first version in the chain corresponds to the default version.  */
2973   default_ver_decl = node_version_info->next->this_node->decl;
2974 
2975   /* node is going to be an alias, so remove the finalized bit.  */
2976   node->definition = false;
2977 
2978   resolver_decl = make_resolver_func (default_ver_decl,
2979 				      node->decl, &empty_bb);
2980 
2981   node_version_info->dispatcher_resolver = resolver_decl;
2982 
2983   push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
2984 
2985   auto_vec<tree, 2> fn_ver_vec;
2986 
2987   for (versn_info = node_version_info->next; versn_info;
2988        versn_info = versn_info->next)
2989     {
2990       versn = versn_info->this_node;
2991       /* Check for virtual functions here again, as by this time it should
2992 	 have been determined if this function needs a vtable index or
2993 	 not.  This happens for methods in derived classes that override
2994 	 virtual methods in base classes but are not explicitly marked as
2995 	 virtual.  */
2996       if (DECL_VINDEX (versn->decl))
2997 	sorry ("virtual function multiversioning not supported");
2998 
2999       fn_ver_vec.safe_push (versn->decl);
3000     }
3001 
3002   dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
3003   cgraph_edge::rebuild_edges ();
3004   pop_cfun ();
3005   return resolver_decl;
3006 }
3007 
3008 
3009