xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/config/i386/x86-tune-sched.c (revision 4c3eb207d36f67d31994830c0a694161fc1ca39b)
1 /* Scheduler hooks for IA-32 which implement CPU specific logic.
2    Copyright (C) 1988-2020 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10 
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 #define IN_TARGET_CODE 1
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "cfghooks.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "insn-config.h"
32 #include "insn-attr.h"
33 #include "insn-opinit.h"
34 #include "recog.h"
35 
36 /* Return the maximum number of instructions a cpu can issue.  */
37 
38 int
ix86_issue_rate(void)39 ix86_issue_rate (void)
40 {
41   switch (ix86_tune)
42     {
43     case PROCESSOR_PENTIUM:
44     case PROCESSOR_LAKEMONT:
45     case PROCESSOR_BONNELL:
46     case PROCESSOR_SILVERMONT:
47     case PROCESSOR_KNL:
48     case PROCESSOR_KNM:
49     case PROCESSOR_INTEL:
50     case PROCESSOR_K6:
51     case PROCESSOR_BTVER2:
52     case PROCESSOR_PENTIUM4:
53     case PROCESSOR_NOCONA:
54       return 2;
55 
56     case PROCESSOR_PENTIUMPRO:
57     case PROCESSOR_ATHLON:
58     case PROCESSOR_K8:
59     case PROCESSOR_AMDFAM10:
60     case PROCESSOR_BTVER1:
61       return 3;
62 
63     case PROCESSOR_BDVER1:
64     case PROCESSOR_BDVER2:
65     case PROCESSOR_BDVER3:
66     case PROCESSOR_BDVER4:
67     case PROCESSOR_ZNVER1:
68     case PROCESSOR_ZNVER2:
69     case PROCESSOR_ZNVER3:
70     case PROCESSOR_CORE2:
71     case PROCESSOR_NEHALEM:
72     case PROCESSOR_SANDYBRIDGE:
73     case PROCESSOR_HASWELL:
74     case PROCESSOR_GENERIC:
75       return 4;
76 
77     default:
78       return 1;
79     }
80 }
81 
82 /* Return true iff USE_INSN has a memory address with operands set by
83    SET_INSN.  */
84 
85 bool
ix86_agi_dependent(rtx_insn * set_insn,rtx_insn * use_insn)86 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
87 {
88   int i;
89   extract_insn_cached (use_insn);
90   for (i = recog_data.n_operands - 1; i >= 0; --i)
91     if (MEM_P (recog_data.operand[i]))
92       {
93 	rtx addr = XEXP (recog_data.operand[i], 0);
94 	if (modified_in_p (addr, set_insn) != 0)
95 	  {
96 	    /* No AGI stall if SET_INSN is a push or pop and USE_INSN
97 	       has SP based memory (unless index reg is modified in a pop).  */
98 	    rtx set = single_set (set_insn);
99 	    if (set
100 		&& (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
101 		    || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
102 	      {
103 		struct ix86_address parts;
104 		if (ix86_decompose_address (addr, &parts)
105 		    && parts.base == stack_pointer_rtx
106 		    && (parts.index == NULL_RTX
107 			|| MEM_P (SET_DEST (set))
108 			|| !modified_in_p (parts.index, set_insn)))
109 		  return false;
110 	      }
111 	    return true;
112 	  }
113 	return false;
114       }
115   return false;
116 }
117 
118 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
119    by DEP_INSN and nothing set by DEP_INSN.  */
120 
121 static bool
ix86_flags_dependent(rtx_insn * insn,rtx_insn * dep_insn,enum attr_type insn_type)122 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
123 {
124   rtx set, set2;
125 
126   /* Simplify the test for uninteresting insns.  */
127   if (insn_type != TYPE_SETCC
128       && insn_type != TYPE_ICMOV
129       && insn_type != TYPE_FCMOV
130       && insn_type != TYPE_IBR)
131     return false;
132 
133   if ((set = single_set (dep_insn)) != 0)
134     {
135       set = SET_DEST (set);
136       set2 = NULL_RTX;
137     }
138   else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
139 	   && XVECLEN (PATTERN (dep_insn), 0) == 2
140 	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
141 	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
142     {
143       set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
144       set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
145     }
146   else
147     return false;
148 
149   if (!REG_P (set) || REGNO (set) != FLAGS_REG)
150     return false;
151 
152   /* This test is true if the dependent insn reads the flags but
153      not any other potentially set register.  */
154   if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
155     return false;
156 
157   if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
158     return false;
159 
160   return true;
161 }
162 
163 /* Helper function for exact_store_load_dependency.
164    Return true if addr is found in insn.  */
165 static bool
exact_dependency_1(rtx addr,rtx insn)166 exact_dependency_1 (rtx addr, rtx insn)
167 {
168   enum rtx_code code;
169   const char *format_ptr;
170   int i, j;
171 
172   code = GET_CODE (insn);
173   switch (code)
174     {
175     case MEM:
176       if (rtx_equal_p (addr, insn))
177 	return true;
178       break;
179     case REG:
180     CASE_CONST_ANY:
181     case SYMBOL_REF:
182     case CODE_LABEL:
183     case PC:
184     case CC0:
185     case EXPR_LIST:
186       return false;
187     default:
188       break;
189     }
190 
191   format_ptr = GET_RTX_FORMAT (code);
192   for (i = 0; i < GET_RTX_LENGTH (code); i++)
193     {
194       switch (*format_ptr++)
195 	{
196 	case 'e':
197 	  if (exact_dependency_1 (addr, XEXP (insn, i)))
198 	    return true;
199 	  break;
200 	case 'E':
201 	  for (j = 0; j < XVECLEN (insn, i); j++)
202 	    if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
203 	      return true;
204 	  break;
205 	}
206     }
207   return false;
208 }
209 
210 /* Return true if there exists exact dependency for store & load, i.e.
211    the same memory address is used in them.  */
212 static bool
exact_store_load_dependency(rtx_insn * store,rtx_insn * load)213 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
214 {
215   rtx set1, set2;
216 
217   set1 = single_set (store);
218   if (!set1)
219     return false;
220   if (!MEM_P (SET_DEST (set1)))
221     return false;
222   set2 = single_set (load);
223   if (!set2)
224     return false;
225   if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
226     return true;
227   return false;
228 }
229 
230 
231 /* This function corrects the value of COST (latency) based on the relationship
232    between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength
233    DW.  It should return the new value.
234 
235    On x86 CPUs this is most commonly used to model the fact that valus of
236    registers used to compute address of memory operand  needs to be ready
237    earlier than values of registers used in the actual operation.  */
238 
239 int
ix86_adjust_cost(rtx_insn * insn,int dep_type,rtx_insn * dep_insn,int cost,unsigned int)240 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
241 		  unsigned int)
242 {
243   enum attr_type insn_type, dep_insn_type;
244   enum attr_memory memory;
245   rtx set, set2;
246   int dep_insn_code_number;
247 
248   /* Anti and output dependencies have zero cost on all CPUs.  */
249   if (dep_type != 0)
250     return 0;
251 
252   dep_insn_code_number = recog_memoized (dep_insn);
253 
254   /* If we can't recognize the insns, we can't really do anything.  */
255   if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
256     return cost;
257 
258   insn_type = get_attr_type (insn);
259   dep_insn_type = get_attr_type (dep_insn);
260 
261   switch (ix86_tune)
262     {
263     case PROCESSOR_PENTIUM:
264     case PROCESSOR_LAKEMONT:
265       /* Address Generation Interlock adds a cycle of latency.  */
266       if (insn_type == TYPE_LEA)
267 	{
268 	  rtx addr = PATTERN (insn);
269 
270 	  if (GET_CODE (addr) == PARALLEL)
271 	    addr = XVECEXP (addr, 0, 0);
272 
273 	  gcc_assert (GET_CODE (addr) == SET);
274 
275 	  addr = SET_SRC (addr);
276 	  if (modified_in_p (addr, dep_insn))
277 	    cost += 1;
278 	}
279       else if (ix86_agi_dependent (dep_insn, insn))
280 	cost += 1;
281 
282       /* ??? Compares pair with jump/setcc.  */
283       if (ix86_flags_dependent (insn, dep_insn, insn_type))
284 	cost = 0;
285 
286       /* Floating point stores require value to be ready one cycle earlier.  */
287       if (insn_type == TYPE_FMOV
288 	  && get_attr_memory (insn) == MEMORY_STORE
289 	  && !ix86_agi_dependent (dep_insn, insn))
290 	cost += 1;
291       break;
292 
293     case PROCESSOR_PENTIUMPRO:
294       /* INT->FP conversion is expensive.  */
295       if (get_attr_fp_int_src (dep_insn))
296 	cost += 5;
297 
298       /* There is one cycle extra latency between an FP op and a store.  */
299       if (insn_type == TYPE_FMOV
300 	  && (set = single_set (dep_insn)) != NULL_RTX
301 	  && (set2 = single_set (insn)) != NULL_RTX
302 	  && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
303 	  && MEM_P (SET_DEST (set2)))
304 	cost += 1;
305 
306       memory = get_attr_memory (insn);
307 
308       /* Show ability of reorder buffer to hide latency of load by executing
309 	 in parallel with previous instruction in case
310 	 previous instruction is not needed to compute the address.  */
311       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
312 	  && !ix86_agi_dependent (dep_insn, insn))
313 	{
314 	  /* Claim moves to take one cycle, as core can issue one load
315 	     at time and the next load can start cycle later.  */
316 	  if (dep_insn_type == TYPE_IMOV
317 	      || dep_insn_type == TYPE_FMOV)
318 	    cost = 1;
319 	  else if (cost > 1)
320 	    cost--;
321 	}
322       break;
323 
324     case PROCESSOR_K6:
325      /* The esp dependency is resolved before
326 	the instruction is really finished.  */
327       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
328 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
329 	return 1;
330 
331       /* INT->FP conversion is expensive.  */
332       if (get_attr_fp_int_src (dep_insn))
333 	cost += 5;
334 
335       memory = get_attr_memory (insn);
336 
337       /* Show ability of reorder buffer to hide latency of load by executing
338 	 in parallel with previous instruction in case
339 	 previous instruction is not needed to compute the address.  */
340       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
341 	  && !ix86_agi_dependent (dep_insn, insn))
342 	{
343 	  /* Claim moves to take one cycle, as core can issue one load
344 	     at time and the next load can start cycle later.  */
345 	  if (dep_insn_type == TYPE_IMOV
346 	      || dep_insn_type == TYPE_FMOV)
347 	    cost = 1;
348 	  else if (cost > 2)
349 	    cost -= 2;
350 	  else
351 	    cost = 1;
352 	}
353       break;
354 
355     case PROCESSOR_AMDFAM10:
356     case PROCESSOR_BDVER1:
357     case PROCESSOR_BDVER2:
358     case PROCESSOR_BDVER3:
359     case PROCESSOR_BDVER4:
360     case PROCESSOR_BTVER1:
361     case PROCESSOR_BTVER2:
362       /* Stack engine allows to execute push&pop instructions in parall.  */
363       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
364 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
365 	return 0;
366       /* FALLTHRU */
367 
368     case PROCESSOR_ATHLON:
369     case PROCESSOR_K8:
370       memory = get_attr_memory (insn);
371 
372       /* Show ability of reorder buffer to hide latency of load by executing
373 	 in parallel with previous instruction in case
374 	 previous instruction is not needed to compute the address.  */
375       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
376 	  && !ix86_agi_dependent (dep_insn, insn))
377 	{
378 	  enum attr_unit unit = get_attr_unit (insn);
379 	  int loadcost = 3;
380 
381 	  /* Because of the difference between the length of integer and
382 	     floating unit pipeline preparation stages, the memory operands
383 	     for floating point are cheaper.
384 
385 	     ??? For Athlon it the difference is most probably 2.  */
386 	  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
387 	    loadcost = 3;
388 	  else
389 	    loadcost = TARGET_ATHLON ? 2 : 0;
390 
391 	  if (cost >= loadcost)
392 	    cost -= loadcost;
393 	  else
394 	    cost = 0;
395 	}
396       break;
397 
398     case PROCESSOR_ZNVER1:
399     case PROCESSOR_ZNVER2:
400     case PROCESSOR_ZNVER3:
401       /* Stack engine allows to execute push&pop instructions in parall.  */
402       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
403 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
404 	return 0;
405 
406       memory = get_attr_memory (insn);
407 
408       /* Show ability of reorder buffer to hide latency of load by executing
409 	 in parallel with previous instruction in case
410 	 previous instruction is not needed to compute the address.  */
411       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
412 	  && !ix86_agi_dependent (dep_insn, insn))
413 	{
414 	  enum attr_unit unit = get_attr_unit (insn);
415 	  int loadcost;
416 
417 	  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
418 	    loadcost = 4;
419 	  else
420 	    loadcost = 7;
421 
422 	  if (cost >= loadcost)
423 	    cost -= loadcost;
424 	  else
425 	    cost = 0;
426 	}
427       break;
428 
429     case PROCESSOR_CORE2:
430     case PROCESSOR_NEHALEM:
431     case PROCESSOR_SANDYBRIDGE:
432     case PROCESSOR_HASWELL:
433     case PROCESSOR_GENERIC:
434       /* Stack engine allows to execute push&pop instructions in parall.  */
435       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
436 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
437 	return 0;
438 
439       memory = get_attr_memory (insn);
440 
441       /* Show ability of reorder buffer to hide latency of load by executing
442 	 in parallel with previous instruction in case
443 	 previous instruction is not needed to compute the address.  */
444       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
445 	  && !ix86_agi_dependent (dep_insn, insn))
446 	{
447 	  if (cost >= 4)
448 	    cost -= 4;
449 	  else
450 	    cost = 0;
451 	}
452       break;
453 
454     case PROCESSOR_SILVERMONT:
455     case PROCESSOR_KNL:
456     case PROCESSOR_KNM:
457     case PROCESSOR_INTEL:
458       if (!reload_completed)
459 	return cost;
460 
461       /* Increase cost of integer loads.  */
462       memory = get_attr_memory (dep_insn);
463       if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
464 	{
465 	  enum attr_unit unit = get_attr_unit (dep_insn);
466 	  if (unit == UNIT_INTEGER && cost == 1)
467 	    {
468 	      if (memory == MEMORY_LOAD)
469 		cost = 3;
470 	      else
471 		{
472 		  /* Increase cost of ld/st for short int types only
473 		     because of store forwarding issue.  */
474 		  rtx set = single_set (dep_insn);
475 		  if (set && (GET_MODE (SET_DEST (set)) == QImode
476 			      || GET_MODE (SET_DEST (set)) == HImode))
477 		    {
478 		      /* Increase cost of store/load insn if exact
479 			 dependence exists and it is load insn.  */
480 		      enum attr_memory insn_memory = get_attr_memory (insn);
481 		      if (insn_memory == MEMORY_LOAD
482 			  && exact_store_load_dependency (dep_insn, insn))
483 			cost = 3;
484 		    }
485 		}
486 	    }
487 	}
488 
489     default:
490       break;
491     }
492 
493   return cost;
494 }
495 
496 /* How many alternative schedules to try.  This should be as wide as the
497    scheduling freedom in the DFA, but no wider.  Making this value too
498    large results extra work for the scheduler.  */
499 
500 int
ia32_multipass_dfa_lookahead(void)501 ia32_multipass_dfa_lookahead (void)
502 {
503   /* Generally, we want haifa-sched:max_issue() to look ahead as far
504      as many instructions can be executed on a cycle, i.e.,
505      issue_rate.  */
506   if (reload_completed)
507     return ix86_issue_rate ();
508   /* Don't use lookahead for pre-reload schedule to save compile time.  */
509   return 0;
510 }
511 
512 /* Return true if target platform supports macro-fusion.  */
513 
514 bool
ix86_macro_fusion_p()515 ix86_macro_fusion_p ()
516 {
517   return TARGET_FUSE_CMP_AND_BRANCH;
518 }
519 
520 /* Check whether current microarchitecture support macro fusion
521    for insn pair "CONDGEN + CONDJMP". Refer to
522    "Intel Architectures Optimization Reference Manual". */
523 
524 bool
ix86_macro_fusion_pair_p(rtx_insn * condgen,rtx_insn * condjmp)525 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
526 {
527   rtx src, dest;
528   enum rtx_code ccode;
529   rtx compare_set = NULL_RTX, test_if, cond;
530   rtx alu_set = NULL_RTX, addr = NULL_RTX;
531   enum attr_type condgen_type;
532 
533   if (!any_condjump_p (condjmp))
534     return false;
535 
536   unsigned int condreg1, condreg2;
537   rtx cc_reg_1;
538   targetm.fixed_condition_code_regs (&condreg1, &condreg2);
539   cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
540   if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
541       || !condgen
542       || !modified_in_p (cc_reg_1, condgen))
543     return false;
544 
545   condgen_type = get_attr_type (condgen);
546   if (condgen_type == TYPE_MULTI
547       && INSN_CODE (condgen) == code_for_stack_protect_test_1 (ptr_mode)
548       && TARGET_FUSE_ALU_AND_BRANCH)
549     {
550       /* stack_protect_test_<mode> ends with a sub, which subtracts
551 	 a non-rip special memory operand from a GPR.  */
552       src = NULL_RTX;
553       alu_set = XVECEXP (PATTERN (condgen), 0, 1);
554       goto handle_stack_protect_test;
555     }
556   else if (condgen_type != TYPE_TEST
557 	   && condgen_type != TYPE_ICMP
558 	   && condgen_type != TYPE_INCDEC
559 	   && condgen_type != TYPE_ALU)
560     return false;
561 
562   compare_set = single_set (condgen);
563   if (compare_set == NULL_RTX && !TARGET_FUSE_ALU_AND_BRANCH)
564     return false;
565 
566   if (compare_set == NULL_RTX)
567     {
568       int i;
569       rtx pat = PATTERN (condgen);
570       for (i = 0; i < XVECLEN (pat, 0); i++)
571 	if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
572 	  {
573 	    rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
574 	    if (GET_CODE (set_src) == COMPARE)
575 	      compare_set = XVECEXP (pat, 0, i);
576 	    else
577 	      alu_set = XVECEXP (pat, 0, i);
578 	  }
579     }
580   if (compare_set == NULL_RTX)
581     return false;
582   src = SET_SRC (compare_set);
583   if (GET_CODE (src) != COMPARE)
584     return false;
585 
586   /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
587      supported.  */
588   if ((MEM_P (XEXP (src, 0)) && CONST_INT_P (XEXP (src, 1)))
589       || (MEM_P (XEXP (src, 1)) && CONST_INT_P (XEXP (src, 0))))
590     return false;
591 
592   /* No fusion for RIP-relative address.  */
593   if (MEM_P (XEXP (src, 0)))
594     addr = XEXP (XEXP (src, 0), 0);
595   else if (MEM_P (XEXP (src, 1)))
596     addr = XEXP (XEXP (src, 1), 0);
597 
598   if (addr)
599     {
600       ix86_address parts;
601       int ok = ix86_decompose_address (addr, &parts);
602       gcc_assert (ok);
603 
604       if (ix86_rip_relative_addr_p (&parts))
605 	return false;
606     }
607 
608  handle_stack_protect_test:
609   test_if = SET_SRC (pc_set (condjmp));
610   cond = XEXP (test_if, 0);
611   ccode = GET_CODE (cond);
612   /* Check whether conditional jump use Sign or Overflow Flags.  */
613   if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
614       && (ccode == GE || ccode == GT || ccode == LE || ccode == LT))
615     return false;
616 
617   /* Return true for TYPE_TEST and TYPE_ICMP.  */
618   if (condgen_type == TYPE_TEST || condgen_type == TYPE_ICMP)
619     return true;
620 
621   /* The following is the case that macro-fusion for alu + jmp.  */
622   if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
623     return false;
624 
625   /* No fusion for alu op with memory destination operand.  */
626   dest = SET_DEST (alu_set);
627   if (MEM_P (dest))
628     return false;
629 
630   /* Macro-fusion for inc/dec + unsigned conditional jump is not
631      supported.  */
632   if (condgen_type == TYPE_INCDEC
633       && (ccode == GEU || ccode == GTU || ccode == LEU || ccode == LTU))
634     return false;
635 
636   return true;
637 }
638 
639