xref: /netbsd-src/external/gpl3/gcc/dist/gcc/config/i386/x86-tune-sched.cc (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1 /* Scheduler hooks for IA-32 which implement CPU specific logic.
2    Copyright (C) 1988-2022 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10 
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 #define IN_TARGET_CODE 1
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "cfghooks.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "insn-config.h"
32 #include "insn-attr.h"
33 #include "insn-opinit.h"
34 #include "recog.h"
35 
36 /* Return the maximum number of instructions a cpu can issue.  */
37 
38 int
ix86_issue_rate(void)39 ix86_issue_rate (void)
40 {
41   switch (ix86_tune)
42     {
43     case PROCESSOR_PENTIUM:
44     case PROCESSOR_LAKEMONT:
45     case PROCESSOR_BONNELL:
46     case PROCESSOR_SILVERMONT:
47     case PROCESSOR_KNL:
48     case PROCESSOR_KNM:
49     case PROCESSOR_INTEL:
50     case PROCESSOR_K6:
51     case PROCESSOR_BTVER2:
52     case PROCESSOR_PENTIUM4:
53     case PROCESSOR_NOCONA:
54       return 2;
55 
56     case PROCESSOR_PENTIUMPRO:
57     case PROCESSOR_ATHLON:
58     case PROCESSOR_K8:
59     case PROCESSOR_AMDFAM10:
60     case PROCESSOR_BTVER1:
61       return 3;
62 
63     case PROCESSOR_BDVER1:
64     case PROCESSOR_BDVER2:
65     case PROCESSOR_BDVER3:
66     case PROCESSOR_BDVER4:
67     case PROCESSOR_ZNVER1:
68     case PROCESSOR_ZNVER2:
69     case PROCESSOR_ZNVER3:
70     case PROCESSOR_ZNVER4:
71     case PROCESSOR_CORE2:
72     case PROCESSOR_NEHALEM:
73     case PROCESSOR_SANDYBRIDGE:
74     case PROCESSOR_HASWELL:
75     case PROCESSOR_TREMONT:
76     case PROCESSOR_ALDERLAKE:
77     case PROCESSOR_GENERIC:
78       return 4;
79 
80     default:
81       return 1;
82     }
83 }
84 
85 /* Return true iff USE_INSN has a memory address with operands set by
86    SET_INSN.  */
87 
88 bool
ix86_agi_dependent(rtx_insn * set_insn,rtx_insn * use_insn)89 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
90 {
91   int i;
92   extract_insn_cached (use_insn);
93   for (i = recog_data.n_operands - 1; i >= 0; --i)
94     if (MEM_P (recog_data.operand[i]))
95       {
96 	rtx addr = XEXP (recog_data.operand[i], 0);
97 	if (modified_in_p (addr, set_insn) != 0)
98 	  {
99 	    /* No AGI stall if SET_INSN is a push or pop and USE_INSN
100 	       has SP based memory (unless index reg is modified in a pop).  */
101 	    rtx set = single_set (set_insn);
102 	    if (set
103 		&& (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
104 		    || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
105 	      {
106 		struct ix86_address parts;
107 		if (ix86_decompose_address (addr, &parts)
108 		    && parts.base == stack_pointer_rtx
109 		    && (parts.index == NULL_RTX
110 			|| MEM_P (SET_DEST (set))
111 			|| !modified_in_p (parts.index, set_insn)))
112 		  return false;
113 	      }
114 	    return true;
115 	  }
116 	return false;
117       }
118   return false;
119 }
120 
121 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
122    by DEP_INSN and nothing set by DEP_INSN.  */
123 
124 static bool
ix86_flags_dependent(rtx_insn * insn,rtx_insn * dep_insn,enum attr_type insn_type)125 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
126 {
127   rtx set, set2;
128 
129   /* Simplify the test for uninteresting insns.  */
130   if (insn_type != TYPE_SETCC
131       && insn_type != TYPE_ICMOV
132       && insn_type != TYPE_FCMOV
133       && insn_type != TYPE_IBR)
134     return false;
135 
136   if ((set = single_set (dep_insn)) != 0)
137     {
138       set = SET_DEST (set);
139       set2 = NULL_RTX;
140     }
141   else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
142 	   && XVECLEN (PATTERN (dep_insn), 0) == 2
143 	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
144 	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
145     {
146       set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
147       set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
148     }
149   else
150     return false;
151 
152   if (!REG_P (set) || REGNO (set) != FLAGS_REG)
153     return false;
154 
155   /* This test is true if the dependent insn reads the flags but
156      not any other potentially set register.  */
157   if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
158     return false;
159 
160   if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
161     return false;
162 
163   return true;
164 }
165 
166 /* Helper function for exact_store_load_dependency.
167    Return true if addr is found in insn.  */
168 static bool
exact_dependency_1(rtx addr,rtx insn)169 exact_dependency_1 (rtx addr, rtx insn)
170 {
171   enum rtx_code code;
172   const char *format_ptr;
173   int i, j;
174 
175   code = GET_CODE (insn);
176   switch (code)
177     {
178     case MEM:
179       if (rtx_equal_p (addr, insn))
180 	return true;
181       break;
182     case REG:
183     CASE_CONST_ANY:
184     case SYMBOL_REF:
185     case CODE_LABEL:
186     case PC:
187     case EXPR_LIST:
188       return false;
189     default:
190       break;
191     }
192 
193   format_ptr = GET_RTX_FORMAT (code);
194   for (i = 0; i < GET_RTX_LENGTH (code); i++)
195     {
196       switch (*format_ptr++)
197 	{
198 	case 'e':
199 	  if (exact_dependency_1 (addr, XEXP (insn, i)))
200 	    return true;
201 	  break;
202 	case 'E':
203 	  for (j = 0; j < XVECLEN (insn, i); j++)
204 	    if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
205 	      return true;
206 	  break;
207 	}
208     }
209   return false;
210 }
211 
212 /* Return true if there exists exact dependency for store & load, i.e.
213    the same memory address is used in them.  */
214 static bool
exact_store_load_dependency(rtx_insn * store,rtx_insn * load)215 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
216 {
217   rtx set1, set2;
218 
219   set1 = single_set (store);
220   if (!set1)
221     return false;
222   if (!MEM_P (SET_DEST (set1)))
223     return false;
224   set2 = single_set (load);
225   if (!set2)
226     return false;
227   if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
228     return true;
229   return false;
230 }
231 
232 
233 /* This function corrects the value of COST (latency) based on the relationship
234    between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength
235    DW.  It should return the new value.
236 
237    On x86 CPUs this is most commonly used to model the fact that valus of
238    registers used to compute address of memory operand  needs to be ready
239    earlier than values of registers used in the actual operation.  */
240 
241 int
ix86_adjust_cost(rtx_insn * insn,int dep_type,rtx_insn * dep_insn,int cost,unsigned int)242 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
243 		  unsigned int)
244 {
245   enum attr_type insn_type, dep_insn_type;
246   enum attr_memory memory;
247   rtx set, set2;
248   int dep_insn_code_number;
249 
250   /* Anti and output dependencies have zero cost on all CPUs.  */
251   if (dep_type != 0)
252     return 0;
253 
254   dep_insn_code_number = recog_memoized (dep_insn);
255 
256   /* If we can't recognize the insns, we can't really do anything.  */
257   if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
258     return cost;
259 
260   insn_type = get_attr_type (insn);
261   dep_insn_type = get_attr_type (dep_insn);
262 
263   switch (ix86_tune)
264     {
265     case PROCESSOR_PENTIUM:
266     case PROCESSOR_LAKEMONT:
267       /* Address Generation Interlock adds a cycle of latency.  */
268       if (insn_type == TYPE_LEA)
269 	{
270 	  rtx addr = PATTERN (insn);
271 
272 	  if (GET_CODE (addr) == PARALLEL)
273 	    addr = XVECEXP (addr, 0, 0);
274 
275 	  gcc_assert (GET_CODE (addr) == SET);
276 
277 	  addr = SET_SRC (addr);
278 	  if (modified_in_p (addr, dep_insn))
279 	    cost += 1;
280 	}
281       else if (ix86_agi_dependent (dep_insn, insn))
282 	cost += 1;
283 
284       /* ??? Compares pair with jump/setcc.  */
285       if (ix86_flags_dependent (insn, dep_insn, insn_type))
286 	cost = 0;
287 
288       /* Floating point stores require value to be ready one cycle earlier.  */
289       if (insn_type == TYPE_FMOV
290 	  && get_attr_memory (insn) == MEMORY_STORE
291 	  && !ix86_agi_dependent (dep_insn, insn))
292 	cost += 1;
293       break;
294 
295     case PROCESSOR_PENTIUMPRO:
296       /* INT->FP conversion is expensive.  */
297       if (get_attr_fp_int_src (dep_insn))
298 	cost += 5;
299 
300       /* There is one cycle extra latency between an FP op and a store.  */
301       if (insn_type == TYPE_FMOV
302 	  && (set = single_set (dep_insn)) != NULL_RTX
303 	  && (set2 = single_set (insn)) != NULL_RTX
304 	  && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
305 	  && MEM_P (SET_DEST (set2)))
306 	cost += 1;
307 
308       memory = get_attr_memory (insn);
309 
310       /* Show ability of reorder buffer to hide latency of load by executing
311 	 in parallel with previous instruction in case
312 	 previous instruction is not needed to compute the address.  */
313       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
314 	  && !ix86_agi_dependent (dep_insn, insn))
315 	{
316 	  /* Claim moves to take one cycle, as core can issue one load
317 	     at time and the next load can start cycle later.  */
318 	  if (dep_insn_type == TYPE_IMOV
319 	      || dep_insn_type == TYPE_FMOV)
320 	    cost = 1;
321 	  else if (cost > 1)
322 	    cost--;
323 	}
324       break;
325 
326     case PROCESSOR_K6:
327      /* The esp dependency is resolved before
328 	the instruction is really finished.  */
329       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
330 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
331 	return 1;
332 
333       /* INT->FP conversion is expensive.  */
334       if (get_attr_fp_int_src (dep_insn))
335 	cost += 5;
336 
337       memory = get_attr_memory (insn);
338 
339       /* Show ability of reorder buffer to hide latency of load by executing
340 	 in parallel with previous instruction in case
341 	 previous instruction is not needed to compute the address.  */
342       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
343 	  && !ix86_agi_dependent (dep_insn, insn))
344 	{
345 	  /* Claim moves to take one cycle, as core can issue one load
346 	     at time and the next load can start cycle later.  */
347 	  if (dep_insn_type == TYPE_IMOV
348 	      || dep_insn_type == TYPE_FMOV)
349 	    cost = 1;
350 	  else if (cost > 2)
351 	    cost -= 2;
352 	  else
353 	    cost = 1;
354 	}
355       break;
356 
357     case PROCESSOR_AMDFAM10:
358     case PROCESSOR_BDVER1:
359     case PROCESSOR_BDVER2:
360     case PROCESSOR_BDVER3:
361     case PROCESSOR_BDVER4:
362     case PROCESSOR_BTVER1:
363     case PROCESSOR_BTVER2:
364       /* Stack engine allows to execute push&pop instructions in parall.  */
365       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
366 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
367 	return 0;
368       /* FALLTHRU */
369 
370     case PROCESSOR_ATHLON:
371     case PROCESSOR_K8:
372       memory = get_attr_memory (insn);
373 
374       /* Show ability of reorder buffer to hide latency of load by executing
375 	 in parallel with previous instruction in case
376 	 previous instruction is not needed to compute the address.  */
377       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
378 	  && !ix86_agi_dependent (dep_insn, insn))
379 	{
380 	  enum attr_unit unit = get_attr_unit (insn);
381 	  int loadcost = 3;
382 
383 	  /* Because of the difference between the length of integer and
384 	     floating unit pipeline preparation stages, the memory operands
385 	     for floating point are cheaper.
386 
387 	     ??? For Athlon it the difference is most probably 2.  */
388 	  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
389 	    loadcost = 3;
390 	  else
391 	    loadcost = TARGET_CPU_P (ATHLON) ? 2 : 0;
392 
393 	  if (cost >= loadcost)
394 	    cost -= loadcost;
395 	  else
396 	    cost = 0;
397 	}
398       break;
399 
400     case PROCESSOR_ZNVER1:
401     case PROCESSOR_ZNVER2:
402     case PROCESSOR_ZNVER3:
403     case PROCESSOR_ZNVER4:
404       /* Stack engine allows to execute push&pop instructions in parall.  */
405       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
406 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
407 	return 0;
408 
409       memory = get_attr_memory (insn);
410 
411       /* Show ability of reorder buffer to hide latency of load by executing
412 	 in parallel with previous instruction in case
413 	 previous instruction is not needed to compute the address.  */
414       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
415 	  && !ix86_agi_dependent (dep_insn, insn))
416 	{
417 	  enum attr_unit unit = get_attr_unit (insn);
418 	  int loadcost;
419 
420 	  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
421 	    loadcost = 4;
422 	  else
423 	    loadcost = 7;
424 
425 	  if (cost >= loadcost)
426 	    cost -= loadcost;
427 	  else
428 	    cost = 0;
429 	}
430       break;
431 
432     case PROCESSOR_CORE2:
433     case PROCESSOR_NEHALEM:
434     case PROCESSOR_SANDYBRIDGE:
435     case PROCESSOR_HASWELL:
436     case PROCESSOR_TREMONT:
437     case PROCESSOR_ALDERLAKE:
438     case PROCESSOR_GENERIC:
439       /* Stack engine allows to execute push&pop instructions in parall.  */
440       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
441 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
442 	return 0;
443 
444       memory = get_attr_memory (insn);
445 
446       /* Show ability of reorder buffer to hide latency of load by executing
447 	 in parallel with previous instruction in case
448 	 previous instruction is not needed to compute the address.  */
449       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
450 	  && !ix86_agi_dependent (dep_insn, insn))
451 	{
452 	  if (cost >= 4)
453 	    cost -= 4;
454 	  else
455 	    cost = 0;
456 	}
457       break;
458 
459     case PROCESSOR_SILVERMONT:
460     case PROCESSOR_KNL:
461     case PROCESSOR_KNM:
462     case PROCESSOR_INTEL:
463       if (!reload_completed)
464 	return cost;
465 
466       /* Increase cost of integer loads.  */
467       memory = get_attr_memory (dep_insn);
468       if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
469 	{
470 	  enum attr_unit unit = get_attr_unit (dep_insn);
471 	  if (unit == UNIT_INTEGER && cost == 1)
472 	    {
473 	      if (memory == MEMORY_LOAD)
474 		cost = 3;
475 	      else
476 		{
477 		  /* Increase cost of ld/st for short int types only
478 		     because of store forwarding issue.  */
479 		  rtx set = single_set (dep_insn);
480 		  if (set && (GET_MODE (SET_DEST (set)) == QImode
481 			      || GET_MODE (SET_DEST (set)) == HImode))
482 		    {
483 		      /* Increase cost of store/load insn if exact
484 			 dependence exists and it is load insn.  */
485 		      enum attr_memory insn_memory = get_attr_memory (insn);
486 		      if (insn_memory == MEMORY_LOAD
487 			  && exact_store_load_dependency (dep_insn, insn))
488 			cost = 3;
489 		    }
490 		}
491 	    }
492 	}
493 
494     default:
495       break;
496     }
497 
498   return cost;
499 }
500 
501 /* How many alternative schedules to try.  This should be as wide as the
502    scheduling freedom in the DFA, but no wider.  Making this value too
503    large results extra work for the scheduler.  */
504 
505 int
ia32_multipass_dfa_lookahead(void)506 ia32_multipass_dfa_lookahead (void)
507 {
508   /* Generally, we want haifa-sched:max_issue() to look ahead as far
509      as many instructions can be executed on a cycle, i.e.,
510      issue_rate.  */
511   if (reload_completed)
512     return ix86_issue_rate ();
513   /* Don't use lookahead for pre-reload schedule to save compile time.  */
514   return 0;
515 }
516 
517 /* Return true if target platform supports macro-fusion.  */
518 
519 bool
ix86_macro_fusion_p()520 ix86_macro_fusion_p ()
521 {
522   return TARGET_FUSE_CMP_AND_BRANCH;
523 }
524 
525 /* Check whether current microarchitecture support macro fusion
526    for insn pair "CONDGEN + CONDJMP". Refer to
527    "Intel Architectures Optimization Reference Manual". */
528 
529 bool
ix86_macro_fusion_pair_p(rtx_insn * condgen,rtx_insn * condjmp)530 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
531 {
532   rtx src, dest;
533   enum rtx_code ccode;
534   rtx compare_set = NULL_RTX, test_if, cond;
535   rtx alu_set = NULL_RTX, addr = NULL_RTX;
536   enum attr_type condgen_type;
537 
538   if (!any_condjump_p (condjmp))
539     return false;
540 
541   unsigned int condreg1, condreg2;
542   rtx cc_reg_1;
543   targetm.fixed_condition_code_regs (&condreg1, &condreg2);
544   cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
545   if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
546       || !condgen
547       || !modified_in_p (cc_reg_1, condgen))
548     return false;
549 
550   condgen_type = get_attr_type (condgen);
551   if (condgen_type == TYPE_MULTI
552       && INSN_CODE (condgen) == code_for_stack_protect_test_1 (ptr_mode)
553       && TARGET_FUSE_ALU_AND_BRANCH)
554     {
555       /* stack_protect_test_<mode> ends with a sub, which subtracts
556 	 a non-rip special memory operand from a GPR.  */
557       src = NULL_RTX;
558       alu_set = XVECEXP (PATTERN (condgen), 0, 1);
559       goto handle_stack_protect_test;
560     }
561   else if (condgen_type != TYPE_TEST
562 	   && condgen_type != TYPE_ICMP
563 	   && condgen_type != TYPE_INCDEC
564 	   && condgen_type != TYPE_ALU)
565     return false;
566 
567   compare_set = single_set (condgen);
568   if (compare_set == NULL_RTX && !TARGET_FUSE_ALU_AND_BRANCH)
569     return false;
570 
571   if (compare_set == NULL_RTX)
572     {
573       int i;
574       rtx pat = PATTERN (condgen);
575       for (i = 0; i < XVECLEN (pat, 0); i++)
576 	if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
577 	  {
578 	    rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
579 	    if (GET_CODE (set_src) == COMPARE)
580 	      compare_set = XVECEXP (pat, 0, i);
581 	    else
582 	      alu_set = XVECEXP (pat, 0, i);
583 	  }
584     }
585   if (compare_set == NULL_RTX)
586     return false;
587   src = SET_SRC (compare_set);
588   if (GET_CODE (src) != COMPARE)
589     return false;
590 
591   /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
592      supported.  */
593   if ((MEM_P (XEXP (src, 0)) && CONST_INT_P (XEXP (src, 1)))
594       || (MEM_P (XEXP (src, 1)) && CONST_INT_P (XEXP (src, 0))))
595     return false;
596 
597   /* No fusion for RIP-relative address.  */
598   if (MEM_P (XEXP (src, 0)))
599     addr = XEXP (XEXP (src, 0), 0);
600   else if (MEM_P (XEXP (src, 1)))
601     addr = XEXP (XEXP (src, 1), 0);
602 
603   if (addr)
604     {
605       ix86_address parts;
606       int ok = ix86_decompose_address (addr, &parts);
607       gcc_assert (ok);
608 
609       if (ix86_rip_relative_addr_p (&parts))
610 	return false;
611     }
612 
613  handle_stack_protect_test:
614   test_if = SET_SRC (pc_set (condjmp));
615   cond = XEXP (test_if, 0);
616   ccode = GET_CODE (cond);
617   /* Check whether conditional jump use Sign or Overflow Flags.  */
618   if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
619       && (ccode == GE || ccode == GT || ccode == LE || ccode == LT))
620     return false;
621 
622   /* Return true for TYPE_TEST and TYPE_ICMP.  */
623   if (condgen_type == TYPE_TEST || condgen_type == TYPE_ICMP)
624     return true;
625 
626   /* The following is the case that macro-fusion for alu + jmp.  */
627   if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
628     return false;
629 
630   /* No fusion for alu op with memory destination operand.  */
631   dest = SET_DEST (alu_set);
632   if (MEM_P (dest))
633     return false;
634 
635   /* Macro-fusion for inc/dec + unsigned conditional jump is not
636      supported.  */
637   if (condgen_type == TYPE_INCDEC
638       && (ccode == GEU || ccode == GTU || ccode == LEU || ccode == LTU))
639     return false;
640 
641   return true;
642 }
643 
644