xref: /dflybsd-src/contrib/gcc-8.0/gcc/config/i386/x86-tune-sched.c (revision 38fd149817dfbff97799f62fcb70be98c4e32523)
1*38fd1498Szrj /* Scheduler hooks for IA-32 which implement CPU specific logic.
2*38fd1498Szrj    Copyright (C) 1988-2018 Free Software Foundation, Inc.
3*38fd1498Szrj 
4*38fd1498Szrj This file is part of GCC.
5*38fd1498Szrj 
6*38fd1498Szrj GCC is free software; you can redistribute it and/or modify
7*38fd1498Szrj it under the terms of the GNU General Public License as published by
8*38fd1498Szrj the Free Software Foundation; either version 3, or (at your option)
9*38fd1498Szrj any later version.
10*38fd1498Szrj 
11*38fd1498Szrj GCC is distributed in the hope that it will be useful,
12*38fd1498Szrj but WITHOUT ANY WARRANTY; without even the implied warranty of
13*38fd1498Szrj MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14*38fd1498Szrj GNU General Public License for more details.
15*38fd1498Szrj 
16*38fd1498Szrj You should have received a copy of the GNU General Public License
17*38fd1498Szrj along with GCC; see the file COPYING3.  If not see
18*38fd1498Szrj <http://www.gnu.org/licenses/>.  */
19*38fd1498Szrj 
20*38fd1498Szrj #define IN_TARGET_CODE 1
21*38fd1498Szrj 
22*38fd1498Szrj #include "config.h"
23*38fd1498Szrj #include "system.h"
24*38fd1498Szrj #include "coretypes.h"
25*38fd1498Szrj #include "backend.h"
26*38fd1498Szrj #include "rtl.h"
27*38fd1498Szrj #include "tree.h"
28*38fd1498Szrj #include "cfghooks.h"
29*38fd1498Szrj #include "tm_p.h"
30*38fd1498Szrj #include "insn-config.h"
31*38fd1498Szrj #include "insn-attr.h"
32*38fd1498Szrj #include "recog.h"
33*38fd1498Szrj #include "target.h"
34*38fd1498Szrj 
35*38fd1498Szrj /* Return the maximum number of instructions a cpu can issue.  */
36*38fd1498Szrj 
37*38fd1498Szrj int
ix86_issue_rate(void)38*38fd1498Szrj ix86_issue_rate (void)
39*38fd1498Szrj {
40*38fd1498Szrj   switch (ix86_tune)
41*38fd1498Szrj     {
42*38fd1498Szrj     case PROCESSOR_PENTIUM:
43*38fd1498Szrj     case PROCESSOR_LAKEMONT:
44*38fd1498Szrj     case PROCESSOR_BONNELL:
45*38fd1498Szrj     case PROCESSOR_SILVERMONT:
46*38fd1498Szrj     case PROCESSOR_KNL:
47*38fd1498Szrj     case PROCESSOR_KNM:
48*38fd1498Szrj     case PROCESSOR_INTEL:
49*38fd1498Szrj     case PROCESSOR_K6:
50*38fd1498Szrj     case PROCESSOR_BTVER2:
51*38fd1498Szrj     case PROCESSOR_PENTIUM4:
52*38fd1498Szrj     case PROCESSOR_NOCONA:
53*38fd1498Szrj       return 2;
54*38fd1498Szrj 
55*38fd1498Szrj     case PROCESSOR_PENTIUMPRO:
56*38fd1498Szrj     case PROCESSOR_ATHLON:
57*38fd1498Szrj     case PROCESSOR_K8:
58*38fd1498Szrj     case PROCESSOR_AMDFAM10:
59*38fd1498Szrj     case PROCESSOR_BTVER1:
60*38fd1498Szrj       return 3;
61*38fd1498Szrj 
62*38fd1498Szrj     case PROCESSOR_BDVER1:
63*38fd1498Szrj     case PROCESSOR_BDVER2:
64*38fd1498Szrj     case PROCESSOR_BDVER3:
65*38fd1498Szrj     case PROCESSOR_BDVER4:
66*38fd1498Szrj     case PROCESSOR_ZNVER1:
67*38fd1498Szrj     case PROCESSOR_CORE2:
68*38fd1498Szrj     case PROCESSOR_NEHALEM:
69*38fd1498Szrj     case PROCESSOR_SANDYBRIDGE:
70*38fd1498Szrj     case PROCESSOR_HASWELL:
71*38fd1498Szrj     case PROCESSOR_GENERIC:
72*38fd1498Szrj       return 4;
73*38fd1498Szrj 
74*38fd1498Szrj     default:
75*38fd1498Szrj       return 1;
76*38fd1498Szrj     }
77*38fd1498Szrj }
78*38fd1498Szrj 
79*38fd1498Szrj /* Return true iff USE_INSN has a memory address with operands set by
80*38fd1498Szrj    SET_INSN.  */
81*38fd1498Szrj 
82*38fd1498Szrj bool
ix86_agi_dependent(rtx_insn * set_insn,rtx_insn * use_insn)83*38fd1498Szrj ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
84*38fd1498Szrj {
85*38fd1498Szrj   int i;
86*38fd1498Szrj   extract_insn_cached (use_insn);
87*38fd1498Szrj   for (i = recog_data.n_operands - 1; i >= 0; --i)
88*38fd1498Szrj     if (MEM_P (recog_data.operand[i]))
89*38fd1498Szrj       {
90*38fd1498Szrj 	rtx addr = XEXP (recog_data.operand[i], 0);
91*38fd1498Szrj 	if (modified_in_p (addr, set_insn) != 0)
92*38fd1498Szrj 	  {
93*38fd1498Szrj 	    /* No AGI stall if SET_INSN is a push or pop and USE_INSN
94*38fd1498Szrj 	       has SP based memory (unless index reg is modified in a pop).  */
95*38fd1498Szrj 	    rtx set = single_set (set_insn);
96*38fd1498Szrj 	    if (set
97*38fd1498Szrj 		&& (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
98*38fd1498Szrj 		    || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
99*38fd1498Szrj 	      {
100*38fd1498Szrj 		struct ix86_address parts;
101*38fd1498Szrj 		if (ix86_decompose_address (addr, &parts)
102*38fd1498Szrj 		    && parts.base == stack_pointer_rtx
103*38fd1498Szrj 		    && (parts.index == NULL_RTX
104*38fd1498Szrj 			|| MEM_P (SET_DEST (set))
105*38fd1498Szrj 			|| !modified_in_p (parts.index, set_insn)))
106*38fd1498Szrj 		  return false;
107*38fd1498Szrj 	      }
108*38fd1498Szrj 	    return true;
109*38fd1498Szrj 	  }
110*38fd1498Szrj 	return false;
111*38fd1498Szrj       }
112*38fd1498Szrj   return false;
113*38fd1498Szrj }
114*38fd1498Szrj 
115*38fd1498Szrj /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
116*38fd1498Szrj    by DEP_INSN and nothing set by DEP_INSN.  */
117*38fd1498Szrj 
118*38fd1498Szrj static bool
ix86_flags_dependent(rtx_insn * insn,rtx_insn * dep_insn,enum attr_type insn_type)119*38fd1498Szrj ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
120*38fd1498Szrj {
121*38fd1498Szrj   rtx set, set2;
122*38fd1498Szrj 
123*38fd1498Szrj   /* Simplify the test for uninteresting insns.  */
124*38fd1498Szrj   if (insn_type != TYPE_SETCC
125*38fd1498Szrj       && insn_type != TYPE_ICMOV
126*38fd1498Szrj       && insn_type != TYPE_FCMOV
127*38fd1498Szrj       && insn_type != TYPE_IBR)
128*38fd1498Szrj     return false;
129*38fd1498Szrj 
130*38fd1498Szrj   if ((set = single_set (dep_insn)) != 0)
131*38fd1498Szrj     {
132*38fd1498Szrj       set = SET_DEST (set);
133*38fd1498Szrj       set2 = NULL_RTX;
134*38fd1498Szrj     }
135*38fd1498Szrj   else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
136*38fd1498Szrj 	   && XVECLEN (PATTERN (dep_insn), 0) == 2
137*38fd1498Szrj 	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
138*38fd1498Szrj 	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
139*38fd1498Szrj     {
140*38fd1498Szrj       set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
141*38fd1498Szrj       set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
142*38fd1498Szrj     }
143*38fd1498Szrj   else
144*38fd1498Szrj     return false;
145*38fd1498Szrj 
146*38fd1498Szrj   if (!REG_P (set) || REGNO (set) != FLAGS_REG)
147*38fd1498Szrj     return false;
148*38fd1498Szrj 
149*38fd1498Szrj   /* This test is true if the dependent insn reads the flags but
150*38fd1498Szrj      not any other potentially set register.  */
151*38fd1498Szrj   if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
152*38fd1498Szrj     return false;
153*38fd1498Szrj 
154*38fd1498Szrj   if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
155*38fd1498Szrj     return false;
156*38fd1498Szrj 
157*38fd1498Szrj   return true;
158*38fd1498Szrj }
159*38fd1498Szrj 
160*38fd1498Szrj /* Helper function for exact_store_load_dependency.
161*38fd1498Szrj    Return true if addr is found in insn.  */
162*38fd1498Szrj static bool
exact_dependency_1(rtx addr,rtx insn)163*38fd1498Szrj exact_dependency_1 (rtx addr, rtx insn)
164*38fd1498Szrj {
165*38fd1498Szrj   enum rtx_code code;
166*38fd1498Szrj   const char *format_ptr;
167*38fd1498Szrj   int i, j;
168*38fd1498Szrj 
169*38fd1498Szrj   code = GET_CODE (insn);
170*38fd1498Szrj   switch (code)
171*38fd1498Szrj     {
172*38fd1498Szrj     case MEM:
173*38fd1498Szrj       if (rtx_equal_p (addr, insn))
174*38fd1498Szrj 	return true;
175*38fd1498Szrj       break;
176*38fd1498Szrj     case REG:
177*38fd1498Szrj     CASE_CONST_ANY:
178*38fd1498Szrj     case SYMBOL_REF:
179*38fd1498Szrj     case CODE_LABEL:
180*38fd1498Szrj     case PC:
181*38fd1498Szrj     case CC0:
182*38fd1498Szrj     case EXPR_LIST:
183*38fd1498Szrj       return false;
184*38fd1498Szrj     default:
185*38fd1498Szrj       break;
186*38fd1498Szrj     }
187*38fd1498Szrj 
188*38fd1498Szrj   format_ptr = GET_RTX_FORMAT (code);
189*38fd1498Szrj   for (i = 0; i < GET_RTX_LENGTH (code); i++)
190*38fd1498Szrj     {
191*38fd1498Szrj       switch (*format_ptr++)
192*38fd1498Szrj 	{
193*38fd1498Szrj 	case 'e':
194*38fd1498Szrj 	  if (exact_dependency_1 (addr, XEXP (insn, i)))
195*38fd1498Szrj 	    return true;
196*38fd1498Szrj 	  break;
197*38fd1498Szrj 	case 'E':
198*38fd1498Szrj 	  for (j = 0; j < XVECLEN (insn, i); j++)
199*38fd1498Szrj 	    if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
200*38fd1498Szrj 	      return true;
201*38fd1498Szrj 	  break;
202*38fd1498Szrj 	}
203*38fd1498Szrj     }
204*38fd1498Szrj   return false;
205*38fd1498Szrj }
206*38fd1498Szrj 
207*38fd1498Szrj /* Return true if there exists exact dependency for store & load, i.e.
208*38fd1498Szrj    the same memory address is used in them.  */
209*38fd1498Szrj static bool
exact_store_load_dependency(rtx_insn * store,rtx_insn * load)210*38fd1498Szrj exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
211*38fd1498Szrj {
212*38fd1498Szrj   rtx set1, set2;
213*38fd1498Szrj 
214*38fd1498Szrj   set1 = single_set (store);
215*38fd1498Szrj   if (!set1)
216*38fd1498Szrj     return false;
217*38fd1498Szrj   if (!MEM_P (SET_DEST (set1)))
218*38fd1498Szrj     return false;
219*38fd1498Szrj   set2 = single_set (load);
220*38fd1498Szrj   if (!set2)
221*38fd1498Szrj     return false;
222*38fd1498Szrj   if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
223*38fd1498Szrj     return true;
224*38fd1498Szrj   return false;
225*38fd1498Szrj }
226*38fd1498Szrj 
227*38fd1498Szrj 
228*38fd1498Szrj /* This function corrects the value of COST (latency) based on the relationship
229*38fd1498Szrj    between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength
230*38fd1498Szrj    DW.  It should return the new value.
231*38fd1498Szrj 
232*38fd1498Szrj    On x86 CPUs this is most commonly used to model the fact that valus of
233*38fd1498Szrj    registers used to compute address of memory operand  needs to be ready
234*38fd1498Szrj    earlier than values of registers used in the actual operation.  */
235*38fd1498Szrj 
236*38fd1498Szrj int
ix86_adjust_cost(rtx_insn * insn,int dep_type,rtx_insn * dep_insn,int cost,unsigned int)237*38fd1498Szrj ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
238*38fd1498Szrj 		  unsigned int)
239*38fd1498Szrj {
240*38fd1498Szrj   enum attr_type insn_type, dep_insn_type;
241*38fd1498Szrj   enum attr_memory memory;
242*38fd1498Szrj   rtx set, set2;
243*38fd1498Szrj   int dep_insn_code_number;
244*38fd1498Szrj 
245*38fd1498Szrj   /* Anti and output dependencies have zero cost on all CPUs.  */
246*38fd1498Szrj   if (dep_type != 0)
247*38fd1498Szrj     return 0;
248*38fd1498Szrj 
249*38fd1498Szrj   dep_insn_code_number = recog_memoized (dep_insn);
250*38fd1498Szrj 
251*38fd1498Szrj   /* If we can't recognize the insns, we can't really do anything.  */
252*38fd1498Szrj   if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
253*38fd1498Szrj     return cost;
254*38fd1498Szrj 
255*38fd1498Szrj   insn_type = get_attr_type (insn);
256*38fd1498Szrj   dep_insn_type = get_attr_type (dep_insn);
257*38fd1498Szrj 
258*38fd1498Szrj   switch (ix86_tune)
259*38fd1498Szrj     {
260*38fd1498Szrj     case PROCESSOR_PENTIUM:
261*38fd1498Szrj     case PROCESSOR_LAKEMONT:
262*38fd1498Szrj       /* Address Generation Interlock adds a cycle of latency.  */
263*38fd1498Szrj       if (insn_type == TYPE_LEA)
264*38fd1498Szrj 	{
265*38fd1498Szrj 	  rtx addr = PATTERN (insn);
266*38fd1498Szrj 
267*38fd1498Szrj 	  if (GET_CODE (addr) == PARALLEL)
268*38fd1498Szrj 	    addr = XVECEXP (addr, 0, 0);
269*38fd1498Szrj 
270*38fd1498Szrj 	  gcc_assert (GET_CODE (addr) == SET);
271*38fd1498Szrj 
272*38fd1498Szrj 	  addr = SET_SRC (addr);
273*38fd1498Szrj 	  if (modified_in_p (addr, dep_insn))
274*38fd1498Szrj 	    cost += 1;
275*38fd1498Szrj 	}
276*38fd1498Szrj       else if (ix86_agi_dependent (dep_insn, insn))
277*38fd1498Szrj 	cost += 1;
278*38fd1498Szrj 
279*38fd1498Szrj       /* ??? Compares pair with jump/setcc.  */
280*38fd1498Szrj       if (ix86_flags_dependent (insn, dep_insn, insn_type))
281*38fd1498Szrj 	cost = 0;
282*38fd1498Szrj 
283*38fd1498Szrj       /* Floating point stores require value to be ready one cycle earlier.  */
284*38fd1498Szrj       if (insn_type == TYPE_FMOV
285*38fd1498Szrj 	  && get_attr_memory (insn) == MEMORY_STORE
286*38fd1498Szrj 	  && !ix86_agi_dependent (dep_insn, insn))
287*38fd1498Szrj 	cost += 1;
288*38fd1498Szrj       break;
289*38fd1498Szrj 
290*38fd1498Szrj     case PROCESSOR_PENTIUMPRO:
291*38fd1498Szrj       /* INT->FP conversion is expensive.  */
292*38fd1498Szrj       if (get_attr_fp_int_src (dep_insn))
293*38fd1498Szrj 	cost += 5;
294*38fd1498Szrj 
295*38fd1498Szrj       /* There is one cycle extra latency between an FP op and a store.  */
296*38fd1498Szrj       if (insn_type == TYPE_FMOV
297*38fd1498Szrj 	  && (set = single_set (dep_insn)) != NULL_RTX
298*38fd1498Szrj 	  && (set2 = single_set (insn)) != NULL_RTX
299*38fd1498Szrj 	  && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
300*38fd1498Szrj 	  && MEM_P (SET_DEST (set2)))
301*38fd1498Szrj 	cost += 1;
302*38fd1498Szrj 
303*38fd1498Szrj       memory = get_attr_memory (insn);
304*38fd1498Szrj 
305*38fd1498Szrj       /* Show ability of reorder buffer to hide latency of load by executing
306*38fd1498Szrj 	 in parallel with previous instruction in case
307*38fd1498Szrj 	 previous instruction is not needed to compute the address.  */
308*38fd1498Szrj       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
309*38fd1498Szrj 	  && !ix86_agi_dependent (dep_insn, insn))
310*38fd1498Szrj 	{
311*38fd1498Szrj 	  /* Claim moves to take one cycle, as core can issue one load
312*38fd1498Szrj 	     at time and the next load can start cycle later.  */
313*38fd1498Szrj 	  if (dep_insn_type == TYPE_IMOV
314*38fd1498Szrj 	      || dep_insn_type == TYPE_FMOV)
315*38fd1498Szrj 	    cost = 1;
316*38fd1498Szrj 	  else if (cost > 1)
317*38fd1498Szrj 	    cost--;
318*38fd1498Szrj 	}
319*38fd1498Szrj       break;
320*38fd1498Szrj 
321*38fd1498Szrj     case PROCESSOR_K6:
322*38fd1498Szrj      /* The esp dependency is resolved before
323*38fd1498Szrj 	the instruction is really finished.  */
324*38fd1498Szrj       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
325*38fd1498Szrj 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
326*38fd1498Szrj 	return 1;
327*38fd1498Szrj 
328*38fd1498Szrj       /* INT->FP conversion is expensive.  */
329*38fd1498Szrj       if (get_attr_fp_int_src (dep_insn))
330*38fd1498Szrj 	cost += 5;
331*38fd1498Szrj 
332*38fd1498Szrj       memory = get_attr_memory (insn);
333*38fd1498Szrj 
334*38fd1498Szrj       /* Show ability of reorder buffer to hide latency of load by executing
335*38fd1498Szrj 	 in parallel with previous instruction in case
336*38fd1498Szrj 	 previous instruction is not needed to compute the address.  */
337*38fd1498Szrj       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
338*38fd1498Szrj 	  && !ix86_agi_dependent (dep_insn, insn))
339*38fd1498Szrj 	{
340*38fd1498Szrj 	  /* Claim moves to take one cycle, as core can issue one load
341*38fd1498Szrj 	     at time and the next load can start cycle later.  */
342*38fd1498Szrj 	  if (dep_insn_type == TYPE_IMOV
343*38fd1498Szrj 	      || dep_insn_type == TYPE_FMOV)
344*38fd1498Szrj 	    cost = 1;
345*38fd1498Szrj 	  else if (cost > 2)
346*38fd1498Szrj 	    cost -= 2;
347*38fd1498Szrj 	  else
348*38fd1498Szrj 	    cost = 1;
349*38fd1498Szrj 	}
350*38fd1498Szrj       break;
351*38fd1498Szrj 
352*38fd1498Szrj     case PROCESSOR_AMDFAM10:
353*38fd1498Szrj     case PROCESSOR_BDVER1:
354*38fd1498Szrj     case PROCESSOR_BDVER2:
355*38fd1498Szrj     case PROCESSOR_BDVER3:
356*38fd1498Szrj     case PROCESSOR_BDVER4:
357*38fd1498Szrj     case PROCESSOR_BTVER1:
358*38fd1498Szrj     case PROCESSOR_BTVER2:
359*38fd1498Szrj       /* Stack engine allows to execute push&pop instructions in parall.  */
360*38fd1498Szrj       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
361*38fd1498Szrj 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
362*38fd1498Szrj 	return 0;
363*38fd1498Szrj       /* FALLTHRU */
364*38fd1498Szrj 
365*38fd1498Szrj     case PROCESSOR_ATHLON:
366*38fd1498Szrj     case PROCESSOR_K8:
367*38fd1498Szrj       memory = get_attr_memory (insn);
368*38fd1498Szrj 
369*38fd1498Szrj       /* Show ability of reorder buffer to hide latency of load by executing
370*38fd1498Szrj 	 in parallel with previous instruction in case
371*38fd1498Szrj 	 previous instruction is not needed to compute the address.  */
372*38fd1498Szrj       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
373*38fd1498Szrj 	  && !ix86_agi_dependent (dep_insn, insn))
374*38fd1498Szrj 	{
375*38fd1498Szrj 	  enum attr_unit unit = get_attr_unit (insn);
376*38fd1498Szrj 	  int loadcost = 3;
377*38fd1498Szrj 
378*38fd1498Szrj 	  /* Because of the difference between the length of integer and
379*38fd1498Szrj 	     floating unit pipeline preparation stages, the memory operands
380*38fd1498Szrj 	     for floating point are cheaper.
381*38fd1498Szrj 
382*38fd1498Szrj 	     ??? For Athlon it the difference is most probably 2.  */
383*38fd1498Szrj 	  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
384*38fd1498Szrj 	    loadcost = 3;
385*38fd1498Szrj 	  else
386*38fd1498Szrj 	    loadcost = TARGET_ATHLON ? 2 : 0;
387*38fd1498Szrj 
388*38fd1498Szrj 	  if (cost >= loadcost)
389*38fd1498Szrj 	    cost -= loadcost;
390*38fd1498Szrj 	  else
391*38fd1498Szrj 	    cost = 0;
392*38fd1498Szrj 	}
393*38fd1498Szrj       break;
394*38fd1498Szrj 
395*38fd1498Szrj     case PROCESSOR_ZNVER1:
396*38fd1498Szrj       /* Stack engine allows to execute push&pop instructions in parall.  */
397*38fd1498Szrj       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
398*38fd1498Szrj 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
399*38fd1498Szrj 	return 0;
400*38fd1498Szrj 
401*38fd1498Szrj       memory = get_attr_memory (insn);
402*38fd1498Szrj 
403*38fd1498Szrj       /* Show ability of reorder buffer to hide latency of load by executing
404*38fd1498Szrj 	 in parallel with previous instruction in case
405*38fd1498Szrj 	 previous instruction is not needed to compute the address.  */
406*38fd1498Szrj       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
407*38fd1498Szrj 	  && !ix86_agi_dependent (dep_insn, insn))
408*38fd1498Szrj 	{
409*38fd1498Szrj 	  enum attr_unit unit = get_attr_unit (insn);
410*38fd1498Szrj 	  int loadcost;
411*38fd1498Szrj 
412*38fd1498Szrj 	  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
413*38fd1498Szrj 	    loadcost = 4;
414*38fd1498Szrj 	  else
415*38fd1498Szrj 	    loadcost = 7;
416*38fd1498Szrj 
417*38fd1498Szrj 	  if (cost >= loadcost)
418*38fd1498Szrj 	    cost -= loadcost;
419*38fd1498Szrj 	  else
420*38fd1498Szrj 	    cost = 0;
421*38fd1498Szrj 	}
422*38fd1498Szrj       break;
423*38fd1498Szrj 
424*38fd1498Szrj     case PROCESSOR_CORE2:
425*38fd1498Szrj     case PROCESSOR_NEHALEM:
426*38fd1498Szrj     case PROCESSOR_SANDYBRIDGE:
427*38fd1498Szrj     case PROCESSOR_HASWELL:
428*38fd1498Szrj     case PROCESSOR_GENERIC:
429*38fd1498Szrj       /* Stack engine allows to execute push&pop instructions in parall.  */
430*38fd1498Szrj       if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
431*38fd1498Szrj 	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
432*38fd1498Szrj 	return 0;
433*38fd1498Szrj 
434*38fd1498Szrj       memory = get_attr_memory (insn);
435*38fd1498Szrj 
436*38fd1498Szrj       /* Show ability of reorder buffer to hide latency of load by executing
437*38fd1498Szrj 	 in parallel with previous instruction in case
438*38fd1498Szrj 	 previous instruction is not needed to compute the address.  */
439*38fd1498Szrj       if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
440*38fd1498Szrj 	  && !ix86_agi_dependent (dep_insn, insn))
441*38fd1498Szrj 	{
442*38fd1498Szrj 	  if (cost >= 4)
443*38fd1498Szrj 	    cost -= 4;
444*38fd1498Szrj 	  else
445*38fd1498Szrj 	    cost = 0;
446*38fd1498Szrj 	}
447*38fd1498Szrj       break;
448*38fd1498Szrj 
449*38fd1498Szrj     case PROCESSOR_SILVERMONT:
450*38fd1498Szrj     case PROCESSOR_KNL:
451*38fd1498Szrj     case PROCESSOR_KNM:
452*38fd1498Szrj     case PROCESSOR_INTEL:
453*38fd1498Szrj       if (!reload_completed)
454*38fd1498Szrj 	return cost;
455*38fd1498Szrj 
456*38fd1498Szrj       /* Increase cost of integer loads.  */
457*38fd1498Szrj       memory = get_attr_memory (dep_insn);
458*38fd1498Szrj       if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
459*38fd1498Szrj 	{
460*38fd1498Szrj 	  enum attr_unit unit = get_attr_unit (dep_insn);
461*38fd1498Szrj 	  if (unit == UNIT_INTEGER && cost == 1)
462*38fd1498Szrj 	    {
463*38fd1498Szrj 	      if (memory == MEMORY_LOAD)
464*38fd1498Szrj 		cost = 3;
465*38fd1498Szrj 	      else
466*38fd1498Szrj 		{
467*38fd1498Szrj 		  /* Increase cost of ld/st for short int types only
468*38fd1498Szrj 		     because of store forwarding issue.  */
469*38fd1498Szrj 		  rtx set = single_set (dep_insn);
470*38fd1498Szrj 		  if (set && (GET_MODE (SET_DEST (set)) == QImode
471*38fd1498Szrj 			      || GET_MODE (SET_DEST (set)) == HImode))
472*38fd1498Szrj 		    {
473*38fd1498Szrj 		      /* Increase cost of store/load insn if exact
474*38fd1498Szrj 			 dependence exists and it is load insn.  */
475*38fd1498Szrj 		      enum attr_memory insn_memory = get_attr_memory (insn);
476*38fd1498Szrj 		      if (insn_memory == MEMORY_LOAD
477*38fd1498Szrj 			  && exact_store_load_dependency (dep_insn, insn))
478*38fd1498Szrj 			cost = 3;
479*38fd1498Szrj 		    }
480*38fd1498Szrj 		}
481*38fd1498Szrj 	    }
482*38fd1498Szrj 	}
483*38fd1498Szrj 
484*38fd1498Szrj     default:
485*38fd1498Szrj       break;
486*38fd1498Szrj     }
487*38fd1498Szrj 
488*38fd1498Szrj   return cost;
489*38fd1498Szrj }
490*38fd1498Szrj 
491*38fd1498Szrj /* How many alternative schedules to try.  This should be as wide as the
492*38fd1498Szrj    scheduling freedom in the DFA, but no wider.  Making this value too
493*38fd1498Szrj    large results extra work for the scheduler.  */
494*38fd1498Szrj 
495*38fd1498Szrj int
ia32_multipass_dfa_lookahead(void)496*38fd1498Szrj ia32_multipass_dfa_lookahead (void)
497*38fd1498Szrj {
498*38fd1498Szrj   /* Generally, we want haifa-sched:max_issue() to look ahead as far
499*38fd1498Szrj      as many instructions can be executed on a cycle, i.e.,
500*38fd1498Szrj      issue_rate.  */
501*38fd1498Szrj   if (reload_completed)
502*38fd1498Szrj     return ix86_issue_rate ();
503*38fd1498Szrj   /* Don't use lookahead for pre-reload schedule to save compile time.  */
504*38fd1498Szrj   return 0;
505*38fd1498Szrj }
506*38fd1498Szrj 
507*38fd1498Szrj /* Return true if target platform supports macro-fusion.  */
508*38fd1498Szrj 
509*38fd1498Szrj bool
ix86_macro_fusion_p()510*38fd1498Szrj ix86_macro_fusion_p ()
511*38fd1498Szrj {
512*38fd1498Szrj   return TARGET_FUSE_CMP_AND_BRANCH;
513*38fd1498Szrj }
514*38fd1498Szrj 
515*38fd1498Szrj /* Check whether current microarchitecture support macro fusion
516*38fd1498Szrj    for insn pair "CONDGEN + CONDJMP". Refer to
517*38fd1498Szrj    "Intel Architectures Optimization Reference Manual". */
518*38fd1498Szrj 
519*38fd1498Szrj bool
ix86_macro_fusion_pair_p(rtx_insn * condgen,rtx_insn * condjmp)520*38fd1498Szrj ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
521*38fd1498Szrj {
522*38fd1498Szrj   rtx src, dest;
523*38fd1498Szrj   enum rtx_code ccode;
524*38fd1498Szrj   rtx compare_set = NULL_RTX, test_if, cond;
525*38fd1498Szrj   rtx alu_set = NULL_RTX, addr = NULL_RTX;
526*38fd1498Szrj 
527*38fd1498Szrj   if (!any_condjump_p (condjmp))
528*38fd1498Szrj     return false;
529*38fd1498Szrj 
530*38fd1498Szrj   unsigned int condreg1, condreg2;
531*38fd1498Szrj   rtx cc_reg_1;
532*38fd1498Szrj   targetm.fixed_condition_code_regs (&condreg1, &condreg2);
533*38fd1498Szrj   cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
534*38fd1498Szrj   if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
535*38fd1498Szrj       || !condgen
536*38fd1498Szrj       || !modified_in_p (cc_reg_1, condgen))
537*38fd1498Szrj     return false;
538*38fd1498Szrj 
539*38fd1498Szrj   if (get_attr_type (condgen) != TYPE_TEST
540*38fd1498Szrj       && get_attr_type (condgen) != TYPE_ICMP
541*38fd1498Szrj       && get_attr_type (condgen) != TYPE_INCDEC
542*38fd1498Szrj       && get_attr_type (condgen) != TYPE_ALU)
543*38fd1498Szrj     return false;
544*38fd1498Szrj 
545*38fd1498Szrj   compare_set = single_set (condgen);
546*38fd1498Szrj   if (compare_set == NULL_RTX
547*38fd1498Szrj       && !TARGET_FUSE_ALU_AND_BRANCH)
548*38fd1498Szrj     return false;
549*38fd1498Szrj 
550*38fd1498Szrj   if (compare_set == NULL_RTX)
551*38fd1498Szrj     {
552*38fd1498Szrj       int i;
553*38fd1498Szrj       rtx pat = PATTERN (condgen);
554*38fd1498Szrj       for (i = 0; i < XVECLEN (pat, 0); i++)
555*38fd1498Szrj 	if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
556*38fd1498Szrj 	  {
557*38fd1498Szrj 	    rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
558*38fd1498Szrj 	    if (GET_CODE (set_src) == COMPARE)
559*38fd1498Szrj 	      compare_set = XVECEXP (pat, 0, i);
560*38fd1498Szrj 	    else
561*38fd1498Szrj 	      alu_set = XVECEXP (pat, 0, i);
562*38fd1498Szrj 	  }
563*38fd1498Szrj     }
564*38fd1498Szrj   if (compare_set == NULL_RTX)
565*38fd1498Szrj     return false;
566*38fd1498Szrj   src = SET_SRC (compare_set);
567*38fd1498Szrj   if (GET_CODE (src) != COMPARE)
568*38fd1498Szrj     return false;
569*38fd1498Szrj 
570*38fd1498Szrj   /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
571*38fd1498Szrj      supported.  */
572*38fd1498Szrj   if ((MEM_P (XEXP (src, 0))
573*38fd1498Szrj        && CONST_INT_P (XEXP (src, 1)))
574*38fd1498Szrj       || (MEM_P (XEXP (src, 1))
575*38fd1498Szrj 	  && CONST_INT_P (XEXP (src, 0))))
576*38fd1498Szrj     return false;
577*38fd1498Szrj 
578*38fd1498Szrj   /* No fusion for RIP-relative address.  */
579*38fd1498Szrj   if (MEM_P (XEXP (src, 0)))
580*38fd1498Szrj     addr = XEXP (XEXP (src, 0), 0);
581*38fd1498Szrj   else if (MEM_P (XEXP (src, 1)))
582*38fd1498Szrj     addr = XEXP (XEXP (src, 1), 0);
583*38fd1498Szrj 
584*38fd1498Szrj   if (addr) {
585*38fd1498Szrj     ix86_address parts;
586*38fd1498Szrj     int ok = ix86_decompose_address (addr, &parts);
587*38fd1498Szrj     gcc_assert (ok);
588*38fd1498Szrj 
589*38fd1498Szrj     if (ix86_rip_relative_addr_p (&parts))
590*38fd1498Szrj       return false;
591*38fd1498Szrj   }
592*38fd1498Szrj 
593*38fd1498Szrj   test_if = SET_SRC (pc_set (condjmp));
594*38fd1498Szrj   cond = XEXP (test_if, 0);
595*38fd1498Szrj   ccode = GET_CODE (cond);
596*38fd1498Szrj   /* Check whether conditional jump use Sign or Overflow Flags.  */
597*38fd1498Szrj   if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
598*38fd1498Szrj       && (ccode == GE
599*38fd1498Szrj           || ccode == GT
600*38fd1498Szrj 	  || ccode == LE
601*38fd1498Szrj 	  || ccode == LT))
602*38fd1498Szrj     return false;
603*38fd1498Szrj 
604*38fd1498Szrj   /* Return true for TYPE_TEST and TYPE_ICMP.  */
605*38fd1498Szrj   if (get_attr_type (condgen) == TYPE_TEST
606*38fd1498Szrj       || get_attr_type (condgen) == TYPE_ICMP)
607*38fd1498Szrj     return true;
608*38fd1498Szrj 
609*38fd1498Szrj   /* The following is the case that macro-fusion for alu + jmp.  */
610*38fd1498Szrj   if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
611*38fd1498Szrj     return false;
612*38fd1498Szrj 
613*38fd1498Szrj   /* No fusion for alu op with memory destination operand.  */
614*38fd1498Szrj   dest = SET_DEST (alu_set);
615*38fd1498Szrj   if (MEM_P (dest))
616*38fd1498Szrj     return false;
617*38fd1498Szrj 
618*38fd1498Szrj   /* Macro-fusion for inc/dec + unsigned conditional jump is not
619*38fd1498Szrj      supported.  */
620*38fd1498Szrj   if (get_attr_type (condgen) == TYPE_INCDEC
621*38fd1498Szrj       && (ccode == GEU
622*38fd1498Szrj 	  || ccode == GTU
623*38fd1498Szrj 	  || ccode == LEU
624*38fd1498Szrj 	  || ccode == LTU))
625*38fd1498Szrj     return false;
626*38fd1498Szrj 
627*38fd1498Szrj   return true;
628*38fd1498Szrj }
629*38fd1498Szrj 
630