1 /* Scheduler hooks for IA-32 which implement CPU specific logic.
2 Copyright (C) 1988-2020 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3, or (at your option)
9 any later version.
10
11 GCC is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #define IN_TARGET_CODE 1
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "rtl.h"
27 #include "tree.h"
28 #include "cfghooks.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "insn-config.h"
32 #include "insn-attr.h"
33 #include "insn-opinit.h"
34 #include "recog.h"
35
36 /* Return the maximum number of instructions a cpu can issue. */
37
38 int
ix86_issue_rate(void)39 ix86_issue_rate (void)
40 {
41 switch (ix86_tune)
42 {
43 case PROCESSOR_PENTIUM:
44 case PROCESSOR_LAKEMONT:
45 case PROCESSOR_BONNELL:
46 case PROCESSOR_SILVERMONT:
47 case PROCESSOR_KNL:
48 case PROCESSOR_KNM:
49 case PROCESSOR_INTEL:
50 case PROCESSOR_K6:
51 case PROCESSOR_BTVER2:
52 case PROCESSOR_PENTIUM4:
53 case PROCESSOR_NOCONA:
54 return 2;
55
56 case PROCESSOR_PENTIUMPRO:
57 case PROCESSOR_ATHLON:
58 case PROCESSOR_K8:
59 case PROCESSOR_AMDFAM10:
60 case PROCESSOR_BTVER1:
61 return 3;
62
63 case PROCESSOR_BDVER1:
64 case PROCESSOR_BDVER2:
65 case PROCESSOR_BDVER3:
66 case PROCESSOR_BDVER4:
67 case PROCESSOR_ZNVER1:
68 case PROCESSOR_ZNVER2:
69 case PROCESSOR_ZNVER3:
70 case PROCESSOR_CORE2:
71 case PROCESSOR_NEHALEM:
72 case PROCESSOR_SANDYBRIDGE:
73 case PROCESSOR_HASWELL:
74 case PROCESSOR_GENERIC:
75 return 4;
76
77 default:
78 return 1;
79 }
80 }
81
82 /* Return true iff USE_INSN has a memory address with operands set by
83 SET_INSN. */
84
85 bool
ix86_agi_dependent(rtx_insn * set_insn,rtx_insn * use_insn)86 ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
87 {
88 int i;
89 extract_insn_cached (use_insn);
90 for (i = recog_data.n_operands - 1; i >= 0; --i)
91 if (MEM_P (recog_data.operand[i]))
92 {
93 rtx addr = XEXP (recog_data.operand[i], 0);
94 if (modified_in_p (addr, set_insn) != 0)
95 {
96 /* No AGI stall if SET_INSN is a push or pop and USE_INSN
97 has SP based memory (unless index reg is modified in a pop). */
98 rtx set = single_set (set_insn);
99 if (set
100 && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
101 || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
102 {
103 struct ix86_address parts;
104 if (ix86_decompose_address (addr, &parts)
105 && parts.base == stack_pointer_rtx
106 && (parts.index == NULL_RTX
107 || MEM_P (SET_DEST (set))
108 || !modified_in_p (parts.index, set_insn)))
109 return false;
110 }
111 return true;
112 }
113 return false;
114 }
115 return false;
116 }
117
118 /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
119 by DEP_INSN and nothing set by DEP_INSN. */
120
121 static bool
ix86_flags_dependent(rtx_insn * insn,rtx_insn * dep_insn,enum attr_type insn_type)122 ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
123 {
124 rtx set, set2;
125
126 /* Simplify the test for uninteresting insns. */
127 if (insn_type != TYPE_SETCC
128 && insn_type != TYPE_ICMOV
129 && insn_type != TYPE_FCMOV
130 && insn_type != TYPE_IBR)
131 return false;
132
133 if ((set = single_set (dep_insn)) != 0)
134 {
135 set = SET_DEST (set);
136 set2 = NULL_RTX;
137 }
138 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
139 && XVECLEN (PATTERN (dep_insn), 0) == 2
140 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
141 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
142 {
143 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
144 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
145 }
146 else
147 return false;
148
149 if (!REG_P (set) || REGNO (set) != FLAGS_REG)
150 return false;
151
152 /* This test is true if the dependent insn reads the flags but
153 not any other potentially set register. */
154 if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
155 return false;
156
157 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
158 return false;
159
160 return true;
161 }
162
163 /* Helper function for exact_store_load_dependency.
164 Return true if addr is found in insn. */
165 static bool
exact_dependency_1(rtx addr,rtx insn)166 exact_dependency_1 (rtx addr, rtx insn)
167 {
168 enum rtx_code code;
169 const char *format_ptr;
170 int i, j;
171
172 code = GET_CODE (insn);
173 switch (code)
174 {
175 case MEM:
176 if (rtx_equal_p (addr, insn))
177 return true;
178 break;
179 case REG:
180 CASE_CONST_ANY:
181 case SYMBOL_REF:
182 case CODE_LABEL:
183 case PC:
184 case CC0:
185 case EXPR_LIST:
186 return false;
187 default:
188 break;
189 }
190
191 format_ptr = GET_RTX_FORMAT (code);
192 for (i = 0; i < GET_RTX_LENGTH (code); i++)
193 {
194 switch (*format_ptr++)
195 {
196 case 'e':
197 if (exact_dependency_1 (addr, XEXP (insn, i)))
198 return true;
199 break;
200 case 'E':
201 for (j = 0; j < XVECLEN (insn, i); j++)
202 if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
203 return true;
204 break;
205 }
206 }
207 return false;
208 }
209
210 /* Return true if there exists exact dependency for store & load, i.e.
211 the same memory address is used in them. */
212 static bool
exact_store_load_dependency(rtx_insn * store,rtx_insn * load)213 exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
214 {
215 rtx set1, set2;
216
217 set1 = single_set (store);
218 if (!set1)
219 return false;
220 if (!MEM_P (SET_DEST (set1)))
221 return false;
222 set2 = single_set (load);
223 if (!set2)
224 return false;
225 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
226 return true;
227 return false;
228 }
229
230
231 /* This function corrects the value of COST (latency) based on the relationship
232 between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength
233 DW. It should return the new value.
234
235 On x86 CPUs this is most commonly used to model the fact that valus of
236 registers used to compute address of memory operand needs to be ready
237 earlier than values of registers used in the actual operation. */
238
239 int
ix86_adjust_cost(rtx_insn * insn,int dep_type,rtx_insn * dep_insn,int cost,unsigned int)240 ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
241 unsigned int)
242 {
243 enum attr_type insn_type, dep_insn_type;
244 enum attr_memory memory;
245 rtx set, set2;
246 int dep_insn_code_number;
247
248 /* Anti and output dependencies have zero cost on all CPUs. */
249 if (dep_type != 0)
250 return 0;
251
252 dep_insn_code_number = recog_memoized (dep_insn);
253
254 /* If we can't recognize the insns, we can't really do anything. */
255 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
256 return cost;
257
258 insn_type = get_attr_type (insn);
259 dep_insn_type = get_attr_type (dep_insn);
260
261 switch (ix86_tune)
262 {
263 case PROCESSOR_PENTIUM:
264 case PROCESSOR_LAKEMONT:
265 /* Address Generation Interlock adds a cycle of latency. */
266 if (insn_type == TYPE_LEA)
267 {
268 rtx addr = PATTERN (insn);
269
270 if (GET_CODE (addr) == PARALLEL)
271 addr = XVECEXP (addr, 0, 0);
272
273 gcc_assert (GET_CODE (addr) == SET);
274
275 addr = SET_SRC (addr);
276 if (modified_in_p (addr, dep_insn))
277 cost += 1;
278 }
279 else if (ix86_agi_dependent (dep_insn, insn))
280 cost += 1;
281
282 /* ??? Compares pair with jump/setcc. */
283 if (ix86_flags_dependent (insn, dep_insn, insn_type))
284 cost = 0;
285
286 /* Floating point stores require value to be ready one cycle earlier. */
287 if (insn_type == TYPE_FMOV
288 && get_attr_memory (insn) == MEMORY_STORE
289 && !ix86_agi_dependent (dep_insn, insn))
290 cost += 1;
291 break;
292
293 case PROCESSOR_PENTIUMPRO:
294 /* INT->FP conversion is expensive. */
295 if (get_attr_fp_int_src (dep_insn))
296 cost += 5;
297
298 /* There is one cycle extra latency between an FP op and a store. */
299 if (insn_type == TYPE_FMOV
300 && (set = single_set (dep_insn)) != NULL_RTX
301 && (set2 = single_set (insn)) != NULL_RTX
302 && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
303 && MEM_P (SET_DEST (set2)))
304 cost += 1;
305
306 memory = get_attr_memory (insn);
307
308 /* Show ability of reorder buffer to hide latency of load by executing
309 in parallel with previous instruction in case
310 previous instruction is not needed to compute the address. */
311 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
312 && !ix86_agi_dependent (dep_insn, insn))
313 {
314 /* Claim moves to take one cycle, as core can issue one load
315 at time and the next load can start cycle later. */
316 if (dep_insn_type == TYPE_IMOV
317 || dep_insn_type == TYPE_FMOV)
318 cost = 1;
319 else if (cost > 1)
320 cost--;
321 }
322 break;
323
324 case PROCESSOR_K6:
325 /* The esp dependency is resolved before
326 the instruction is really finished. */
327 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
328 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
329 return 1;
330
331 /* INT->FP conversion is expensive. */
332 if (get_attr_fp_int_src (dep_insn))
333 cost += 5;
334
335 memory = get_attr_memory (insn);
336
337 /* Show ability of reorder buffer to hide latency of load by executing
338 in parallel with previous instruction in case
339 previous instruction is not needed to compute the address. */
340 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
341 && !ix86_agi_dependent (dep_insn, insn))
342 {
343 /* Claim moves to take one cycle, as core can issue one load
344 at time and the next load can start cycle later. */
345 if (dep_insn_type == TYPE_IMOV
346 || dep_insn_type == TYPE_FMOV)
347 cost = 1;
348 else if (cost > 2)
349 cost -= 2;
350 else
351 cost = 1;
352 }
353 break;
354
355 case PROCESSOR_AMDFAM10:
356 case PROCESSOR_BDVER1:
357 case PROCESSOR_BDVER2:
358 case PROCESSOR_BDVER3:
359 case PROCESSOR_BDVER4:
360 case PROCESSOR_BTVER1:
361 case PROCESSOR_BTVER2:
362 /* Stack engine allows to execute push&pop instructions in parall. */
363 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
364 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
365 return 0;
366 /* FALLTHRU */
367
368 case PROCESSOR_ATHLON:
369 case PROCESSOR_K8:
370 memory = get_attr_memory (insn);
371
372 /* Show ability of reorder buffer to hide latency of load by executing
373 in parallel with previous instruction in case
374 previous instruction is not needed to compute the address. */
375 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
376 && !ix86_agi_dependent (dep_insn, insn))
377 {
378 enum attr_unit unit = get_attr_unit (insn);
379 int loadcost = 3;
380
381 /* Because of the difference between the length of integer and
382 floating unit pipeline preparation stages, the memory operands
383 for floating point are cheaper.
384
385 ??? For Athlon it the difference is most probably 2. */
386 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
387 loadcost = 3;
388 else
389 loadcost = TARGET_ATHLON ? 2 : 0;
390
391 if (cost >= loadcost)
392 cost -= loadcost;
393 else
394 cost = 0;
395 }
396 break;
397
398 case PROCESSOR_ZNVER1:
399 case PROCESSOR_ZNVER2:
400 case PROCESSOR_ZNVER3:
401 /* Stack engine allows to execute push&pop instructions in parall. */
402 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
403 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
404 return 0;
405
406 memory = get_attr_memory (insn);
407
408 /* Show ability of reorder buffer to hide latency of load by executing
409 in parallel with previous instruction in case
410 previous instruction is not needed to compute the address. */
411 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
412 && !ix86_agi_dependent (dep_insn, insn))
413 {
414 enum attr_unit unit = get_attr_unit (insn);
415 int loadcost;
416
417 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
418 loadcost = 4;
419 else
420 loadcost = 7;
421
422 if (cost >= loadcost)
423 cost -= loadcost;
424 else
425 cost = 0;
426 }
427 break;
428
429 case PROCESSOR_CORE2:
430 case PROCESSOR_NEHALEM:
431 case PROCESSOR_SANDYBRIDGE:
432 case PROCESSOR_HASWELL:
433 case PROCESSOR_GENERIC:
434 /* Stack engine allows to execute push&pop instructions in parall. */
435 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
436 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
437 return 0;
438
439 memory = get_attr_memory (insn);
440
441 /* Show ability of reorder buffer to hide latency of load by executing
442 in parallel with previous instruction in case
443 previous instruction is not needed to compute the address. */
444 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
445 && !ix86_agi_dependent (dep_insn, insn))
446 {
447 if (cost >= 4)
448 cost -= 4;
449 else
450 cost = 0;
451 }
452 break;
453
454 case PROCESSOR_SILVERMONT:
455 case PROCESSOR_KNL:
456 case PROCESSOR_KNM:
457 case PROCESSOR_INTEL:
458 if (!reload_completed)
459 return cost;
460
461 /* Increase cost of integer loads. */
462 memory = get_attr_memory (dep_insn);
463 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
464 {
465 enum attr_unit unit = get_attr_unit (dep_insn);
466 if (unit == UNIT_INTEGER && cost == 1)
467 {
468 if (memory == MEMORY_LOAD)
469 cost = 3;
470 else
471 {
472 /* Increase cost of ld/st for short int types only
473 because of store forwarding issue. */
474 rtx set = single_set (dep_insn);
475 if (set && (GET_MODE (SET_DEST (set)) == QImode
476 || GET_MODE (SET_DEST (set)) == HImode))
477 {
478 /* Increase cost of store/load insn if exact
479 dependence exists and it is load insn. */
480 enum attr_memory insn_memory = get_attr_memory (insn);
481 if (insn_memory == MEMORY_LOAD
482 && exact_store_load_dependency (dep_insn, insn))
483 cost = 3;
484 }
485 }
486 }
487 }
488
489 default:
490 break;
491 }
492
493 return cost;
494 }
495
496 /* How many alternative schedules to try. This should be as wide as the
497 scheduling freedom in the DFA, but no wider. Making this value too
498 large results extra work for the scheduler. */
499
500 int
ia32_multipass_dfa_lookahead(void)501 ia32_multipass_dfa_lookahead (void)
502 {
503 /* Generally, we want haifa-sched:max_issue() to look ahead as far
504 as many instructions can be executed on a cycle, i.e.,
505 issue_rate. */
506 if (reload_completed)
507 return ix86_issue_rate ();
508 /* Don't use lookahead for pre-reload schedule to save compile time. */
509 return 0;
510 }
511
512 /* Return true if target platform supports macro-fusion. */
513
514 bool
ix86_macro_fusion_p()515 ix86_macro_fusion_p ()
516 {
517 return TARGET_FUSE_CMP_AND_BRANCH;
518 }
519
520 /* Check whether current microarchitecture support macro fusion
521 for insn pair "CONDGEN + CONDJMP". Refer to
522 "Intel Architectures Optimization Reference Manual". */
523
524 bool
ix86_macro_fusion_pair_p(rtx_insn * condgen,rtx_insn * condjmp)525 ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
526 {
527 rtx src, dest;
528 enum rtx_code ccode;
529 rtx compare_set = NULL_RTX, test_if, cond;
530 rtx alu_set = NULL_RTX, addr = NULL_RTX;
531 enum attr_type condgen_type;
532
533 if (!any_condjump_p (condjmp))
534 return false;
535
536 unsigned int condreg1, condreg2;
537 rtx cc_reg_1;
538 targetm.fixed_condition_code_regs (&condreg1, &condreg2);
539 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
540 if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
541 || !condgen
542 || !modified_in_p (cc_reg_1, condgen))
543 return false;
544
545 condgen_type = get_attr_type (condgen);
546 if (condgen_type == TYPE_MULTI
547 && INSN_CODE (condgen) == code_for_stack_protect_test_1 (ptr_mode)
548 && TARGET_FUSE_ALU_AND_BRANCH)
549 {
550 /* stack_protect_test_<mode> ends with a sub, which subtracts
551 a non-rip special memory operand from a GPR. */
552 src = NULL_RTX;
553 alu_set = XVECEXP (PATTERN (condgen), 0, 1);
554 goto handle_stack_protect_test;
555 }
556 else if (condgen_type != TYPE_TEST
557 && condgen_type != TYPE_ICMP
558 && condgen_type != TYPE_INCDEC
559 && condgen_type != TYPE_ALU)
560 return false;
561
562 compare_set = single_set (condgen);
563 if (compare_set == NULL_RTX && !TARGET_FUSE_ALU_AND_BRANCH)
564 return false;
565
566 if (compare_set == NULL_RTX)
567 {
568 int i;
569 rtx pat = PATTERN (condgen);
570 for (i = 0; i < XVECLEN (pat, 0); i++)
571 if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
572 {
573 rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
574 if (GET_CODE (set_src) == COMPARE)
575 compare_set = XVECEXP (pat, 0, i);
576 else
577 alu_set = XVECEXP (pat, 0, i);
578 }
579 }
580 if (compare_set == NULL_RTX)
581 return false;
582 src = SET_SRC (compare_set);
583 if (GET_CODE (src) != COMPARE)
584 return false;
585
586 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
587 supported. */
588 if ((MEM_P (XEXP (src, 0)) && CONST_INT_P (XEXP (src, 1)))
589 || (MEM_P (XEXP (src, 1)) && CONST_INT_P (XEXP (src, 0))))
590 return false;
591
592 /* No fusion for RIP-relative address. */
593 if (MEM_P (XEXP (src, 0)))
594 addr = XEXP (XEXP (src, 0), 0);
595 else if (MEM_P (XEXP (src, 1)))
596 addr = XEXP (XEXP (src, 1), 0);
597
598 if (addr)
599 {
600 ix86_address parts;
601 int ok = ix86_decompose_address (addr, &parts);
602 gcc_assert (ok);
603
604 if (ix86_rip_relative_addr_p (&parts))
605 return false;
606 }
607
608 handle_stack_protect_test:
609 test_if = SET_SRC (pc_set (condjmp));
610 cond = XEXP (test_if, 0);
611 ccode = GET_CODE (cond);
612 /* Check whether conditional jump use Sign or Overflow Flags. */
613 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
614 && (ccode == GE || ccode == GT || ccode == LE || ccode == LT))
615 return false;
616
617 /* Return true for TYPE_TEST and TYPE_ICMP. */
618 if (condgen_type == TYPE_TEST || condgen_type == TYPE_ICMP)
619 return true;
620
621 /* The following is the case that macro-fusion for alu + jmp. */
622 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
623 return false;
624
625 /* No fusion for alu op with memory destination operand. */
626 dest = SET_DEST (alu_set);
627 if (MEM_P (dest))
628 return false;
629
630 /* Macro-fusion for inc/dec + unsigned conditional jump is not
631 supported. */
632 if (condgen_type == TYPE_INCDEC
633 && (ccode == GEU || ccode == GTU || ccode == LEU || ccode == LTU))
634 return false;
635
636 return true;
637 }
638
639