1/* Definitions of x86 tunable features. 2 Copyright (C) 2013-2016 Free Software Foundation, Inc. 3 4This file is part of GCC. 5 6GCC is free software; you can redistribute it and/or modify 7it under the terms of the GNU General Public License as published by 8the Free Software Foundation; either version 3, or (at your option) 9any later version. 10 11GCC is distributed in the hope that it will be useful, 12but WITHOUT ANY WARRANTY; without even the implied warranty of 13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14GNU General Public License for more details. 15 16You should have received a copy of the GNU General Public License and 17a copy of the GCC Runtime Library Exception along with this program; 18see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 19<http://www.gnu.org/licenses/>. */ 20 21/* Tuning for a given CPU XXXX consists of: 22 - adding new CPU into: 23 - adding PROCESSOR_XXX to processor_type (in i386.h) 24 - possibly adding XXX into CPU attribute in i386.md 25 - adding XXX to processor_alias_table (in i386.c) 26 - introducing ix86_XXX_cost in i386.c 27 - Stringop generation table can be build based on test_stringop 28 - script (once rest of tuning is complete) 29 - designing a scheduler model in 30 - XXXX.md file 31 - Updating ix86_issue_rate and ix86_adjust_cost in i386.md 32 - possibly updating ia32_multipass_dfa_lookahead, ix86_sched_reorder 33 and ix86_sched_init_global if those tricks are needed. 34 - Tunning the flags bellow. Those are split into sections and each 35 section is very roughly ordered by importance. */ 36 37/*****************************************************************************/ 38/* Scheduling flags. */ 39/*****************************************************************************/ 40 41/* X86_TUNE_SCHEDULE: Enable scheduling. */ 42DEF_TUNE (X86_TUNE_SCHEDULE, "schedule", 43 m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT 44 | m_INTEL | m_KNL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) 45 46/* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming 47 on modern chips. Preffer stores affecting whole integer register 48 over partial stores. For example preffer MOVZBL or MOVQ to load 8bit 49 value over movb. */ 50DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency", 51 m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL 52 | m_KNL | m_AMD_MULTIPLE | m_GENERIC) 53 54/* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store 55 destinations to be 128bit to allow register renaming on 128bit SSE units, 56 but usually results in one extra microop on 64bit SSE units. 57 Experimental results shows that disabling this option on P4 brings over 20% 58 SPECfp regression, while enabling it on K8 brings roughly 2.4% regression 59 that can be partly masked by careful scheduling of moves. */ 60DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency", 61 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10 62 | m_BDVER | m_ZNVER1 | m_GENERIC) 63 64/* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies 65 are resolved on SSE register parts instead of whole registers, so we may 66 maintain just lower part of scalar values in proper format leaving the 67 upper part undefined. */ 68DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8) 69 70/* X86_TUNE_PARTIAL_FLAG_REG_STALL: this flag disables use of of flags 71 set by instructions affecting just some flags (in particular shifts). 72 This is because Core2 resolves dependencies on whole flags register 73 and such sequences introduce false dependency on previous instruction 74 setting full flags. 75 76 The flags does not affect generation of INC and DEC that is controlled 77 by X86_TUNE_USE_INCDEC. 78 79 This flag may be dropped from generic once core2-corei5 machines are 80 rare enough. */ 81DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall", 82 m_CORE2 | m_GENERIC) 83 84/* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid 85 partial dependencies. */ 86DEF_TUNE (X86_TUNE_MOVX, "movx", 87 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT 88 | m_KNL | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GENERIC) 89 90/* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by 91 full sized loads. */ 92DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall", 93 m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL 94 | m_KNL | m_AMD_MULTIPLE | m_GENERIC) 95 96/* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent 97 conditional jump instruction for 32 bit TARGET. 98 FIXME: revisit for generic. */ 99DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32", 100 m_CORE_ALL | m_BDVER | m_ZNVER1) 101 102/* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent 103 conditional jump instruction for TARGET_64BIT. 104 FIXME: revisit for generic. */ 105DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64", 106 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_ZNVER1) 107 108/* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a 109 subsequent conditional jump instruction when the condition jump 110 check sign flag (SF) or overflow flag (OF). */ 111DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags", 112 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_ZNVER1) 113 114/* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional 115 jump instruction when the alu instruction produces the CCFLAG consumed by 116 the conditional jump instruction. */ 117DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch", 118 m_SANDYBRIDGE | m_HASWELL) 119 120/* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations 121 during reassociation of integer computation. */ 122DEF_TUNE (X86_TUNE_REASSOC_INT_TO_PARALLEL, "reassoc_int_to_parallel", 123 m_BONNELL) 124 125/* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations 126 during reassociation of fp computation. */ 127DEF_TUNE (X86_TUNE_REASSOC_FP_TO_PARALLEL, "reassoc_fp_to_parallel", 128 m_BONNELL | m_SILVERMONT | m_HASWELL | m_KNL |m_INTEL | m_BDVER1 129 | m_BDVER2 | m_ZNVER1 | m_GENERIC) 130 131/*****************************************************************************/ 132/* Function prologue, epilogue and function calling sequences. */ 133/*****************************************************************************/ 134 135/* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing 136 arguments in prologue/epilogue instead of separately for each call 137 by push/pop instructions. 138 This increase code size by about 5% in 32bit mode, less so in 64bit mode 139 because parameters are passed in registers. It is considerable 140 win for targets without stack engine that prevents multple push operations 141 to happen in parallel. 142 143 FIXME: the flags is incorrectly enabled for amdfam10, Bulldozer, 144 Bobcat and Generic. This is because disabling it causes large 145 regression on mgrid due to IRA limitation leading to unecessary 146 use of the frame pointer in 32bit mode. */ 147DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args", 148 m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL 149 | m_ATHLON_K8) 150 151/* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are 152 considered on critical path. */ 153DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move", 154 m_PPRO | m_ATHLON_K8) 155 156/* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in epilogues that are 157 considered on critical path. */ 158DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move", 159 m_PPRO | m_ATHLON_K8) 160 161/* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits. */ 162DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave", 163 m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) 164 165/* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions. 166 Some chips, like 486 and Pentium works faster with separate load 167 and push instructions. */ 168DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory", 169 m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE 170 | m_GENERIC) 171 172/* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred 173 over esp subtraction. */ 174DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT 175 | m_LAKEMONT | m_K6_GEODE) 176 177/* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred 178 over esp subtraction. */ 179DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_LAKEMONT 180 | m_K6_GEODE) 181 182/* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred 183 over esp addition. */ 184DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT 185 | m_LAKEMONT | m_PPRO) 186 187/* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred 188 over esp addition. */ 189DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT | m_LAKEMONT) 190 191/*****************************************************************************/ 192/* Branch predictor tuning */ 193/*****************************************************************************/ 194 195/* X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4 196 instructions long. */ 197DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_BONNELL) 198 199/* X86_TUNE_PAD_RETURNS: Place NOP before every RET that is a destination 200 of conditional jump or directly preceded by other jump instruction. 201 This is important for AND K8-AMDFAM10 because the branch prediction 202 architecture expect at most one jump per 2 byte window. Failing to 203 pad returns leads to misaligned return stack. */ 204DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns", 205 m_ATHLON_K8 | m_AMDFAM10 | m_GENERIC) 206 207/* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more 208 than 4 branch instructions in the 16 byte window. */ 209DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit", 210 m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL |m_INTEL | 211 m_ATHLON_K8 | m_AMDFAM10) 212 213/*****************************************************************************/ 214/* Integer instruction selection tuning */ 215/*****************************************************************************/ 216 217/* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching 218 at -O3. For the moment, the prefetching seems badly tuned for Intel 219 chips. */ 220DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial", 221 m_K6_GEODE | m_ATHLON_K8 | m_AMDFAM10 | m_BDVER | m_BTVER) 222 223/* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall 224 on 16-bit immediate moves into memory on Core2 and Corei7. */ 225DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC) 226 227/* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such 228 as "add mem, reg". */ 229DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_LAKEMONT | m_PPRO)) 230 231/* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions. */ 232DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec", 233 ~(m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL 234 | m_KNL | m_GENERIC)) 235 236/* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred 237 for DFmode copies */ 238DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves", 239 ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT 240 | m_KNL | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GENERIC)) 241 242/* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag 243 will impact LEA instruction selection. */ 244DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_KNL 245 | m_INTEL) 246 247/* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation. */ 248DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr", 249 m_BONNELL | m_SILVERMONT | m_KNL) 250 251/* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is 252 vector path on AMD machines. 253 FIXME: Do we need to enable this for core? */ 254DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem", 255 m_K8 | m_AMDFAM10) 256 257/* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD 258 machines. 259 FIXME: Do we need to enable this for core? */ 260DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8", 261 m_K8 | m_AMDFAM10) 262 263/* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for 264 a conditional move. */ 265DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove", 266 m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL) 267 268/* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such 269 as MOVS and STOS (without a REP prefix) to move/set sequences of bytes. */ 270DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA) 271 272/* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of 273 compact prologues and epilogues by issuing a misaligned moves. This 274 requires target to handle misaligned moves and partial memory stalls 275 reasonably well. 276 FIXME: This may actualy be a win on more targets than listed here. */ 277DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES, 278 "misaligned_move_string_pro_epilogues", 279 m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC) 280 281/* X86_TUNE_USE_SAHF: Controls use of SAHF. */ 282DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf", 283 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT 284 | m_KNL | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER 285 | m_BTVER | m_ZNVER1 | m_GENERIC) 286 287/* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions. */ 288DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd", 289 ~(m_PENT | m_LAKEMONT | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL 290 | m_K6)) 291 292/* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions. */ 293DEF_TUNE (X86_TUNE_USE_BT, "use_bt", 294 m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL 295 | m_LAKEMONT | m_AMD_MULTIPLE | m_GENERIC) 296 297/*****************************************************************************/ 298/* 387 instruction selection tuning */ 299/*****************************************************************************/ 300 301/* X86_TUNE_USE_HIMODE_FIOP: Enables use of x87 instructions with 16bit 302 integer operand. 303 FIXME: Why this is disabled for modern chips? */ 304DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop", 305 m_386 | m_486 | m_K6_GEODE) 306 307/* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit 308 integer operand. */ 309DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop", 310 ~(m_PENT | m_LAKEMONT | m_PPRO | m_CORE_ALL | m_BONNELL 311 | m_SILVERMONT | m_KNL | m_INTEL | m_AMD_MULTIPLE | m_GENERIC)) 312 313/* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp. */ 314DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE) 315 316/* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI. */ 317DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants", 318 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT 319 | m_KNL | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC) 320 321/*****************************************************************************/ 322/* SSE instruction selection tuning */ 323/*****************************************************************************/ 324 325/* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector 326 instructions. */ 327DEF_TUNE (X86_TUNE_VECTORIZE_DOUBLE, "vectorize_double", ~m_BONNELL) 328 329/* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE 330 regs instead of memory. */ 331DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill", 332 m_CORE_ALL) 333 334/* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead 335 of a sequence loading registers by parts. */ 336DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal", 337 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_SILVERMONT | m_KNL 338 | m_INTEL | m_AMDFAM10 | m_BDVER | m_BTVER | m_ZNVER1 | m_GENERIC) 339 340/* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead 341 of a sequence loading registers by parts. */ 342DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal", 343 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_SILVERMONT | m_KNL 344 | m_INTEL | m_BDVER | m_ZNVER1 | m_GENERIC) 345 346/* Use packed single precision instructions where posisble. I.e. movups instead 347 of movupd. */ 348DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal", 349 m_BDVER | m_ZNVER1) 350 351/* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores. */ 352DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores", 353 m_AMD_MULTIPLE | m_CORE_ALL | m_GENERIC) 354 355/* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to 356 xorps/xorpd and other variants. */ 357DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor", 358 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_ZNVER1 359 | m_GENERIC) 360 361/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer 362 to SSE registers. If disabled, the moves will be done by storing 363 the value to memory and reloading. */ 364DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec", 365 ~(m_AMD_MULTIPLE | m_GENERIC)) 366 367/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE 368 to integer registers. If disabled, the moves will be done by storing 369 the value to memory and reloading. */ 370DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec", 371 ~m_ATHLON_K8) 372 373/* X86_TUNE_INTER_UNIT_CONVERSIONS: Enable float<->integer conversions 374 to use both SSE and integer registers at a same time. 375 FIXME: revisit importance of this for generic. */ 376DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions", 377 ~(m_AMDFAM10 | m_BDVER)) 378 379/* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for 380 fp converts to destination register. */ 381DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts", 382 m_SILVERMONT | m_KNL | m_INTEL) 383 384/* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion 385 from FP to FP. This form of instructions avoids partial write to the 386 destination. */ 387DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts", 388 m_AMDFAM10) 389 390/* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion 391 from integer to FP. */ 392DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10) 393 394/* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction. */ 395DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb", 396 m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL) 397 398/* X86_TUNE_VECTOR_PARALLEL_EXECUTION: Indicates tunings with ability to 399 execute 2 or more vector instructions in parallel. */ 400DEF_TUNE (X86_TUNE_VECTOR_PARALLEL_EXECUTION, "vec_parallel", 401 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL) 402 403/* X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes. */ 404DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes", 405 m_SILVERMONT | m_INTEL) 406 407/*****************************************************************************/ 408/* AVX instruction selection tuning (some of SSE flags affects AVX, too) */ 409/*****************************************************************************/ 410 411/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are 412 split. */ 413DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal", 414 ~(m_NEHALEM | m_SANDYBRIDGE | m_GENERIC)) 415 416/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are 417 split. */ 418DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal", 419 ~(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_ZNVER1 | m_GENERIC)) 420 421/* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for 422 the auto-vectorizer. */ 423DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2 424 | m_ZNVER1) 425 426/*****************************************************************************/ 427/* Historical relics: tuning flags that helps a specific old CPU designs */ 428/*****************************************************************************/ 429 430/* X86_TUNE_DOUBLE_WITH_ADD: Use add instead of sal to double value in 431 an integer register. */ 432DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386) 433 434/* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations, 435 such as fsqrt, fprem, fsin, fcos, fsincos etc. 436 Should be enabled for all targets that always has coprocesor. */ 437DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387", 438 ~(m_386 | m_486 | m_LAKEMONT)) 439 440/* X86_TUNE_UNROLL_STRLEN: Produce (quite lame) unrolled sequence for 441 inline strlen. This affects only -minline-all-stringops mode. By 442 default we always dispatch to a library since our internal strlen 443 is bad. */ 444DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen", ~m_386) 445 446/* X86_TUNE_SHIFT1: Enables use of short encoding of "sal reg" instead of 447 longer "sal $1, reg". */ 448DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486) 449 450/* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead 451 of mozbl/movwl. */ 452DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and", 453 m_486 | m_PENT) 454 455/* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode 456 and SImode multiply, but 386 and 486 do HImode multiply faster. */ 457DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul", 458 ~(m_386 | m_486)) 459 460/* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic 461 into 16bit/8bit when resulting sequence is shorter. For example 462 for "and $-65536, reg" to 16bit store of 0. */ 463DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix", 464 ~(m_386 | m_486 | m_PENT | m_LAKEMONT)) 465 466/* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions 467 such as "add $1, mem". */ 468DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write", 469 ~(m_PENT | m_LAKEMONT)) 470 471/* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR 472 than a MOV. */ 473DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT | m_LAKEMONT) 474 475/* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is, 476 but one byte longer. */ 477DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT | m_LAKEMONT) 478 479/* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled 480 use of partial registers by renaming. This improved performance of 16bit 481 code where upper halves of registers are not used. It also leads to 482 an penalty whenever a 16bit store is followed by 32bit use. This flag 483 disables production of such sequences in common cases. 484 See also X86_TUNE_HIMODE_MATH. 485 486 In current implementation the partial register stalls are not eliminated 487 very well - they can be introduced via subregs synthesized by combine 488 and can happen in caller/callee saving sequences. */ 489DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO) 490 491/* X86_TUNE_PROMOTE_QIMODE: When it is cheap, turn 8bit arithmetic to 492 corresponding 32bit arithmetic. */ 493DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode", 494 ~m_PPRO) 495 496/* X86_TUNE_PROMOTE_HI_REGS: Same, but for 16bit artihmetic. Again we avoid 497 partial register stalls on PentiumPro targets. */ 498DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs", m_PPRO) 499 500/* X86_TUNE_HIMODE_MATH: Enable use of 16bit arithmetic. 501 On PPro this flag is meant to avoid partial register stalls. */ 502DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math", ~m_PPRO) 503 504/* X86_TUNE_SPLIT_LONG_MOVES: Avoid instructions moving immediates 505 directly to memory. */ 506DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO) 507 508/* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */ 509DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb", m_PENT4) 510 511/* X86_TUNE_USE_MOV0: Use "mov $0, reg" instead of "xor reg, reg" to clear 512 integer register. */ 513DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0", m_K6) 514 515/* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory 516 operand that cannot be represented using a modRM byte. The XOR 517 replacement is long decoded, so this split helps here as well. */ 518DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6) 519 520/* X86_TUNE_AVOID_VECTOR_DECODE: Enable splitters that avoid vector decoded 521 forms of instructions on K8 targets. */ 522DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode", 523 m_K8) 524 525/* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency 526 for bit-manipulation instructions. */ 527DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi", 528 m_SANDYBRIDGE | m_HASWELL | m_GENERIC) 529 530/*****************************************************************************/ 531/* This never worked well before. */ 532/*****************************************************************************/ 533 534/* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based 535 on simulation result. But after P4 was made, no performance benefit 536 was observed with branch hints. It also increases the code size. 537 As a result, icc never generates branch hints. */ 538DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", 0) 539 540/* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic. */ 541DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", ~0) 542 543/* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit 544 arithmetic to 32bit via PROMOTE_MODE macro. This code generation scheme 545 is usually used for RISC targets. */ 546DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0) 547 548/* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based 549 on hardware capabilities. Bdver3 hardware has a loop buffer which makes 550 unrolling small loop less important. For, such architectures we adjust 551 the unroll factor so that the unrolled loop fits the loop buffer. */ 552DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4) 553 554/* X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of set insns to be 555 if-converted to one. */ 556DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn", 557 m_SILVERMONT | m_KNL | m_INTEL | m_CORE_ALL | m_GENERIC) 558 559/* X86_TUNE_EMIT_VZEROUPPER: This enables vzeroupper instruction insertion 560 before a transfer of control flow out of the function. */ 561DEF_TUNE (X86_TUNE_EMIT_VZEROUPPER, "emit_vzeroupper", ~m_KNL) 562