1/* Definitions of x86 tunable features. 2 Copyright (C) 2013-2015 Free Software Foundation, Inc. 3 4This file is part of GCC. 5 6GCC is free software; you can redistribute it and/or modify 7it under the terms of the GNU General Public License as published by 8the Free Software Foundation; either version 3, or (at your option) 9any later version. 10 11GCC is distributed in the hope that it will be useful, 12but WITHOUT ANY WARRANTY; without even the implied warranty of 13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14GNU General Public License for more details. 15 16You should have received a copy of the GNU General Public License and 17a copy of the GCC Runtime Library Exception along with this program; 18see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 19<http://www.gnu.org/licenses/>. */ 20 21/* Tuning for a given CPU XXXX consists of: 22 - adding new CPU into: 23 - adding PROCESSOR_XXX to processor_type (in i386.h) 24 - possibly adding XXX into CPU attribute in i386.md 25 - adding XXX to processor_alias_table (in i386.c) 26 - introducing ix86_XXX_cost in i386.c 27 - Stringop generation table can be build based on test_stringop 28 - script (once rest of tuning is complete) 29 - designing a scheduler model in 30 - XXXX.md file 31 - Updating ix86_issue_rate and ix86_adjust_cost in i386.md 32 - possibly updating ia32_multipass_dfa_lookahead, ix86_sched_reorder 33 and ix86_sched_init_global if those tricks are needed. 34 - Tunning the flags bellow. Those are split into sections and each 35 section is very roughly ordered by importance. */ 36 37/*****************************************************************************/ 38/* Scheduling flags. */ 39/*****************************************************************************/ 40 41/* X86_TUNE_SCHEDULE: Enable scheduling. */ 42DEF_TUNE (X86_TUNE_SCHEDULE, "schedule", 43 m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL 44 | m_KNL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) 45 46/* X86_TUNE_PARTIAL_REG_DEPENDENCY: Enable more register renaming 47 on modern chips. Preffer stores affecting whole integer register 48 over partial stores. For example preffer MOVZBL or MOVQ to load 8bit 49 value over movb. */ 50DEF_TUNE (X86_TUNE_PARTIAL_REG_DEPENDENCY, "partial_reg_dependency", 51 m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL 52 | m_KNL | m_AMD_MULTIPLE | m_GENERIC) 53 54/* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: This knob promotes all store 55 destinations to be 128bit to allow register renaming on 128bit SSE units, 56 but usually results in one extra microop on 64bit SSE units. 57 Experimental results shows that disabling this option on P4 brings over 20% 58 SPECfp regression, while enabling it on K8 brings roughly 2.4% regression 59 that can be partly masked by careful scheduling of moves. */ 60DEF_TUNE (X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY, "sse_partial_reg_dependency", 61 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_AMDFAM10 62 | m_BDVER | m_GENERIC) 63 64/* X86_TUNE_SSE_SPLIT_REGS: Set for machines where the type and dependencies 65 are resolved on SSE register parts instead of whole registers, so we may 66 maintain just lower part of scalar values in proper format leaving the 67 upper part undefined. */ 68DEF_TUNE (X86_TUNE_SSE_SPLIT_REGS, "sse_split_regs", m_ATHLON_K8) 69 70/* X86_TUNE_PARTIAL_FLAG_REG_STALL: this flag disables use of of flags 71 set by instructions affecting just some flags (in particular shifts). 72 This is because Core2 resolves dependencies on whole flags register 73 and such sequences introduce false dependency on previous instruction 74 setting full flags. 75 76 The flags does not affect generation of INC and DEC that is controlled 77 by X86_TUNE_USE_INCDEC. 78 79 This flag may be dropped from generic once core2-corei5 machines are 80 rare enough. */ 81DEF_TUNE (X86_TUNE_PARTIAL_FLAG_REG_STALL, "partial_flag_reg_stall", 82 m_CORE2 | m_GENERIC) 83 84/* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid 85 partial dependencies. */ 86DEF_TUNE (X86_TUNE_MOVX, "movx", 87 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT 88 | m_KNL | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GENERIC) 89 90/* X86_TUNE_MEMORY_MISMATCH_STALL: Avoid partial stores that are followed by 91 full sized loads. */ 92DEF_TUNE (X86_TUNE_MEMORY_MISMATCH_STALL, "memory_mismatch_stall", 93 m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL 94 | m_KNL | m_AMD_MULTIPLE | m_GENERIC) 95 96/* X86_TUNE_FUSE_CMP_AND_BRANCH_32: Fuse compare with a subsequent 97 conditional jump instruction for 32 bit TARGET. 98 FIXME: revisit for generic. */ 99DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_32, "fuse_cmp_and_branch_32", 100 m_CORE_ALL | m_BDVER) 101 102/* X86_TUNE_FUSE_CMP_AND_BRANCH_64: Fuse compare with a subsequent 103 conditional jump instruction for TARGET_64BIT. 104 FIXME: revisit for generic. */ 105DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_64, "fuse_cmp_and_branch_64", 106 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER) 107 108/* X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS: Fuse compare with a 109 subsequent conditional jump instruction when the condition jump 110 check sign flag (SF) or overflow flag (OF). */ 111DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags", 112 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER) 113 114/* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional 115 jump instruction when the alu instruction produces the CCFLAG consumed by 116 the conditional jump instruction. */ 117DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch", 118 m_SANDYBRIDGE | m_HASWELL) 119 120/* X86_TUNE_REASSOC_INT_TO_PARALLEL: Try to produce parallel computations 121 during reassociation of integer computation. */ 122DEF_TUNE (X86_TUNE_REASSOC_INT_TO_PARALLEL, "reassoc_int_to_parallel", 123 m_BONNELL) 124 125/* X86_TUNE_REASSOC_FP_TO_PARALLEL: Try to produce parallel computations 126 during reassociation of fp computation. */ 127DEF_TUNE (X86_TUNE_REASSOC_FP_TO_PARALLEL, "reassoc_fp_to_parallel", 128 m_BONNELL | m_SILVERMONT | m_HASWELL | m_KNL |m_INTEL | m_BDVER1 129 | m_BDVER2 | m_GENERIC) 130 131/*****************************************************************************/ 132/* Function prologue, epilogue and function calling sequences. */ 133/*****************************************************************************/ 134 135/* X86_TUNE_ACCUMULATE_OUTGOING_ARGS: Allocate stack space for outgoing 136 arguments in prologue/epilogue instead of separately for each call 137 by push/pop instructions. 138 This increase code size by about 5% in 32bit mode, less so in 64bit mode 139 because parameters are passed in registers. It is considerable 140 win for targets without stack engine that prevents multple push operations 141 to happen in parallel. 142 143 FIXME: the flags is incorrectly enabled for amdfam10, Bulldozer, 144 Bobcat and Generic. This is because disabling it causes large 145 regression on mgrid due to IRA limitation leading to unecessary 146 use of the frame pointer in 32bit mode. */ 147DEF_TUNE (X86_TUNE_ACCUMULATE_OUTGOING_ARGS, "accumulate_outgoing_args", 148 m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL 149 | m_ATHLON_K8) 150 151/* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in prologues that are 152 considered on critical path. */ 153DEF_TUNE (X86_TUNE_PROLOGUE_USING_MOVE, "prologue_using_move", 154 m_PPRO | m_ATHLON_K8) 155 156/* X86_TUNE_PROLOGUE_USING_MOVE: Do not use push/pop in epilogues that are 157 considered on critical path. */ 158DEF_TUNE (X86_TUNE_EPILOGUE_USING_MOVE, "epilogue_using_move", 159 m_PPRO | m_ATHLON_K8) 160 161/* X86_TUNE_USE_LEAVE: Use "leave" instruction in epilogues where it fits. */ 162DEF_TUNE (X86_TUNE_USE_LEAVE, "use_leave", 163 m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC) 164 165/* X86_TUNE_PUSH_MEMORY: Enable generation of "push mem" instructions. 166 Some chips, like 486 and Pentium works faster with separate load 167 and push instructions. */ 168DEF_TUNE (X86_TUNE_PUSH_MEMORY, "push_memory", 169 m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE 170 | m_GENERIC) 171 172/* X86_TUNE_SINGLE_PUSH: Enable if single push insn is preferred 173 over esp subtraction. */ 174DEF_TUNE (X86_TUNE_SINGLE_PUSH, "single_push", m_386 | m_486 | m_PENT 175 | m_K6_GEODE) 176 177/* X86_TUNE_DOUBLE_PUSH. Enable if double push insn is preferred 178 over esp subtraction. */ 179DEF_TUNE (X86_TUNE_DOUBLE_PUSH, "double_push", m_PENT | m_K6_GEODE) 180 181/* X86_TUNE_SINGLE_POP: Enable if single pop insn is preferred 182 over esp addition. */ 183DEF_TUNE (X86_TUNE_SINGLE_POP, "single_pop", m_386 | m_486 | m_PENT | m_PPRO) 184 185/* X86_TUNE_DOUBLE_POP: Enable if double pop insn is preferred 186 over esp addition. */ 187DEF_TUNE (X86_TUNE_DOUBLE_POP, "double_pop", m_PENT) 188 189/*****************************************************************************/ 190/* Branch predictor tuning */ 191/*****************************************************************************/ 192 193/* X86_TUNE_PAD_SHORT_FUNCTION: Make every function to be at least 4 194 instructions long. */ 195DEF_TUNE (X86_TUNE_PAD_SHORT_FUNCTION, "pad_short_function", m_BONNELL) 196 197/* X86_TUNE_PAD_RETURNS: Place NOP before every RET that is a destination 198 of conditional jump or directly preceded by other jump instruction. 199 This is important for AND K8-AMDFAM10 because the branch prediction 200 architecture expect at most one jump per 2 byte window. Failing to 201 pad returns leads to misaligned return stack. */ 202DEF_TUNE (X86_TUNE_PAD_RETURNS, "pad_returns", 203 m_ATHLON_K8 | m_AMDFAM10 | m_GENERIC) 204 205/* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more 206 than 4 branch instructions in the 16 byte window. */ 207DEF_TUNE (X86_TUNE_FOUR_JUMP_LIMIT, "four_jump_limit", 208 m_PPRO | m_P4_NOCONA | m_BONNELL | m_SILVERMONT | m_KNL |m_INTEL | 209 m_ATHLON_K8 | m_AMDFAM10) 210 211/*****************************************************************************/ 212/* Integer instruction selection tuning */ 213/*****************************************************************************/ 214 215/* X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL: Enable software prefetching 216 at -O3. For the moment, the prefetching seems badly tuned for Intel 217 chips. */ 218DEF_TUNE (X86_TUNE_SOFTWARE_PREFETCHING_BENEFICIAL, "software_prefetching_beneficial", 219 m_K6_GEODE | m_AMD_MULTIPLE) 220 221/* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall 222 on 16-bit immediate moves into memory on Core2 and Corei7. */ 223DEF_TUNE (X86_TUNE_LCP_STALL, "lcp_stall", m_CORE_ALL | m_GENERIC) 224 225/* X86_TUNE_READ_MODIFY: Enable use of read-modify instructions such 226 as "add mem, reg". */ 227DEF_TUNE (X86_TUNE_READ_MODIFY, "read_modify", ~(m_PENT | m_PPRO)) 228 229/* X86_TUNE_USE_INCDEC: Enable use of inc/dec instructions. */ 230DEF_TUNE (X86_TUNE_USE_INCDEC, "use_incdec", 231 ~(m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_INTEL 232 | m_KNL | m_GENERIC)) 233 234/* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred 235 for DFmode copies */ 236DEF_TUNE (X86_TUNE_INTEGER_DFMODE_MOVES, "integer_dfmode_moves", 237 ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT 238 | m_KNL | m_INTEL | m_GEODE | m_AMD_MULTIPLE | m_GENERIC)) 239 240/* X86_TUNE_OPT_AGU: Optimize for Address Generation Unit. This flag 241 will impact LEA instruction selection. */ 242DEF_TUNE (X86_TUNE_OPT_AGU, "opt_agu", m_BONNELL | m_SILVERMONT | m_KNL 243 | m_INTEL) 244 245/* X86_TUNE_AVOID_LEA_FOR_ADDR: Avoid lea for address computation. */ 246DEF_TUNE (X86_TUNE_AVOID_LEA_FOR_ADDR, "avoid_lea_for_addr", 247 m_BONNELL | m_SILVERMONT | m_KNL) 248 249/* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is 250 vector path on AMD machines. 251 FIXME: Do we need to enable this for core? */ 252DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM32_MEM, "slow_imul_imm32_mem", 253 m_K8 | m_AMDFAM10) 254 255/* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD 256 machines. 257 FIXME: Do we need to enable this for core? */ 258DEF_TUNE (X86_TUNE_SLOW_IMUL_IMM8, "slow_imul_imm8", 259 m_K8 | m_AMDFAM10) 260 261/* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for 262 a conditional move. */ 263DEF_TUNE (X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE, "avoid_mem_opnd_for_cmove", 264 m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL) 265 266/* X86_TUNE_SINGLE_STRINGOP: Enable use of single string operations, such 267 as MOVS and STOS (without a REP prefix) to move/set sequences of bytes. */ 268DEF_TUNE (X86_TUNE_SINGLE_STRINGOP, "single_stringop", m_386 | m_P4_NOCONA) 269 270/* X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES: Enable generation of 271 compact prologues and epilogues by issuing a misaligned moves. This 272 requires target to handle misaligned moves and partial memory stalls 273 reasonably well. 274 FIXME: This may actualy be a win on more targets than listed here. */ 275DEF_TUNE (X86_TUNE_MISALIGNED_MOVE_STRING_PRO_EPILOGUES, 276 "misaligned_move_string_pro_epilogues", 277 m_386 | m_486 | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC) 278 279/* X86_TUNE_USE_SAHF: Controls use of SAHF. */ 280DEF_TUNE (X86_TUNE_USE_SAHF, "use_sahf", 281 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT 282 | m_KNL | m_INTEL | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER 283 | m_BTVER | m_GENERIC) 284 285/* X86_TUNE_USE_CLTD: Controls use of CLTD and CTQO instructions. */ 286DEF_TUNE (X86_TUNE_USE_CLTD, "use_cltd", 287 ~(m_PENT | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL | m_K6)) 288 289/* X86_TUNE_USE_BT: Enable use of BT (bit test) instructions. */ 290DEF_TUNE (X86_TUNE_USE_BT, "use_bt", 291 m_CORE_ALL | m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL 292 | m_AMD_MULTIPLE | m_GENERIC) 293 294/*****************************************************************************/ 295/* 387 instruction selection tuning */ 296/*****************************************************************************/ 297 298/* X86_TUNE_USE_HIMODE_FIOP: Enables use of x87 instructions with 16bit 299 integer operand. 300 FIXME: Why this is disabled for modern chips? */ 301DEF_TUNE (X86_TUNE_USE_HIMODE_FIOP, "use_himode_fiop", 302 m_386 | m_486 | m_K6_GEODE) 303 304/* X86_TUNE_USE_SIMODE_FIOP: Enables use of x87 instructions with 32bit 305 integer operand. */ 306DEF_TUNE (X86_TUNE_USE_SIMODE_FIOP, "use_simode_fiop", 307 ~(m_PENT | m_PPRO | m_CORE_ALL | m_BONNELL | m_SILVERMONT 308 | m_KNL | m_INTEL | m_AMD_MULTIPLE | m_GENERIC)) 309 310/* X86_TUNE_USE_FFREEP: Use freep instruction instead of fstp. */ 311DEF_TUNE (X86_TUNE_USE_FFREEP, "use_ffreep", m_AMD_MULTIPLE) 312 313/* X86_TUNE_EXT_80387_CONSTANTS: Use fancy 80387 constants, such as PI. */ 314DEF_TUNE (X86_TUNE_EXT_80387_CONSTANTS, "ext_80387_constants", 315 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BONNELL | m_SILVERMONT 316 | m_KNL | m_INTEL | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC) 317 318/*****************************************************************************/ 319/* SSE instruction selection tuning */ 320/*****************************************************************************/ 321 322/* X86_TUNE_VECTORIZE_DOUBLE: Enable double precision vector 323 instructions. */ 324DEF_TUNE (X86_TUNE_VECTORIZE_DOUBLE, "vectorize_double", ~m_BONNELL) 325 326/* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE 327 regs instead of memory. */ 328DEF_TUNE (X86_TUNE_GENERAL_REGS_SSE_SPILL, "general_regs_sse_spill", 329 m_CORE_ALL) 330 331/* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL: Use movups for misaligned loads instead 332 of a sequence loading registers by parts. */ 333DEF_TUNE (X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL, "sse_unaligned_load_optimal", 334 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_AMDFAM10 | m_BDVER 335 | m_BTVER | m_SILVERMONT | m_KNL | m_INTEL | m_GENERIC) 336 337/* X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL: Use movups for misaligned stores instead 338 of a sequence loading registers by parts. */ 339DEF_TUNE (X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL, "sse_unaligned_store_optimal", 340 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL | m_BDVER | m_SILVERMONT 341 | m_KNL | m_INTEL | m_GENERIC) 342 343/* Use packed single precision instructions where posisble. I.e. movups instead 344 of movupd. */ 345DEF_TUNE (X86_TUNE_SSE_PACKED_SINGLE_INSN_OPTIMAL, "sse_packed_single_insn_optimal", 346 m_BDVER) 347 348/* X86_TUNE_SSE_TYPELESS_STORES: Always movaps/movups for 128bit stores. */ 349DEF_TUNE (X86_TUNE_SSE_TYPELESS_STORES, "sse_typeless_stores", 350 m_AMD_MULTIPLE | m_CORE_ALL | m_GENERIC) 351 352/* X86_TUNE_SSE_LOAD0_BY_PXOR: Always use pxor to load0 as opposed to 353 xorps/xorpd and other variants. */ 354DEF_TUNE (X86_TUNE_SSE_LOAD0_BY_PXOR, "sse_load0_by_pxor", 355 m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_BDVER | m_BTVER | m_GENERIC) 356 357/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from integer 358 to SSE registers. If disabled, the moves will be done by storing 359 the value to memory and reloading. */ 360DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_TO_VEC, "inter_unit_moves_to_vec", 361 ~(m_AMD_MULTIPLE | m_GENERIC)) 362 363/* X86_TUNE_INTER_UNIT_MOVES_TO_VEC: Enable moves in from SSE 364 to integer registers. If disabled, the moves will be done by storing 365 the value to memory and reloading. */ 366DEF_TUNE (X86_TUNE_INTER_UNIT_MOVES_FROM_VEC, "inter_unit_moves_from_vec", 367 ~m_ATHLON_K8) 368 369/* X86_TUNE_INTER_UNIT_CONVERSIONS: Enable float<->integer conversions 370 to use both SSE and integer registers at a same time. 371 FIXME: revisit importance of this for generic. */ 372DEF_TUNE (X86_TUNE_INTER_UNIT_CONVERSIONS, "inter_unit_conversions", 373 ~(m_AMDFAM10 | m_BDVER)) 374 375/* X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS: Try to split memory operand for 376 fp converts to destination register. */ 377DEF_TUNE (X86_TUNE_SPLIT_MEM_OPND_FOR_FP_CONVERTS, "split_mem_opnd_for_fp_converts", 378 m_SILVERMONT | m_KNL | m_INTEL) 379 380/* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion 381 from FP to FP. This form of instructions avoids partial write to the 382 destination. */ 383DEF_TUNE (X86_TUNE_USE_VECTOR_FP_CONVERTS, "use_vector_fp_converts", 384 m_AMDFAM10) 385 386/* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion 387 from integer to FP. */ 388DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS, "use_vector_converts", m_AMDFAM10) 389 390/* X86_TUNE_SLOW_SHUFB: Indicates tunings with slow pshufb instruction. */ 391DEF_TUNE (X86_TUNE_SLOW_PSHUFB, "slow_pshufb", 392 m_BONNELL | m_SILVERMONT | m_KNL | m_INTEL) 393 394/* X86_TUNE_VECTOR_PARALLEL_EXECUTION: Indicates tunings with ability to 395 execute 2 or more vector instructions in parallel. */ 396DEF_TUNE (X86_TUNE_VECTOR_PARALLEL_EXECUTION, "vec_parallel", 397 m_NEHALEM | m_SANDYBRIDGE | m_HASWELL) 398 399/* X86_TUNE_AVOID_4BYTE_PREFIXES: Avoid instructions requiring 4+ bytes of prefixes. */ 400DEF_TUNE (X86_TUNE_AVOID_4BYTE_PREFIXES, "avoid_4byte_prefixes", 401 m_SILVERMONT | m_INTEL) 402 403/*****************************************************************************/ 404/* AVX instruction selection tuning (some of SSE flags affects AVX, too) */ 405/*****************************************************************************/ 406 407/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are 408 split. */ 409DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal", 410 ~(m_NEHALEM | m_SANDYBRIDGE | m_GENERIC)) 411 412/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are 413 split. */ 414DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_store_optimal", 415 ~(m_NEHALEM | m_SANDYBRIDGE | m_BDVER | m_GENERIC)) 416 417/* X86_TUNE_AVX128_OPTIMAL: Enable 128-bit AVX instruction generation for 418 the auto-vectorizer. */ 419DEF_TUNE (X86_TUNE_AVX128_OPTIMAL, "avx128_optimal", m_BDVER | m_BTVER2) 420 421/*****************************************************************************/ 422/* Historical relics: tuning flags that helps a specific old CPU designs */ 423/*****************************************************************************/ 424 425/* X86_TUNE_DOUBLE_WITH_ADD: Use add instead of sal to double value in 426 an integer register. */ 427DEF_TUNE (X86_TUNE_DOUBLE_WITH_ADD, "double_with_add", ~m_386) 428 429/* X86_TUNE_ALWAYS_FANCY_MATH_387: controls use of fancy 387 operations, 430 such as fsqrt, fprem, fsin, fcos, fsincos etc. 431 Should be enabled for all targets that always has coprocesor. */ 432DEF_TUNE (X86_TUNE_ALWAYS_FANCY_MATH_387, "always_fancy_math_387", 433 ~(m_386 | m_486)) 434 435/* X86_TUNE_UNROLL_STRLEN: Produce (quite lame) unrolled sequence for 436 inline strlen. This affects only -minline-all-stringops mode. By 437 default we always dispatch to a library since our internal strlen 438 is bad. */ 439DEF_TUNE (X86_TUNE_UNROLL_STRLEN, "unroll_strlen", ~m_386) 440 441/* X86_TUNE_SHIFT1: Enables use of short encoding of "sal reg" instead of 442 longer "sal $1, reg". */ 443DEF_TUNE (X86_TUNE_SHIFT1, "shift1", ~m_486) 444 445/* X86_TUNE_ZERO_EXTEND_WITH_AND: Use AND instruction instead 446 of mozbl/movwl. */ 447DEF_TUNE (X86_TUNE_ZERO_EXTEND_WITH_AND, "zero_extend_with_and", m_486 | m_PENT) 448 449/* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode 450 and SImode multiply, but 386 and 486 do HImode multiply faster. */ 451DEF_TUNE (X86_TUNE_PROMOTE_HIMODE_IMUL, "promote_himode_imul", 452 ~(m_386 | m_486)) 453 454/* X86_TUNE_FAST_PREFIX: Enable demoting some 32bit or 64bit arithmetic 455 into 16bit/8bit when resulting sequence is shorter. For example 456 for "and $-65536, reg" to 16bit store of 0. */ 457DEF_TUNE (X86_TUNE_FAST_PREFIX, "fast_prefix", ~(m_386 | m_486 | m_PENT)) 458 459/* X86_TUNE_READ_MODIFY_WRITE: Enable use of read modify write instructions 460 such as "add $1, mem". */ 461DEF_TUNE (X86_TUNE_READ_MODIFY_WRITE, "read_modify_write", ~m_PENT) 462 463/* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR 464 than a MOV. */ 465DEF_TUNE (X86_TUNE_MOVE_M1_VIA_OR, "move_m1_via_or", m_PENT) 466 467/* X86_TUNE_NOT_UNPAIRABLE: NOT is not pairable on Pentium, while XOR is, 468 but one byte longer. */ 469DEF_TUNE (X86_TUNE_NOT_UNPAIRABLE, "not_unpairable", m_PENT) 470 471/* X86_TUNE_PARTIAL_REG_STALL: Pentium pro, unlike later chips, handled 472 use of partial registers by renaming. This improved performance of 16bit 473 code where upper halves of registers are not used. It also leads to 474 an penalty whenever a 16bit store is followed by 32bit use. This flag 475 disables production of such sequences in common cases. 476 See also X86_TUNE_HIMODE_MATH. 477 478 In current implementation the partial register stalls are not eliminated 479 very well - they can be introduced via subregs synthesized by combine 480 and can happen in caller/callee saving sequences. */ 481DEF_TUNE (X86_TUNE_PARTIAL_REG_STALL, "partial_reg_stall", m_PPRO) 482 483/* X86_TUNE_PROMOTE_QIMODE: When it is cheap, turn 8bit arithmetic to 484 corresponding 32bit arithmetic. */ 485DEF_TUNE (X86_TUNE_PROMOTE_QIMODE, "promote_qimode", 486 ~m_PPRO) 487 488/* X86_TUNE_PROMOTE_HI_REGS: Same, but for 16bit artihmetic. Again we avoid 489 partial register stalls on PentiumPro targets. */ 490DEF_TUNE (X86_TUNE_PROMOTE_HI_REGS, "promote_hi_regs", m_PPRO) 491 492/* X86_TUNE_HIMODE_MATH: Enable use of 16bit arithmetic. 493 On PPro this flag is meant to avoid partial register stalls. */ 494DEF_TUNE (X86_TUNE_HIMODE_MATH, "himode_math", ~m_PPRO) 495 496/* X86_TUNE_SPLIT_LONG_MOVES: Avoid instructions moving immediates 497 directly to memory. */ 498DEF_TUNE (X86_TUNE_SPLIT_LONG_MOVES, "split_long_moves", m_PPRO) 499 500/* X86_TUNE_USE_XCHGB: Use xchgb %rh,%rl instead of rolw/rorw $8,rx. */ 501DEF_TUNE (X86_TUNE_USE_XCHGB, "use_xchgb", m_PENT4) 502 503/* X86_TUNE_USE_MOV0: Use "mov $0, reg" instead of "xor reg, reg" to clear 504 integer register. */ 505DEF_TUNE (X86_TUNE_USE_MOV0, "use_mov0", m_K6) 506 507/* X86_TUNE_NOT_VECTORMODE: On AMD K6, NOT is vector decoded with memory 508 operand that cannot be represented using a modRM byte. The XOR 509 replacement is long decoded, so this split helps here as well. */ 510DEF_TUNE (X86_TUNE_NOT_VECTORMODE, "not_vectormode", m_K6) 511 512/* X86_TUNE_AVOID_VECTOR_DECODE: Enable splitters that avoid vector decoded 513 forms of instructions on K8 targets. */ 514DEF_TUNE (X86_TUNE_AVOID_VECTOR_DECODE, "avoid_vector_decode", 515 m_K8) 516 517/* X86_TUNE_AVOID_FALSE_DEP_FOR_BMI: Avoid false dependency 518 for bit-manipulation instructions. */ 519DEF_TUNE (X86_TUNE_AVOID_FALSE_DEP_FOR_BMI, "avoid_false_dep_for_bmi", 520 m_SANDYBRIDGE | m_HASWELL | m_GENERIC) 521 522/*****************************************************************************/ 523/* This never worked well before. */ 524/*****************************************************************************/ 525 526/* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based 527 on simulation result. But after P4 was made, no performance benefit 528 was observed with branch hints. It also increases the code size. 529 As a result, icc never generates branch hints. */ 530DEF_TUNE (X86_TUNE_BRANCH_PREDICTION_HINTS, "branch_prediction_hints", 0) 531 532/* X86_TUNE_QIMODE_MATH: Enable use of 8bit arithmetic. */ 533DEF_TUNE (X86_TUNE_QIMODE_MATH, "qimode_math", ~0) 534 535/* X86_TUNE_PROMOTE_QI_REGS: This enables generic code that promotes all 8bit 536 arithmetic to 32bit via PROMOTE_MODE macro. This code generation scheme 537 is usually used for RISC targets. */ 538DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0) 539 540/* X86_TUNE_ADJUST_UNROLL: This enables adjusting the unroll factor based 541 on hardware capabilities. Bdver3 hardware has a loop buffer which makes 542 unrolling small loop less important. For, such architectures we adjust 543 the unroll factor so that the unrolled loop fits the loop buffer. */ 544DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4) 545