1/* Scheduler hooks for IA-32 which implement CPU specific logic. 2 Copyright (C) 1988-2020 Free Software Foundation, Inc. 3 4This file is part of GCC. 5 6GCC is free software; you can redistribute it and/or modify 7it under the terms of the GNU General Public License as published by 8the Free Software Foundation; either version 3, or (at your option) 9any later version. 10 11GCC is distributed in the hope that it will be useful, 12but WITHOUT ANY WARRANTY; without even the implied warranty of 13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14GNU General Public License for more details. 15 16You should have received a copy of the GNU General Public License 17along with GCC; see the file COPYING3. If not see 18<http://www.gnu.org/licenses/>. */ 19 20#define IN_TARGET_CODE 1 21 22#include "config.h" 23#include "system.h" 24#include "coretypes.h" 25#include "backend.h" 26#include "rtl.h" 27#include "tree.h" 28#include "cfghooks.h" 29#include "tm_p.h" 30#include "target.h" 31#include "insn-config.h" 32#include "insn-attr.h" 33#include "insn-opinit.h" 34#include "recog.h" 35 36/* Return the maximum number of instructions a cpu can issue. */ 37 38int 39ix86_issue_rate (void) 40{ 41 switch (ix86_tune) 42 { 43 case PROCESSOR_PENTIUM: 44 case PROCESSOR_LAKEMONT: 45 case PROCESSOR_BONNELL: 46 case PROCESSOR_SILVERMONT: 47 case PROCESSOR_KNL: 48 case PROCESSOR_KNM: 49 case PROCESSOR_INTEL: 50 case PROCESSOR_K6: 51 case PROCESSOR_BTVER2: 52 case PROCESSOR_PENTIUM4: 53 case PROCESSOR_NOCONA: 54 return 2; 55 56 case PROCESSOR_PENTIUMPRO: 57 case PROCESSOR_ATHLON: 58 case PROCESSOR_K8: 59 case PROCESSOR_AMDFAM10: 60 case PROCESSOR_BTVER1: 61 return 3; 62 63 case PROCESSOR_BDVER1: 64 case PROCESSOR_BDVER2: 65 case PROCESSOR_BDVER3: 66 case PROCESSOR_BDVER4: 67 case PROCESSOR_ZNVER1: 68 case PROCESSOR_ZNVER2: 69 case PROCESSOR_ZNVER3: 70 case PROCESSOR_CORE2: 71 case PROCESSOR_NEHALEM: 72 case PROCESSOR_SANDYBRIDGE: 73 case PROCESSOR_HASWELL: 74 case PROCESSOR_GENERIC: 75 return 4; 76 77 default: 78 return 1; 79 } 80} 81 82/* Return true iff USE_INSN has a memory address with operands set by 83 SET_INSN. */ 84 85bool 86ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn) 87{ 88 int i; 89 extract_insn_cached (use_insn); 90 for (i = recog_data.n_operands - 1; i >= 0; --i) 91 if (MEM_P (recog_data.operand[i])) 92 { 93 rtx addr = XEXP (recog_data.operand[i], 0); 94 if (modified_in_p (addr, set_insn) != 0) 95 { 96 /* No AGI stall if SET_INSN is a push or pop and USE_INSN 97 has SP based memory (unless index reg is modified in a pop). */ 98 rtx set = single_set (set_insn); 99 if (set 100 && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set))) 101 || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set))))) 102 { 103 struct ix86_address parts; 104 if (ix86_decompose_address (addr, &parts) 105 && parts.base == stack_pointer_rtx 106 && (parts.index == NULL_RTX 107 || MEM_P (SET_DEST (set)) 108 || !modified_in_p (parts.index, set_insn))) 109 return false; 110 } 111 return true; 112 } 113 return false; 114 } 115 return false; 116} 117 118/* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set 119 by DEP_INSN and nothing set by DEP_INSN. */ 120 121static bool 122ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type) 123{ 124 rtx set, set2; 125 126 /* Simplify the test for uninteresting insns. */ 127 if (insn_type != TYPE_SETCC 128 && insn_type != TYPE_ICMOV 129 && insn_type != TYPE_FCMOV 130 && insn_type != TYPE_IBR) 131 return false; 132 133 if ((set = single_set (dep_insn)) != 0) 134 { 135 set = SET_DEST (set); 136 set2 = NULL_RTX; 137 } 138 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL 139 && XVECLEN (PATTERN (dep_insn), 0) == 2 140 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET 141 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET) 142 { 143 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0)); 144 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0)); 145 } 146 else 147 return false; 148 149 if (!REG_P (set) || REGNO (set) != FLAGS_REG) 150 return false; 151 152 /* This test is true if the dependent insn reads the flags but 153 not any other potentially set register. */ 154 if (!reg_overlap_mentioned_p (set, PATTERN (insn))) 155 return false; 156 157 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn))) 158 return false; 159 160 return true; 161} 162 163/* Helper function for exact_store_load_dependency. 164 Return true if addr is found in insn. */ 165static bool 166exact_dependency_1 (rtx addr, rtx insn) 167{ 168 enum rtx_code code; 169 const char *format_ptr; 170 int i, j; 171 172 code = GET_CODE (insn); 173 switch (code) 174 { 175 case MEM: 176 if (rtx_equal_p (addr, insn)) 177 return true; 178 break; 179 case REG: 180 CASE_CONST_ANY: 181 case SYMBOL_REF: 182 case CODE_LABEL: 183 case PC: 184 case CC0: 185 case EXPR_LIST: 186 return false; 187 default: 188 break; 189 } 190 191 format_ptr = GET_RTX_FORMAT (code); 192 for (i = 0; i < GET_RTX_LENGTH (code); i++) 193 { 194 switch (*format_ptr++) 195 { 196 case 'e': 197 if (exact_dependency_1 (addr, XEXP (insn, i))) 198 return true; 199 break; 200 case 'E': 201 for (j = 0; j < XVECLEN (insn, i); j++) 202 if (exact_dependency_1 (addr, XVECEXP (insn, i, j))) 203 return true; 204 break; 205 } 206 } 207 return false; 208} 209 210/* Return true if there exists exact dependency for store & load, i.e. 211 the same memory address is used in them. */ 212static bool 213exact_store_load_dependency (rtx_insn *store, rtx_insn *load) 214{ 215 rtx set1, set2; 216 217 set1 = single_set (store); 218 if (!set1) 219 return false; 220 if (!MEM_P (SET_DEST (set1))) 221 return false; 222 set2 = single_set (load); 223 if (!set2) 224 return false; 225 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2))) 226 return true; 227 return false; 228} 229 230 231/* This function corrects the value of COST (latency) based on the relationship 232 between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength 233 DW. It should return the new value. 234 235 On x86 CPUs this is most commonly used to model the fact that valus of 236 registers used to compute address of memory operand needs to be ready 237 earlier than values of registers used in the actual operation. */ 238 239int 240ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost, 241 unsigned int) 242{ 243 enum attr_type insn_type, dep_insn_type; 244 enum attr_memory memory; 245 rtx set, set2; 246 int dep_insn_code_number; 247 248 /* Anti and output dependencies have zero cost on all CPUs. */ 249 if (dep_type != 0) 250 return 0; 251 252 dep_insn_code_number = recog_memoized (dep_insn); 253 254 /* If we can't recognize the insns, we can't really do anything. */ 255 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0) 256 return cost; 257 258 insn_type = get_attr_type (insn); 259 dep_insn_type = get_attr_type (dep_insn); 260 261 switch (ix86_tune) 262 { 263 case PROCESSOR_PENTIUM: 264 case PROCESSOR_LAKEMONT: 265 /* Address Generation Interlock adds a cycle of latency. */ 266 if (insn_type == TYPE_LEA) 267 { 268 rtx addr = PATTERN (insn); 269 270 if (GET_CODE (addr) == PARALLEL) 271 addr = XVECEXP (addr, 0, 0); 272 273 gcc_assert (GET_CODE (addr) == SET); 274 275 addr = SET_SRC (addr); 276 if (modified_in_p (addr, dep_insn)) 277 cost += 1; 278 } 279 else if (ix86_agi_dependent (dep_insn, insn)) 280 cost += 1; 281 282 /* ??? Compares pair with jump/setcc. */ 283 if (ix86_flags_dependent (insn, dep_insn, insn_type)) 284 cost = 0; 285 286 /* Floating point stores require value to be ready one cycle earlier. */ 287 if (insn_type == TYPE_FMOV 288 && get_attr_memory (insn) == MEMORY_STORE 289 && !ix86_agi_dependent (dep_insn, insn)) 290 cost += 1; 291 break; 292 293 case PROCESSOR_PENTIUMPRO: 294 /* INT->FP conversion is expensive. */ 295 if (get_attr_fp_int_src (dep_insn)) 296 cost += 5; 297 298 /* There is one cycle extra latency between an FP op and a store. */ 299 if (insn_type == TYPE_FMOV 300 && (set = single_set (dep_insn)) != NULL_RTX 301 && (set2 = single_set (insn)) != NULL_RTX 302 && rtx_equal_p (SET_DEST (set), SET_SRC (set2)) 303 && MEM_P (SET_DEST (set2))) 304 cost += 1; 305 306 memory = get_attr_memory (insn); 307 308 /* Show ability of reorder buffer to hide latency of load by executing 309 in parallel with previous instruction in case 310 previous instruction is not needed to compute the address. */ 311 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) 312 && !ix86_agi_dependent (dep_insn, insn)) 313 { 314 /* Claim moves to take one cycle, as core can issue one load 315 at time and the next load can start cycle later. */ 316 if (dep_insn_type == TYPE_IMOV 317 || dep_insn_type == TYPE_FMOV) 318 cost = 1; 319 else if (cost > 1) 320 cost--; 321 } 322 break; 323 324 case PROCESSOR_K6: 325 /* The esp dependency is resolved before 326 the instruction is really finished. */ 327 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) 328 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) 329 return 1; 330 331 /* INT->FP conversion is expensive. */ 332 if (get_attr_fp_int_src (dep_insn)) 333 cost += 5; 334 335 memory = get_attr_memory (insn); 336 337 /* Show ability of reorder buffer to hide latency of load by executing 338 in parallel with previous instruction in case 339 previous instruction is not needed to compute the address. */ 340 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) 341 && !ix86_agi_dependent (dep_insn, insn)) 342 { 343 /* Claim moves to take one cycle, as core can issue one load 344 at time and the next load can start cycle later. */ 345 if (dep_insn_type == TYPE_IMOV 346 || dep_insn_type == TYPE_FMOV) 347 cost = 1; 348 else if (cost > 2) 349 cost -= 2; 350 else 351 cost = 1; 352 } 353 break; 354 355 case PROCESSOR_AMDFAM10: 356 case PROCESSOR_BDVER1: 357 case PROCESSOR_BDVER2: 358 case PROCESSOR_BDVER3: 359 case PROCESSOR_BDVER4: 360 case PROCESSOR_BTVER1: 361 case PROCESSOR_BTVER2: 362 /* Stack engine allows to execute push&pop instructions in parall. */ 363 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) 364 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) 365 return 0; 366 /* FALLTHRU */ 367 368 case PROCESSOR_ATHLON: 369 case PROCESSOR_K8: 370 memory = get_attr_memory (insn); 371 372 /* Show ability of reorder buffer to hide latency of load by executing 373 in parallel with previous instruction in case 374 previous instruction is not needed to compute the address. */ 375 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) 376 && !ix86_agi_dependent (dep_insn, insn)) 377 { 378 enum attr_unit unit = get_attr_unit (insn); 379 int loadcost = 3; 380 381 /* Because of the difference between the length of integer and 382 floating unit pipeline preparation stages, the memory operands 383 for floating point are cheaper. 384 385 ??? For Athlon it the difference is most probably 2. */ 386 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN) 387 loadcost = 3; 388 else 389 loadcost = TARGET_ATHLON ? 2 : 0; 390 391 if (cost >= loadcost) 392 cost -= loadcost; 393 else 394 cost = 0; 395 } 396 break; 397 398 case PROCESSOR_ZNVER1: 399 case PROCESSOR_ZNVER2: 400 case PROCESSOR_ZNVER3: 401 /* Stack engine allows to execute push&pop instructions in parall. */ 402 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) 403 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) 404 return 0; 405 406 memory = get_attr_memory (insn); 407 408 /* Show ability of reorder buffer to hide latency of load by executing 409 in parallel with previous instruction in case 410 previous instruction is not needed to compute the address. */ 411 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) 412 && !ix86_agi_dependent (dep_insn, insn)) 413 { 414 enum attr_unit unit = get_attr_unit (insn); 415 int loadcost; 416 417 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN) 418 loadcost = 4; 419 else 420 loadcost = 7; 421 422 if (cost >= loadcost) 423 cost -= loadcost; 424 else 425 cost = 0; 426 } 427 break; 428 429 case PROCESSOR_CORE2: 430 case PROCESSOR_NEHALEM: 431 case PROCESSOR_SANDYBRIDGE: 432 case PROCESSOR_HASWELL: 433 case PROCESSOR_GENERIC: 434 /* Stack engine allows to execute push&pop instructions in parall. */ 435 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) 436 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) 437 return 0; 438 439 memory = get_attr_memory (insn); 440 441 /* Show ability of reorder buffer to hide latency of load by executing 442 in parallel with previous instruction in case 443 previous instruction is not needed to compute the address. */ 444 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) 445 && !ix86_agi_dependent (dep_insn, insn)) 446 { 447 if (cost >= 4) 448 cost -= 4; 449 else 450 cost = 0; 451 } 452 break; 453 454 case PROCESSOR_SILVERMONT: 455 case PROCESSOR_KNL: 456 case PROCESSOR_KNM: 457 case PROCESSOR_INTEL: 458 if (!reload_completed) 459 return cost; 460 461 /* Increase cost of integer loads. */ 462 memory = get_attr_memory (dep_insn); 463 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH) 464 { 465 enum attr_unit unit = get_attr_unit (dep_insn); 466 if (unit == UNIT_INTEGER && cost == 1) 467 { 468 if (memory == MEMORY_LOAD) 469 cost = 3; 470 else 471 { 472 /* Increase cost of ld/st for short int types only 473 because of store forwarding issue. */ 474 rtx set = single_set (dep_insn); 475 if (set && (GET_MODE (SET_DEST (set)) == QImode 476 || GET_MODE (SET_DEST (set)) == HImode)) 477 { 478 /* Increase cost of store/load insn if exact 479 dependence exists and it is load insn. */ 480 enum attr_memory insn_memory = get_attr_memory (insn); 481 if (insn_memory == MEMORY_LOAD 482 && exact_store_load_dependency (dep_insn, insn)) 483 cost = 3; 484 } 485 } 486 } 487 } 488 489 default: 490 break; 491 } 492 493 return cost; 494} 495 496/* How many alternative schedules to try. This should be as wide as the 497 scheduling freedom in the DFA, but no wider. Making this value too 498 large results extra work for the scheduler. */ 499 500int 501ia32_multipass_dfa_lookahead (void) 502{ 503 /* Generally, we want haifa-sched:max_issue() to look ahead as far 504 as many instructions can be executed on a cycle, i.e., 505 issue_rate. */ 506 if (reload_completed) 507 return ix86_issue_rate (); 508 /* Don't use lookahead for pre-reload schedule to save compile time. */ 509 return 0; 510} 511 512/* Return true if target platform supports macro-fusion. */ 513 514bool 515ix86_macro_fusion_p () 516{ 517 return TARGET_FUSE_CMP_AND_BRANCH; 518} 519 520/* Check whether current microarchitecture support macro fusion 521 for insn pair "CONDGEN + CONDJMP". Refer to 522 "Intel Architectures Optimization Reference Manual". */ 523 524bool 525ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp) 526{ 527 rtx src, dest; 528 enum rtx_code ccode; 529 rtx compare_set = NULL_RTX, test_if, cond; 530 rtx alu_set = NULL_RTX, addr = NULL_RTX; 531 enum attr_type condgen_type; 532 533 if (!any_condjump_p (condjmp)) 534 return false; 535 536 unsigned int condreg1, condreg2; 537 rtx cc_reg_1; 538 targetm.fixed_condition_code_regs (&condreg1, &condreg2); 539 cc_reg_1 = gen_rtx_REG (CCmode, condreg1); 540 if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp)) 541 || !condgen 542 || !modified_in_p (cc_reg_1, condgen)) 543 return false; 544 545 condgen_type = get_attr_type (condgen); 546 if (condgen_type == TYPE_MULTI 547 && INSN_CODE (condgen) == code_for_stack_protect_test_1 (ptr_mode) 548 && TARGET_FUSE_ALU_AND_BRANCH) 549 { 550 /* stack_protect_test_<mode> ends with a sub, which subtracts 551 a non-rip special memory operand from a GPR. */ 552 src = NULL_RTX; 553 alu_set = XVECEXP (PATTERN (condgen), 0, 1); 554 goto handle_stack_protect_test; 555 } 556 else if (condgen_type != TYPE_TEST 557 && condgen_type != TYPE_ICMP 558 && condgen_type != TYPE_INCDEC 559 && condgen_type != TYPE_ALU) 560 return false; 561 562 compare_set = single_set (condgen); 563 if (compare_set == NULL_RTX && !TARGET_FUSE_ALU_AND_BRANCH) 564 return false; 565 566 if (compare_set == NULL_RTX) 567 { 568 int i; 569 rtx pat = PATTERN (condgen); 570 for (i = 0; i < XVECLEN (pat, 0); i++) 571 if (GET_CODE (XVECEXP (pat, 0, i)) == SET) 572 { 573 rtx set_src = SET_SRC (XVECEXP (pat, 0, i)); 574 if (GET_CODE (set_src) == COMPARE) 575 compare_set = XVECEXP (pat, 0, i); 576 else 577 alu_set = XVECEXP (pat, 0, i); 578 } 579 } 580 if (compare_set == NULL_RTX) 581 return false; 582 src = SET_SRC (compare_set); 583 if (GET_CODE (src) != COMPARE) 584 return false; 585 586 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not 587 supported. */ 588 if ((MEM_P (XEXP (src, 0)) && CONST_INT_P (XEXP (src, 1))) 589 || (MEM_P (XEXP (src, 1)) && CONST_INT_P (XEXP (src, 0)))) 590 return false; 591 592 /* No fusion for RIP-relative address. */ 593 if (MEM_P (XEXP (src, 0))) 594 addr = XEXP (XEXP (src, 0), 0); 595 else if (MEM_P (XEXP (src, 1))) 596 addr = XEXP (XEXP (src, 1), 0); 597 598 if (addr) 599 { 600 ix86_address parts; 601 int ok = ix86_decompose_address (addr, &parts); 602 gcc_assert (ok); 603 604 if (ix86_rip_relative_addr_p (&parts)) 605 return false; 606 } 607 608 handle_stack_protect_test: 609 test_if = SET_SRC (pc_set (condjmp)); 610 cond = XEXP (test_if, 0); 611 ccode = GET_CODE (cond); 612 /* Check whether conditional jump use Sign or Overflow Flags. */ 613 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS 614 && (ccode == GE || ccode == GT || ccode == LE || ccode == LT)) 615 return false; 616 617 /* Return true for TYPE_TEST and TYPE_ICMP. */ 618 if (condgen_type == TYPE_TEST || condgen_type == TYPE_ICMP) 619 return true; 620 621 /* The following is the case that macro-fusion for alu + jmp. */ 622 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set) 623 return false; 624 625 /* No fusion for alu op with memory destination operand. */ 626 dest = SET_DEST (alu_set); 627 if (MEM_P (dest)) 628 return false; 629 630 /* Macro-fusion for inc/dec + unsigned conditional jump is not 631 supported. */ 632 if (condgen_type == TYPE_INCDEC 633 && (ccode == GEU || ccode == GTU || ccode == LEU || ccode == LTU)) 634 return false; 635 636 return true; 637} 638 639