1/* Scheduler hooks for IA-32 which implement CPU specific logic. 2 Copyright (C) 1988-2022 Free Software Foundation, Inc. 3 4This file is part of GCC. 5 6GCC is free software; you can redistribute it and/or modify 7it under the terms of the GNU General Public License as published by 8the Free Software Foundation; either version 3, or (at your option) 9any later version. 10 11GCC is distributed in the hope that it will be useful, 12but WITHOUT ANY WARRANTY; without even the implied warranty of 13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14GNU General Public License for more details. 15 16You should have received a copy of the GNU General Public License 17along with GCC; see the file COPYING3. If not see 18<http://www.gnu.org/licenses/>. */ 19 20#define IN_TARGET_CODE 1 21 22#include "config.h" 23#include "system.h" 24#include "coretypes.h" 25#include "backend.h" 26#include "rtl.h" 27#include "tree.h" 28#include "cfghooks.h" 29#include "tm_p.h" 30#include "target.h" 31#include "insn-config.h" 32#include "insn-attr.h" 33#include "insn-opinit.h" 34#include "recog.h" 35 36/* Return the maximum number of instructions a cpu can issue. */ 37 38int 39ix86_issue_rate (void) 40{ 41 switch (ix86_tune) 42 { 43 case PROCESSOR_PENTIUM: 44 case PROCESSOR_LAKEMONT: 45 case PROCESSOR_BONNELL: 46 case PROCESSOR_SILVERMONT: 47 case PROCESSOR_KNL: 48 case PROCESSOR_KNM: 49 case PROCESSOR_INTEL: 50 case PROCESSOR_K6: 51 case PROCESSOR_BTVER2: 52 case PROCESSOR_PENTIUM4: 53 case PROCESSOR_NOCONA: 54 return 2; 55 56 case PROCESSOR_PENTIUMPRO: 57 case PROCESSOR_ATHLON: 58 case PROCESSOR_K8: 59 case PROCESSOR_AMDFAM10: 60 case PROCESSOR_BTVER1: 61 return 3; 62 63 case PROCESSOR_BDVER1: 64 case PROCESSOR_BDVER2: 65 case PROCESSOR_BDVER3: 66 case PROCESSOR_BDVER4: 67 case PROCESSOR_ZNVER1: 68 case PROCESSOR_ZNVER2: 69 case PROCESSOR_ZNVER3: 70 case PROCESSOR_ZNVER4: 71 case PROCESSOR_CORE2: 72 case PROCESSOR_NEHALEM: 73 case PROCESSOR_SANDYBRIDGE: 74 case PROCESSOR_HASWELL: 75 case PROCESSOR_TREMONT: 76 case PROCESSOR_ALDERLAKE: 77 case PROCESSOR_GENERIC: 78 return 4; 79 80 default: 81 return 1; 82 } 83} 84 85/* Return true iff USE_INSN has a memory address with operands set by 86 SET_INSN. */ 87 88bool 89ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn) 90{ 91 int i; 92 extract_insn_cached (use_insn); 93 for (i = recog_data.n_operands - 1; i >= 0; --i) 94 if (MEM_P (recog_data.operand[i])) 95 { 96 rtx addr = XEXP (recog_data.operand[i], 0); 97 if (modified_in_p (addr, set_insn) != 0) 98 { 99 /* No AGI stall if SET_INSN is a push or pop and USE_INSN 100 has SP based memory (unless index reg is modified in a pop). */ 101 rtx set = single_set (set_insn); 102 if (set 103 && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set))) 104 || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set))))) 105 { 106 struct ix86_address parts; 107 if (ix86_decompose_address (addr, &parts) 108 && parts.base == stack_pointer_rtx 109 && (parts.index == NULL_RTX 110 || MEM_P (SET_DEST (set)) 111 || !modified_in_p (parts.index, set_insn))) 112 return false; 113 } 114 return true; 115 } 116 return false; 117 } 118 return false; 119} 120 121/* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set 122 by DEP_INSN and nothing set by DEP_INSN. */ 123 124static bool 125ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type) 126{ 127 rtx set, set2; 128 129 /* Simplify the test for uninteresting insns. */ 130 if (insn_type != TYPE_SETCC 131 && insn_type != TYPE_ICMOV 132 && insn_type != TYPE_FCMOV 133 && insn_type != TYPE_IBR) 134 return false; 135 136 if ((set = single_set (dep_insn)) != 0) 137 { 138 set = SET_DEST (set); 139 set2 = NULL_RTX; 140 } 141 else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL 142 && XVECLEN (PATTERN (dep_insn), 0) == 2 143 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET 144 && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET) 145 { 146 set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0)); 147 set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0)); 148 } 149 else 150 return false; 151 152 if (!REG_P (set) || REGNO (set) != FLAGS_REG) 153 return false; 154 155 /* This test is true if the dependent insn reads the flags but 156 not any other potentially set register. */ 157 if (!reg_overlap_mentioned_p (set, PATTERN (insn))) 158 return false; 159 160 if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn))) 161 return false; 162 163 return true; 164} 165 166/* Helper function for exact_store_load_dependency. 167 Return true if addr is found in insn. */ 168static bool 169exact_dependency_1 (rtx addr, rtx insn) 170{ 171 enum rtx_code code; 172 const char *format_ptr; 173 int i, j; 174 175 code = GET_CODE (insn); 176 switch (code) 177 { 178 case MEM: 179 if (rtx_equal_p (addr, insn)) 180 return true; 181 break; 182 case REG: 183 CASE_CONST_ANY: 184 case SYMBOL_REF: 185 case CODE_LABEL: 186 case PC: 187 case EXPR_LIST: 188 return false; 189 default: 190 break; 191 } 192 193 format_ptr = GET_RTX_FORMAT (code); 194 for (i = 0; i < GET_RTX_LENGTH (code); i++) 195 { 196 switch (*format_ptr++) 197 { 198 case 'e': 199 if (exact_dependency_1 (addr, XEXP (insn, i))) 200 return true; 201 break; 202 case 'E': 203 for (j = 0; j < XVECLEN (insn, i); j++) 204 if (exact_dependency_1 (addr, XVECEXP (insn, i, j))) 205 return true; 206 break; 207 } 208 } 209 return false; 210} 211 212/* Return true if there exists exact dependency for store & load, i.e. 213 the same memory address is used in them. */ 214static bool 215exact_store_load_dependency (rtx_insn *store, rtx_insn *load) 216{ 217 rtx set1, set2; 218 219 set1 = single_set (store); 220 if (!set1) 221 return false; 222 if (!MEM_P (SET_DEST (set1))) 223 return false; 224 set2 = single_set (load); 225 if (!set2) 226 return false; 227 if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2))) 228 return true; 229 return false; 230} 231 232 233/* This function corrects the value of COST (latency) based on the relationship 234 between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength 235 DW. It should return the new value. 236 237 On x86 CPUs this is most commonly used to model the fact that valus of 238 registers used to compute address of memory operand needs to be ready 239 earlier than values of registers used in the actual operation. */ 240 241int 242ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost, 243 unsigned int) 244{ 245 enum attr_type insn_type, dep_insn_type; 246 enum attr_memory memory; 247 rtx set, set2; 248 int dep_insn_code_number; 249 250 /* Anti and output dependencies have zero cost on all CPUs. */ 251 if (dep_type != 0) 252 return 0; 253 254 dep_insn_code_number = recog_memoized (dep_insn); 255 256 /* If we can't recognize the insns, we can't really do anything. */ 257 if (dep_insn_code_number < 0 || recog_memoized (insn) < 0) 258 return cost; 259 260 insn_type = get_attr_type (insn); 261 dep_insn_type = get_attr_type (dep_insn); 262 263 switch (ix86_tune) 264 { 265 case PROCESSOR_PENTIUM: 266 case PROCESSOR_LAKEMONT: 267 /* Address Generation Interlock adds a cycle of latency. */ 268 if (insn_type == TYPE_LEA) 269 { 270 rtx addr = PATTERN (insn); 271 272 if (GET_CODE (addr) == PARALLEL) 273 addr = XVECEXP (addr, 0, 0); 274 275 gcc_assert (GET_CODE (addr) == SET); 276 277 addr = SET_SRC (addr); 278 if (modified_in_p (addr, dep_insn)) 279 cost += 1; 280 } 281 else if (ix86_agi_dependent (dep_insn, insn)) 282 cost += 1; 283 284 /* ??? Compares pair with jump/setcc. */ 285 if (ix86_flags_dependent (insn, dep_insn, insn_type)) 286 cost = 0; 287 288 /* Floating point stores require value to be ready one cycle earlier. */ 289 if (insn_type == TYPE_FMOV 290 && get_attr_memory (insn) == MEMORY_STORE 291 && !ix86_agi_dependent (dep_insn, insn)) 292 cost += 1; 293 break; 294 295 case PROCESSOR_PENTIUMPRO: 296 /* INT->FP conversion is expensive. */ 297 if (get_attr_fp_int_src (dep_insn)) 298 cost += 5; 299 300 /* There is one cycle extra latency between an FP op and a store. */ 301 if (insn_type == TYPE_FMOV 302 && (set = single_set (dep_insn)) != NULL_RTX 303 && (set2 = single_set (insn)) != NULL_RTX 304 && rtx_equal_p (SET_DEST (set), SET_SRC (set2)) 305 && MEM_P (SET_DEST (set2))) 306 cost += 1; 307 308 memory = get_attr_memory (insn); 309 310 /* Show ability of reorder buffer to hide latency of load by executing 311 in parallel with previous instruction in case 312 previous instruction is not needed to compute the address. */ 313 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) 314 && !ix86_agi_dependent (dep_insn, insn)) 315 { 316 /* Claim moves to take one cycle, as core can issue one load 317 at time and the next load can start cycle later. */ 318 if (dep_insn_type == TYPE_IMOV 319 || dep_insn_type == TYPE_FMOV) 320 cost = 1; 321 else if (cost > 1) 322 cost--; 323 } 324 break; 325 326 case PROCESSOR_K6: 327 /* The esp dependency is resolved before 328 the instruction is really finished. */ 329 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) 330 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) 331 return 1; 332 333 /* INT->FP conversion is expensive. */ 334 if (get_attr_fp_int_src (dep_insn)) 335 cost += 5; 336 337 memory = get_attr_memory (insn); 338 339 /* Show ability of reorder buffer to hide latency of load by executing 340 in parallel with previous instruction in case 341 previous instruction is not needed to compute the address. */ 342 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) 343 && !ix86_agi_dependent (dep_insn, insn)) 344 { 345 /* Claim moves to take one cycle, as core can issue one load 346 at time and the next load can start cycle later. */ 347 if (dep_insn_type == TYPE_IMOV 348 || dep_insn_type == TYPE_FMOV) 349 cost = 1; 350 else if (cost > 2) 351 cost -= 2; 352 else 353 cost = 1; 354 } 355 break; 356 357 case PROCESSOR_AMDFAM10: 358 case PROCESSOR_BDVER1: 359 case PROCESSOR_BDVER2: 360 case PROCESSOR_BDVER3: 361 case PROCESSOR_BDVER4: 362 case PROCESSOR_BTVER1: 363 case PROCESSOR_BTVER2: 364 /* Stack engine allows to execute push&pop instructions in parall. */ 365 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) 366 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) 367 return 0; 368 /* FALLTHRU */ 369 370 case PROCESSOR_ATHLON: 371 case PROCESSOR_K8: 372 memory = get_attr_memory (insn); 373 374 /* Show ability of reorder buffer to hide latency of load by executing 375 in parallel with previous instruction in case 376 previous instruction is not needed to compute the address. */ 377 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) 378 && !ix86_agi_dependent (dep_insn, insn)) 379 { 380 enum attr_unit unit = get_attr_unit (insn); 381 int loadcost = 3; 382 383 /* Because of the difference between the length of integer and 384 floating unit pipeline preparation stages, the memory operands 385 for floating point are cheaper. 386 387 ??? For Athlon it the difference is most probably 2. */ 388 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN) 389 loadcost = 3; 390 else 391 loadcost = TARGET_CPU_P (ATHLON) ? 2 : 0; 392 393 if (cost >= loadcost) 394 cost -= loadcost; 395 else 396 cost = 0; 397 } 398 break; 399 400 case PROCESSOR_ZNVER1: 401 case PROCESSOR_ZNVER2: 402 case PROCESSOR_ZNVER3: 403 case PROCESSOR_ZNVER4: 404 /* Stack engine allows to execute push&pop instructions in parall. */ 405 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) 406 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) 407 return 0; 408 409 memory = get_attr_memory (insn); 410 411 /* Show ability of reorder buffer to hide latency of load by executing 412 in parallel with previous instruction in case 413 previous instruction is not needed to compute the address. */ 414 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) 415 && !ix86_agi_dependent (dep_insn, insn)) 416 { 417 enum attr_unit unit = get_attr_unit (insn); 418 int loadcost; 419 420 if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN) 421 loadcost = 4; 422 else 423 loadcost = 7; 424 425 if (cost >= loadcost) 426 cost -= loadcost; 427 else 428 cost = 0; 429 } 430 break; 431 432 case PROCESSOR_CORE2: 433 case PROCESSOR_NEHALEM: 434 case PROCESSOR_SANDYBRIDGE: 435 case PROCESSOR_HASWELL: 436 case PROCESSOR_TREMONT: 437 case PROCESSOR_ALDERLAKE: 438 case PROCESSOR_GENERIC: 439 /* Stack engine allows to execute push&pop instructions in parall. */ 440 if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP) 441 && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP)) 442 return 0; 443 444 memory = get_attr_memory (insn); 445 446 /* Show ability of reorder buffer to hide latency of load by executing 447 in parallel with previous instruction in case 448 previous instruction is not needed to compute the address. */ 449 if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH) 450 && !ix86_agi_dependent (dep_insn, insn)) 451 { 452 if (cost >= 4) 453 cost -= 4; 454 else 455 cost = 0; 456 } 457 break; 458 459 case PROCESSOR_SILVERMONT: 460 case PROCESSOR_KNL: 461 case PROCESSOR_KNM: 462 case PROCESSOR_INTEL: 463 if (!reload_completed) 464 return cost; 465 466 /* Increase cost of integer loads. */ 467 memory = get_attr_memory (dep_insn); 468 if (memory == MEMORY_LOAD || memory == MEMORY_BOTH) 469 { 470 enum attr_unit unit = get_attr_unit (dep_insn); 471 if (unit == UNIT_INTEGER && cost == 1) 472 { 473 if (memory == MEMORY_LOAD) 474 cost = 3; 475 else 476 { 477 /* Increase cost of ld/st for short int types only 478 because of store forwarding issue. */ 479 rtx set = single_set (dep_insn); 480 if (set && (GET_MODE (SET_DEST (set)) == QImode 481 || GET_MODE (SET_DEST (set)) == HImode)) 482 { 483 /* Increase cost of store/load insn if exact 484 dependence exists and it is load insn. */ 485 enum attr_memory insn_memory = get_attr_memory (insn); 486 if (insn_memory == MEMORY_LOAD 487 && exact_store_load_dependency (dep_insn, insn)) 488 cost = 3; 489 } 490 } 491 } 492 } 493 494 default: 495 break; 496 } 497 498 return cost; 499} 500 501/* How many alternative schedules to try. This should be as wide as the 502 scheduling freedom in the DFA, but no wider. Making this value too 503 large results extra work for the scheduler. */ 504 505int 506ia32_multipass_dfa_lookahead (void) 507{ 508 /* Generally, we want haifa-sched:max_issue() to look ahead as far 509 as many instructions can be executed on a cycle, i.e., 510 issue_rate. */ 511 if (reload_completed) 512 return ix86_issue_rate (); 513 /* Don't use lookahead for pre-reload schedule to save compile time. */ 514 return 0; 515} 516 517/* Return true if target platform supports macro-fusion. */ 518 519bool 520ix86_macro_fusion_p () 521{ 522 return TARGET_FUSE_CMP_AND_BRANCH; 523} 524 525/* Check whether current microarchitecture support macro fusion 526 for insn pair "CONDGEN + CONDJMP". Refer to 527 "Intel Architectures Optimization Reference Manual". */ 528 529bool 530ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp) 531{ 532 rtx src, dest; 533 enum rtx_code ccode; 534 rtx compare_set = NULL_RTX, test_if, cond; 535 rtx alu_set = NULL_RTX, addr = NULL_RTX; 536 enum attr_type condgen_type; 537 538 if (!any_condjump_p (condjmp)) 539 return false; 540 541 unsigned int condreg1, condreg2; 542 rtx cc_reg_1; 543 targetm.fixed_condition_code_regs (&condreg1, &condreg2); 544 cc_reg_1 = gen_rtx_REG (CCmode, condreg1); 545 if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp)) 546 || !condgen 547 || !modified_in_p (cc_reg_1, condgen)) 548 return false; 549 550 condgen_type = get_attr_type (condgen); 551 if (condgen_type == TYPE_MULTI 552 && INSN_CODE (condgen) == code_for_stack_protect_test_1 (ptr_mode) 553 && TARGET_FUSE_ALU_AND_BRANCH) 554 { 555 /* stack_protect_test_<mode> ends with a sub, which subtracts 556 a non-rip special memory operand from a GPR. */ 557 src = NULL_RTX; 558 alu_set = XVECEXP (PATTERN (condgen), 0, 1); 559 goto handle_stack_protect_test; 560 } 561 else if (condgen_type != TYPE_TEST 562 && condgen_type != TYPE_ICMP 563 && condgen_type != TYPE_INCDEC 564 && condgen_type != TYPE_ALU) 565 return false; 566 567 compare_set = single_set (condgen); 568 if (compare_set == NULL_RTX && !TARGET_FUSE_ALU_AND_BRANCH) 569 return false; 570 571 if (compare_set == NULL_RTX) 572 { 573 int i; 574 rtx pat = PATTERN (condgen); 575 for (i = 0; i < XVECLEN (pat, 0); i++) 576 if (GET_CODE (XVECEXP (pat, 0, i)) == SET) 577 { 578 rtx set_src = SET_SRC (XVECEXP (pat, 0, i)); 579 if (GET_CODE (set_src) == COMPARE) 580 compare_set = XVECEXP (pat, 0, i); 581 else 582 alu_set = XVECEXP (pat, 0, i); 583 } 584 } 585 if (compare_set == NULL_RTX) 586 return false; 587 src = SET_SRC (compare_set); 588 if (GET_CODE (src) != COMPARE) 589 return false; 590 591 /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not 592 supported. */ 593 if ((MEM_P (XEXP (src, 0)) && CONST_INT_P (XEXP (src, 1))) 594 || (MEM_P (XEXP (src, 1)) && CONST_INT_P (XEXP (src, 0)))) 595 return false; 596 597 /* No fusion for RIP-relative address. */ 598 if (MEM_P (XEXP (src, 0))) 599 addr = XEXP (XEXP (src, 0), 0); 600 else if (MEM_P (XEXP (src, 1))) 601 addr = XEXP (XEXP (src, 1), 0); 602 603 if (addr) 604 { 605 ix86_address parts; 606 int ok = ix86_decompose_address (addr, &parts); 607 gcc_assert (ok); 608 609 if (ix86_rip_relative_addr_p (&parts)) 610 return false; 611 } 612 613 handle_stack_protect_test: 614 test_if = SET_SRC (pc_set (condjmp)); 615 cond = XEXP (test_if, 0); 616 ccode = GET_CODE (cond); 617 /* Check whether conditional jump use Sign or Overflow Flags. */ 618 if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS 619 && (ccode == GE || ccode == GT || ccode == LE || ccode == LT)) 620 return false; 621 622 /* Return true for TYPE_TEST and TYPE_ICMP. */ 623 if (condgen_type == TYPE_TEST || condgen_type == TYPE_ICMP) 624 return true; 625 626 /* The following is the case that macro-fusion for alu + jmp. */ 627 if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set) 628 return false; 629 630 /* No fusion for alu op with memory destination operand. */ 631 dest = SET_DEST (alu_set); 632 if (MEM_P (dest)) 633 return false; 634 635 /* Macro-fusion for inc/dec + unsigned conditional jump is not 636 supported. */ 637 if (condgen_type == TYPE_INCDEC 638 && (ccode == GEU || ccode == GTU || ccode == LEU || ccode == LTU)) 639 return false; 640 641 return true; 642} 643 644