1;; Scheduling for Core 2 and derived processors. 2;; Copyright (C) 2004-2020 Free Software Foundation, Inc. 3;; 4;; This file is part of GCC. 5;; 6;; GCC is free software; you can redistribute it and/or modify 7;; it under the terms of the GNU General Public License as published by 8;; the Free Software Foundation; either version 3, or (at your option) 9;; any later version. 10;; 11;; GCC is distributed in the hope that it will be useful, 12;; but WITHOUT ANY WARRANTY; without even the implied warranty of 13;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14;; GNU General Public License for more details. 15;; 16;; You should have received a copy of the GNU General Public License 17;; along with GCC; see the file COPYING3. If not see 18;; <http://www.gnu.org/licenses/>. */ 19 20;; The scheduling description in this file is based on the one in ppro.md, 21;; with additional information obtained from 22;; 23;; "How to optimize for the Pentium family of microprocessors", 24;; by Agner Fog, PhD. 25;; 26;; The major difference from the P6 pipeline is one extra decoder, and 27;; one extra execute unit. Due to micro-op fusion, many insns no longer 28;; need to be decoded in decoder 0, but can be handled by all of them. 29 30;; The core2_idiv, core2_fdiv and core2_ssediv automata are used to 31;; model issue latencies of idiv, fdiv and ssediv type insns. 32(define_automaton "core2_decoder,core2_core,core2_idiv,core2_fdiv,core2_ssediv,core2_load,core2_store") 33 34;; The CPU domain, used for Core i7 bypass latencies 35(define_attr "i7_domain" "int,float,simd" 36 (cond [(eq_attr "type" "fmov,fop,fsgn,fmul,fdiv,fpspc,fcmov,fcmp,fxch,fistp,fisttp,frndint") 37 (const_string "float") 38 (eq_attr "type" "sselog,sselog1,sseiadd,sseiadd1,sseishft,sseishft1,sseimul, 39 sse,ssemov,sseadd,sseadd1,ssemul,ssecmp,ssecomi,ssecvt, 40 ssecvt1,sseicvt,ssediv,sseins,ssemuladd,sse4arg") 41 (cond [(eq_attr "mode" "V4DF,V8SF,V2DF,V4SF,SF,DF") 42 (const_string "float") 43 (eq_attr "mode" "SI") 44 (const_string "int")] 45 (const_string "simd")) 46 (eq_attr "type" "mmx,mmxmov,mmxadd,mmxmul,mmxcmp,mmxcvt,mmxshft") 47 (const_string "simd")] 48 (const_string "int"))) 49 50;; As for the Pentium Pro, 51;; - an instruction with 1 uop can be decoded by any of the three 52;; decoders in one cycle. 53;; - an instruction with 1 to 4 uops can be decoded only by decoder 0 54;; but still in only one cycle. 55;; - a complex (microcode) instruction can also only be decoded by 56;; decoder 0, and this takes an unspecified number of cycles. 57;; 58;; The goal is to schedule such that we have a few-one-one uops sequence 59;; in each cycle, to decode as many instructions per cycle as possible. 60(define_cpu_unit "c2_decoder0" "core2_decoder") 61(define_cpu_unit "c2_decoder1" "core2_decoder") 62(define_cpu_unit "c2_decoder2" "core2_decoder") 63(define_cpu_unit "c2_decoder3" "core2_decoder") 64 65;; We first wish to find an instruction for c2_decoder0, so exclude 66;; c2_decoder1 and c2_decoder2 from being reserved until c2_decoder 0 is 67;; reserved. 68(presence_set "c2_decoder1" "c2_decoder0") 69(presence_set "c2_decoder2" "c2_decoder0") 70(presence_set "c2_decoder3" "c2_decoder0") 71 72;; Most instructions can be decoded on any of the three decoders. 73(define_reservation "c2_decodern" "(c2_decoder0|c2_decoder1|c2_decoder2|c2_decoder3)") 74 75;; The out-of-order core has six pipelines. These are similar to the 76;; Pentium Pro's five pipelines. Port 2 is responsible for memory loads, 77;; port 3 for store address calculations, port 4 for memory stores, and 78;; ports 0, 1 and 5 for everything else. 79 80(define_cpu_unit "c2_p0,c2_p1,c2_p5" "core2_core") 81(define_cpu_unit "c2_p2" "core2_load") 82(define_cpu_unit "c2_p3,c2_p4" "core2_store") 83(define_cpu_unit "c2_idiv" "core2_idiv") 84(define_cpu_unit "c2_fdiv" "core2_fdiv") 85(define_cpu_unit "c2_ssediv" "core2_ssediv") 86 87;; Only the irregular instructions have to be modeled here. A load 88;; increases the latency by 2 or 3, or by nothing if the manual gives 89;; a latency already. Store latencies are not accounted for. 90;; 91;; The simple instructions follow a very regular pattern of 1 uop per 92;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store 93;; on port 4 and port 3. These instructions are modelled at the bottom 94;; of this file. 95;; 96;; For microcoded instructions we don't know how many uops are produced. 97;; These instructions are the "complex" ones in the Intel manuals. All 98;; we _do_ know is that they typically produce four or more uops, so 99;; they can only be decoded on c2_decoder0. Modelling their latencies 100;; doesn't make sense because we don't know how these instructions are 101;; executed in the core. So we just model that they can only be decoded 102;; on decoder 0, and say that it takes a little while before the result 103;; is available. 104(define_insn_reservation "c2_complex_insn" 6 105 (and (eq_attr "cpu" "core2,nehalem") 106 (eq_attr "type" "other,multi,str")) 107 "c2_decoder0") 108 109(define_insn_reservation "c2_call" 1 110 (and (eq_attr "cpu" "core2,nehalem") 111 (eq_attr "type" "call,callv")) 112 "c2_decoder0") 113 114;; imov with memory operands does not use the integer units. 115;; imovx always decodes to one uop, and also doesn't use the integer 116;; units if it has memory operands. 117(define_insn_reservation "c2_imov" 1 118 (and (eq_attr "cpu" "core2,nehalem") 119 (and (eq_attr "memory" "none") 120 (eq_attr "type" "imov,imovx"))) 121 "c2_decodern,(c2_p0|c2_p1|c2_p5)") 122 123(define_insn_reservation "c2_imov_load" 4 124 (and (eq_attr "cpu" "core2,nehalem") 125 (and (eq_attr "memory" "load") 126 (eq_attr "type" "imov,imovx"))) 127 "c2_decodern,c2_p2") 128 129(define_insn_reservation "c2_imov_store" 1 130 (and (eq_attr "cpu" "core2,nehalem") 131 (and (eq_attr "memory" "store") 132 (eq_attr "type" "imov"))) 133 "c2_decodern,c2_p4+c2_p3") 134 135(define_insn_reservation "c2_icmov" 2 136 (and (eq_attr "cpu" "core2,nehalem") 137 (and (eq_attr "memory" "none") 138 (eq_attr "type" "icmov"))) 139 "c2_decoder0,(c2_p0|c2_p1|c2_p5)*2") 140 141(define_insn_reservation "c2_icmov_load" 2 142 (and (eq_attr "cpu" "core2,nehalem") 143 (and (eq_attr "memory" "load") 144 (eq_attr "type" "icmov"))) 145 "c2_decoder0,c2_p2,(c2_p0|c2_p1|c2_p5)*2") 146 147(define_insn_reservation "c2_push_reg" 1 148 (and (eq_attr "cpu" "core2,nehalem") 149 (and (eq_attr "memory" "store") 150 (eq_attr "type" "push"))) 151 "c2_decodern,c2_p4+c2_p3") 152 153(define_insn_reservation "c2_push_mem" 1 154 (and (eq_attr "cpu" "core2,nehalem") 155 (and (eq_attr "memory" "both") 156 (eq_attr "type" "push"))) 157 "c2_decoder0,c2_p2,c2_p4+c2_p3") 158 159;; lea executes on port 0 with latency one and throughput 1. 160(define_insn_reservation "c2_lea" 1 161 (and (eq_attr "cpu" "core2,nehalem") 162 (and (eq_attr "memory" "none") 163 (eq_attr "type" "lea"))) 164 "c2_decodern,c2_p0") 165 166;; Shift and rotate decode as two uops which can go to port 0 or 5. 167;; The load and store units need to be reserved when memory operands 168;; are involved. 169(define_insn_reservation "c2_shift_rotate" 1 170 (and (eq_attr "cpu" "core2,nehalem") 171 (and (eq_attr "memory" "none") 172 (eq_attr "type" "ishift,ishift1,rotate,rotate1"))) 173 "c2_decodern,(c2_p0|c2_p5)") 174 175(define_insn_reservation "c2_shift_rotate_mem" 4 176 (and (eq_attr "cpu" "core2,nehalem") 177 (and (eq_attr "memory" "!none") 178 (eq_attr "type" "ishift,ishift1,rotate,rotate1"))) 179 "c2_decoder0,c2_p2,(c2_p0|c2_p5),c2_p4+c2_p3") 180 181;; See comments in ppro.md for the corresponding reservation. 182(define_insn_reservation "c2_branch" 1 183 (and (eq_attr "cpu" "core2,nehalem") 184 (and (eq_attr "memory" "none") 185 (eq_attr "type" "ibr"))) 186 "c2_decodern,c2_p5") 187 188;; ??? Indirect branches probably have worse latency than this. 189(define_insn_reservation "c2_indirect_branch" 6 190 (and (eq_attr "cpu" "core2,nehalem") 191 (and (eq_attr "memory" "!none") 192 (eq_attr "type" "ibr"))) 193 "c2_decoder0,c2_p2+c2_p5") 194 195(define_insn_reservation "c2_leave" 4 196 (and (eq_attr "cpu" "core2,nehalem") 197 (eq_attr "type" "leave")) 198 "c2_decoder0,c2_p2+(c2_p0|c2_p1),(c2_p0|c2_p1)") 199 200;; mul and imul with two/three operands only execute on port 1 for HImode 201;; and SImode, port 0 for DImode. 202(define_insn_reservation "c2_imul_hisi" 3 203 (and (eq_attr "cpu" "core2,nehalem") 204 (and (eq_attr "memory" "none") 205 (and (eq_attr "mode" "HI,SI") 206 (eq_attr "type" "imul")))) 207 "c2_decodern,c2_p1") 208 209(define_insn_reservation "c2_imul_hisi_mem" 3 210 (and (eq_attr "cpu" "core2,nehalem") 211 (and (eq_attr "memory" "!none") 212 (and (eq_attr "mode" "HI,SI") 213 (eq_attr "type" "imul")))) 214 "c2_decoder0,c2_p2+c2_p1") 215 216(define_insn_reservation "c2_imul_di" 5 217 (and (eq_attr "cpu" "core2,nehalem") 218 (and (eq_attr "memory" "none") 219 (and (eq_attr "mode" "DI") 220 (eq_attr "type" "imul")))) 221 "c2_decodern,c2_p0") 222 223(define_insn_reservation "c2_imul_di_mem" 5 224 (and (eq_attr "cpu" "core2,nehalem") 225 (and (eq_attr "memory" "!none") 226 (and (eq_attr "mode" "DI") 227 (eq_attr "type" "imul")))) 228 "c2_decoder0,c2_p2+c2_p0") 229 230;; div and idiv are very similar, so we model them the same. 231;; QI, HI, and SI have issue latency 12, 21, and 37, respectively. 232;; These issue latencies are modelled via the c2_div automaton. 233(define_insn_reservation "c2_idiv_QI" 19 234 (and (eq_attr "cpu" "core2,nehalem") 235 (and (eq_attr "memory" "none") 236 (and (eq_attr "mode" "QI") 237 (eq_attr "type" "idiv")))) 238 "c2_decoder0,(c2_p0+c2_idiv)*2,(c2_p0|c2_p1)+c2_idiv,c2_idiv*9") 239 240(define_insn_reservation "c2_idiv_QI_load" 19 241 (and (eq_attr "cpu" "core2,nehalem") 242 (and (eq_attr "memory" "load") 243 (and (eq_attr "mode" "QI") 244 (eq_attr "type" "idiv")))) 245 "c2_decoder0,c2_p2+c2_p0+c2_idiv,c2_p0+c2_idiv,(c2_p0|c2_p1)+c2_idiv,c2_idiv*9") 246 247(define_insn_reservation "c2_idiv_HI" 23 248 (and (eq_attr "cpu" "core2,nehalem") 249 (and (eq_attr "memory" "none") 250 (and (eq_attr "mode" "HI") 251 (eq_attr "type" "idiv")))) 252 "c2_decoder0,(c2_p0+c2_idiv)*3,(c2_p0|c2_p1)+c2_idiv,c2_idiv*17") 253 254(define_insn_reservation "c2_idiv_HI_load" 23 255 (and (eq_attr "cpu" "core2,nehalem") 256 (and (eq_attr "memory" "load") 257 (and (eq_attr "mode" "HI") 258 (eq_attr "type" "idiv")))) 259 "c2_decoder0,c2_p2+c2_p0+c2_idiv,c2_p0+c2_idiv,(c2_p0|c2_p1)+c2_idiv,c2_idiv*18") 260 261(define_insn_reservation "c2_idiv_SI" 39 262 (and (eq_attr "cpu" "core2,nehalem") 263 (and (eq_attr "memory" "none") 264 (and (eq_attr "mode" "SI") 265 (eq_attr "type" "idiv")))) 266 "c2_decoder0,(c2_p0+c2_idiv)*3,(c2_p0|c2_p1)+c2_idiv,c2_idiv*33") 267 268(define_insn_reservation "c2_idiv_SI_load" 39 269 (and (eq_attr "cpu" "core2,nehalem") 270 (and (eq_attr "memory" "load") 271 (and (eq_attr "mode" "SI") 272 (eq_attr "type" "idiv")))) 273 "c2_decoder0,c2_p2+c2_p0+c2_idiv,c2_p0+c2_idiv,(c2_p0|c2_p1)+c2_idiv,c2_idiv*34") 274 275;; x87 floating point operations. 276 277(define_insn_reservation "c2_fxch" 0 278 (and (eq_attr "cpu" "core2,nehalem") 279 (eq_attr "type" "fxch")) 280 "c2_decodern") 281 282(define_insn_reservation "c2_fop" 3 283 (and (eq_attr "cpu" "core2,nehalem") 284 (and (eq_attr "memory" "none,unknown") 285 (eq_attr "type" "fop"))) 286 "c2_decodern,c2_p1") 287 288(define_insn_reservation "c2_fop_load" 5 289 (and (eq_attr "cpu" "core2,nehalem") 290 (and (eq_attr "memory" "load") 291 (eq_attr "type" "fop"))) 292 "c2_decoder0,c2_p2+c2_p1,c2_p1") 293 294(define_insn_reservation "c2_fop_store" 3 295 (and (eq_attr "cpu" "core2,nehalem") 296 (and (eq_attr "memory" "store") 297 (eq_attr "type" "fop"))) 298 "c2_decoder0,c2_p0,c2_p0,c2_p0+c2_p4+c2_p3") 299 300(define_insn_reservation "c2_fop_both" 5 301 (and (eq_attr "cpu" "core2,nehalem") 302 (and (eq_attr "memory" "both") 303 (eq_attr "type" "fop"))) 304 "c2_decoder0,c2_p2+c2_p0,c2_p0+c2_p4+c2_p3") 305 306(define_insn_reservation "c2_fsgn" 1 307 (and (eq_attr "cpu" "core2,nehalem") 308 (eq_attr "type" "fsgn")) 309 "c2_decodern,c2_p0") 310 311(define_insn_reservation "c2_fistp" 5 312 (and (eq_attr "cpu" "core2,nehalem") 313 (eq_attr "type" "fistp")) 314 "c2_decoder0,c2_p0*2,c2_p4+c2_p3") 315 316(define_insn_reservation "c2_fcmov" 2 317 (and (eq_attr "cpu" "core2,nehalem") 318 (eq_attr "type" "fcmov")) 319 "c2_decoder0,c2_p0*2") 320 321(define_insn_reservation "c2_fcmp" 1 322 (and (eq_attr "cpu" "core2,nehalem") 323 (and (eq_attr "memory" "none") 324 (eq_attr "type" "fcmp"))) 325 "c2_decodern,c2_p1") 326 327(define_insn_reservation "c2_fcmp_load" 4 328 (and (eq_attr "cpu" "core2,nehalem") 329 (and (eq_attr "memory" "load") 330 (eq_attr "type" "fcmp"))) 331 "c2_decoder0,c2_p2+c2_p1") 332 333(define_insn_reservation "c2_fmov" 1 334 (and (eq_attr "cpu" "core2,nehalem") 335 (and (eq_attr "memory" "none") 336 (eq_attr "type" "fmov"))) 337 "c2_decodern,c2_p0") 338 339(define_insn_reservation "c2_fmov_load" 1 340 (and (eq_attr "cpu" "core2,nehalem") 341 (and (eq_attr "memory" "load") 342 (and (eq_attr "mode" "!XF") 343 (eq_attr "type" "fmov")))) 344 "c2_decodern,c2_p2") 345 346(define_insn_reservation "c2_fmov_XF_load" 3 347 (and (eq_attr "cpu" "core2,nehalem") 348 (and (eq_attr "memory" "load") 349 (and (eq_attr "mode" "XF") 350 (eq_attr "type" "fmov")))) 351 "c2_decoder0,(c2_p2+c2_p0)*2") 352 353(define_insn_reservation "c2_fmov_store" 1 354 (and (eq_attr "cpu" "core2,nehalem") 355 (and (eq_attr "memory" "store") 356 (and (eq_attr "mode" "!XF") 357 (eq_attr "type" "fmov")))) 358 "c2_decodern,c2_p3+c2_p4") 359 360(define_insn_reservation "c2_fmov_XF_store" 3 361 (and (eq_attr "cpu" "core2,nehalem") 362 (and (eq_attr "memory" "store") 363 (and (eq_attr "mode" "XF") 364 (eq_attr "type" "fmov")))) 365 "c2_decoder0,(c2_p3+c2_p4),(c2_p3+c2_p4)") 366 367;; fmul executes on port 0 with latency 5. It has issue latency 2, 368;; but we don't model this. 369(define_insn_reservation "c2_fmul" 5 370 (and (eq_attr "cpu" "core2,nehalem") 371 (and (eq_attr "memory" "none") 372 (eq_attr "type" "fmul"))) 373 "c2_decoder0,c2_p0*2") 374 375(define_insn_reservation "c2_fmul_load" 6 376 (and (eq_attr "cpu" "core2,nehalem") 377 (and (eq_attr "memory" "load") 378 (eq_attr "type" "fmul"))) 379 "c2_decoder0,c2_p2+c2_p0,c2_p0") 380 381;; fdiv latencies depend on the mode of the operands. XFmode gives 382;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18. 383;; Division by a power of 2 takes only 9 cycles, but we cannot model 384;; that. Throughput is equal to latency - 1, which we model using the 385;; c2_div automaton. 386(define_insn_reservation "c2_fdiv_SF" 18 387 (and (eq_attr "cpu" "core2,nehalem") 388 (and (eq_attr "memory" "none") 389 (and (eq_attr "mode" "SF") 390 (eq_attr "type" "fdiv,fpspc")))) 391 "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*16") 392 393(define_insn_reservation "c2_fdiv_SF_load" 19 394 (and (eq_attr "cpu" "core2,nehalem") 395 (and (eq_attr "memory" "load") 396 (and (eq_attr "mode" "SF") 397 (eq_attr "type" "fdiv,fpspc")))) 398 "c2_decoder0,c2_p2+c2_p0+c2_fdiv,c2_fdiv*16") 399 400(define_insn_reservation "c2_fdiv_DF" 32 401 (and (eq_attr "cpu" "core2,nehalem") 402 (and (eq_attr "memory" "none") 403 (and (eq_attr "mode" "DF") 404 (eq_attr "type" "fdiv,fpspc")))) 405 "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*30") 406 407(define_insn_reservation "c2_fdiv_DF_load" 33 408 (and (eq_attr "cpu" "core2,nehalem") 409 (and (eq_attr "memory" "load") 410 (and (eq_attr "mode" "DF") 411 (eq_attr "type" "fdiv,fpspc")))) 412 "c2_decoder0,c2_p2+c2_p0+c2_fdiv,c2_fdiv*30") 413 414(define_insn_reservation "c2_fdiv_XF" 38 415 (and (eq_attr "cpu" "core2,nehalem") 416 (and (eq_attr "memory" "none") 417 (and (eq_attr "mode" "XF") 418 (eq_attr "type" "fdiv,fpspc")))) 419 "c2_decodern,c2_p0+c2_fdiv,c2_fdiv*36") 420 421(define_insn_reservation "c2_fdiv_XF_load" 39 422 (and (eq_attr "cpu" "core2,nehalem") 423 (and (eq_attr "memory" "load") 424 (and (eq_attr "mode" "XF") 425 (eq_attr "type" "fdiv,fpspc")))) 426 "c2_decoder0,c2_p2+c2_p0+c2_fdiv,c2_fdiv*36") 427 428;; MMX instructions. 429 430(define_insn_reservation "c2_mmx_add" 1 431 (and (eq_attr "cpu" "core2,nehalem") 432 (and (eq_attr "memory" "none") 433 (eq_attr "type" "mmxadd,sseiadd"))) 434 "c2_decodern,c2_p0|c2_p5") 435 436(define_insn_reservation "c2_mmx_add_load" 2 437 (and (eq_attr "cpu" "core2,nehalem") 438 (and (eq_attr "memory" "load") 439 (eq_attr "type" "mmxadd,sseiadd"))) 440 "c2_decodern,c2_p2+c2_p0|c2_p5") 441 442(define_insn_reservation "c2_mmx_shft" 1 443 (and (eq_attr "cpu" "core2,nehalem") 444 (and (eq_attr "memory" "none") 445 (eq_attr "type" "mmxshft"))) 446 "c2_decodern,c2_p0|c2_p5") 447 448(define_insn_reservation "c2_mmx_shft_load" 2 449 (and (eq_attr "cpu" "core2,nehalem") 450 (and (eq_attr "memory" "load") 451 (eq_attr "type" "mmxshft"))) 452 "c2_decoder0,c2_p2+c2_p1") 453 454(define_insn_reservation "c2_mmx_sse_shft" 1 455 (and (eq_attr "cpu" "core2,nehalem") 456 (and (eq_attr "memory" "none") 457 (and (eq_attr "type" "sseishft") 458 (eq_attr "length_immediate" "!0")))) 459 "c2_decodern,c2_p1") 460 461(define_insn_reservation "c2_mmx_sse_shft_load" 2 462 (and (eq_attr "cpu" "core2,nehalem") 463 (and (eq_attr "memory" "load") 464 (and (eq_attr "type" "sseishft") 465 (eq_attr "length_immediate" "!0")))) 466 "c2_decodern,c2_p1") 467 468(define_insn_reservation "c2_mmx_sse_shft1" 2 469 (and (eq_attr "cpu" "core2,nehalem") 470 (and (eq_attr "memory" "none") 471 (and (eq_attr "type" "sseishft") 472 (eq_attr "length_immediate" "0")))) 473 "c2_decodern,c2_p1") 474 475(define_insn_reservation "c2_mmx_sse_shft1_load" 3 476 (and (eq_attr "cpu" "core2,nehalem") 477 (and (eq_attr "memory" "load") 478 (and (eq_attr "type" "sseishft") 479 (eq_attr "length_immediate" "0")))) 480 "c2_decodern,c2_p1") 481 482(define_insn_reservation "c2_mmx_mul" 3 483 (and (eq_attr "cpu" "core2,nehalem") 484 (and (eq_attr "memory" "none") 485 (eq_attr "type" "mmxmul,sseimul"))) 486 "c2_decodern,c2_p1") 487 488(define_insn_reservation "c2_mmx_mul_load" 3 489 (and (eq_attr "cpu" "core2,nehalem") 490 (and (eq_attr "memory" "none") 491 (eq_attr "type" "mmxmul,sseimul"))) 492 "c2_decoder0,c2_p2+c2_p1") 493 494(define_insn_reservation "c2_sse_mmxcvt" 4 495 (and (eq_attr "cpu" "core2,nehalem") 496 (and (eq_attr "mode" "DI") 497 (eq_attr "type" "mmxcvt"))) 498 "c2_decodern,c2_p1") 499 500;; FIXME: These are Pentium III only, but we cannot tell here if 501;; we're generating code for PentiumPro/Pentium II or Pentium III 502;; (define_insn_reservation "c2_sse_mmxshft" 2 503;; (and (eq_attr "cpu" "core2,nehalem") 504;; (and (eq_attr "mode" "TI") 505;; (eq_attr "type" "mmxshft"))) 506;; "c2_decodern,c2_p0") 507 508;; The sfence instruction. 509(define_insn_reservation "c2_sse_sfence" 3 510 (and (eq_attr "cpu" "core2,nehalem") 511 (and (eq_attr "memory" "unknown") 512 (eq_attr "type" "sse"))) 513 "c2_decoder0,c2_p4+c2_p3") 514 515;; FIXME: This reservation is all wrong when we're scheduling sqrtss. 516(define_insn_reservation "c2_sse_SFDF" 3 517 (and (eq_attr "cpu" "core2,nehalem") 518 (and (eq_attr "mode" "SF,DF") 519 (eq_attr "type" "sse"))) 520 "c2_decodern,c2_p0") 521 522(define_insn_reservation "c2_sse_V4SF" 4 523 (and (eq_attr "cpu" "core2,nehalem") 524 (and (eq_attr "mode" "V4SF") 525 (eq_attr "type" "sse"))) 526 "c2_decoder0,c2_p1*2") 527 528(define_insn_reservation "c2_sse_addcmp" 3 529 (and (eq_attr "cpu" "core2,nehalem") 530 (and (eq_attr "memory" "none") 531 (eq_attr "type" "sseadd,sseadd1,ssecmp,ssecomi"))) 532 "c2_decodern,c2_p1") 533 534(define_insn_reservation "c2_sse_addcmp_load" 3 535 (and (eq_attr "cpu" "core2,nehalem") 536 (and (eq_attr "memory" "load") 537 (eq_attr "type" "sseadd,sseadd1,ssecmp,ssecomi"))) 538 "c2_decodern,c2_p2+c2_p1") 539 540(define_insn_reservation "c2_sse_mul_SF" 4 541 (and (eq_attr "cpu" "core2,nehalem") 542 (and (eq_attr "memory" "none") 543 (and (eq_attr "mode" "SF,V4SF") 544 (eq_attr "type" "ssemul")))) 545 "c2_decodern,c2_p0") 546 547(define_insn_reservation "c2_sse_mul_SF_load" 4 548 (and (eq_attr "cpu" "core2,nehalem") 549 (and (eq_attr "memory" "load") 550 (and (eq_attr "mode" "SF,V4SF") 551 (eq_attr "type" "ssemul")))) 552 "c2_decodern,c2_p2+c2_p0") 553 554(define_insn_reservation "c2_sse_mul_DF" 5 555 (and (eq_attr "cpu" "core2,nehalem") 556 (and (eq_attr "memory" "none") 557 (and (eq_attr "mode" "DF,V2DF") 558 (eq_attr "type" "ssemul")))) 559 "c2_decodern,c2_p0") 560 561(define_insn_reservation "c2_sse_mul_DF_load" 5 562 (and (eq_attr "cpu" "core2,nehalem") 563 (and (eq_attr "memory" "load") 564 (and (eq_attr "mode" "DF,V2DF") 565 (eq_attr "type" "ssemul")))) 566 "c2_decodern,c2_p2+c2_p0") 567 568(define_insn_reservation "c2_sse_div_SF" 18 569 (and (eq_attr "cpu" "core2,nehalem") 570 (and (eq_attr "memory" "none") 571 (and (eq_attr "mode" "SF,V4SF") 572 (eq_attr "type" "ssediv")))) 573 "c2_decodern,c2_p0,c2_ssediv*17") 574 575(define_insn_reservation "c2_sse_div_SF_load" 18 576 (and (eq_attr "cpu" "core2,nehalem") 577 (and (eq_attr "memory" "none") 578 (and (eq_attr "mode" "SF,V4SF") 579 (eq_attr "type" "ssediv")))) 580 "c2_decodern,(c2_p2+c2_p0),c2_ssediv*17") 581 582(define_insn_reservation "c2_sse_div_DF" 32 583 (and (eq_attr "cpu" "core2,nehalem") 584 (and (eq_attr "memory" "none") 585 (and (eq_attr "mode" "DF,V2DF") 586 (eq_attr "type" "ssediv")))) 587 "c2_decodern,c2_p0,c2_ssediv*31") 588 589(define_insn_reservation "c2_sse_div_DF_load" 32 590 (and (eq_attr "cpu" "core2,nehalem") 591 (and (eq_attr "memory" "none") 592 (and (eq_attr "mode" "DF,V2DF") 593 (eq_attr "type" "ssediv")))) 594 "c2_decodern,(c2_p2+c2_p0),c2_ssediv*31") 595 596;; FIXME: these have limited throughput 597(define_insn_reservation "c2_sse_icvt_SF" 4 598 (and (eq_attr "cpu" "core2,nehalem") 599 (and (eq_attr "memory" "none") 600 (and (eq_attr "mode" "SF") 601 (eq_attr "type" "sseicvt")))) 602 "c2_decodern,c2_p1") 603 604(define_insn_reservation "c2_sse_icvt_SF_load" 4 605 (and (eq_attr "cpu" "core2,nehalem") 606 (and (eq_attr "memory" "!none") 607 (and (eq_attr "mode" "SF") 608 (eq_attr "type" "sseicvt")))) 609 "c2_decodern,c2_p2+c2_p1") 610 611(define_insn_reservation "c2_sse_icvt_DF" 4 612 (and (eq_attr "cpu" "core2,nehalem") 613 (and (eq_attr "memory" "none") 614 (and (eq_attr "mode" "DF") 615 (eq_attr "type" "sseicvt")))) 616 "c2_decoder0,c2_p0+c2_p1") 617 618(define_insn_reservation "c2_sse_icvt_DF_load" 4 619 (and (eq_attr "cpu" "core2,nehalem") 620 (and (eq_attr "memory" "!none") 621 (and (eq_attr "mode" "DF") 622 (eq_attr "type" "sseicvt")))) 623 "c2_decoder0,(c2_p2+c2_p1)") 624 625(define_insn_reservation "c2_sse_icvt_SI" 3 626 (and (eq_attr "cpu" "core2,nehalem") 627 (and (eq_attr "memory" "none") 628 (and (eq_attr "mode" "SI") 629 (eq_attr "type" "sseicvt")))) 630 "c2_decodern,c2_p1") 631 632(define_insn_reservation "c2_sse_icvt_SI_load" 3 633 (and (eq_attr "cpu" "core2,nehalem") 634 (and (eq_attr "memory" "!none") 635 (and (eq_attr "mode" "SI") 636 (eq_attr "type" "sseicvt")))) 637 "c2_decodern,(c2_p2+c2_p1)") 638 639(define_insn_reservation "c2_sse_mov" 1 640 (and (eq_attr "cpu" "core2,nehalem") 641 (and (eq_attr "memory" "none") 642 (eq_attr "type" "ssemov"))) 643 "c2_decodern,(c2_p0|c2_p1|c2_p5)") 644 645(define_insn_reservation "c2_sse_mov_load" 2 646 (and (eq_attr "cpu" "core2,nehalem") 647 (and (eq_attr "memory" "load") 648 (eq_attr "type" "ssemov"))) 649 "c2_decodern,c2_p2") 650 651(define_insn_reservation "c2_sse_mov_store" 1 652 (and (eq_attr "cpu" "core2,nehalem") 653 (and (eq_attr "memory" "store") 654 (eq_attr "type" "ssemov"))) 655 "c2_decodern,c2_p4+c2_p3") 656 657;; All other instructions are modelled as simple instructions. 658;; We have already modelled all i387 floating point instructions, so all 659;; other instructions execute on either port 0, 1 or 5. This includes 660;; the ALU units, and the MMX units. 661;; 662;; reg-reg instructions produce 1 uop so they can be decoded on any of 663;; the three decoders. Loads benefit from micro-op fusion and can be 664;; treated in the same way. 665(define_insn_reservation "c2_insn" 1 666 (and (eq_attr "cpu" "core2,nehalem") 667 (and (eq_attr "memory" "none,unknown") 668 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,sseishft1,mmx,mmxcmp"))) 669 "c2_decodern,(c2_p0|c2_p1|c2_p5)") 670 671(define_insn_reservation "c2_insn_load" 4 672 (and (eq_attr "cpu" "core2,nehalem") 673 (and (eq_attr "memory" "load") 674 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,pop,sseishft1,mmx,mmxcmp"))) 675 "c2_decodern,c2_p2,(c2_p0|c2_p1|c2_p5)") 676 677;; register-memory instructions have three uops, so they have to be 678;; decoded on c2_decoder0. 679(define_insn_reservation "c2_insn_store" 1 680 (and (eq_attr "cpu" "core2,nehalem") 681 (and (eq_attr "memory" "store") 682 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,sseishft1,mmx,mmxcmp"))) 683 "c2_decoder0,(c2_p0|c2_p1|c2_p5),c2_p4+c2_p3") 684 685;; read-modify-store instructions produce 4 uops so they have to be 686;; decoded on c2_decoder0 as well. 687(define_insn_reservation "c2_insn_both" 4 688 (and (eq_attr "cpu" "core2,nehalem") 689 (and (eq_attr "memory" "both") 690 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,pop,sseishft1,mmx,mmxcmp"))) 691 "c2_decoder0,c2_p2,(c2_p0|c2_p1|c2_p5),c2_p4+c2_p3") 692