ppro.md revision 256281
185213Sdarrenr;; Scheduling for the Intel P6 family of processors 285213Sdarrenr;; Copyright (C) 2004, 2005 Free Software Foundation, Inc. 385213Sdarrenr;; 485213Sdarrenr;; This file is part of GCC. 585213Sdarrenr;; 685213Sdarrenr;; GCC is free software; you can redistribute it and/or modify 785213Sdarrenr;; it under the terms of the GNU General Public License as published by 885213Sdarrenr;; the Free Software Foundation; either version 2, or (at your option) 985213Sdarrenr;; any later version. 1085213Sdarrenr;; 1185213Sdarrenr;; GCC is distributed in the hope that it will be useful, 1285213Sdarrenr;; but WITHOUT ANY WARRANTY; without even the implied warranty of 1385213Sdarrenr;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 1485213Sdarrenr;; GNU General Public License for more details. 1585213Sdarrenr;; 1685213Sdarrenr;; You should have received a copy of the GNU General Public License 1785213Sdarrenr;; along with GCC; see the file COPYING. If not, write to 1885213Sdarrenr;; the Free Software Foundation, 51 Franklin Street, Fifth Floor, 1985213Sdarrenr;; Boston, MA 02110-1301, USA. */ 2085213Sdarrenr 2185213Sdarrenr;; The P6 family includes the Pentium Pro, Pentium II, Pentium III, Celeron 2285213Sdarrenr;; and Xeon lines of CPUs. The DFA scheduler description in this file is 2385213Sdarrenr;; based on information that can be found in the following three documents: 2485213Sdarrenr;; 2585213Sdarrenr;; "P6 Family of Processors Hardware Developer's Manual", 2685213Sdarrenr;; Intel, September 1999. 2785213Sdarrenr;; 2885213Sdarrenr;; "Intel Architecture Optimization Manual", 2985213Sdarrenr;; Intel, 1999 (Order Number: 245127-001). 3085213Sdarrenr;; 31;; "How to optimize for the Pentium family of microprocessors", 32;; by Agner Fog, PhD. 33;; 34;; The P6 pipeline has three major components: 35;; 1) the FETCH/DECODE unit, an in-order issue front-end 36;; 2) the DISPATCH/EXECUTE unit, which is the out-of-order core 37;; 3) the RETIRE unit, an in-order retirement unit 38;; 39;; So, the P6 CPUs have out-of-order cores, but the instruction decoder and 40;; retirement unit are naturally in-order. 41;; 42;; BUS INTERFACE UNIT 43;; / \ 44;; L1 ICACHE L1 DCACHE 45;; / | \ | \ 46;; DECODER0 DECODER1 DECODER2 DISP/EXEC RETIRE 47;; \ | / | | 48;; INSTRUCTION POOL __________|_______/ 49;; (inc. reorder buffer) 50;; 51;; Since the P6 CPUs execute instructions out-of-order, the most important 52;; consideration in performance tuning is making sure enough micro-ops are 53;; ready for execution in the out-of-order core, while not stalling the 54;; decoder. 55;; 56;; TODO: 57;; - Find a less crude way to model complex instructions, in 58;; particular how many cycles they take to be decoded. 59;; - Include decoder latencies in the total reservation latencies. 60;; This isn't necessary right now because we assume for every 61;; instruction that it never blocks a decoder. 62;; - Figure out where the p0 and p1 reservations come from. These 63;; appear not to be in the manual (e.g. why is cld "(p0+p1)*2" 64;; better than "(p0|p1)*4" ???) 65;; - Lots more because I'm sure this is still far from optimal :-) 66 67;; The ppro_idiv and ppro_fdiv automata are used to model issue 68;; latencies of idiv and fdiv type insns. 69(define_automaton "ppro_decoder,ppro_core,ppro_idiv,ppro_fdiv,ppro_load,ppro_store") 70 71;; Simple instructions of the register-register form have only one uop. 72;; Load instructions are also only one uop. Store instructions decode to 73;; two uops, and simple read-modify instructions also take two uops. 74;; Simple instructions of the register-memory form have two to three uops. 75;; Simple read-modify-write instructions have four uops. The rules for 76;; the decoder are simple: 77;; - an instruction with 1 uop can be decoded by any of the three 78;; decoders in one cycle. 79;; - an instruction with 1 to 4 uops can be decoded only by decoder 0 80;; but still in only one cycle. 81;; - a complex (microcode) instruction can also only be decoded by 82;; decoder 0, and this takes an unspecified number of cycles. 83;; 84;; The goal is to schedule such that we have a few-one-one uops sequence 85;; in each cycle, to decode as many instructions per cycle as possible. 86(define_cpu_unit "decoder0" "ppro_decoder") 87(define_cpu_unit "decoder1" "ppro_decoder") 88(define_cpu_unit "decoder2" "ppro_decoder") 89 90;; We first wish to find an instruction for decoder0, so exclude 91;; decoder1 and decoder2 from being reserved until decoder 0 is 92;; reserved. 93(presence_set "decoder1" "decoder0") 94(presence_set "decoder2" "decoder0") 95 96;; Most instructions can be decoded on any of the three decoders. 97(define_reservation "decodern" "(decoder0|decoder1|decoder2)") 98 99;; The out-of-order core has five pipelines. During each cycle, the core 100;; may dispatch zero or one uop on the port of any of the five pipelines 101;; so the maximum number of dispatched uops per cycle is 5. In practicer, 102;; 3 uops per cycle is more realistic. 103;; 104;; Two of the five pipelines contain several execution units: 105;; 106;; Port 0 Port 1 Port 2 Port 3 Port 4 107;; ALU ALU LOAD SAC SDA 108;; FPU JUE 109;; AGU MMX 110;; MMX P3FPU 111;; P3FPU 112;; 113;; (SAC=Store Address Calculation, SDA=Store Data Unit, P3FPU = SSE unit, 114;; JUE = Jump Execution Unit, AGU = Address Generation Unit) 115;; 116(define_cpu_unit "p0,p1" "ppro_core") 117(define_cpu_unit "p2" "ppro_load") 118(define_cpu_unit "p3,p4" "ppro_store") 119(define_cpu_unit "idiv" "ppro_idiv") 120(define_cpu_unit "fdiv" "ppro_fdiv") 121 122;; Only the irregular instructions have to be modeled here. A load 123;; increases the latency by 2 or 3, or by nothing if the manual gives 124;; a latency already. Store latencies are not accounted for. 125;; 126;; The simple instructions follow a very regular pattern of 1 uop per 127;; reg-reg operation, 1 uop per load on port 2. and 2 uops per store 128;; on port 4 and port 3. These instructions are modelled at the bottom 129;; of this file. 130;; 131;; For microcoded instructions we don't know how many uops are produced. 132;; These instructions are the "complex" ones in the Intel manuals. All 133;; we _do_ know is that they typically produce four or more uops, so 134;; they can only be decoded on decoder0. Modelling their latencies 135;; doesn't make sense because we don't know how these instructions are 136;; executed in the core. So we just model that they can only be decoded 137;; on decoder 0, and say that it takes a little while before the result 138;; is available. 139(define_insn_reservation "ppro_complex_insn" 6 140 (and (eq_attr "cpu" "pentiumpro,generic32") 141 (eq_attr "type" "other,multi,call,callv,str")) 142 "decoder0") 143 144;; imov with memory operands does not use the integer units. 145(define_insn_reservation "ppro_imov" 1 146 (and (eq_attr "cpu" "pentiumpro,generic32") 147 (and (eq_attr "memory" "none") 148 (eq_attr "type" "imov"))) 149 "decodern,(p0|p1)") 150 151(define_insn_reservation "ppro_imov_load" 4 152 (and (eq_attr "cpu" "pentiumpro,generic32") 153 (and (eq_attr "memory" "load") 154 (eq_attr "type" "imov"))) 155 "decodern,p2") 156 157(define_insn_reservation "ppro_imov_store" 1 158 (and (eq_attr "cpu" "pentiumpro,generic32") 159 (and (eq_attr "memory" "store") 160 (eq_attr "type" "imov"))) 161 "decoder0,p4+p3") 162 163;; imovx always decodes to one uop, and also doesn't use the integer 164;; units if it has memory operands. 165(define_insn_reservation "ppro_imovx" 1 166 (and (eq_attr "cpu" "pentiumpro,generic32") 167 (and (eq_attr "memory" "none") 168 (eq_attr "type" "imovx"))) 169 "decodern,(p0|p1)") 170 171(define_insn_reservation "ppro_imovx_load" 4 172 (and (eq_attr "cpu" "pentiumpro,generic32") 173 (and (eq_attr "memory" "load") 174 (eq_attr "type" "imovx"))) 175 "decodern,p2") 176 177;; lea executes on port 0 with latency one and throughput 1. 178(define_insn_reservation "ppro_lea" 1 179 (and (eq_attr "cpu" "pentiumpro,generic32") 180 (and (eq_attr "memory" "none") 181 (eq_attr "type" "lea"))) 182 "decodern,p0") 183 184;; Shift and rotate execute on port 0 with latency and throughput 1. 185;; The load and store units need to be reserved when memory operands 186;; are involved. 187(define_insn_reservation "ppro_shift_rotate" 1 188 (and (eq_attr "cpu" "pentiumpro,generic32") 189 (and (eq_attr "memory" "none") 190 (eq_attr "type" "ishift,ishift1,rotate,rotate1"))) 191 "decodern,p0") 192 193(define_insn_reservation "ppro_shift_rotate_mem" 4 194 (and (eq_attr "cpu" "pentiumpro,generic32") 195 (and (eq_attr "memory" "!none") 196 (eq_attr "type" "ishift,ishift1,rotate,rotate1"))) 197 "decoder0,p2+p0,p4+p3") 198 199(define_insn_reservation "ppro_cld" 2 200 (and (eq_attr "cpu" "pentiumpro,generic32") 201 (eq_attr "type" "cld")) 202 "decoder0,(p0+p1)*2") 203 204;; The P6 has a sophisticated branch prediction mechanism to minimize 205;; latencies due to branching. In particular, it has a fast way to 206;; execute branches that are taken multiple times (such as in loops). 207;; Branches not taken suffer no penalty, and correctly predicted 208;; branches cost only one fetch cycle. Mispredicted branches are very 209;; costly: typically 15 cycles and possibly as many as 26 cycles. 210;; 211;; Unfortunately all this makes it quite difficult to properly model 212;; the latencies for the compiler. Here I've made the choice to be 213;; optimistic and assume branches are often predicted correctly, so 214;; they have latency 1, and the decoders are not blocked. 215;; 216;; In addition, the model assumes a branch always decodes to only 1 uop, 217;; which is not exactly true because there are a few instructions that 218;; decode to 2 uops or microcode. But this probably gives the best 219;; results because we can assume these instructions can decode on all 220;; decoders. 221(define_insn_reservation "ppro_branch" 1 222 (and (eq_attr "cpu" "pentiumpro,generic32") 223 (and (eq_attr "memory" "none") 224 (eq_attr "type" "ibr"))) 225 "decodern,p1") 226 227;; ??? Indirect branches probably have worse latency than this. 228(define_insn_reservation "ppro_indirect_branch" 6 229 (and (eq_attr "cpu" "pentiumpro,generic32") 230 (and (eq_attr "memory" "!none") 231 (eq_attr "type" "ibr"))) 232 "decoder0,p2+p1") 233 234(define_insn_reservation "ppro_leave" 4 235 (and (eq_attr "cpu" "pentiumpro,generic32") 236 (eq_attr "type" "leave")) 237 "decoder0,p2+(p0|p1),(p0|p1)") 238 239;; imul has throughput one, but latency 4, and can only execute on port 0. 240(define_insn_reservation "ppro_imul" 4 241 (and (eq_attr "cpu" "pentiumpro,generic32") 242 (and (eq_attr "memory" "none") 243 (eq_attr "type" "imul"))) 244 "decodern,p0") 245 246(define_insn_reservation "ppro_imul_mem" 4 247 (and (eq_attr "cpu" "pentiumpro,generic32") 248 (and (eq_attr "memory" "!none") 249 (eq_attr "type" "imul"))) 250 "decoder0,p2+p0") 251 252;; div and idiv are very similar, so we model them the same. 253;; QI, HI, and SI have issue latency 12, 21, and 37, respectively. 254;; These issue latencies are modelled via the ppro_div automaton. 255(define_insn_reservation "ppro_idiv_QI" 19 256 (and (eq_attr "cpu" "pentiumpro,generic32") 257 (and (eq_attr "memory" "none") 258 (and (eq_attr "mode" "QI") 259 (eq_attr "type" "idiv")))) 260 "decoder0,(p0+idiv)*2,(p0|p1)+idiv,idiv*9") 261 262(define_insn_reservation "ppro_idiv_QI_load" 19 263 (and (eq_attr "cpu" "pentiumpro,generic32") 264 (and (eq_attr "memory" "load") 265 (and (eq_attr "mode" "QI") 266 (eq_attr "type" "idiv")))) 267 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*9") 268 269(define_insn_reservation "ppro_idiv_HI" 23 270 (and (eq_attr "cpu" "pentiumpro,generic32") 271 (and (eq_attr "memory" "none") 272 (and (eq_attr "mode" "HI") 273 (eq_attr "type" "idiv")))) 274 "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*17") 275 276(define_insn_reservation "ppro_idiv_HI_load" 23 277 (and (eq_attr "cpu" "pentiumpro,generic32") 278 (and (eq_attr "memory" "load") 279 (and (eq_attr "mode" "HI") 280 (eq_attr "type" "idiv")))) 281 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*18") 282 283(define_insn_reservation "ppro_idiv_SI" 39 284 (and (eq_attr "cpu" "pentiumpro,generic32") 285 (and (eq_attr "memory" "none") 286 (and (eq_attr "mode" "SI") 287 (eq_attr "type" "idiv")))) 288 "decoder0,(p0+idiv)*3,(p0|p1)+idiv,idiv*33") 289 290(define_insn_reservation "ppro_idiv_SI_load" 39 291 (and (eq_attr "cpu" "pentiumpro,generic32") 292 (and (eq_attr "memory" "load") 293 (and (eq_attr "mode" "SI") 294 (eq_attr "type" "idiv")))) 295 "decoder0,p2+p0+idiv,p0+idiv,(p0|p1)+idiv,idiv*34") 296 297;; Floating point operations always execute on port 0. 298;; ??? where do these latencies come from? fadd has latency 3 and 299;; has throughput "1/cycle (align with FADD)". What do they 300;; mean and how can we model that? 301(define_insn_reservation "ppro_fop" 3 302 (and (eq_attr "cpu" "pentiumpro,generic32") 303 (and (eq_attr "memory" "none,unknown") 304 (eq_attr "type" "fop"))) 305 "decodern,p0") 306 307(define_insn_reservation "ppro_fop_load" 5 308 (and (eq_attr "cpu" "pentiumpro,generic32") 309 (and (eq_attr "memory" "load") 310 (eq_attr "type" "fop"))) 311 "decoder0,p2+p0,p0") 312 313(define_insn_reservation "ppro_fop_store" 3 314 (and (eq_attr "cpu" "pentiumpro,generic32") 315 (and (eq_attr "memory" "store") 316 (eq_attr "type" "fop"))) 317 "decoder0,p0,p0,p0+p4+p3") 318 319(define_insn_reservation "ppro_fop_both" 5 320 (and (eq_attr "cpu" "pentiumpro,generic32") 321 (and (eq_attr "memory" "both") 322 (eq_attr "type" "fop"))) 323 "decoder0,p2+p0,p0+p4+p3") 324 325(define_insn_reservation "ppro_fsgn" 1 326 (and (eq_attr "cpu" "pentiumpro,generic32") 327 (eq_attr "type" "fsgn")) 328 "decodern,p0") 329 330(define_insn_reservation "ppro_fistp" 5 331 (and (eq_attr "cpu" "pentiumpro,generic32") 332 (eq_attr "type" "fistp")) 333 "decoder0,p0*2,p4+p3") 334 335(define_insn_reservation "ppro_fcmov" 2 336 (and (eq_attr "cpu" "pentiumpro,generic32") 337 (eq_attr "type" "fcmov")) 338 "decoder0,p0*2") 339 340(define_insn_reservation "ppro_fcmp" 1 341 (and (eq_attr "cpu" "pentiumpro,generic32") 342 (and (eq_attr "memory" "none") 343 (eq_attr "type" "fcmp"))) 344 "decodern,p0") 345 346(define_insn_reservation "ppro_fcmp_load" 4 347 (and (eq_attr "cpu" "pentiumpro,generic32") 348 (and (eq_attr "memory" "load") 349 (eq_attr "type" "fcmp"))) 350 "decoder0,p2+p0") 351 352(define_insn_reservation "ppro_fmov" 1 353 (and (eq_attr "cpu" "pentiumpro,generic32") 354 (and (eq_attr "memory" "none") 355 (eq_attr "type" "fmov"))) 356 "decodern,p0") 357 358(define_insn_reservation "ppro_fmov_load" 1 359 (and (eq_attr "cpu" "pentiumpro,generic32") 360 (and (eq_attr "memory" "load") 361 (and (eq_attr "mode" "!XF") 362 (eq_attr "type" "fmov")))) 363 "decodern,p2") 364 365(define_insn_reservation "ppro_fmov_XF_load" 3 366 (and (eq_attr "cpu" "pentiumpro,generic32") 367 (and (eq_attr "memory" "load") 368 (and (eq_attr "mode" "XF") 369 (eq_attr "type" "fmov")))) 370 "decoder0,(p2+p0)*2") 371 372(define_insn_reservation "ppro_fmov_store" 1 373 (and (eq_attr "cpu" "pentiumpro,generic32") 374 (and (eq_attr "memory" "store") 375 (and (eq_attr "mode" "!XF") 376 (eq_attr "type" "fmov")))) 377 "decodern,p0") 378 379(define_insn_reservation "ppro_fmov_XF_store" 3 380 (and (eq_attr "cpu" "pentiumpro,generic32") 381 (and (eq_attr "memory" "store") 382 (and (eq_attr "mode" "XF") 383 (eq_attr "type" "fmov")))) 384 "decoder0,(p0+p4),(p0+p3)") 385 386;; fmul executes on port 0 with latency 5. It has issue latency 2, 387;; but we don't model this. 388(define_insn_reservation "ppro_fmul" 5 389 (and (eq_attr "cpu" "pentiumpro,generic32") 390 (and (eq_attr "memory" "none") 391 (eq_attr "type" "fmul"))) 392 "decoder0,p0*2") 393 394(define_insn_reservation "ppro_fmul_load" 6 395 (and (eq_attr "cpu" "pentiumpro,generic32") 396 (and (eq_attr "memory" "load") 397 (eq_attr "type" "fmul"))) 398 "decoder0,p2+p0,p0") 399 400;; fdiv latencies depend on the mode of the operands. XFmode gives 401;; a latency of 38 cycles, DFmode gives 32, and SFmode gives latency 18. 402;; Division by a power of 2 takes only 9 cycles, but we cannot model 403;; that. Throughput is equal to latency - 1, which we model using the 404;; ppro_div automaton. 405(define_insn_reservation "ppro_fdiv_SF" 18 406 (and (eq_attr "cpu" "pentiumpro,generic32") 407 (and (eq_attr "memory" "none") 408 (and (eq_attr "mode" "SF") 409 (eq_attr "type" "fdiv,fpspc")))) 410 "decodern,p0+fdiv,fdiv*16") 411 412(define_insn_reservation "ppro_fdiv_SF_load" 19 413 (and (eq_attr "cpu" "pentiumpro,generic32") 414 (and (eq_attr "memory" "load") 415 (and (eq_attr "mode" "SF") 416 (eq_attr "type" "fdiv,fpspc")))) 417 "decoder0,p2+p0+fdiv,fdiv*16") 418 419(define_insn_reservation "ppro_fdiv_DF" 32 420 (and (eq_attr "cpu" "pentiumpro,generic32") 421 (and (eq_attr "memory" "none") 422 (and (eq_attr "mode" "DF") 423 (eq_attr "type" "fdiv,fpspc")))) 424 "decodern,p0+fdiv,fdiv*30") 425 426(define_insn_reservation "ppro_fdiv_DF_load" 33 427 (and (eq_attr "cpu" "pentiumpro,generic32") 428 (and (eq_attr "memory" "load") 429 (and (eq_attr "mode" "DF") 430 (eq_attr "type" "fdiv,fpspc")))) 431 "decoder0,p2+p0+fdiv,fdiv*30") 432 433(define_insn_reservation "ppro_fdiv_XF" 38 434 (and (eq_attr "cpu" "pentiumpro,generic32") 435 (and (eq_attr "memory" "none") 436 (and (eq_attr "mode" "XF") 437 (eq_attr "type" "fdiv,fpspc")))) 438 "decodern,p0+fdiv,fdiv*36") 439 440(define_insn_reservation "ppro_fdiv_XF_load" 39 441 (and (eq_attr "cpu" "pentiumpro,generic32") 442 (and (eq_attr "memory" "load") 443 (and (eq_attr "mode" "XF") 444 (eq_attr "type" "fdiv,fpspc")))) 445 "decoder0,p2+p0+fdiv,fdiv*36") 446 447;; MMX instructions can execute on either port 0 or port 1 with a 448;; throughput of 1/cycle. 449;; on port 0: - ALU (latency 1) 450;; - Multiplier Unit (latency 3) 451;; on port 1: - ALU (latency 1) 452;; - Shift Unit (latency 1) 453;; 454;; MMX instructions are either of the type reg-reg, or read-modify, and 455;; except for mmxshft and mmxmul they can execute on port 0 or port 1, 456;; so they behave as "simple" instructions that need no special modelling. 457;; We only have to model mmxshft and mmxmul. 458(define_insn_reservation "ppro_mmx_shft" 1 459 (and (eq_attr "cpu" "pentiumpro,generic32") 460 (and (eq_attr "memory" "none") 461 (eq_attr "type" "mmxshft"))) 462 "decodern,p1") 463 464(define_insn_reservation "ppro_mmx_shft_load" 2 465 (and (eq_attr "cpu" "pentiumpro,generic32") 466 (and (eq_attr "memory" "none") 467 (eq_attr "type" "mmxshft"))) 468 "decoder0,p2+p1") 469 470(define_insn_reservation "ppro_mmx_mul" 3 471 (and (eq_attr "cpu" "pentiumpro,generic32") 472 (and (eq_attr "memory" "none") 473 (eq_attr "type" "mmxmul"))) 474 "decodern,p0") 475 476(define_insn_reservation "ppro_mmx_mul_load" 3 477 (and (eq_attr "cpu" "pentiumpro,generic32") 478 (and (eq_attr "memory" "none") 479 (eq_attr "type" "mmxmul"))) 480 "decoder0,p2+p0") 481 482(define_insn_reservation "ppro_sse_mmxcvt" 4 483 (and (eq_attr "cpu" "pentiumpro,generic32") 484 (and (eq_attr "mode" "DI") 485 (eq_attr "type" "mmxcvt"))) 486 "decodern,p1") 487 488;; FIXME: These are Pentium III only, but we cannot tell here if 489;; we're generating code for PentiumPro/Pentium II or Pentium III 490;; (define_insn_reservation "ppro_sse_mmxshft" 2 491;; (and (eq_attr "cpu" "pentiumpro,generic32") 492;; (and (eq_attr "mode" "DI") 493;; (eq_attr "type" "mmxshft"))) 494;; "decodern,p0") 495 496;; SSE is very complicated, and takes a bit more effort. 497;; ??? I assumed that all SSE instructions decode on decoder0, 498;; but is this correct? 499 500;; The sfence instruction. 501(define_insn_reservation "ppro_sse_sfence" 3 502 (and (eq_attr "cpu" "pentiumpro,generic32") 503 (and (eq_attr "memory" "unknown") 504 (eq_attr "type" "sse"))) 505 "decoder0,p4+p3") 506 507;; FIXME: This reservation is all wrong when we're scheduling sqrtss. 508(define_insn_reservation "ppro_sse_SF" 3 509 (and (eq_attr "cpu" "pentiumpro,generic32") 510 (and (eq_attr "mode" "SF") 511 (eq_attr "type" "sse"))) 512 "decodern,p0") 513 514(define_insn_reservation "ppro_sse_add_SF" 3 515 (and (eq_attr "cpu" "pentiumpro,generic32") 516 (and (eq_attr "memory" "none") 517 (and (eq_attr "mode" "SF") 518 (eq_attr "type" "sseadd")))) 519 "decodern,p1") 520 521(define_insn_reservation "ppro_sse_add_SF_load" 3 522 (and (eq_attr "cpu" "pentiumpro,generic32") 523 (and (eq_attr "memory" "load") 524 (and (eq_attr "mode" "SF") 525 (eq_attr "type" "sseadd")))) 526 "decoder0,p2+p1") 527 528(define_insn_reservation "ppro_sse_cmp_SF" 3 529 (and (eq_attr "cpu" "pentiumpro,generic32") 530 (and (eq_attr "memory" "none") 531 (and (eq_attr "mode" "SF") 532 (eq_attr "type" "ssecmp")))) 533 "decoder0,p1") 534 535(define_insn_reservation "ppro_sse_cmp_SF_load" 3 536 (and (eq_attr "cpu" "pentiumpro,generic32") 537 (and (eq_attr "memory" "load") 538 (and (eq_attr "mode" "SF") 539 (eq_attr "type" "ssecmp")))) 540 "decoder0,p2+p1") 541 542(define_insn_reservation "ppro_sse_comi_SF" 1 543 (and (eq_attr "cpu" "pentiumpro,generic32") 544 (and (eq_attr "memory" "none") 545 (and (eq_attr "mode" "SF") 546 (eq_attr "type" "ssecomi")))) 547 "decodern,p0") 548 549(define_insn_reservation "ppro_sse_comi_SF_load" 1 550 (and (eq_attr "cpu" "pentiumpro,generic32") 551 (and (eq_attr "memory" "load") 552 (and (eq_attr "mode" "SF") 553 (eq_attr "type" "ssecomi")))) 554 "decoder0,p2+p0") 555 556(define_insn_reservation "ppro_sse_mul_SF" 4 557 (and (eq_attr "cpu" "pentiumpro,generic32") 558 (and (eq_attr "memory" "none") 559 (and (eq_attr "mode" "SF") 560 (eq_attr "type" "ssemul")))) 561 "decodern,p0") 562 563(define_insn_reservation "ppro_sse_mul_SF_load" 4 564 (and (eq_attr "cpu" "pentiumpro,generic32") 565 (and (eq_attr "memory" "load") 566 (and (eq_attr "mode" "SF") 567 (eq_attr "type" "ssemul")))) 568 "decoder0,p2+p0") 569 570;; FIXME: ssediv doesn't close p0 for 17 cycles, surely??? 571(define_insn_reservation "ppro_sse_div_SF" 18 572 (and (eq_attr "cpu" "pentiumpro,generic32") 573 (and (eq_attr "memory" "none") 574 (and (eq_attr "mode" "SF") 575 (eq_attr "type" "ssediv")))) 576 "decoder0,p0*17") 577 578(define_insn_reservation "ppro_sse_div_SF_load" 18 579 (and (eq_attr "cpu" "pentiumpro,generic32") 580 (and (eq_attr "memory" "none") 581 (and (eq_attr "mode" "SF") 582 (eq_attr "type" "ssediv")))) 583 "decoder0,(p2+p0),p0*16") 584 585(define_insn_reservation "ppro_sse_icvt_SF" 4 586 (and (eq_attr "cpu" "pentiumpro,generic32") 587 (and (eq_attr "mode" "SF") 588 (eq_attr "type" "sseicvt"))) 589 "decoder0,(p2+p1)*2") 590 591(define_insn_reservation "ppro_sse_icvt_SI" 3 592 (and (eq_attr "cpu" "pentiumpro,generic32") 593 (and (eq_attr "mode" "SI") 594 (eq_attr "type" "sseicvt"))) 595 "decoder0,(p2+p1)") 596 597(define_insn_reservation "ppro_sse_mov_SF" 3 598 (and (eq_attr "cpu" "pentiumpro,generic32") 599 (and (eq_attr "memory" "none") 600 (and (eq_attr "mode" "SF") 601 (eq_attr "type" "ssemov")))) 602 "decoder0,(p0|p1)") 603 604(define_insn_reservation "ppro_sse_mov_SF_load" 3 605 (and (eq_attr "cpu" "pentiumpro,generic32") 606 (and (eq_attr "memory" "load") 607 (and (eq_attr "mode" "SF") 608 (eq_attr "type" "ssemov")))) 609 "decoder0,p2+(p0|p1)") 610 611(define_insn_reservation "ppro_sse_mov_SF_store" 3 612 (and (eq_attr "cpu" "pentiumpro,generic32") 613 (and (eq_attr "memory" "store") 614 (and (eq_attr "mode" "SF") 615 (eq_attr "type" "ssemov")))) 616 "decoder0,p4+p3") 617 618(define_insn_reservation "ppro_sse_V4SF" 4 619 (and (eq_attr "cpu" "pentiumpro,generic32") 620 (and (eq_attr "mode" "V4SF") 621 (eq_attr "type" "sse"))) 622 "decoder0,p1*2") 623 624(define_insn_reservation "ppro_sse_add_V4SF" 3 625 (and (eq_attr "cpu" "pentiumpro,generic32") 626 (and (eq_attr "memory" "none") 627 (and (eq_attr "mode" "V4SF") 628 (eq_attr "type" "sseadd")))) 629 "decoder0,p1*2") 630 631(define_insn_reservation "ppro_sse_add_V4SF_load" 3 632 (and (eq_attr "cpu" "pentiumpro,generic32") 633 (and (eq_attr "memory" "load") 634 (and (eq_attr "mode" "V4SF") 635 (eq_attr "type" "sseadd")))) 636 "decoder0,(p2+p1)*2") 637 638(define_insn_reservation "ppro_sse_cmp_V4SF" 3 639 (and (eq_attr "cpu" "pentiumpro,generic32") 640 (and (eq_attr "memory" "none") 641 (and (eq_attr "mode" "V4SF") 642 (eq_attr "type" "ssecmp")))) 643 "decoder0,p1*2") 644 645(define_insn_reservation "ppro_sse_cmp_V4SF_load" 3 646 (and (eq_attr "cpu" "pentiumpro,generic32") 647 (and (eq_attr "memory" "load") 648 (and (eq_attr "mode" "V4SF") 649 (eq_attr "type" "ssecmp")))) 650 "decoder0,(p2+p1)*2") 651 652(define_insn_reservation "ppro_sse_cvt_V4SF" 3 653 (and (eq_attr "cpu" "pentiumpro,generic32") 654 (and (eq_attr "memory" "none,unknown") 655 (and (eq_attr "mode" "V4SF") 656 (eq_attr "type" "ssecvt")))) 657 "decoder0,p1*2") 658 659(define_insn_reservation "ppro_sse_cvt_V4SF_other" 4 660 (and (eq_attr "cpu" "pentiumpro,generic32") 661 (and (eq_attr "memory" "!none,unknown") 662 (and (eq_attr "mode" "V4SF") 663 (eq_attr "type" "ssecmp")))) 664 "decoder0,p1,p4+p3") 665 666(define_insn_reservation "ppro_sse_mul_V4SF" 5 667 (and (eq_attr "cpu" "pentiumpro,generic32") 668 (and (eq_attr "memory" "none") 669 (and (eq_attr "mode" "V4SF") 670 (eq_attr "type" "ssemul")))) 671 "decoder0,p0*2") 672 673(define_insn_reservation "ppro_sse_mul_V4SF_load" 5 674 (and (eq_attr "cpu" "pentiumpro,generic32") 675 (and (eq_attr "memory" "load") 676 (and (eq_attr "mode" "V4SF") 677 (eq_attr "type" "ssemul")))) 678 "decoder0,(p2+p0)*2") 679 680;; FIXME: p0 really closed this long??? 681(define_insn_reservation "ppro_sse_div_V4SF" 48 682 (and (eq_attr "cpu" "pentiumpro,generic32") 683 (and (eq_attr "memory" "none") 684 (and (eq_attr "mode" "V4SF") 685 (eq_attr "type" "ssediv")))) 686 "decoder0,p0*34") 687 688(define_insn_reservation "ppro_sse_div_V4SF_load" 48 689 (and (eq_attr "cpu" "pentiumpro,generic32") 690 (and (eq_attr "memory" "load") 691 (and (eq_attr "mode" "V4SF") 692 (eq_attr "type" "ssediv")))) 693 "decoder0,(p2+p0)*2,p0*32") 694 695(define_insn_reservation "ppro_sse_log_V4SF" 2 696 (and (eq_attr "cpu" "pentiumpro,generic32") 697 (and (eq_attr "memory" "none") 698 (and (eq_attr "mode" "V4SF") 699 (eq_attr "type" "sselog,sselog1")))) 700 "decodern,p1") 701 702(define_insn_reservation "ppro_sse_log_V4SF_load" 2 703 (and (eq_attr "cpu" "pentiumpro,generic32") 704 (and (eq_attr "memory" "load") 705 (and (eq_attr "mode" "V4SF") 706 (eq_attr "type" "sselog,sselog1")))) 707 "decoder0,(p2+p1)") 708 709(define_insn_reservation "ppro_sse_mov_V4SF" 1 710 (and (eq_attr "cpu" "pentiumpro,generic32") 711 (and (eq_attr "memory" "none") 712 (and (eq_attr "mode" "V4SF") 713 (eq_attr "type" "ssemov")))) 714 "decoder0,(p0|p1)*2") 715 716(define_insn_reservation "ppro_sse_mov_V4SF_load" 2 717 (and (eq_attr "cpu" "pentiumpro,generic32") 718 (and (eq_attr "memory" "load") 719 (and (eq_attr "mode" "V4SF") 720 (eq_attr "type" "ssemov")))) 721 "decoder0,p2*2") 722 723(define_insn_reservation "ppro_sse_mov_V4SF_store" 3 724 (and (eq_attr "cpu" "pentiumpro,generic32") 725 (and (eq_attr "memory" "store") 726 (and (eq_attr "mode" "V4SF") 727 (eq_attr "type" "ssemov")))) 728 "decoder0,(p4+p3)*2") 729 730;; All other instructions are modelled as simple instructions. 731;; We have already modelled all i387 floating point instructions, so all 732;; other instructions execute on either port 0 or port 1. This includes 733;; the ALU units, and the MMX units. 734;; 735;; reg-reg instructions produce 1 uop so they can be decoded on any of 736;; the three decoders. 737(define_insn_reservation "ppro_insn" 1 738 (and (eq_attr "cpu" "pentiumpro,generic32") 739 (and (eq_attr "memory" "none,unknown") 740 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp"))) 741 "decodern,(p0|p1)") 742 743;; read-modify and register-memory instructions have 2 or three uops, 744;; so they have to be decoded on decoder0. 745(define_insn_reservation "ppro_insn_load" 3 746 (and (eq_attr "cpu" "pentiumpro,generic32") 747 (and (eq_attr "memory" "load") 748 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp"))) 749 "decoder0,p2+(p0|p1)") 750 751(define_insn_reservation "ppro_insn_store" 1 752 (and (eq_attr "cpu" "pentiumpro,generic32") 753 (and (eq_attr "memory" "store") 754 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp"))) 755 "decoder0,(p0|p1),p4+p3") 756 757;; read-modify-store instructions produce 4 uops so they have to be 758;; decoded on decoder0 as well. 759(define_insn_reservation "ppro_insn_both" 4 760 (and (eq_attr "cpu" "pentiumpro,generic32") 761 (and (eq_attr "memory" "both") 762 (eq_attr "type" "alu,alu1,negnot,incdec,icmp,test,setcc,icmov,push,pop,fxch,sseiadd,sseishft,sseimul,mmx,mmxadd,mmxcmp"))) 763 "decoder0,p2+(p0|p1),p4+p3") 764 765