1;; Scheduling description for cell processor. 2;; Copyright (C) 2001-2020 Free Software Foundation, Inc. 3;; Contributed by Sony Computer Entertainment, Inc., 4 5 6;; This file is free software; you can redistribute it and/or modify it under 7;; the terms of the GNU General Public License as published by the Free 8;; Software Foundation; either version 3 of the License, or (at your option) 9;; any later version. 10 11;; This file is distributed in the hope that it will be useful, but WITHOUT 12;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14;; for more details. 15 16;; You should have received a copy of the GNU General Public License 17;; along with GCC; see the file COPYING3. If not see 18;; <http://www.gnu.org/licenses/>. 19 20;; Sources: BE BOOK4 (/sfs/enc/doc/PPU_BookIV_DD3.0_latest.pdf) 21 22;; BE Architecture *DD3.0 and DD3.1* 23;; This file simulate PPU processor unit backend of pipeline, maualP24. 24;; manual P27, stall and flush points 25;; IU, XU, VSU, dispatcher decodes and dispatch 2 insns per cycle in program 26;; order, the grouped address are aligned by 8 27;; This file only simulate one thread situation 28;; XU executes all fixed point insns(3 units, a simple alu, a complex unit, 29;; and load/store unit) 30;; VSU executes all scalar floating points insn(a float unit), 31;; VMX insns(VMX unit, 4 sub units, simple, permute, complex, floating point) 32 33;; Dual issue combination 34 35;; FXU LSU BR VMX VMX 36;; (sx,cx,vsu_fp,fp_arith) (perm,vsu_ls,fp_ls) 37;;FXU X 38;;LSU X X X 39;;BR X 40;;VMX(sx,cx,vsu_fp,fp_arth) X 41;;VMX(perm,vsu_ls, fp_ls) X 42;; X are illegal combination. 43 44;; Dual issue exceptions: 45;;(1) nop-pipelined FXU instr in slot 0 46;;(2) non-pipelined FPU inst in slot 0 47;; CSI instr(contex-synchronizing insn) 48;; Microcode insn 49 50;; BRU unit: bru(none register stall), bru_cr(cr register stall) 51;; VSU unit: vus(vmx simple), vup(vmx permute), vuc(vmx complex), 52;; vuf(vmx float), fpu(floats). fpu_div is hypothetical, it is for 53;; nonpipelined simulation 54;; micr insns will stall at least 7 cycles to get the first instr from ROM, 55;; micro instructions are not dual issued. 56 57;; slot0 is older than slot1 58;; non-pipelined insn need to be in slot1 to avoid 1cycle stall 59 60;; There different stall point 61;; IB2, only stall one thread if stall here, so try to stall here as much as 62;; we can 63;; condition(1) insert nop, OR and ORI instruction form 64;; condition(2) flush happens, in case of: RAW, WAW, D-ERAT miss, or 65;; CR0-access while stdcx, or stwcx 66;; IS2 stall ;; Page91 for details 67;; VQ8 stall 68;; IS2 stall can be activated by VQ8 stall and trying to issue a vsu instr to 69;; the vsu issue queue 70 71;;(define_automaton "cellxu") 72 73;;(define_cpu_unit "fxu_cell,lsu_cell,bru_cell,vsu1_cell,vsu2_cell" "cellxu") 74 75;; ndfa 76(define_automaton "cellxu,cellvsu,cellbru,cell_mis") 77 78(define_cpu_unit "fxu_cell,lsu_cell" "cellxu") 79(define_cpu_unit "bru_cell" "cellbru") 80(define_cpu_unit "vsu1_cell,vsu2_cell" "cellvsu") 81 82(define_cpu_unit "slot0,slot1" "cell_mis") 83 84(absence_set "slot0" "slot1") 85 86(define_reservation "nonpipeline" "fxu_cell+lsu_cell+vsu1_cell+vsu2_cell") 87(define_reservation "slot01" "slot0|slot1") 88 89 90;; Load/store 91;; lmw, lswi, lswx are only generated for optimize for space, MC, 92;; these instr are not simulated 93(define_insn_reservation "cell-load" 2 94 (and (eq_attr "type" "load") 95 (eq_attr "sign_extend" "no") 96 (eq_attr "update" "no") 97 (eq_attr "cpu" "cell")) 98 "slot01,lsu_cell") 99 100;; ldux, ldu, lbzux, lbzu, hardware breaks it down to two instrs, 101;; if with 32bytes alignment, CMC 102(define_insn_reservation "cell-load-ux" 2 103 (and (eq_attr "type" "load") 104 (eq_attr "sign_extend" "no") 105 (eq_attr "update" "yes") 106 (eq_attr "cpu" "cell")) 107 "slot01,fxu_cell+lsu_cell") 108 109;; lha, lhax, lhau, lhaux, lwa, lwax, lwaux, MC, latency unknown 110;; 11/7, 11/8, 11/12 111(define_insn_reservation "cell-load-ext" 2 112 (and (eq_attr "type" "load") 113 (eq_attr "sign_extend" "yes") 114 (eq_attr "cpu" "cell")) 115 "slot01,fxu_cell+lsu_cell") 116 117;;lfs,lfsx,lfd,lfdx, 1 cycle 118(define_insn_reservation "cell-fpload" 1 119 (and (eq_attr "type" "fpload") 120 (eq_attr "update" "no") 121 (eq_attr "cpu" "cell")) 122 "vsu2_cell+lsu_cell+slot01") 123 124;; lfsu,lfsux,lfdu,lfdux 1cycle(fpr) 2 cycle(gpr) 125(define_insn_reservation "cell-fpload-update" 1 126 (and (eq_attr "type" "fpload") 127 (eq_attr "update" "yes") 128 (eq_attr "cpu" "cell")) 129 "fxu_cell+vsu2_cell+lsu_cell+slot01") 130 131(define_insn_reservation "cell-vecload" 2 132 (and (eq_attr "type" "vecload") 133 (eq_attr "cpu" "cell")) 134 "slot01,vsu2_cell+lsu_cell") 135 136;;st? stw(MC) 137(define_insn_reservation "cell-store" 1 138 (and (eq_attr "type" "store") 139 (eq_attr "update" "no") 140 (eq_attr "cpu" "cell")) 141 "lsu_cell+slot01") 142 143;;stdux, stdu, (hardware breaks into store and add) 2 for update reg 144(define_insn_reservation "cell-store-update" 1 145 (and (eq_attr "type" "store") 146 (eq_attr "update" "yes") 147 (eq_attr "cpu" "cell")) 148 "fxu_cell+lsu_cell+slot01") 149 150(define_insn_reservation "cell-fpstore" 1 151 (and (eq_attr "type" "fpstore") 152 (eq_attr "update" "no") 153 (eq_attr "cpu" "cell")) 154 "vsu2_cell+lsu_cell+slot01") 155 156(define_insn_reservation "cell-fpstore-update" 1 157 (and (eq_attr "type" "fpstore") 158 (eq_attr "update" "yes") 159 (eq_attr "cpu" "cell")) 160 "vsu2_cell+fxu_cell+lsu_cell+slot01") 161 162(define_insn_reservation "cell-vecstore" 1 163 (and (eq_attr "type" "vecstore") 164 (eq_attr "cpu" "cell")) 165 "vsu2_cell+lsu_cell+slot01") 166 167;; Integer latency is 2 cycles 168(define_insn_reservation "cell-integer" 2 169 (and (ior (eq_attr "type" "integer,trap,cntlz,isel") 170 (and (eq_attr "type" "add,logical,shift,exts") 171 (eq_attr "dot" "no")) 172 (and (eq_attr "type" "insert") 173 (eq_attr "size" "64"))) 174 (eq_attr "cpu" "cell")) 175 "slot01,fxu_cell") 176 177;; Two integer latency is 4 cycles 178(define_insn_reservation "cell-two" 4 179 (and (eq_attr "type" "two") 180 (eq_attr "cpu" "cell")) 181 "slot01,fxu_cell,fxu_cell*2") 182 183;; Three integer latency is 6 cycles 184(define_insn_reservation "cell-three" 6 185 (and (eq_attr "type" "three") 186 (eq_attr "cpu" "cell")) 187 "slot01,fxu_cell,fxu_cell*4") 188 189;; rlwimi, alter cr0 190(define_insn_reservation "cell-insert" 2 191 (and (eq_attr "type" "insert") 192 (eq_attr "size" "32") 193 (eq_attr "cpu" "cell")) 194 "slot01,fxu_cell") 195 196;; cmpi, cmpli, cmpla, add, addo, sub, subo, alter cr0 197(define_insn_reservation "cell-cmp" 1 198 (and (eq_attr "type" "cmp") 199 (eq_attr "cpu" "cell")) 200 "fxu_cell+slot01") 201 202;; add, addo, sub, subo, alter cr0, rldcli, rlwinm 203(define_insn_reservation "cell-fast-cmp" 2 204 (and (eq_attr "type" "add,logical,shift,exts") 205 (eq_attr "dot" "yes") 206 (eq_attr "cpu" "cell") 207 (eq_attr "cell_micro" "not")) 208 "slot01,fxu_cell") 209 210(define_insn_reservation "cell-cmp-microcoded" 9 211 (and (eq_attr "type" "add,logical,shift,exts") 212 (eq_attr "dot" "yes") 213 (eq_attr "cpu" "cell") 214 (eq_attr "cell_micro" "always")) 215 "slot0+slot1,fxu_cell,fxu_cell*7") 216 217;; mulld 218(define_insn_reservation "cell-lmul" 15 219 (and (eq_attr "type" "mul") 220 (eq_attr "dot" "no") 221 (eq_attr "size" "64") 222 (eq_attr "cpu" "cell")) 223 "slot1,nonpipeline,nonpipeline*13") 224 225;; mulld. is microcoded 226(define_insn_reservation "cell-lmul-cmp" 22 227 (and (eq_attr "type" "mul") 228 (eq_attr "dot" "yes") 229 (eq_attr "size" "64") 230 (eq_attr "cpu" "cell")) 231 "slot0+slot1,nonpipeline,nonpipeline*20") 232 233;; mulli, 6 cycles 234(define_insn_reservation "cell-imul23" 6 235 (and (eq_attr "type" "mul") 236 (eq_attr "size" "8,16") 237 (eq_attr "cpu" "cell")) 238 "slot1,nonpipeline,nonpipeline*4") 239 240;; mullw, 9 241(define_insn_reservation "cell-imul" 9 242 (and (eq_attr "type" "mul") 243 (eq_attr "dot" "no") 244 (eq_attr "size" "32") 245 (eq_attr "cpu" "cell")) 246 "slot1,nonpipeline,nonpipeline*7") 247 248;; divide 249(define_insn_reservation "cell-idiv" 32 250 (and (eq_attr "type" "div") 251 (eq_attr "size" "32") 252 (eq_attr "cpu" "cell")) 253 "slot1,nonpipeline,nonpipeline*30") 254 255(define_insn_reservation "cell-ldiv" 64 256 (and (eq_attr "type" "div") 257 (eq_attr "size" "64") 258 (eq_attr "cpu" "cell")) 259 "slot1,nonpipeline,nonpipeline*62") 260 261;;mflr and mfctr are pipelined 262(define_insn_reservation "cell-mfjmpr" 1 263 (and (eq_attr "type" "mfjmpr") 264 (eq_attr "cpu" "cell")) 265 "slot01+bru_cell") 266 267;;mtlr and mtctr, 268;;mtspr fully pipelined 269(define_insn_reservation "cell-mtjmpr" 1 270 (and (eq_attr "type" "mtjmpr") 271 (eq_attr "cpu" "cell")) 272 "bru_cell+slot01") 273 274;; Branches 275;; b, ba, bl, bla, unconditional branch always predicts correctly n/a latency 276;; bcctr, bcctrl, latency 2, actually adjust by be to 4 277(define_insn_reservation "cell-branch" 1 278 (and (eq_attr "type" "branch") 279 (eq_attr "cpu" "cell")) 280 "bru_cell+slot1") 281 282(define_insn_reservation "cell-branchreg" 1 283 (and (eq_attr "type" "jmpreg") 284 (eq_attr "cpu" "cell")) 285 "bru_cell+slot1") 286 287;; cr hazard 288;; page 90, special cases for CR hazard, only one instr can access cr per cycle 289;; if insn reads CR following a stwcx, pipeline stall till stwcx finish 290(define_insn_reservation "cell-crlogical" 1 291 (and (eq_attr "type" "cr_logical") 292 (eq_attr "cpu" "cell")) 293 "bru_cell+slot01") 294 295;; mfcrf and mfcr is about 34 cycles and nonpipelined 296(define_insn_reservation "cell-mfcr" 34 297 (and (eq_attr "type" "mfcrf,mfcr") 298 (eq_attr "cpu" "cell")) 299 "slot1,nonpipeline,nonpipeline*32") 300 301;; mtcrf (1 field) 302(define_insn_reservation "cell-mtcrf" 1 303 (and (eq_attr "type" "mtcr") 304 (eq_attr "cpu" "cell")) 305 "fxu_cell+slot01") 306 307; Basic FP latency is 10 cycles, thoughput is 1/cycle 308(define_insn_reservation "cell-fp" 10 309 (and (eq_attr "type" "fp,fpsimple,dmul") 310 (eq_attr "cpu" "cell")) 311 "slot01,vsu1_cell,vsu1_cell*8") 312 313(define_insn_reservation "cell-fpcompare" 1 314 (and (eq_attr "type" "fpcompare") 315 (eq_attr "cpu" "cell")) 316 "vsu1_cell+slot01") 317 318;; sdiv thoughput 1/74, not pipelined but only in the FPU 319(define_insn_reservation "cell-sdiv" 74 320 (and (eq_attr "type" "sdiv,ddiv") 321 (eq_attr "cpu" "cell")) 322 "slot1,nonpipeline,nonpipeline*72") 323 324;; fsqrt thoughput 1/84, not pipelined but only in the FPU 325(define_insn_reservation "cell-sqrt" 84 326 (and (eq_attr "type" "ssqrt,dsqrt") 327 (eq_attr "cpu" "cell")) 328 "slot1,nonpipeline,nonpipeline*82") 329 330; VMX 331(define_insn_reservation "cell-vecsimple" 4 332 (and (eq_attr "type" "vecsimple,veclogical,vecmove") 333 (eq_attr "cpu" "cell")) 334 "slot01,vsu1_cell,vsu1_cell*2") 335 336;; mult, div, madd 337(define_insn_reservation "cell-veccomplex" 10 338 (and (eq_attr "type" "veccomplex") 339 (eq_attr "cpu" "cell")) 340 "slot01,vsu1_cell,vsu1_cell*8") 341 342;; TODO: add support for recording instructions 343(define_insn_reservation "cell-veccmp" 4 344 (and (eq_attr "type" "veccmp,veccmpfx") 345 (eq_attr "cpu" "cell")) 346 "slot01,vsu1_cell,vsu1_cell*2") 347 348(define_insn_reservation "cell-vecfloat" 12 349 (and (eq_attr "type" "vecfloat") 350 (eq_attr "cpu" "cell")) 351 "slot01,vsu1_cell,vsu1_cell*10") 352 353(define_insn_reservation "cell-vecperm" 4 354 (and (eq_attr "type" "vecperm") 355 (eq_attr "cpu" "cell")) 356 "slot01,vsu2_cell,vsu2_cell*2") 357 358;; New for 4.2, syncs 359 360(define_insn_reservation "cell-sync" 11 361 (and (eq_attr "type" "sync") 362 (eq_attr "cpu" "cell")) 363 "slot01,lsu_cell,lsu_cell*9") 364 365(define_insn_reservation "cell-isync" 11 366 (and (eq_attr "type" "isync") 367 (eq_attr "cpu" "cell")) 368 "slot01,lsu_cell,lsu_cell*9") 369 370(define_insn_reservation "cell-load_l" 11 371 (and (eq_attr "type" "load_l") 372 (eq_attr "cpu" "cell")) 373 "slot01,lsu_cell,lsu_cell*9") 374 375(define_insn_reservation "cell-store_c" 11 376 (and (eq_attr "type" "store_c") 377 (eq_attr "cpu" "cell")) 378 "slot01,lsu_cell,lsu_cell*9") 379 380;; RAW register dependency 381 382;; addi r3, r3, 1 383;; lw r4,offset(r3) 384;; there are 5 cycle deplay for r3 bypassing 385;; there are 5 cycle delay for a dependent load after a load 386(define_bypass 5 "cell-integer" "cell-load") 387(define_bypass 5 "cell-integer" "cell-load-ext") 388(define_bypass 5 "cell-load,cell-load-ext" "cell-load,cell-load-ext") 389 390;; there is a 6 cycle delay after a fp compare until you can use the cr. 391(define_bypass 6 "cell-fpcompare" "cell-branch,cell-branchreg,cell-mfcr,cell-crlogical") 392 393;; VXU float RAW 394(define_bypass 11 "cell-vecfloat" "cell-vecfloat") 395 396;; VXU and FPU 397(define_bypass 6 "cell-veccomplex" "cell-vecsimple") 398;;(define_bypass 6 "cell-veccompare" "cell-branch,cell-branchreg") 399(define_bypass 3 "cell-vecfloat" "cell-veccomplex") 400; this is not correct, 401;; this is a stall in general and not dependent on result 402(define_bypass 13 "cell-vecstore" "cell-fpstore") 403; this is not correct, this can never be true, not dependent on result 404(define_bypass 7 "cell-fp" "cell-fpload") 405;; vsu1 should avoid writing to the same target register as vsu2 insn 406;; within 12 cycles. 407 408;; WAW hazard 409 410;; the target of VSU estimate should not be reused within 10 dispatch groups 411;; the target of VSU float should not be reused within 8 dispatch groups 412;; the target of VSU complex should not be reused within 5 dispatch groups 413;; FP LOAD should not reuse an FPU Arithmetic target with 6 dispatch gropus 414 415;; mtctr-bcctr/bcctrl, branch target ctr register shadow update at 416;; ex4 stage(10 cycles) 417(define_bypass 10 "cell-mtjmpr" "cell-branchreg") 418 419;;Things are not simulated: 420;; update instruction, update address gpr are not simulated 421;; vrefp, vrsqrtefp have latency(14), currently simulated as 12 cycle float 422;; insns 423 424