arm1020e.md revision 169689
1169689Skan;; ARM 1020E & ARM 1022E Pipeline Description 2169689Skan;; Copyright (C) 2005 Free Software Foundation, Inc. 3169689Skan;; Contributed by Richard Earnshaw (richard.earnshaw@arm.com) 4169689Skan;; 5169689Skan;; This file is part of GCC. 6169689Skan;; 7169689Skan;; GCC is free software; you can redistribute it and/or modify it 8169689Skan;; under the terms of the GNU General Public License as published by 9169689Skan;; the Free Software Foundation; either version 2, or (at your option) 10169689Skan;; any later version. 11169689Skan;; 12169689Skan;; GCC is distributed in the hope that it will be useful, but 13169689Skan;; WITHOUT ANY WARRANTY; without even the implied warranty of 14169689Skan;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15169689Skan;; General Public License for more details. 16169689Skan;; 17169689Skan;; You should have received a copy of the GNU General Public License 18169689Skan;; along with GCC; see the file COPYING. If not, write to the Free 19169689Skan;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 20169689Skan;; 02110-1301, USA. */ 21169689Skan 22169689Skan;; These descriptions are based on the information contained in the 23169689Skan;; ARM1020E Technical Reference Manual, Copyright (c) 2003 ARM 24169689Skan;; Limited. 25169689Skan;; 26169689Skan 27169689Skan;; This automaton provides a pipeline description for the ARM 28169689Skan;; 1020E core. 29169689Skan;; 30169689Skan;; The model given here assumes that the condition for all conditional 31169689Skan;; instructions is "true", i.e., that all of the instructions are 32169689Skan;; actually executed. 33169689Skan 34169689Skan(define_automaton "arm1020e") 35169689Skan 36169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 37169689Skan;; Pipelines 38169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 39169689Skan 40169689Skan;; There are two pipelines: 41169689Skan;; 42169689Skan;; - An Arithmetic Logic Unit (ALU) pipeline. 43169689Skan;; 44169689Skan;; The ALU pipeline has fetch, issue, decode, execute, memory, and 45169689Skan;; write stages. We only need to model the execute, memory and write 46169689Skan;; stages. 47169689Skan;; 48169689Skan;; - A Load-Store Unit (LSU) pipeline. 49169689Skan;; 50169689Skan;; The LSU pipeline has decode, execute, memory, and write stages. 51169689Skan;; We only model the execute, memory and write stages. 52169689Skan 53169689Skan(define_cpu_unit "1020a_e,1020a_m,1020a_w" "arm1020e") 54169689Skan(define_cpu_unit "1020l_e,1020l_m,1020l_w" "arm1020e") 55169689Skan 56169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 57169689Skan;; ALU Instructions 58169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 59169689Skan 60169689Skan;; ALU instructions require three cycles to execute, and use the ALU 61169689Skan;; pipeline in each of the three stages. The results are available 62169689Skan;; after the execute stage stage has finished. 63169689Skan;; 64169689Skan;; If the destination register is the PC, the pipelines are stalled 65169689Skan;; for several cycles. That case is not modeled here. 66169689Skan 67169689Skan;; ALU operations with no shifted operand 68169689Skan(define_insn_reservation "1020alu_op" 1 69169689Skan (and (eq_attr "tune" "arm1020e,arm1022e") 70169689Skan (eq_attr "type" "alu")) 71169689Skan "1020a_e,1020a_m,1020a_w") 72169689Skan 73169689Skan;; ALU operations with a shift-by-constant operand 74169689Skan(define_insn_reservation "1020alu_shift_op" 1 75169689Skan (and (eq_attr "tune" "arm1020e,arm1022e") 76169689Skan (eq_attr "type" "alu_shift")) 77169689Skan "1020a_e,1020a_m,1020a_w") 78169689Skan 79169689Skan;; ALU operations with a shift-by-register operand 80169689Skan;; These really stall in the decoder, in order to read 81169689Skan;; the shift value in a second cycle. Pretend we take two cycles in 82169689Skan;; the execute stage. 83169689Skan(define_insn_reservation "1020alu_shift_reg_op" 2 84169689Skan (and (eq_attr "tune" "arm1020e,arm1022e") 85169689Skan (eq_attr "type" "alu_shift_reg")) 86169689Skan "1020a_e*2,1020a_m,1020a_w") 87169689Skan 88169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 89169689Skan;; Multiplication Instructions 90169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 91169689Skan 92169689Skan;; Multiplication instructions loop in the execute stage until the 93169689Skan;; instruction has been passed through the multiplier array enough 94169689Skan;; times. 95169689Skan 96169689Skan;; The result of the "smul" and "smulw" instructions is not available 97169689Skan;; until after the memory stage. 98169689Skan(define_insn_reservation "1020mult1" 2 99169689Skan (and (eq_attr "tune" "arm1020e,arm1022e") 100169689Skan (eq_attr "insn" "smulxy,smulwy")) 101169689Skan "1020a_e,1020a_m,1020a_w") 102169689Skan 103169689Skan;; The "smlaxy" and "smlawx" instructions require two iterations through 104169689Skan;; the execute stage; the result is available immediately following 105169689Skan;; the execute stage. 106169689Skan(define_insn_reservation "1020mult2" 2 107169689Skan (and (eq_attr "tune" "arm1020e,arm1022e") 108169689Skan (eq_attr "insn" "smlaxy,smlalxy,smlawx")) 109169689Skan "1020a_e*2,1020a_m,1020a_w") 110169689Skan 111169689Skan;; The "smlalxy", "mul", and "mla" instructions require two iterations 112169689Skan;; through the execute stage; the result is not available until after 113169689Skan;; the memory stage. 114169689Skan(define_insn_reservation "1020mult3" 3 115169689Skan (and (eq_attr "tune" "arm1020e,arm1022e") 116169689Skan (eq_attr "insn" "smlalxy,mul,mla")) 117169689Skan "1020a_e*2,1020a_m,1020a_w") 118169689Skan 119169689Skan;; The "muls" and "mlas" instructions loop in the execute stage for 120169689Skan;; four iterations in order to set the flags. The value result is 121169689Skan;; available after three iterations. 122169689Skan(define_insn_reservation "1020mult4" 3 123169689Skan (and (eq_attr "tune" "arm1020e,arm1022e") 124169689Skan (eq_attr "insn" "muls,mlas")) 125169689Skan "1020a_e*4,1020a_m,1020a_w") 126169689Skan 127169689Skan;; Long multiply instructions that produce two registers of 128169689Skan;; output (such as umull) make their results available in two cycles; 129169689Skan;; the least significant word is available before the most significant 130169689Skan;; word. That fact is not modeled; instead, the instructions are 131169689Skan;; described.as if the entire result was available at the end of the 132169689Skan;; cycle in which both words are available. 133169689Skan 134169689Skan;; The "umull", "umlal", "smull", and "smlal" instructions all take 135169689Skan;; three iterations through the execute cycle, and make their results 136169689Skan;; available after the memory cycle. 137169689Skan(define_insn_reservation "1020mult5" 4 138169689Skan (and (eq_attr "tune" "arm1020e,arm1022e") 139169689Skan (eq_attr "insn" "umull,umlal,smull,smlal")) 140169689Skan "1020a_e*3,1020a_m,1020a_w") 141169689Skan 142169689Skan;; The "umulls", "umlals", "smulls", and "smlals" instructions loop in 143169689Skan;; the execute stage for five iterations in order to set the flags. 144169689Skan;; The value result is available after four iterations. 145169689Skan(define_insn_reservation "1020mult6" 4 146169689Skan (and (eq_attr "tune" "arm1020e,arm1022e") 147169689Skan (eq_attr "insn" "umulls,umlals,smulls,smlals")) 148169689Skan "1020a_e*5,1020a_m,1020a_w") 149169689Skan 150169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 151169689Skan;; Load/Store Instructions 152169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 153169689Skan 154169689Skan;; The models for load/store instructions do not accurately describe 155169689Skan;; the difference between operations with a base register writeback 156169689Skan;; (such as "ldm!"). These models assume that all memory references 157169689Skan;; hit in dcache. 158169689Skan 159169689Skan;; LSU instructions require six cycles to execute. They use the ALU 160169689Skan;; pipeline in all but the 5th cycle, and the LSU pipeline in cycles 161169689Skan;; three through six. 162169689Skan;; Loads and stores which use a scaled register offset or scaled 163169689Skan;; register pre-indexed addressing mode take three cycles EXCEPT for 164169689Skan;; those that are base + offset with LSL of 0 or 2, or base - offset 165169689Skan;; with LSL of zero. The remainder take 1 cycle to execute. 166169689Skan;; For 4byte loads there is a bypass from the load stage 167169689Skan 168169689Skan(define_insn_reservation "1020load1_op" 2 169169689Skan (and (eq_attr "tune" "arm1020e,arm1022e") 170169689Skan (eq_attr "type" "load_byte,load1")) 171169689Skan "1020a_e+1020l_e,1020l_m,1020l_w") 172169689Skan 173169689Skan(define_insn_reservation "1020store1_op" 0 174169689Skan (and (eq_attr "tune" "arm1020e,arm1022e") 175169689Skan (eq_attr "type" "store1")) 176169689Skan "1020a_e+1020l_e,1020l_m,1020l_w") 177169689Skan 178169689Skan;; A load's result can be stored by an immediately following store 179169689Skan(define_bypass 1 "1020load1_op" "1020store1_op" "arm_no_early_store_addr_dep") 180169689Skan 181169689Skan;; On a LDM/STM operation, the LSU pipeline iterates until all of the 182169689Skan;; registers have been processed. 183169689Skan;; 184169689Skan;; The time it takes to load the data depends on whether or not the 185169689Skan;; base address is 64-bit aligned; if it is not, an additional cycle 186169689Skan;; is required. This model assumes that the address is always 64-bit 187169689Skan;; aligned. Because the processor can load two registers per cycle, 188169689Skan;; that assumption means that we use the same instruction reservations 189169689Skan;; for loading 2k and 2k - 1 registers. 190169689Skan;; 191169689Skan;; The ALU pipeline is decoupled after the first cycle unless there is 192169689Skan;; a register dependency; the dependency is cleared as soon as the LDM/STM 193169689Skan;; has dealt with the corresponding register. So for example, 194169689Skan;; stmia sp, {r0-r3} 195169689Skan;; add r0, r0, #4 196169689Skan;; will have one fewer stalls than 197169689Skan;; stmia sp, {r0-r3} 198169689Skan;; add r3, r3, #4 199169689Skan;; 200169689Skan;; As with ALU operations, if one of the destination registers is the 201169689Skan;; PC, there are additional stalls; that is not modeled. 202169689Skan 203169689Skan(define_insn_reservation "1020load2_op" 2 204169689Skan (and (eq_attr "tune" "arm1020e,arm1022e") 205169689Skan (eq_attr "type" "load2")) 206169689Skan "1020a_e+1020l_e,1020l_m,1020l_w") 207169689Skan 208169689Skan(define_insn_reservation "1020store2_op" 0 209169689Skan (and (eq_attr "tune" "arm1020e,arm1022e") 210169689Skan (eq_attr "type" "store2")) 211169689Skan "1020a_e+1020l_e,1020l_m,1020l_w") 212169689Skan 213169689Skan(define_insn_reservation "1020load34_op" 3 214169689Skan (and (eq_attr "tune" "arm1020e,arm1022e") 215169689Skan (eq_attr "type" "load3,load4")) 216169689Skan "1020a_e+1020l_e,1020l_e+1020l_m,1020l_m,1020l_w") 217169689Skan 218169689Skan(define_insn_reservation "1020store34_op" 0 219169689Skan (and (eq_attr "tune" "arm1020e,arm1022e") 220169689Skan (eq_attr "type" "store3,store4")) 221169689Skan "1020a_e+1020l_e,1020l_e+1020l_m,1020l_m,1020l_w") 222169689Skan 223169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 224169689Skan;; Branch and Call Instructions 225169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 226169689Skan 227169689Skan;; Branch instructions are difficult to model accurately. The ARM 228169689Skan;; core can predict most branches. If the branch is predicted 229169689Skan;; correctly, and predicted early enough, the branch can be completely 230169689Skan;; eliminated from the instruction stream. Some branches can 231169689Skan;; therefore appear to require zero cycles to execute. We assume that 232169689Skan;; all branches are predicted correctly, and that the latency is 233169689Skan;; therefore the minimum value. 234169689Skan 235169689Skan(define_insn_reservation "1020branch_op" 0 236169689Skan (and (eq_attr "tune" "arm1020e,arm1022e") 237169689Skan (eq_attr "type" "branch")) 238169689Skan "1020a_e") 239169689Skan 240169689Skan;; The latency for a call is not predictable. Therefore, we use 32 as 241169689Skan;; roughly equivalent to positive infinity. 242169689Skan 243169689Skan(define_insn_reservation "1020call_op" 32 244169689Skan (and (eq_attr "tune" "arm1020e,arm1022e") 245169689Skan (eq_attr "type" "call")) 246169689Skan "1020a_e*32") 247169689Skan 248169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 249169689Skan;; VFP 250169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 251169689Skan 252169689Skan(define_cpu_unit "v10_fmac" "arm1020e") 253169689Skan 254169689Skan(define_cpu_unit "v10_ds" "arm1020e") 255169689Skan 256169689Skan(define_cpu_unit "v10_fmstat" "arm1020e") 257169689Skan 258169689Skan(define_cpu_unit "v10_ls1,v10_ls2,v10_ls3" "arm1020e") 259169689Skan 260169689Skan;; fmstat is a serializing instruction. It will stall the core until 261169689Skan;; the mac and ds units have completed. 262169689Skan(exclusion_set "v10_fmac,v10_ds" "v10_fmstat") 263169689Skan 264169689Skan(define_attr "vfp10" "yes,no" 265169689Skan (const (if_then_else (and (eq_attr "tune" "arm1020e,arm1022e") 266169689Skan (eq_attr "fpu" "vfp")) 267169689Skan (const_string "yes") (const_string "no")))) 268169689Skan 269169689Skan;; The VFP "type" attributes differ from those used in the FPA model. 270169689Skan;; ffarith Fast floating point insns, e.g. abs, neg, cpy, cmp. 271169689Skan;; farith Most arithmetic insns. 272169689Skan;; fmul Double precision multiply. 273169689Skan;; fdivs Single precision sqrt or division. 274169689Skan;; fdivd Double precision sqrt or division. 275169689Skan;; f_flag fmstat operation 276169689Skan;; f_load Floating point load from memory. 277169689Skan;; f_store Floating point store to memory. 278169689Skan;; f_2_r Transfer vfp to arm reg. 279169689Skan;; r_2_f Transfer arm to vfp reg. 280169689Skan 281169689Skan;; Note, no instruction can issue to the VFP if the core is stalled in the 282169689Skan;; first execute state. We model this by using 1020a_e in the first cycle. 283169689Skan(define_insn_reservation "v10_ffarith" 5 284169689Skan (and (eq_attr "vfp10" "yes") 285169689Skan (eq_attr "type" "ffarith")) 286169689Skan "1020a_e+v10_fmac") 287169689Skan 288169689Skan(define_insn_reservation "v10_farith" 5 289169689Skan (and (eq_attr "vfp10" "yes") 290169689Skan (eq_attr "type" "farith")) 291169689Skan "1020a_e+v10_fmac") 292169689Skan 293169689Skan(define_insn_reservation "v10_cvt" 5 294169689Skan (and (eq_attr "vfp10" "yes") 295169689Skan (eq_attr "type" "f_cvt")) 296169689Skan "1020a_e+v10_fmac") 297169689Skan 298169689Skan(define_insn_reservation "v10_fmul" 6 299169689Skan (and (eq_attr "vfp10" "yes") 300169689Skan (eq_attr "type" "fmul")) 301169689Skan "1020a_e+v10_fmac*2") 302169689Skan 303169689Skan(define_insn_reservation "v10_fdivs" 18 304169689Skan (and (eq_attr "vfp10" "yes") 305169689Skan (eq_attr "type" "fdivs")) 306169689Skan "1020a_e+v10_ds*14") 307169689Skan 308169689Skan(define_insn_reservation "v10_fdivd" 32 309169689Skan (and (eq_attr "vfp10" "yes") 310169689Skan (eq_attr "type" "fdivd")) 311169689Skan "1020a_e+v10_fmac+v10_ds*28") 312169689Skan 313169689Skan(define_insn_reservation "v10_floads" 4 314169689Skan (and (eq_attr "vfp10" "yes") 315169689Skan (eq_attr "type" "f_loads")) 316169689Skan "1020a_e+1020l_e+v10_ls1,v10_ls2") 317169689Skan 318169689Skan;; We model a load of a double as needing all the vfp ls* stage in cycle 1. 319169689Skan;; This gives the correct mix between single-and double loads where a flds 320169689Skan;; followed by and fldd will stall for one cycle, but two back-to-back fldd 321169689Skan;; insns stall for two cycles. 322169689Skan(define_insn_reservation "v10_floadd" 5 323169689Skan (and (eq_attr "vfp10" "yes") 324169689Skan (eq_attr "type" "f_loadd")) 325169689Skan "1020a_e+1020l_e+v10_ls1+v10_ls2+v10_ls3,v10_ls2+v10_ls3,v10_ls3") 326169689Skan 327169689Skan;; Moves to/from arm regs also use the load/store pipeline. 328169689Skan 329169689Skan(define_insn_reservation "v10_c2v" 4 330169689Skan (and (eq_attr "vfp10" "yes") 331169689Skan (eq_attr "type" "r_2_f")) 332169689Skan "1020a_e+1020l_e+v10_ls1,v10_ls2") 333169689Skan 334169689Skan(define_insn_reservation "v10_fstores" 1 335169689Skan (and (eq_attr "vfp10" "yes") 336169689Skan (eq_attr "type" "f_stores")) 337169689Skan "1020a_e+1020l_e+v10_ls1,v10_ls2") 338169689Skan 339169689Skan(define_insn_reservation "v10_fstored" 1 340169689Skan (and (eq_attr "vfp10" "yes") 341169689Skan (eq_attr "type" "f_stored")) 342169689Skan "1020a_e+1020l_e+v10_ls1+v10_ls2+v10_ls3,v10_ls2+v10_ls3,v10_ls3") 343169689Skan 344169689Skan(define_insn_reservation "v10_v2c" 1 345169689Skan (and (eq_attr "vfp10" "yes") 346169689Skan (eq_attr "type" "f_2_r")) 347169689Skan "1020a_e+1020l_e,1020l_m,1020l_w") 348169689Skan 349169689Skan(define_insn_reservation "v10_to_cpsr" 2 350169689Skan (and (eq_attr "vfp10" "yes") 351169689Skan (eq_attr "type" "f_flag")) 352169689Skan "1020a_e+v10_fmstat,1020a_e+1020l_e,1020l_m,1020l_w") 353169689Skan 354169689Skan;; VFP bypasses 355169689Skan 356169689Skan;; There are bypasses for most operations other than store 357169689Skan 358169689Skan(define_bypass 3 359169689Skan "v10_c2v,v10_floads" 360169689Skan "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd,v10_cvt") 361169689Skan 362169689Skan(define_bypass 4 363169689Skan "v10_floadd" 364169689Skan "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd") 365169689Skan 366169689Skan;; Arithmetic to other arithmetic saves a cycle due to forwarding 367169689Skan(define_bypass 4 368169689Skan "v10_ffarith,v10_farith" 369169689Skan "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd") 370169689Skan 371169689Skan(define_bypass 5 372169689Skan "v10_fmul" 373169689Skan "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd") 374169689Skan 375169689Skan(define_bypass 17 376169689Skan "v10_fdivs" 377169689Skan "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd") 378169689Skan 379169689Skan(define_bypass 31 380169689Skan "v10_fdivd" 381169689Skan "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd") 382169689Skan 383169689Skan;; VFP anti-dependencies. 384169689Skan 385169689Skan;; There is one anti-dependence in the following case (not yet modelled): 386169689Skan;; - After a store: one extra cycle for both fsts and fstd 387169689Skan;; Note, back-to-back fstd instructions will overload the load/store datapath 388169689Skan;; causing a two-cycle stall. 389