arm1020e.md revision 256281
1;; ARM 1020E & ARM 1022E Pipeline Description 2;; Copyright (C) 2005 Free Software Foundation, Inc. 3;; Contributed by Richard Earnshaw (richard.earnshaw@arm.com) 4;; 5;; This file is part of GCC. 6;; 7;; GCC is free software; you can redistribute it and/or modify it 8;; under the terms of the GNU General Public License as published by 9;; the Free Software Foundation; either version 2, or (at your option) 10;; any later version. 11;; 12;; GCC is distributed in the hope that it will be useful, but 13;; WITHOUT ANY WARRANTY; without even the implied warranty of 14;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;; General Public License for more details. 16;; 17;; You should have received a copy of the GNU General Public License 18;; along with GCC; see the file COPYING. If not, write to the Free 19;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 20;; 02110-1301, USA. */ 21 22;; These descriptions are based on the information contained in the 23;; ARM1020E Technical Reference Manual, Copyright (c) 2003 ARM 24;; Limited. 25;; 26 27;; This automaton provides a pipeline description for the ARM 28;; 1020E core. 29;; 30;; The model given here assumes that the condition for all conditional 31;; instructions is "true", i.e., that all of the instructions are 32;; actually executed. 33 34(define_automaton "arm1020e") 35 36;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 37;; Pipelines 38;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 39 40;; There are two pipelines: 41;; 42;; - An Arithmetic Logic Unit (ALU) pipeline. 43;; 44;; The ALU pipeline has fetch, issue, decode, execute, memory, and 45;; write stages. We only need to model the execute, memory and write 46;; stages. 47;; 48;; - A Load-Store Unit (LSU) pipeline. 49;; 50;; The LSU pipeline has decode, execute, memory, and write stages. 51;; We only model the execute, memory and write stages. 52 53(define_cpu_unit "1020a_e,1020a_m,1020a_w" "arm1020e") 54(define_cpu_unit "1020l_e,1020l_m,1020l_w" "arm1020e") 55 56;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 57;; ALU Instructions 58;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 59 60;; ALU instructions require three cycles to execute, and use the ALU 61;; pipeline in each of the three stages. The results are available 62;; after the execute stage stage has finished. 63;; 64;; If the destination register is the PC, the pipelines are stalled 65;; for several cycles. That case is not modeled here. 66 67;; ALU operations with no shifted operand 68(define_insn_reservation "1020alu_op" 1 69 (and (eq_attr "tune" "arm1020e,arm1022e") 70 (eq_attr "type" "alu")) 71 "1020a_e,1020a_m,1020a_w") 72 73;; ALU operations with a shift-by-constant operand 74(define_insn_reservation "1020alu_shift_op" 1 75 (and (eq_attr "tune" "arm1020e,arm1022e") 76 (eq_attr "type" "alu_shift")) 77 "1020a_e,1020a_m,1020a_w") 78 79;; ALU operations with a shift-by-register operand 80;; These really stall in the decoder, in order to read 81;; the shift value in a second cycle. Pretend we take two cycles in 82;; the execute stage. 83(define_insn_reservation "1020alu_shift_reg_op" 2 84 (and (eq_attr "tune" "arm1020e,arm1022e") 85 (eq_attr "type" "alu_shift_reg")) 86 "1020a_e*2,1020a_m,1020a_w") 87 88;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 89;; Multiplication Instructions 90;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 91 92;; Multiplication instructions loop in the execute stage until the 93;; instruction has been passed through the multiplier array enough 94;; times. 95 96;; The result of the "smul" and "smulw" instructions is not available 97;; until after the memory stage. 98(define_insn_reservation "1020mult1" 2 99 (and (eq_attr "tune" "arm1020e,arm1022e") 100 (eq_attr "insn" "smulxy,smulwy")) 101 "1020a_e,1020a_m,1020a_w") 102 103;; The "smlaxy" and "smlawx" instructions require two iterations through 104;; the execute stage; the result is available immediately following 105;; the execute stage. 106(define_insn_reservation "1020mult2" 2 107 (and (eq_attr "tune" "arm1020e,arm1022e") 108 (eq_attr "insn" "smlaxy,smlalxy,smlawx")) 109 "1020a_e*2,1020a_m,1020a_w") 110 111;; The "smlalxy", "mul", and "mla" instructions require two iterations 112;; through the execute stage; the result is not available until after 113;; the memory stage. 114(define_insn_reservation "1020mult3" 3 115 (and (eq_attr "tune" "arm1020e,arm1022e") 116 (eq_attr "insn" "smlalxy,mul,mla")) 117 "1020a_e*2,1020a_m,1020a_w") 118 119;; The "muls" and "mlas" instructions loop in the execute stage for 120;; four iterations in order to set the flags. The value result is 121;; available after three iterations. 122(define_insn_reservation "1020mult4" 3 123 (and (eq_attr "tune" "arm1020e,arm1022e") 124 (eq_attr "insn" "muls,mlas")) 125 "1020a_e*4,1020a_m,1020a_w") 126 127;; Long multiply instructions that produce two registers of 128;; output (such as umull) make their results available in two cycles; 129;; the least significant word is available before the most significant 130;; word. That fact is not modeled; instead, the instructions are 131;; described.as if the entire result was available at the end of the 132;; cycle in which both words are available. 133 134;; The "umull", "umlal", "smull", and "smlal" instructions all take 135;; three iterations through the execute cycle, and make their results 136;; available after the memory cycle. 137(define_insn_reservation "1020mult5" 4 138 (and (eq_attr "tune" "arm1020e,arm1022e") 139 (eq_attr "insn" "umull,umlal,smull,smlal")) 140 "1020a_e*3,1020a_m,1020a_w") 141 142;; The "umulls", "umlals", "smulls", and "smlals" instructions loop in 143;; the execute stage for five iterations in order to set the flags. 144;; The value result is available after four iterations. 145(define_insn_reservation "1020mult6" 4 146 (and (eq_attr "tune" "arm1020e,arm1022e") 147 (eq_attr "insn" "umulls,umlals,smulls,smlals")) 148 "1020a_e*5,1020a_m,1020a_w") 149 150;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 151;; Load/Store Instructions 152;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 153 154;; The models for load/store instructions do not accurately describe 155;; the difference between operations with a base register writeback 156;; (such as "ldm!"). These models assume that all memory references 157;; hit in dcache. 158 159;; LSU instructions require six cycles to execute. They use the ALU 160;; pipeline in all but the 5th cycle, and the LSU pipeline in cycles 161;; three through six. 162;; Loads and stores which use a scaled register offset or scaled 163;; register pre-indexed addressing mode take three cycles EXCEPT for 164;; those that are base + offset with LSL of 0 or 2, or base - offset 165;; with LSL of zero. The remainder take 1 cycle to execute. 166;; For 4byte loads there is a bypass from the load stage 167 168(define_insn_reservation "1020load1_op" 2 169 (and (eq_attr "tune" "arm1020e,arm1022e") 170 (eq_attr "type" "load_byte,load1")) 171 "1020a_e+1020l_e,1020l_m,1020l_w") 172 173(define_insn_reservation "1020store1_op" 0 174 (and (eq_attr "tune" "arm1020e,arm1022e") 175 (eq_attr "type" "store1")) 176 "1020a_e+1020l_e,1020l_m,1020l_w") 177 178;; A load's result can be stored by an immediately following store 179(define_bypass 1 "1020load1_op" "1020store1_op" "arm_no_early_store_addr_dep") 180 181;; On a LDM/STM operation, the LSU pipeline iterates until all of the 182;; registers have been processed. 183;; 184;; The time it takes to load the data depends on whether or not the 185;; base address is 64-bit aligned; if it is not, an additional cycle 186;; is required. This model assumes that the address is always 64-bit 187;; aligned. Because the processor can load two registers per cycle, 188;; that assumption means that we use the same instruction reservations 189;; for loading 2k and 2k - 1 registers. 190;; 191;; The ALU pipeline is decoupled after the first cycle unless there is 192;; a register dependency; the dependency is cleared as soon as the LDM/STM 193;; has dealt with the corresponding register. So for example, 194;; stmia sp, {r0-r3} 195;; add r0, r0, #4 196;; will have one fewer stalls than 197;; stmia sp, {r0-r3} 198;; add r3, r3, #4 199;; 200;; As with ALU operations, if one of the destination registers is the 201;; PC, there are additional stalls; that is not modeled. 202 203(define_insn_reservation "1020load2_op" 2 204 (and (eq_attr "tune" "arm1020e,arm1022e") 205 (eq_attr "type" "load2")) 206 "1020a_e+1020l_e,1020l_m,1020l_w") 207 208(define_insn_reservation "1020store2_op" 0 209 (and (eq_attr "tune" "arm1020e,arm1022e") 210 (eq_attr "type" "store2")) 211 "1020a_e+1020l_e,1020l_m,1020l_w") 212 213(define_insn_reservation "1020load34_op" 3 214 (and (eq_attr "tune" "arm1020e,arm1022e") 215 (eq_attr "type" "load3,load4")) 216 "1020a_e+1020l_e,1020l_e+1020l_m,1020l_m,1020l_w") 217 218(define_insn_reservation "1020store34_op" 0 219 (and (eq_attr "tune" "arm1020e,arm1022e") 220 (eq_attr "type" "store3,store4")) 221 "1020a_e+1020l_e,1020l_e+1020l_m,1020l_m,1020l_w") 222 223;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 224;; Branch and Call Instructions 225;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 226 227;; Branch instructions are difficult to model accurately. The ARM 228;; core can predict most branches. If the branch is predicted 229;; correctly, and predicted early enough, the branch can be completely 230;; eliminated from the instruction stream. Some branches can 231;; therefore appear to require zero cycles to execute. We assume that 232;; all branches are predicted correctly, and that the latency is 233;; therefore the minimum value. 234 235(define_insn_reservation "1020branch_op" 0 236 (and (eq_attr "tune" "arm1020e,arm1022e") 237 (eq_attr "type" "branch")) 238 "1020a_e") 239 240;; The latency for a call is not predictable. Therefore, we use 32 as 241;; roughly equivalent to positive infinity. 242 243(define_insn_reservation "1020call_op" 32 244 (and (eq_attr "tune" "arm1020e,arm1022e") 245 (eq_attr "type" "call")) 246 "1020a_e*32") 247 248;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 249;; VFP 250;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 251 252(define_cpu_unit "v10_fmac" "arm1020e") 253 254(define_cpu_unit "v10_ds" "arm1020e") 255 256(define_cpu_unit "v10_fmstat" "arm1020e") 257 258(define_cpu_unit "v10_ls1,v10_ls2,v10_ls3" "arm1020e") 259 260;; fmstat is a serializing instruction. It will stall the core until 261;; the mac and ds units have completed. 262(exclusion_set "v10_fmac,v10_ds" "v10_fmstat") 263 264(define_attr "vfp10" "yes,no" 265 (const (if_then_else (and (eq_attr "tune" "arm1020e,arm1022e") 266 (eq_attr "fpu" "vfp")) 267 (const_string "yes") (const_string "no")))) 268 269;; The VFP "type" attributes differ from those used in the FPA model. 270;; ffarith Fast floating point insns, e.g. abs, neg, cpy, cmp. 271;; farith Most arithmetic insns. 272;; fmul Double precision multiply. 273;; fdivs Single precision sqrt or division. 274;; fdivd Double precision sqrt or division. 275;; f_flag fmstat operation 276;; f_load Floating point load from memory. 277;; f_store Floating point store to memory. 278;; f_2_r Transfer vfp to arm reg. 279;; r_2_f Transfer arm to vfp reg. 280 281;; Note, no instruction can issue to the VFP if the core is stalled in the 282;; first execute state. We model this by using 1020a_e in the first cycle. 283(define_insn_reservation "v10_ffarith" 5 284 (and (eq_attr "vfp10" "yes") 285 (eq_attr "type" "ffarith")) 286 "1020a_e+v10_fmac") 287 288(define_insn_reservation "v10_farith" 5 289 (and (eq_attr "vfp10" "yes") 290 (eq_attr "type" "farith")) 291 "1020a_e+v10_fmac") 292 293(define_insn_reservation "v10_cvt" 5 294 (and (eq_attr "vfp10" "yes") 295 (eq_attr "type" "f_cvt")) 296 "1020a_e+v10_fmac") 297 298(define_insn_reservation "v10_fmul" 6 299 (and (eq_attr "vfp10" "yes") 300 (eq_attr "type" "fmul")) 301 "1020a_e+v10_fmac*2") 302 303(define_insn_reservation "v10_fdivs" 18 304 (and (eq_attr "vfp10" "yes") 305 (eq_attr "type" "fdivs")) 306 "1020a_e+v10_ds*14") 307 308(define_insn_reservation "v10_fdivd" 32 309 (and (eq_attr "vfp10" "yes") 310 (eq_attr "type" "fdivd")) 311 "1020a_e+v10_fmac+v10_ds*28") 312 313(define_insn_reservation "v10_floads" 4 314 (and (eq_attr "vfp10" "yes") 315 (eq_attr "type" "f_loads")) 316 "1020a_e+1020l_e+v10_ls1,v10_ls2") 317 318;; We model a load of a double as needing all the vfp ls* stage in cycle 1. 319;; This gives the correct mix between single-and double loads where a flds 320;; followed by and fldd will stall for one cycle, but two back-to-back fldd 321;; insns stall for two cycles. 322(define_insn_reservation "v10_floadd" 5 323 (and (eq_attr "vfp10" "yes") 324 (eq_attr "type" "f_loadd")) 325 "1020a_e+1020l_e+v10_ls1+v10_ls2+v10_ls3,v10_ls2+v10_ls3,v10_ls3") 326 327;; Moves to/from arm regs also use the load/store pipeline. 328 329(define_insn_reservation "v10_c2v" 4 330 (and (eq_attr "vfp10" "yes") 331 (eq_attr "type" "r_2_f")) 332 "1020a_e+1020l_e+v10_ls1,v10_ls2") 333 334(define_insn_reservation "v10_fstores" 1 335 (and (eq_attr "vfp10" "yes") 336 (eq_attr "type" "f_stores")) 337 "1020a_e+1020l_e+v10_ls1,v10_ls2") 338 339(define_insn_reservation "v10_fstored" 1 340 (and (eq_attr "vfp10" "yes") 341 (eq_attr "type" "f_stored")) 342 "1020a_e+1020l_e+v10_ls1+v10_ls2+v10_ls3,v10_ls2+v10_ls3,v10_ls3") 343 344(define_insn_reservation "v10_v2c" 1 345 (and (eq_attr "vfp10" "yes") 346 (eq_attr "type" "f_2_r")) 347 "1020a_e+1020l_e,1020l_m,1020l_w") 348 349(define_insn_reservation "v10_to_cpsr" 2 350 (and (eq_attr "vfp10" "yes") 351 (eq_attr "type" "f_flag")) 352 "1020a_e+v10_fmstat,1020a_e+1020l_e,1020l_m,1020l_w") 353 354;; VFP bypasses 355 356;; There are bypasses for most operations other than store 357 358(define_bypass 3 359 "v10_c2v,v10_floads" 360 "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd,v10_cvt") 361 362(define_bypass 4 363 "v10_floadd" 364 "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd") 365 366;; Arithmetic to other arithmetic saves a cycle due to forwarding 367(define_bypass 4 368 "v10_ffarith,v10_farith" 369 "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd") 370 371(define_bypass 5 372 "v10_fmul" 373 "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd") 374 375(define_bypass 17 376 "v10_fdivs" 377 "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd") 378 379(define_bypass 31 380 "v10_fdivd" 381 "v10_ffarith,v10_farith,v10_fmul,v10_fdivs,v10_fdivd") 382 383;; VFP anti-dependencies. 384 385;; There is one anti-dependence in the following case (not yet modelled): 386;; - After a store: one extra cycle for both fsts and fstd 387;; Note, back-to-back fstd instructions will overload the load/store datapath 388;; causing a two-cycle stall. 389