arm1136jfs.md revision 285830
1;; ARM 1136J[F]-S Pipeline Description 2;; Copyright (C) 2003 Free Software Foundation, Inc. 3;; Written by CodeSourcery, LLC. 4;; 5;; This file is part of GCC. 6;; 7;; GCC is free software; you can redistribute it and/or modify it 8;; under the terms of the GNU General Public License as published by 9;; the Free Software Foundation; either version 2, or (at your option) 10;; any later version. 11;; 12;; GCC is distributed in the hope that it will be useful, but 13;; WITHOUT ANY WARRANTY; without even the implied warranty of 14;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15;; General Public License for more details. 16;; 17;; You should have received a copy of the GNU General Public License 18;; along with GCC; see the file COPYING. If not, write to the Free 19;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 20;; 02110-1301, USA. */ 21 22;; These descriptions are based on the information contained in the 23;; ARM1136JF-S Technical Reference Manual, Copyright (c) 2003 ARM 24;; Limited. 25;; 26 27;; This automaton provides a pipeline description for the ARM 28;; 1136J-S and 1136JF-S cores. 29;; 30;; The model given here assumes that the condition for all conditional 31;; instructions is "true", i.e., that all of the instructions are 32;; actually executed. 33 34(define_automaton "arm1136jfs") 35 36;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 37;; Pipelines 38;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 39 40;; There are three distinct pipelines (page 1-26 and following): 41;; 42;; - A 4-stage decode pipeline, shared by all three. It has fetch (1), 43;; fetch (2), decode, and issue stages. Since this is always involved, 44;; we do not model it in the scheduler. 45;; 46;; - A 4-stage ALU pipeline. It has shifter, ALU (main integer operations), 47;; and saturation stages. The fourth stage is writeback; see below. 48;; 49;; - A 4-stage multiply-accumulate pipeline. It has three stages, called 50;; MAC1 through MAC3, and a fourth writeback stage. 51;; 52;; The 4th-stage writeback is shared between the ALU and MAC pipelines, 53;; which operate in lockstep. Results from either pipeline will be 54;; moved into the writeback stage. Because the two pipelines operate 55;; in lockstep, we schedule them as a single "execute" pipeline. 56;; 57;; - A 4-stage LSU pipeline. It has address generation, data cache (1), 58;; data cache (2), and writeback stages. (Note that this pipeline, 59;; including the writeback stage, is independent from the ALU & LSU pipes.) 60 61(define_cpu_unit "e_1,e_2,e_3,e_wb" "arm1136jfs") ; ALU and MAC 62; e_1 = Sh/Mac1, e_2 = ALU/Mac2, e_3 = SAT/Mac3 63(define_cpu_unit "l_a,l_dc1,l_dc2,l_wb" "arm1136jfs") ; Load/Store 64 65;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 66;; ALU Instructions 67;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 68 69;; ALU instructions require eight cycles to execute, and use the ALU 70;; pipeline in each of the eight stages. The results are available 71;; after the alu stage has finished. 72;; 73;; If the destination register is the PC, the pipelines are stalled 74;; for several cycles. That case is not modelled here. 75 76;; ALU operations with no shifted operand 77(define_insn_reservation "11_alu_op" 2 78 (and (eq_attr "tune" "arm1136js,arm1136jfs") 79 (eq_attr "type" "alu")) 80 "e_1,e_2,e_3,e_wb") 81 82;; ALU operations with a shift-by-constant operand 83(define_insn_reservation "11_alu_shift_op" 2 84 (and (eq_attr "tune" "arm1136js,arm1136jfs") 85 (eq_attr "type" "alu_shift")) 86 "e_1,e_2,e_3,e_wb") 87 88;; ALU operations with a shift-by-register operand 89;; These really stall in the decoder, in order to read 90;; the shift value in a second cycle. Pretend we take two cycles in 91;; the shift stage. 92(define_insn_reservation "11_alu_shift_reg_op" 3 93 (and (eq_attr "tune" "arm1136js,arm1136jfs") 94 (eq_attr "type" "alu_shift_reg")) 95 "e_1*2,e_2,e_3,e_wb") 96 97;; alu_ops can start sooner, if there is no shifter dependency 98(define_bypass 1 "11_alu_op,11_alu_shift_op" 99 "11_alu_op") 100(define_bypass 1 "11_alu_op,11_alu_shift_op" 101 "11_alu_shift_op" 102 "arm_no_early_alu_shift_value_dep") 103(define_bypass 1 "11_alu_op,11_alu_shift_op" 104 "11_alu_shift_reg_op" 105 "arm_no_early_alu_shift_dep") 106(define_bypass 2 "11_alu_shift_reg_op" 107 "11_alu_op") 108(define_bypass 2 "11_alu_shift_reg_op" 109 "11_alu_shift_op" 110 "arm_no_early_alu_shift_value_dep") 111(define_bypass 2 "11_alu_shift_reg_op" 112 "11_alu_shift_reg_op" 113 "arm_no_early_alu_shift_dep") 114 115(define_bypass 1 "11_alu_op,11_alu_shift_op" 116 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 117 "arm_no_early_mul_dep") 118(define_bypass 2 "11_alu_shift_reg_op" 119 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 120 "arm_no_early_mul_dep") 121 122;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 123;; Multiplication Instructions 124;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 125 126;; Multiplication instructions loop in the first two execute stages until 127;; the instruction has been passed through the multiplier array enough 128;; times. 129 130;; Multiply and multiply-accumulate results are available after four stages. 131(define_insn_reservation "11_mult1" 4 132 (and (eq_attr "tune" "arm1136js,arm1136jfs") 133 (eq_attr "insn" "mul,mla")) 134 "e_1*2,e_2,e_3,e_wb") 135 136;; The *S variants set the condition flags, which requires three more cycles. 137(define_insn_reservation "11_mult2" 4 138 (and (eq_attr "tune" "arm1136js,arm1136jfs") 139 (eq_attr "insn" "muls,mlas")) 140 "e_1*2,e_2,e_3,e_wb") 141 142(define_bypass 3 "11_mult1,11_mult2" 143 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 144 "arm_no_early_mul_dep") 145(define_bypass 3 "11_mult1,11_mult2" 146 "11_alu_op") 147(define_bypass 3 "11_mult1,11_mult2" 148 "11_alu_shift_op" 149 "arm_no_early_alu_shift_value_dep") 150(define_bypass 3 "11_mult1,11_mult2" 151 "11_alu_shift_reg_op" 152 "arm_no_early_alu_shift_dep") 153(define_bypass 3 "11_mult1,11_mult2" 154 "11_store1" 155 "arm_no_early_store_addr_dep") 156 157;; Signed and unsigned multiply long results are available across two cycles; 158;; the less significant word is available one cycle before the more significant 159;; word. Here we conservatively wait until both are available, which is 160;; after three iterations and the memory cycle. The same is also true of 161;; the two multiply-accumulate instructions. 162(define_insn_reservation "11_mult3" 5 163 (and (eq_attr "tune" "arm1136js,arm1136jfs") 164 (eq_attr "insn" "smull,umull,smlal,umlal")) 165 "e_1*3,e_2,e_3,e_wb*2") 166 167;; The *S variants set the condition flags, which requires three more cycles. 168(define_insn_reservation "11_mult4" 5 169 (and (eq_attr "tune" "arm1136js,arm1136jfs") 170 (eq_attr "insn" "smulls,umulls,smlals,umlals")) 171 "e_1*3,e_2,e_3,e_wb*2") 172 173(define_bypass 4 "11_mult3,11_mult4" 174 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 175 "arm_no_early_mul_dep") 176(define_bypass 4 "11_mult3,11_mult4" 177 "11_alu_op") 178(define_bypass 4 "11_mult3,11_mult4" 179 "11_alu_shift_op" 180 "arm_no_early_alu_shift_value_dep") 181(define_bypass 4 "11_mult3,11_mult4" 182 "11_alu_shift_reg_op" 183 "arm_no_early_alu_shift_dep") 184(define_bypass 4 "11_mult3,11_mult4" 185 "11_store1" 186 "arm_no_early_store_addr_dep") 187 188;; Various 16x16->32 multiplies and multiply-accumulates, using combinations 189;; of high and low halves of the argument registers. They take a single 190;; pass through the pipeline and make the result available after three 191;; cycles. 192(define_insn_reservation "11_mult5" 3 193 (and (eq_attr "tune" "arm1136js,arm1136jfs") 194 (eq_attr "insn" "smulxy,smlaxy,smulwy,smlawy,smuad,smuadx,smlad,smladx,smusd,smusdx,smlsd,smlsdx")) 195 "e_1,e_2,e_3,e_wb") 196 197(define_bypass 2 "11_mult5" 198 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 199 "arm_no_early_mul_dep") 200(define_bypass 2 "11_mult5" 201 "11_alu_op") 202(define_bypass 2 "11_mult5" 203 "11_alu_shift_op" 204 "arm_no_early_alu_shift_value_dep") 205(define_bypass 2 "11_mult5" 206 "11_alu_shift_reg_op" 207 "arm_no_early_alu_shift_dep") 208(define_bypass 2 "11_mult5" 209 "11_store1" 210 "arm_no_early_store_addr_dep") 211 212;; The same idea, then the 32-bit result is added to a 64-bit quantity. 213(define_insn_reservation "11_mult6" 4 214 (and (eq_attr "tune" "arm1136js,arm1136jfs") 215 (eq_attr "insn" "smlalxy")) 216 "e_1*2,e_2,e_3,e_wb*2") 217 218;; Signed 32x32 multiply, then the most significant 32 bits are extracted 219;; and are available after the memory stage. 220(define_insn_reservation "11_mult7" 4 221 (and (eq_attr "tune" "arm1136js,arm1136jfs") 222 (eq_attr "insn" "smmul,smmulr")) 223 "e_1*2,e_2,e_3,e_wb") 224 225(define_bypass 3 "11_mult6,11_mult7" 226 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 227 "arm_no_early_mul_dep") 228(define_bypass 3 "11_mult6,11_mult7" 229 "11_alu_op") 230(define_bypass 3 "11_mult6,11_mult7" 231 "11_alu_shift_op" 232 "arm_no_early_alu_shift_value_dep") 233(define_bypass 3 "11_mult6,11_mult7" 234 "11_alu_shift_reg_op" 235 "arm_no_early_alu_shift_dep") 236(define_bypass 3 "11_mult6,11_mult7" 237 "11_store1" 238 "arm_no_early_store_addr_dep") 239 240;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 241;; Branch Instructions 242;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 243 244;; These vary greatly depending on their arguments and the results of 245;; stat prediction. Cycle count ranges from zero (unconditional branch, 246;; folded dynamic prediction) to seven (incorrect predictions, etc). We 247;; assume an optimal case for now, because the cost of a cache miss 248;; overwhelms the cost of everything else anyhow. 249 250(define_insn_reservation "11_branches" 0 251 (and (eq_attr "tune" "arm1136js,arm1136jfs") 252 (eq_attr "type" "branch")) 253 "nothing") 254 255;; Call latencies are not predictable. A semi-arbitrary very large 256;; number is used as "positive infinity" so that everything should be 257;; finished by the time of return. 258(define_insn_reservation "11_call" 32 259 (and (eq_attr "tune" "arm1136js,arm1136jfs") 260 (eq_attr "type" "call")) 261 "nothing") 262 263;; Branches are predicted. A correctly predicted branch will be no 264;; cost, but we're conservative here, and use the timings a 265;; late-register would give us. 266(define_bypass 1 "11_alu_op,11_alu_shift_op" 267 "11_branches") 268(define_bypass 2 "11_alu_shift_reg_op" 269 "11_branches") 270(define_bypass 2 "11_load1,11_load2" 271 "11_branches") 272(define_bypass 3 "11_load34" 273 "11_branches") 274 275;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 276;; Load/Store Instructions 277;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 278 279;; The models for load/store instructions do not accurately describe 280;; the difference between operations with a base register writeback. 281;; These models assume that all memory references hit in dcache. Also, 282;; if the PC is one of the registers involved, there are additional stalls 283;; not modelled here. Addressing modes are also not modelled. 284 285(define_insn_reservation "11_load1" 3 286 (and (eq_attr "tune" "arm1136js,arm1136jfs") 287 (eq_attr "type" "load1")) 288 "l_a+e_1,l_dc1,l_dc2,l_wb") 289 290;; Load byte results are not available until the writeback stage, where 291;; the correct byte is extracted. 292 293(define_insn_reservation "11_loadb" 4 294 (and (eq_attr "tune" "arm1136js,arm1136jfs") 295 (eq_attr "type" "load_byte")) 296 "l_a+e_1,l_dc1,l_dc2,l_wb") 297 298(define_insn_reservation "11_store1" 0 299 (and (eq_attr "tune" "arm1136js,arm1136jfs") 300 (eq_attr "type" "store1")) 301 "l_a+e_1,l_dc1,l_dc2,l_wb") 302 303;; Load/store double words into adjacent registers. The timing and 304;; latencies are different depending on whether the address is 64-bit 305;; aligned. This model assumes that it is. 306(define_insn_reservation "11_load2" 3 307 (and (eq_attr "tune" "arm1136js,arm1136jfs") 308 (eq_attr "type" "load2")) 309 "l_a+e_1,l_dc1,l_dc2,l_wb") 310 311(define_insn_reservation "11_store2" 0 312 (and (eq_attr "tune" "arm1136js,arm1136jfs") 313 (eq_attr "type" "store2")) 314 "l_a+e_1,l_dc1,l_dc2,l_wb") 315 316;; Load/store multiple registers. Two registers are stored per cycle. 317;; Actual timing depends on how many registers are affected, so we 318;; optimistically schedule a low latency. 319(define_insn_reservation "11_load34" 4 320 (and (eq_attr "tune" "arm1136js,arm1136jfs") 321 (eq_attr "type" "load3,load4")) 322 "l_a+e_1,l_dc1*2,l_dc2,l_wb") 323 324(define_insn_reservation "11_store34" 0 325 (and (eq_attr "tune" "arm1136js,arm1136jfs") 326 (eq_attr "type" "store3,store4")) 327 "l_a+e_1,l_dc1*2,l_dc2,l_wb") 328 329;; A store can start immediately after an alu op, if that alu op does 330;; not provide part of the address to access. 331(define_bypass 1 "11_alu_op,11_alu_shift_op" 332 "11_store1" 333 "arm_no_early_store_addr_dep") 334(define_bypass 2 "11_alu_shift_reg_op" 335 "11_store1" 336 "arm_no_early_store_addr_dep") 337 338;; An alu op can start sooner after a load, if that alu op does not 339;; have an early register dependency on the load 340(define_bypass 2 "11_load1" 341 "11_alu_op") 342(define_bypass 2 "11_load1" 343 "11_alu_shift_op" 344 "arm_no_early_alu_shift_value_dep") 345(define_bypass 2 "11_load1" 346 "11_alu_shift_reg_op" 347 "arm_no_early_alu_shift_dep") 348 349(define_bypass 3 "11_loadb" 350 "11_alu_op") 351(define_bypass 3 "11_loadb" 352 "11_alu_shift_op" 353 "arm_no_early_alu_shift_value_dep") 354(define_bypass 3 "11_loadb" 355 "11_alu_shift_reg_op" 356 "arm_no_early_alu_shift_dep") 357 358;; A mul op can start sooner after a load, if that mul op does not 359;; have an early multiply dependency 360(define_bypass 2 "11_load1" 361 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 362 "arm_no_early_mul_dep") 363(define_bypass 3 "11_load34" 364 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 365 "arm_no_early_mul_dep") 366(define_bypass 3 "11_loadb" 367 "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7" 368 "arm_no_early_mul_dep") 369 370;; A store can start sooner after a load, if that load does not 371;; produce part of the address to access 372(define_bypass 2 "11_load1" 373 "11_store1" 374 "arm_no_early_store_addr_dep") 375(define_bypass 3 "11_loadb" 376 "11_store1" 377 "arm_no_early_store_addr_dep") 378