1;; ARM Cortex-R4 scheduling description. 2;; Copyright (C) 2007-2015 Free Software Foundation, Inc. 3;; Contributed by CodeSourcery. 4 5;; This file is part of GCC. 6 7;; GCC is free software; you can redistribute it and/or modify it 8;; under the terms of the GNU General Public License as published 9;; by the Free Software Foundation; either version 3, or (at your 10;; option) any later version. 11 12;; GCC is distributed in the hope that it will be useful, but WITHOUT 13;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 15;; License for more details. 16 17;; You should have received a copy of the GNU General Public License 18;; along with GCC; see the file COPYING3. If not see 19;; <http://www.gnu.org/licenses/>. 20 21(define_automaton "cortex_r4") 22 23;; We approximate the dual-issue constraints of this core using four 24;; "issue units" and a reservation matrix as follows. The numbers indicate 25;; the instruction groups' preferences in order. Multiple entries for 26;; the same numbered preference indicate units that must be reserved 27;; together. 28;; 29;; Issue unit: A B C ALU 30;; 31;; ALU w/o reg shift 1st 2nd 1st and 2nd 32;; ALU w/ reg shift 1st 2nd 2nd 1st and 2nd 33;; Moves 1st 2nd 2nd 34;; Multiplication 1st 1st 35;; Division 1st 1st 36;; Load/store single 1st 1st 37;; Other load/store 1st 1st 38;; Branches 1st 39 40(define_cpu_unit "cortex_r4_issue_a" "cortex_r4") 41(define_cpu_unit "cortex_r4_issue_b" "cortex_r4") 42(define_cpu_unit "cortex_r4_issue_c" "cortex_r4") 43(define_cpu_unit "cortex_r4_issue_alu" "cortex_r4") 44 45(define_reservation "cortex_r4_alu" 46 "(cortex_r4_issue_a+cortex_r4_issue_alu)|\ 47 (cortex_r4_issue_b+cortex_r4_issue_alu)") 48(define_reservation "cortex_r4_alu_shift_reg" 49 "(cortex_r4_issue_a+cortex_r4_issue_alu)|\ 50 (cortex_r4_issue_b+cortex_r4_issue_c+\ 51 cortex_r4_issue_alu)") 52(define_reservation "cortex_r4_mov" 53 "cortex_r4_issue_a|(cortex_r4_issue_b+\ 54 cortex_r4_issue_alu)") 55(define_reservation "cortex_r4_mul" "cortex_r4_issue_a+cortex_r4_issue_alu") 56(define_reservation "cortex_r4_mul_2" 57 "(cortex_r4_issue_a+cortex_r4_issue_alu)*2") 58;; Division instructions execute out-of-order with respect to the 59;; rest of the pipeline and only require reservations on their first and 60;; final cycles. 61(define_reservation "cortex_r4_div_9" 62 "cortex_r4_issue_a+cortex_r4_issue_alu,\ 63 nothing*7,\ 64 cortex_r4_issue_a+cortex_r4_issue_alu") 65(define_reservation "cortex_r4_div_10" 66 "cortex_r4_issue_a+cortex_r4_issue_alu,\ 67 nothing*8,\ 68 cortex_r4_issue_a+cortex_r4_issue_alu") 69(define_reservation "cortex_r4_load_store" 70 "cortex_r4_issue_a+cortex_r4_issue_c") 71(define_reservation "cortex_r4_load_store_2" 72 "(cortex_r4_issue_a+cortex_r4_issue_b)*2") 73(define_reservation "cortex_r4_branch" "cortex_r4_issue_b") 74 75;; We assume that all instructions are unconditional. 76 77;; Data processing instructions. Moves without shifts are kept separate 78;; for the purposes of the dual-issue constraints above. 79(define_insn_reservation "cortex_r4_alu" 2 80 (and (eq_attr "tune_cortexr4" "yes") 81 (eq_attr "type" "alu_imm,alus_imm,logic_imm,logics_imm,\ 82 alu_sreg,alus_sreg,logic_reg,logics_reg,\ 83 adc_imm,adcs_imm,adc_reg,adcs_reg,\ 84 adr,bfm,clz,rbit,rev,\ 85 shift_imm,shift_reg,mvn_imm,mvn_reg")) 86 "cortex_r4_alu") 87 88(define_insn_reservation "cortex_r4_mov" 2 89 (and (eq_attr "tune_cortexr4" "yes") 90 (eq_attr "type" "mov_imm,mov_reg")) 91 "cortex_r4_mov") 92 93(define_insn_reservation "cortex_r4_alu_shift" 2 94 (and (eq_attr "tune_cortexr4" "yes") 95 (eq_attr "type" "alu_shift_imm,alus_shift_imm,\ 96 logic_shift_imm,logics_shift_imm,\ 97 extend,mov_shift,mvn_shift")) 98 "cortex_r4_alu") 99 100(define_insn_reservation "cortex_r4_alu_shift_reg" 2 101 (and (eq_attr "tune_cortexr4" "yes") 102 (eq_attr "type" "alu_shift_reg,alus_shift_reg,\ 103 logic_shift_reg,logics_shift_reg,\ 104 mov_shift_reg,mvn_shift_reg,\ 105 mrs,multiple,no_insn")) 106 "cortex_r4_alu_shift_reg") 107 108;; An ALU instruction followed by an ALU instruction with no early dep. 109(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\ 110 cortex_r4_mov" 111 "cortex_r4_alu") 112(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\ 113 cortex_r4_mov" 114 "cortex_r4_alu_shift" 115 "arm_no_early_alu_shift_dep") 116(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\ 117 cortex_r4_mov" 118 "cortex_r4_alu_shift_reg" 119 "arm_no_early_alu_shift_value_dep") 120 121;; In terms of availabilities, a consumer mov could theoretically be 122;; issued together with a producer ALU instruction, without stalls. 123;; In practice this cannot happen because mov;add (in that order) is not 124;; eligible for dual issue and furthermore dual issue is not permitted 125;; when a dependency is involved. We therefore note it as latency one. 126;; A mov followed by another of the same is also latency one. 127(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\ 128 cortex_r4_mov" 129 "cortex_r4_mov") 130 131;; qadd, qdadd, qsub and qdsub are not currently emitted, and neither are 132;; media data processing instructions nor sad instructions. 133 134;; Multiplication instructions. 135 136(define_insn_reservation "cortex_r4_mul_4" 4 137 (and (eq_attr "tune_cortexr4" "yes") 138 (eq_attr "type" "mul,smmul")) 139 "cortex_r4_mul_2") 140 141(define_insn_reservation "cortex_r4_mul_3" 3 142 (and (eq_attr "tune_cortexr4" "yes") 143 (eq_attr "type" "smulxy,smulwy,smuad,smusd")) 144 "cortex_r4_mul") 145 146(define_insn_reservation "cortex_r4_mla_4" 4 147 (and (eq_attr "tune_cortexr4" "yes") 148 (eq_attr "type" "mla,smmla")) 149 "cortex_r4_mul_2") 150 151(define_insn_reservation "cortex_r4_mla_3" 3 152 (and (eq_attr "tune_cortexr4" "yes") 153 (eq_attr "type" "smlaxy,smlawy,smlad,smlsd")) 154 "cortex_r4_mul") 155 156(define_insn_reservation "cortex_r4_smlald" 3 157 (and (eq_attr "tune_cortexr4" "yes") 158 (eq_attr "type" "smlald,smlsld")) 159 "cortex_r4_mul") 160 161(define_insn_reservation "cortex_r4_mull" 4 162 (and (eq_attr "tune_cortexr4" "yes") 163 (eq_attr "type" "smull,umull,umlal,umaal")) 164 "cortex_r4_mul_2") 165 166;; A multiply or an MLA with a single-register result, followed by an 167;; MLA with an accumulator dependency, has its result forwarded. 168(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3" 169 "cortex_r4_mla_3,cortex_r4_mla_4" 170 "arm_mac_accumulator_is_mul_result") 171 172(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4" 173 "cortex_r4_mla_3,cortex_r4_mla_4" 174 "arm_mac_accumulator_is_mul_result") 175 176;; A multiply followed by an ALU instruction needing the multiply 177;; result only at ALU has lower latency than one needing it at Shift. 178(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald" 179 "cortex_r4_alu") 180(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald" 181 "cortex_r4_alu_shift" 182 "arm_no_early_alu_shift_dep") 183(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald" 184 "cortex_r4_alu_shift_reg" 185 "arm_no_early_alu_shift_value_dep") 186(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull" 187 "cortex_r4_alu") 188(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull" 189 "cortex_r4_alu_shift" 190 "arm_no_early_alu_shift_dep") 191(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull" 192 "cortex_r4_alu_shift_reg" 193 "arm_no_early_alu_shift_value_dep") 194 195;; A multiply followed by a mov has one cycle lower latency again. 196(define_bypass 1 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald" 197 "cortex_r4_mov") 198(define_bypass 2 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull" 199 "cortex_r4_mov") 200 201;; We guess that division of A/B using sdiv or udiv, on average, 202;; is performed with B having ten more leading zeros than A. 203;; This gives a latency of nine for udiv and ten for sdiv. 204(define_insn_reservation "cortex_r4_udiv" 9 205 (and (eq_attr "tune_cortexr4" "yes") 206 (eq_attr "type" "udiv")) 207 "cortex_r4_div_9") 208 209(define_insn_reservation "cortex_r4_sdiv" 10 210 (and (eq_attr "tune_cortexr4" "yes") 211 (eq_attr "type" "sdiv")) 212 "cortex_r4_div_10") 213 214;; Branches. We assume correct prediction. 215 216(define_insn_reservation "cortex_r4_branch" 0 217 (and (eq_attr "tune_cortexr4" "yes") 218 (eq_attr "type" "branch")) 219 "cortex_r4_branch") 220 221;; Call latencies are not predictable. A semi-arbitrary very large 222;; number is used as "positive infinity" so that everything should be 223;; finished by the time of return. 224(define_insn_reservation "cortex_r4_call" 32 225 (and (eq_attr "tune_cortexr4" "yes") 226 (eq_attr "type" "call")) 227 "nothing") 228 229;; Status register access instructions are not currently emitted. 230 231;; Load instructions. 232;; We do not model the "addr_md_3cycle" cases and assume that 233;; accesses following are correctly aligned. 234 235(define_insn_reservation "cortex_r4_load_1_2" 3 236 (and (eq_attr "tune_cortexr4" "yes") 237 (eq_attr "type" "load1,load2")) 238 "cortex_r4_load_store") 239 240(define_insn_reservation "cortex_r4_load_3_4" 4 241 (and (eq_attr "tune_cortexr4" "yes") 242 (eq_attr "type" "load3,load4")) 243 "cortex_r4_load_store_2") 244 245;; If a producing load is followed by an instruction consuming only 246;; as a Normal Reg, there is one fewer cycle of latency. 247 248(define_bypass 2 "cortex_r4_load_1_2" 249 "cortex_r4_alu") 250(define_bypass 2 "cortex_r4_load_1_2" 251 "cortex_r4_alu_shift" 252 "arm_no_early_alu_shift_dep") 253(define_bypass 2 "cortex_r4_load_1_2" 254 "cortex_r4_alu_shift_reg" 255 "arm_no_early_alu_shift_value_dep") 256 257(define_bypass 3 "cortex_r4_load_3_4" 258 "cortex_r4_alu") 259(define_bypass 3 "cortex_r4_load_3_4" 260 "cortex_r4_alu_shift" 261 "arm_no_early_alu_shift_dep") 262(define_bypass 3 "cortex_r4_load_3_4" 263 "cortex_r4_alu_shift_reg" 264 "arm_no_early_alu_shift_value_dep") 265 266;; If a producing load is followed by an instruction consuming only 267;; as a Late Reg, there are two fewer cycles of latency. Such consumer 268;; instructions are moves and stores. 269 270(define_bypass 1 "cortex_r4_load_1_2" 271 "cortex_r4_mov,cortex_r4_store_1_2,cortex_r4_store_3_4") 272(define_bypass 2 "cortex_r4_load_3_4" 273 "cortex_r4_mov,cortex_r4_store_1_2,cortex_r4_store_3_4") 274 275;; If a producer's result is required as the base or offset of a load, 276;; there is an extra cycle latency. 277 278(define_bypass 3 "cortex_r4_alu,cortex_r4_mov,cortex_r4_alu_shift,\ 279 cortex_r4_alu_shift_reg" 280 "cortex_r4_load_1_2,cortex_r4_load_3_4") 281 282(define_bypass 4 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald" 283 "cortex_r4_load_1_2,cortex_r4_load_3_4") 284 285(define_bypass 5 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull" 286 "cortex_r4_load_1_2,cortex_r4_load_3_4") 287 288;; Store instructions. 289 290(define_insn_reservation "cortex_r4_store_1_2" 0 291 (and (eq_attr "tune_cortexr4" "yes") 292 (eq_attr "type" "store1,store2")) 293 "cortex_r4_load_store") 294 295(define_insn_reservation "cortex_r4_store_3_4" 0 296 (and (eq_attr "tune_cortexr4" "yes") 297 (eq_attr "type" "store3,store4")) 298 "cortex_r4_load_store_2") 299 300