1;; ARM Cortex-A7 pipeline description 2;; Copyright (C) 2012-2015 Free Software Foundation, Inc. 3;; 4;; Contributed by ARM Ltd. 5;; Based on cortex-a5.md which was originally contributed by CodeSourcery. 6;; 7;; This file is part of GCC. 8;; 9;; GCC is free software; you can redistribute it and/or modify it 10;; under the terms of the GNU General Public License as published by 11;; the Free Software Foundation; either version 3, or (at your option) 12;; any later version. 13;; 14;; GCC is distributed in the hope that it will be useful, but 15;; WITHOUT ANY WARRANTY; without even the implied warranty of 16;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;; General Public License for more details. 18;; 19;; You should have received a copy of the GNU General Public License 20;; along with GCC; see the file COPYING3. If not see 21;; <http://www.gnu.org/licenses/>. 22 23(define_attr "cortex_a7_neon_type" 24 "neon_mul, neon_mla, neon_other" 25 (cond [ 26 (eq_attr "type" "neon_mul_b, neon_mul_b_q,\ 27 neon_mul_h, neon_mul_h_q,\ 28 neon_mul_s, neon_mul_s_q,\ 29 neon_mul_b_long, neon_mul_h_long,\ 30 neon_mul_s_long, neon_mul_h_scalar,\ 31 neon_mul_h_scalar_q, neon_mul_s_scalar,\ 32 neon_mul_s_scalar_q, neon_mul_h_scalar_long,\ 33 neon_mul_s_scalar_long,\ 34 neon_sat_mul_b, neon_sat_mul_b_q,\ 35 neon_sat_mul_h, neon_sat_mul_h_q,\ 36 neon_sat_mul_s, neon_sat_mul_s_q,\ 37 neon_sat_mul_b_long, neon_sat_mul_h_long,\ 38 neon_sat_mul_s_long,\ 39 neon_sat_mul_h_scalar, neon_sat_mul_h_scalar_q,\ 40 neon_sat_mul_s_scalar, neon_sat_mul_s_scalar_q,\ 41 neon_sat_mul_h_scalar_long,\ 42 neon_sat_mul_s_scalar_long,\ 43 neon_fp_mul_s, neon_fp_mul_s_q,\ 44 neon_fp_mul_s_scalar, neon_fp_mul_s_scalar_q") 45 (const_string "neon_mul") 46 (eq_attr "type" "neon_mla_b, neon_mla_b_q, neon_mla_h,\ 47 neon_mla_h_q, neon_mla_s, neon_mla_s_q,\ 48 neon_mla_b_long, neon_mla_h_long,\ 49 neon_mla_s_long,\ 50 neon_mla_h_scalar, neon_mla_h_scalar_q,\ 51 neon_mla_s_scalar, neon_mla_s_scalar_q,\ 52 neon_mla_h_scalar_long, neon_mla_s_scalar_long,\ 53 neon_sat_mla_b_long, neon_sat_mla_h_long,\ 54 neon_sat_mla_s_long,\ 55 neon_sat_mla_h_scalar_long,\ 56 neon_sat_mla_s_scalar_long,\ 57 neon_fp_mla_s, neon_fp_mla_s_q,\ 58 neon_fp_mla_s_scalar, neon_fp_mla_s_scalar_q") 59 (const_string "neon_mla")] 60 (const_string "neon_other"))) 61 62(define_automaton "cortex_a7") 63 64;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 65;; Functional units. 66;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 67 68;; The Cortex-A7 pipeline integer and vfp pipeline. 69;; The decode is the same for all instructions, so do not model it. 70;; We only model the first execution stage because 71;; instructions always advance one stage per cycle in order. 72;; We model all of the LS, Branch, ALU, MAC and FPU pipelines together. 73 74(define_cpu_unit "cortex_a7_ex1, cortex_a7_ex2" "cortex_a7") 75 76(define_reservation "cortex_a7_both" "cortex_a7_ex1+cortex_a7_ex2") 77 78(define_cpu_unit "cortex_a7_branch" "cortex_a7") 79 80;; Cortex-A7 is in order and can dual-issue under limited circumstances. 81;; ex2 can be reserved only after ex1 is reserved. 82 83(final_presence_set "cortex_a7_ex2" "cortex_a7_ex1") 84 85;; Pseudo-unit for blocking the multiply pipeline when a double-precision 86;; multiply is in progress. 87 88(define_cpu_unit "cortex_a7_fpmul_pipe" "cortex_a7") 89 90;; The floating-point add pipeline (ex1/f1 stage), used to model the usage 91;; of the add pipeline by fmac instructions, etc. 92 93(define_cpu_unit "cortex_a7_fpadd_pipe" "cortex_a7") 94 95;; Floating-point div/sqrt (long latency, out-of-order completion). 96 97(define_cpu_unit "cortex_a7_fp_div_sqrt" "cortex_a7") 98 99;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 100;; Branches. 101;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 102 103;; A direct branch can dual issue either as younger or older instruction, 104;; but branches cannot dual issue with branches. 105;; No latency as there is no result. 106 107(define_insn_reservation "cortex_a7_branch" 0 108 (and (eq_attr "tune" "cortexa7") 109 (eq_attr "type" "branch")) 110 "(cortex_a7_ex2|cortex_a7_ex1)+cortex_a7_branch") 111 112;; Call cannot dual-issue as an older instruction. It can dual-issue 113;; as a younger instruction, or single-issue. Call cannot dual-issue 114;; with another branch instruction. The result is available the next 115;; cycle. 116(define_insn_reservation "cortex_a7_call" 1 117 (and (eq_attr "tune" "cortexa7") 118 (eq_attr "type" "call")) 119 "(cortex_a7_ex2|cortex_a7_both)+cortex_a7_branch") 120 121;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 122;; ALU instructions. 123;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 124 125;; ALU instruction with an immediate operand can dual-issue. 126(define_insn_reservation "cortex_a7_alu_imm" 2 127 (and (eq_attr "tune" "cortexa7") 128 (ior (eq_attr "type" "adr,alu_imm,alus_imm,logic_imm,logics_imm,\ 129 mov_imm,mvn_imm,extend") 130 (and (eq_attr "type" "mov_reg,mov_shift,mov_shift_reg") 131 (not (eq_attr "length" "8"))))) 132 "cortex_a7_ex2|cortex_a7_ex1") 133 134;; ALU instruction with register operands can dual-issue 135;; with a younger immediate-based instruction. 136(define_insn_reservation "cortex_a7_alu_sreg" 2 137 (and (eq_attr "tune" "cortexa7") 138 (eq_attr "type" "alu_sreg,alus_sreg,logic_reg,logics_reg,\ 139 adc_imm,adcs_imm,adc_reg,adcs_reg,\ 140 bfm,clz,rbit,rev,alu_dsp_reg,\ 141 shift_imm,shift_reg,mov_reg,mvn_reg")) 142 "cortex_a7_ex1") 143 144(define_insn_reservation "cortex_a7_alu_shift" 2 145 (and (eq_attr "tune" "cortexa7") 146 (eq_attr "type" "alu_shift_imm,alus_shift_imm,\ 147 logic_shift_imm,logics_shift_imm,\ 148 alu_shift_reg,alus_shift_reg,\ 149 logic_shift_reg,logics_shift_reg,\ 150 mov_shift,mov_shift_reg,\ 151 mvn_shift,mvn_shift_reg,\ 152 mrs,multiple,no_insn")) 153 "cortex_a7_ex1") 154 155;; Forwarding path for unshifted operands. 156(define_bypass 1 "cortex_a7_alu_imm,cortex_a7_alu_sreg,cortex_a7_alu_shift" 157 "cortex_a7_alu_imm,cortex_a7_alu_sreg,cortex_a7_mul") 158 159(define_bypass 1 "cortex_a7_alu_imm,cortex_a7_alu_sreg,cortex_a7_alu_shift" 160 "cortex_a7_store*" 161 "arm_no_early_store_addr_dep") 162 163(define_bypass 1 "cortex_a7_alu_imm,cortex_a7_alu_sreg,cortex_a7_alu_shift" 164 "cortex_a7_alu_shift" 165 "arm_no_early_alu_shift_dep") 166 167;; The multiplier pipeline can forward results from wr stage only so 168;; there's no need to specify bypasses. 169;; Multiply instructions cannot dual-issue. 170 171(define_insn_reservation "cortex_a7_mul" 2 172 (and (eq_attr "tune" "cortexa7") 173 (ior (eq_attr "mul32" "yes") 174 (eq_attr "mul64" "yes"))) 175 "cortex_a7_both") 176 177;; Forward the result of a multiply operation to the accumulator 178;; of the following multiply and accumulate instruction. 179(define_bypass 1 "cortex_a7_mul" 180 "cortex_a7_mul" 181 "arm_mac_accumulator_is_result") 182 183;; The latency depends on the operands, so we use an estimate here. 184(define_insn_reservation "cortex_a7_idiv" 5 185 (and (eq_attr "tune" "cortexa7") 186 (eq_attr "type" "udiv,sdiv")) 187 "cortex_a7_both*5") 188 189;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 190;; Load/store instructions. 191;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 192 193;; Address-generation happens in the issue stage. 194;; Double-word accesses can be issued in a single cycle, 195;; and occupy only one pipeline stage. 196 197(define_insn_reservation "cortex_a7_load1" 2 198 (and (eq_attr "tune" "cortexa7") 199 (eq_attr "type" "load_byte,load1")) 200 "cortex_a7_ex1") 201 202(define_insn_reservation "cortex_a7_store1" 0 203 (and (eq_attr "tune" "cortexa7") 204 (eq_attr "type" "store1")) 205 "cortex_a7_ex1") 206 207(define_insn_reservation "cortex_a7_load2" 2 208 (and (eq_attr "tune" "cortexa7") 209 (eq_attr "type" "load2")) 210 "cortex_a7_both") 211 212(define_insn_reservation "cortex_a7_store2" 0 213 (and (eq_attr "tune" "cortexa7") 214 (eq_attr "type" "store2")) 215 "cortex_a7_both") 216 217(define_insn_reservation "cortex_a7_load3" 3 218 (and (eq_attr "tune" "cortexa7") 219 (eq_attr "type" "load3")) 220 "cortex_a7_both, cortex_a7_ex1") 221 222(define_insn_reservation "cortex_a7_store3" 0 223 (and (eq_attr "tune" "cortexa7") 224 (eq_attr "type" "store4")) 225 "cortex_a7_both, cortex_a7_ex1") 226 227(define_insn_reservation "cortex_a7_load4" 3 228 (and (eq_attr "tune" "cortexa7") 229 (eq_attr "type" "load4")) 230 "cortex_a7_both, cortex_a7_both") 231 232(define_insn_reservation "cortex_a7_store4" 0 233 (and (eq_attr "tune" "cortexa7") 234 (eq_attr "type" "store3")) 235 "cortex_a7_both, cortex_a7_both") 236 237;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 238;; Floating-point arithmetic. 239;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 240;; Neon integer, neon floating point, and single-precision floating 241;; point instructions of the same type have the same timing 242;; characteristics, but neon instructions cannot dual-issue. 243 244(define_insn_reservation "cortex_a7_fpalu" 4 245 (and (eq_attr "tune" "cortexa7") 246 (eq_attr "type" "ffariths, fadds, ffarithd, faddd, fmov,\ 247 f_cvt, f_cvtf2i, f_cvti2f, fcmps, fcmpd")) 248 "cortex_a7_ex1+cortex_a7_fpadd_pipe") 249 250;; For fconsts and fconstd, 8-bit immediate data is passed directly from 251;; f1 to f3 (which I think reduces the latency by one cycle). 252 253(define_insn_reservation "cortex_a7_fconst" 3 254 (and (eq_attr "tune" "cortexa7") 255 (eq_attr "type" "fconsts,fconstd")) 256 "cortex_a7_ex1+cortex_a7_fpadd_pipe") 257 258;; We should try not to attempt to issue a single-precision multiplication in 259;; the middle of a double-precision multiplication operation (the usage of 260;; cortex_a7_fpmul_pipe). 261 262(define_insn_reservation "cortex_a7_fpmuls" 4 263 (and (eq_attr "tune" "cortexa7") 264 (eq_attr "type" "fmuls")) 265 "cortex_a7_ex1+cortex_a7_fpmul_pipe") 266 267(define_insn_reservation "cortex_a7_neon_mul" 4 268 (and (eq_attr "tune" "cortexa7") 269 (eq_attr "cortex_a7_neon_type" "neon_mul")) 270 "(cortex_a7_both+cortex_a7_fpmul_pipe)*2") 271 272(define_insn_reservation "cortex_a7_fpmacs" 8 273 (and (eq_attr "tune" "cortexa7") 274 (eq_attr "type" "fmacs,ffmas")) 275 "cortex_a7_ex1+cortex_a7_fpmul_pipe") 276 277(define_insn_reservation "cortex_a7_neon_mla" 8 278 (and (eq_attr "tune" "cortexa7") 279 (eq_attr "cortex_a7_neon_type" "neon_mla")) 280 "cortex_a7_both+cortex_a7_fpmul_pipe") 281 282(define_bypass 4 "cortex_a7_fpmacs,cortex_a7_neon_mla" 283 "cortex_a7_fpmacs,cortex_a7_neon_mla" 284 "arm_mac_accumulator_is_result") 285 286;; Non-multiply instructions can issue between two cycles of a 287;; double-precision multiply. 288 289(define_insn_reservation "cortex_a7_fpmuld" 7 290 (and (eq_attr "tune" "cortexa7") 291 (eq_attr "type" "fmuld")) 292 "cortex_a7_ex1+cortex_a7_fpmul_pipe, cortex_a7_fpmul_pipe*3") 293 294(define_insn_reservation "cortex_a7_fpmacd" 11 295 (and (eq_attr "tune" "cortexa7") 296 (eq_attr "type" "fmacd")) 297 "cortex_a7_ex1+cortex_a7_fpmul_pipe, cortex_a7_fpmul_pipe*3") 298 299(define_insn_reservation "cortex_a7_fpfmad" 8 300 (and (eq_attr "tune" "cortexa7") 301 (eq_attr "type" "ffmad")) 302 "cortex_a7_ex1+cortex_a7_fpmul_pipe, cortex_a7_fpmul_pipe*4") 303 304(define_bypass 7 "cortex_a7_fpmacd" 305 "cortex_a7_fpmacd,cortex_a7_fpfmad" 306 "arm_mac_accumulator_is_result") 307 308;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 309;; Floating-point divide/square root instructions. 310;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 311 312(define_insn_reservation "cortex_a7_fdivs" 16 313 (and (eq_attr "tune" "cortexa7") 314 (eq_attr "type" "fdivs, fsqrts")) 315 "cortex_a7_ex1+cortex_a7_fp_div_sqrt, cortex_a7_fp_div_sqrt * 13") 316 317(define_insn_reservation "cortex_a7_fdivd" 31 318 (and (eq_attr "tune" "cortexa7") 319 (eq_attr "type" "fdivd, fsqrtd")) 320 "cortex_a7_ex1+cortex_a7_fp_div_sqrt, cortex_a7_fp_div_sqrt * 28") 321 322;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 323;; VFP to/from core transfers. 324;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 325 326;; Core-to-VFP transfers. 327 328(define_insn_reservation "cortex_a7_r2f" 4 329 (and (eq_attr "tune" "cortexa7") 330 (eq_attr "type" "f_mcr,f_mcrr")) 331 "cortex_a7_both") 332 333(define_insn_reservation "cortex_a7_f2r" 2 334 (and (eq_attr "tune" "cortexa7") 335 (eq_attr "type" "f_mrc,f_mrrc")) 336 "cortex_a7_ex1") 337 338;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 339;; VFP flag transfer. 340;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 341 342;; Fuxne: The flag forwarding from fmstat to the second instruction is 343;; not modeled at present. 344 345(define_insn_reservation "cortex_a7_f_flags" 4 346 (and (eq_attr "tune" "cortexa7") 347 (eq_attr "type" "f_flag")) 348 "cortex_a7_ex1") 349 350;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 351;; VFP load/store. 352;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 353 354(define_insn_reservation "cortex_a7_f_loads" 4 355 (and (eq_attr "tune" "cortexa7") 356 (eq_attr "type" "f_loads")) 357 "cortex_a7_ex1") 358 359(define_insn_reservation "cortex_a7_f_loadd" 4 360 (and (eq_attr "tune" "cortexa7") 361 (eq_attr "type" "f_loadd")) 362 "cortex_a7_both") 363 364(define_insn_reservation "cortex_a7_f_stores" 0 365 (and (eq_attr "tune" "cortexa7") 366 (eq_attr "type" "f_stores")) 367 "cortex_a7_ex1") 368 369(define_insn_reservation "cortex_a7_f_stored" 0 370 (and (eq_attr "tune" "cortexa7") 371 (eq_attr "type" "f_stored")) 372 "cortex_a7_both") 373 374;; Load-to-use for floating-point values has a penalty of one cycle, 375;; i.e. a latency of two. 376 377(define_bypass 2 "cortex_a7_f_loads, cortex_a7_f_loadd" 378 "cortex_a7_fpalu,\ 379 cortex_a7_fpmuls,cortex_a7_fpmacs,\ 380 cortex_a7_fpmuld,cortex_a7_fpmacd, cortex_a7_fpfmad,\ 381 cortex_a7_fdivs, cortex_a7_fdivd,\ 382 cortex_a7_f2r") 383 384;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 385;; NEON 386;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 387 388;; Simple modeling for all neon instructions not covered earlier. 389 390(define_insn_reservation "cortex_a7_neon" 4 391 (and (eq_attr "tune" "cortexa7") 392 (and (eq_attr "is_neon_type" "yes") 393 (eq_attr "cortex_a7_neon_type" "neon_other"))) 394 "cortex_a7_both*2") 395