1;; ARM Cortex-A53 pipeline description 2;; Copyright (C) 2013-2015 Free Software Foundation, Inc. 3;; 4;; Contributed by ARM Ltd. 5;; 6;; This file is part of GCC. 7;; 8;; GCC is free software; you can redistribute it and/or modify it 9;; under the terms of the GNU General Public License as published by 10;; the Free Software Foundation; either version 3, or (at your option) 11;; any later version. 12;; 13;; GCC is distributed in the hope that it will be useful, but 14;; WITHOUT ANY WARRANTY; without even the implied warranty of 15;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;; General Public License for more details. 17;; 18;; You should have received a copy of the GNU General Public License 19;; along with GCC; see the file COPYING3. If not see 20;; <http://www.gnu.org/licenses/>. 21 22(define_automaton "cortex_a53") 23 24;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 25;; Functional units. 26;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 27 28;; There are two main integer execution pipelines, described as 29;; slot 0 and issue slot 1. 30 31(define_cpu_unit "cortex_a53_slot0" "cortex_a53") 32(define_cpu_unit "cortex_a53_slot1" "cortex_a53") 33 34(define_reservation "cortex_a53_slot_any" "cortex_a53_slot0|cortex_a53_slot1") 35(define_reservation "cortex_a53_single_issue" "cortex_a53_slot0+cortex_a53_slot1") 36 37;; The load/store pipeline. Load/store instructions can dual-issue from 38;; either pipeline, but two load/stores cannot simultaneously issue. 39 40(define_cpu_unit "cortex_a53_ls" "cortex_a53") 41 42;; The store pipeline. Shared between both execution pipelines. 43 44(define_cpu_unit "cortex_a53_store" "cortex_a53") 45 46;; The branch pipeline. Branches can dual-issue with other instructions 47;; (except when those instructions take multiple cycles to issue). 48 49(define_cpu_unit "cortex_a53_branch" "cortex_a53") 50 51;; The integer divider. 52 53(define_cpu_unit "cortex_a53_idiv" "cortex_a53") 54 55;; The floating-point add pipeline used to model the usage 56;; of the add pipeline by fmac instructions. 57 58(define_cpu_unit "cortex_a53_fpadd_pipe" "cortex_a53") 59 60;; Floating-point div/sqrt (long latency, out-of-order completion). 61 62(define_cpu_unit "cortex_a53_fp_div_sqrt" "cortex_a53") 63 64;; The Advanced SIMD pipelines. 65 66(define_cpu_unit "cortex_a53_simd0" "cortex_a53") 67(define_cpu_unit "cortex_a53_simd1" "cortex_a53") 68 69;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 70;; ALU instructions. 71;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 72 73(define_insn_reservation "cortex_a53_alu" 2 74 (and (eq_attr "tune" "cortexa53") 75 (eq_attr "type" "alu_imm,alus_imm,logic_imm,logics_imm,\ 76 alu_sreg,alus_sreg,logic_reg,logics_reg,\ 77 adc_imm,adcs_imm,adc_reg,adcs_reg,\ 78 adr,bfm,csel,clz,rbit,rev,alu_dsp_reg,\ 79 shift_imm,shift_reg,\ 80 mov_imm,mov_reg,mvn_imm,mvn_reg,\ 81 mrs,multiple,no_insn")) 82 "cortex_a53_slot_any") 83 84(define_insn_reservation "cortex_a53_alu_shift" 2 85 (and (eq_attr "tune" "cortexa53") 86 (eq_attr "type" "alu_shift_imm,alus_shift_imm,\ 87 crc,logic_shift_imm,logics_shift_imm,\ 88 alu_ext,alus_ext,alu_shift_reg,alus_shift_reg,\ 89 logic_shift_reg,logics_shift_reg,\ 90 extend,mov_shift,mov_shift_reg,\ 91 mvn_shift,mvn_shift_reg")) 92 "cortex_a53_slot_any") 93 94;; Forwarding path for unshifted operands. 95 96(define_bypass 1 "cortex_a53_alu,cortex_a53_alu_shift" 97 "cortex_a53_alu") 98 99(define_bypass 1 "cortex_a53_alu,cortex_a53_alu_shift" 100 "cortex_a53_alu_shift" 101 "arm_no_early_alu_shift_dep") 102 103;; The multiplier pipeline can forward results so there's no need to specify 104;; bypasses. Multiplies can only single-issue currently. 105 106(define_insn_reservation "cortex_a53_mul" 3 107 (and (eq_attr "tune" "cortexa53") 108 (ior (eq_attr "mul32" "yes") 109 (eq_attr "mul64" "yes"))) 110 "cortex_a53_single_issue") 111 112;; A multiply with a single-register result or an MLA, followed by an 113;; MLA with an accumulator dependency, has its result forwarded so two 114;; such instructions can issue back-to-back. 115 116(define_bypass 1 "cortex_a53_mul" 117 "cortex_a53_mul" 118 "arm_mac_accumulator_is_mul_result") 119 120;; Punt with a high enough latency for divides. 121(define_insn_reservation "cortex_a53_udiv" 8 122 (and (eq_attr "tune" "cortexa53") 123 (eq_attr "type" "udiv")) 124 "(cortex_a53_slot0+cortex_a53_idiv),cortex_a53_idiv*7") 125 126(define_insn_reservation "cortex_a53_sdiv" 9 127 (and (eq_attr "tune" "cortexa53") 128 (eq_attr "type" "sdiv")) 129 "(cortex_a53_slot0+cortex_a53_idiv),cortex_a53_idiv*8") 130 131 132(define_bypass 2 "cortex_a53_mul,cortex_a53_udiv,cortex_a53_sdiv" 133 "cortex_a53_alu") 134(define_bypass 2 "cortex_a53_mul,cortex_a53_udiv,cortex_a53_sdiv" 135 "cortex_a53_alu_shift" 136 "arm_no_early_alu_shift_dep") 137 138;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 139;; Load/store instructions. 140;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 141 142;; Address-generation happens in the issue stage. 143 144(define_insn_reservation "cortex_a53_load1" 3 145 (and (eq_attr "tune" "cortexa53") 146 (eq_attr "type" "load_byte,load1,load_acq")) 147 "cortex_a53_slot_any+cortex_a53_ls") 148 149(define_insn_reservation "cortex_a53_store1" 2 150 (and (eq_attr "tune" "cortexa53") 151 (eq_attr "type" "store1,store_rel")) 152 "cortex_a53_slot_any+cortex_a53_ls+cortex_a53_store") 153 154(define_insn_reservation "cortex_a53_load2" 3 155 (and (eq_attr "tune" "cortexa53") 156 (eq_attr "type" "load2")) 157 "cortex_a53_single_issue+cortex_a53_ls") 158 159(define_insn_reservation "cortex_a53_store2" 2 160 (and (eq_attr "tune" "cortexa53") 161 (eq_attr "type" "store2")) 162 "cortex_a53_single_issue+cortex_a53_ls+cortex_a53_store") 163 164(define_insn_reservation "cortex_a53_load3plus" 4 165 (and (eq_attr "tune" "cortexa53") 166 (eq_attr "type" "load3,load4")) 167 "(cortex_a53_single_issue+cortex_a53_ls)*2") 168 169(define_insn_reservation "cortex_a53_store3plus" 3 170 (and (eq_attr "tune" "cortexa53") 171 (eq_attr "type" "store3,store4")) 172 "(cortex_a53_single_issue+cortex_a53_ls+cortex_a53_store)*2") 173 174;; Load/store addresses are required early in Issue. 175(define_bypass 3 "cortex_a53_load1,cortex_a53_load2,cortex_a53_load3plus,cortex_a53_alu,cortex_a53_alu_shift" 176 "cortex_a53_load*" 177 "arm_early_load_addr_dep") 178(define_bypass 3 "cortex_a53_load1,cortex_a53_load2,cortex_a53_load3plus,cortex_a53_alu,cortex_a53_alu_shift" 179 "cortex_a53_store*" 180 "arm_early_store_addr_dep") 181 182;; Load data can forward in the ALU pipeline 183(define_bypass 2 "cortex_a53_load1,cortex_a53_load2" 184 "cortex_a53_alu") 185(define_bypass 2 "cortex_a53_load1,cortex_a53_load2" 186 "cortex_a53_alu_shift" 187 "arm_no_early_alu_shift_dep") 188 189;; ALU ops can forward to stores. 190(define_bypass 0 "cortex_a53_alu,cortex_a53_alu_shift" 191 "cortex_a53_store1,cortex_a53_store2,cortex_a53_store3plus" 192 "arm_no_early_store_addr_dep") 193 194(define_bypass 1 "cortex_a53_mul,cortex_a53_udiv,cortex_a53_sdiv,cortex_a53_load1,cortex_a53_load2,cortex_a53_load3plus" 195 "cortex_a53_store1,cortex_a53_store2,cortex_a53_store3plus" 196 "arm_no_early_store_addr_dep") 197 198;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 199;; Branches. 200;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 201 202;; Currently models all branches as dual-issuable from either execution 203;; slot, which isn't true for all cases. We still need to model indirect 204;; branches. 205 206(define_insn_reservation "cortex_a53_branch" 0 207 (and (eq_attr "tune" "cortexa53") 208 (eq_attr "type" "branch,call")) 209 "cortex_a53_slot_any+cortex_a53_branch") 210 211;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 212;; Floating-point arithmetic. 213;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 214 215(define_insn_reservation "cortex_a53_fpalu" 4 216 (and (eq_attr "tune" "cortexa53") 217 (eq_attr "type" "ffariths, fadds, ffarithd, faddd, fmov, fmuls,\ 218 f_cvt,f_cvtf2i,f_cvti2f,\ 219 fcmps, fcmpd, fcsel, f_rints, f_rintd, f_minmaxs,\ 220 f_minmaxd")) 221 "cortex_a53_slot0+cortex_a53_fpadd_pipe") 222 223(define_insn_reservation "cortex_a53_fconst" 2 224 (and (eq_attr "tune" "cortexa53") 225 (eq_attr "type" "fconsts,fconstd")) 226 "cortex_a53_slot0+cortex_a53_fpadd_pipe") 227 228(define_insn_reservation "cortex_a53_fpmul" 4 229 (and (eq_attr "tune" "cortexa53") 230 (eq_attr "type" "fmuls,fmuld")) 231 "cortex_a53_slot0") 232 233;; For single-precision multiply-accumulate, the add (accumulate) is issued after 234;; the multiply completes. Model that accordingly. 235 236(define_insn_reservation "cortex_a53_fpmac" 8 237 (and (eq_attr "tune" "cortexa53") 238 (eq_attr "type" "fmacs,fmacd,ffmas,ffmad")) 239 "cortex_a53_slot0, nothing*3, cortex_a53_fpadd_pipe") 240 241;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 242;; Floating-point divide/square root instructions. 243;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 244;; fsqrt really takes one cycle less, but that is not modelled. 245 246(define_insn_reservation "cortex_a53_fdivs" 14 247 (and (eq_attr "tune" "cortexa53") 248 (eq_attr "type" "fdivs, fsqrts")) 249 "cortex_a53_slot0, cortex_a53_fp_div_sqrt * 5") 250 251(define_insn_reservation "cortex_a53_fdivd" 29 252 (and (eq_attr "tune" "cortexa53") 253 (eq_attr "type" "fdivd, fsqrtd")) 254 "cortex_a53_slot0, cortex_a53_fp_div_sqrt * 8") 255 256;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 257;; ARMv8-A Cryptographic extensions. 258;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 259 260(define_insn_reservation "cortex_a53_crypto_aese" 2 261 (and (eq_attr "tune" "cortexa53") 262 (eq_attr "type" "crypto_aese")) 263 "cortex_a53_simd0") 264 265(define_insn_reservation "cortex_a53_crypto_aesmc" 2 266 (and (eq_attr "tune" "cortexa53") 267 (eq_attr "type" "crypto_aesmc")) 268 "cortex_a53_simd0 | cortex_a53_simd1") 269 270(define_insn_reservation "cortex_a53_crypto_sha1_fast" 2 271 (and (eq_attr "tune" "cortexa53") 272 (eq_attr "type" "crypto_sha1_fast, crypto_sha256_fast")) 273 "cortex_a53_simd0") 274 275(define_insn_reservation "cortex_a53_crypto_sha1_xor" 3 276 (and (eq_attr "tune" "cortexa53") 277 (eq_attr "type" "crypto_sha1_xor")) 278 "cortex_a53_simd0") 279 280(define_insn_reservation "cortex_a53_crypto_sha_slow" 5 281 (and (eq_attr "tune" "cortexa53") 282 (eq_attr "type" "crypto_sha1_slow, crypto_sha256_slow")) 283 "cortex_a53_simd0") 284 285(define_bypass 0 "cortex_a53_crypto_aese" 286 "cortex_a53_crypto_aesmc" 287 "aarch_crypto_can_dual_issue") 288 289;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 290;; VFP to/from core transfers. 291;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 292 293(define_insn_reservation "cortex_a53_r2f" 4 294 (and (eq_attr "tune" "cortexa53") 295 (eq_attr "type" "f_mcr,f_mcrr")) 296 "cortex_a53_slot0") 297 298(define_insn_reservation "cortex_a53_f2r" 2 299 (and (eq_attr "tune" "cortexa53") 300 (eq_attr "type" "f_mrc,f_mrrc")) 301 "cortex_a53_slot0") 302 303;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 304;; VFP flag transfer. 305;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 306 307(define_insn_reservation "cortex_a53_f_flags" 4 308 (and (eq_attr "tune" "cortexa53") 309 (eq_attr "type" "f_flag")) 310 "cortex_a53_slot0") 311 312;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 313;; VFP load/store. 314;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 315 316(define_insn_reservation "cortex_a53_f_loads" 4 317 (and (eq_attr "tune" "cortexa53") 318 (eq_attr "type" "f_loads")) 319 "cortex_a53_slot0") 320 321(define_insn_reservation "cortex_a53_f_loadd" 5 322 (and (eq_attr "tune" "cortexa53") 323 (eq_attr "type" "f_loadd")) 324 "cortex_a53_slot0") 325 326(define_insn_reservation "cortex_a53_f_load_2reg" 5 327 (and (eq_attr "tune" "cortexa53") 328 (eq_attr "type" "neon_load2_2reg_q")) 329 "(cortex_a53_slot_any+cortex_a53_ls)*2") 330 331(define_insn_reservation "cortex_a53_f_loadq" 5 332 (and (eq_attr "tune" "cortexa53") 333 (eq_attr "type" "neon_load1_1reg_q")) 334 "cortex_a53_slot_any+cortex_a53_ls") 335 336(define_insn_reservation "cortex_a53_f_stores" 0 337 (and (eq_attr "tune" "cortexa53") 338 (eq_attr "type" "f_stores")) 339 "cortex_a53_slot0") 340 341(define_insn_reservation "cortex_a53_f_stored" 0 342 (and (eq_attr "tune" "cortexa53") 343 (eq_attr "type" "f_stored")) 344 "cortex_a53_slot0") 345 346;; Load-to-use for floating-point values has a penalty of one cycle, 347;; i.e. a latency of two. 348 349(define_bypass 2 "cortex_a53_f_loads" 350 "cortex_a53_fpalu, cortex_a53_fpmac, cortex_a53_fpmul,\ 351 cortex_a53_fdivs, cortex_a53_fdivd,\ 352 cortex_a53_f2r") 353 354(define_bypass 2 "cortex_a53_f_loadd" 355 "cortex_a53_fpalu, cortex_a53_fpmac, cortex_a53_fpmul,\ 356 cortex_a53_fdivs, cortex_a53_fdivd,\ 357 cortex_a53_f2r") 358 359;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 360;; Crude Advanced SIMD approximation. 361;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 362 363(define_insn_reservation "cortex_53_advsimd" 4 364 (and (eq_attr "tune" "cortexa53") 365 (eq_attr "is_neon_type" "yes")) 366 "cortex_a53_simd0") 367