1;; DFA-based pipeline description for the VR1x000. 2;; Copyright (C) 2005-2015 Free Software Foundation, Inc. 3;; 4;; This file is part of GCC. 5 6;; GCC is free software; you can redistribute it and/or modify it 7;; under the terms of the GNU General Public License as published 8;; by the Free Software Foundation; either version 3, or (at your 9;; option) any later version. 10 11;; GCC is distributed in the hope that it will be useful, but WITHOUT 12;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 13;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 14;; License for more details. 15 16;; You should have received a copy of the GNU General Public License 17;; along with GCC; see the file COPYING3. If not see 18;; <http://www.gnu.org/licenses/>. 19 20 21;; R12K/R14K/R16K are derivatives of R10K, thus copy its description 22;; until specific tuning for each is added. 23 24;; R10000 has an int queue, fp queue, address queue. 25;; The int queue feeds ALU1 and ALU2. 26;; The fp queue feeds the fp-adder and fp-multiplier. 27;; The addr queue feeds the Load/Store unit. 28;; 29;; However, we define the fp-adder and fp-multiplier as 30;; separate automatons, because the fp-multiplier is 31;; divided into fp-multiplier, fp-division, and 32;; fp-squareroot units, all of which share the same 33;; issue and completion logic, yet can operate in 34;; parallel. 35;; 36;; This is based on the model described in the R10K Manual 37;; and it helps to reduce the size of the automata. 38(define_automaton "r10k_a_int, r10k_a_fpadder, r10k_a_addr, 39 r10k_a_fpmpy, r10k_a_fpdiv, r10k_a_fpsqrt") 40 41(define_cpu_unit "r10k_alu1" "r10k_a_int") 42(define_cpu_unit "r10k_alu2" "r10k_a_int") 43(define_cpu_unit "r10k_fpadd" "r10k_a_fpadder") 44(define_cpu_unit "r10k_fpmpy" "r10k_a_fpmpy") 45(define_cpu_unit "r10k_fpdiv" "r10k_a_fpdiv") 46(define_cpu_unit "r10k_fpsqrt" "r10k_a_fpsqrt") 47(define_cpu_unit "r10k_loadstore" "r10k_a_addr") 48 49 50;; R10k Loads and Stores. 51(define_insn_reservation "r10k_load" 2 52 (and (eq_attr "cpu" "r10000") 53 (eq_attr "type" "load,prefetch,prefetchx")) 54 "r10k_loadstore") 55 56(define_insn_reservation "r10k_store" 0 57 (and (eq_attr "cpu" "r10000") 58 (eq_attr "type" "store,fpstore,fpidxstore")) 59 "r10k_loadstore") 60 61(define_insn_reservation "r10k_fpload" 3 62 (and (eq_attr "cpu" "r10000") 63 (eq_attr "type" "fpload,fpidxload")) 64 "r10k_loadstore") 65 66 67;; Integer add/sub + logic ops, and mt hi/lo can be done by alu1 or alu2. 68;; Miscellaneous arith goes here too (this is a guess). 69(define_insn_reservation "r10k_arith" 1 70 (and (eq_attr "cpu" "r10000") 71 (eq_attr "type" "arith,mthi,mtlo,slt,clz,const,nop,trap,logical")) 72 "r10k_alu1 | r10k_alu2") 73 74;; We treat mfhilo differently, because we need to know when 75;; it's HI and when it's LO. 76(define_insn_reservation "r10k_mfhi" 1 77 (and (eq_attr "cpu" "r10000") 78 (eq_attr "type" "mfhi")) 79 "r10k_alu1 | r10k_alu2") 80 81(define_insn_reservation "r10k_mflo" 1 82 (and (eq_attr "cpu" "r10000") 83 (eq_attr "type" "mflo")) 84 "r10k_alu1 | r10k_alu2") 85 86 87;; ALU1 handles shifts, branch eval, and condmove. 88;; 89;; Brancher is separate, but part of ALU1, but can only 90;; do one branch per cycle (is this even implementable?). 91;; 92;; Unsure if the brancher handles jumps and calls as well, but since 93;; they're related, we'll add them here for now. 94(define_insn_reservation "r10k_brancher" 1 95 (and (eq_attr "cpu" "r10000") 96 (eq_attr "type" "shift,branch,jump,call")) 97 "r10k_alu1") 98 99(define_insn_reservation "r10k_int_cmove" 1 100 (and (eq_attr "cpu" "r10000") 101 (and (eq_attr "type" "condmove") 102 (eq_attr "mode" "SI,DI"))) 103 "r10k_alu1") 104 105 106;; Coprocessor Moves. 107;; mtc1/dmtc1 are handled by ALU1. 108;; mfc1/dmfc1 are handled by the fp-multiplier. 109(define_insn_reservation "r10k_mt_xfer" 3 110 (and (eq_attr "cpu" "r10000") 111 (eq_attr "type" "mtc")) 112 "r10k_alu1") 113 114(define_insn_reservation "r10k_mf_xfer" 2 115 (and (eq_attr "cpu" "r10000") 116 (eq_attr "type" "mfc")) 117 "r10k_fpmpy") 118 119 120;; Only ALU2 does int multiplications and divisions. 121;; 122;; According to the Vr10000 series user manual, 123;; integer mult and div insns can be issued one 124;; cycle earlier if using register Lo. We model 125;; this by using the Lo value by default, as it 126;; is the more common value, and use a bypass 127;; for the Hi value when needed. 128;; 129;; Also of note, There are different latencies 130;; for MULT/DMULT (Lo 5/Hi 6) and MULTU/DMULTU (Lo 6/Hi 7). 131;; However, gcc does not have separate types 132;; for these insns. Thus to strike a balance, 133;; we use the Hi latency value for imul 134;; operations until the imul type can be split. 135(define_insn_reservation "r10k_imul_single" 6 136 (and (eq_attr "cpu" "r10000") 137 (and (eq_attr "type" "imul,imul3") 138 (eq_attr "mode" "SI"))) 139 "r10k_alu2 * 6") 140 141(define_insn_reservation "r10k_imul_double" 10 142 (and (eq_attr "cpu" "r10000") 143 (and (eq_attr "type" "imul,imul3") 144 (eq_attr "mode" "DI"))) 145 "r10k_alu2 * 10") 146 147;; Divides keep ALU2 busy. 148(define_insn_reservation "r10k_idiv_single" 34 149 (and (eq_attr "cpu" "r10000") 150 (and (eq_attr "type" "idiv") 151 (eq_attr "mode" "SI"))) 152 "r10k_alu2 * 35") 153 154(define_insn_reservation "r10k_idiv_double" 66 155 (and (eq_attr "cpu" "r10000") 156 (and (eq_attr "type" "idiv") 157 (eq_attr "mode" "DI"))) 158 "r10k_alu2 * 67") 159 160(define_bypass 35 "r10k_idiv_single" "r10k_mfhi") 161(define_bypass 67 "r10k_idiv_double" "r10k_mfhi") 162 163 164;; Floating point add/sub, mul, abs value, neg, comp, & moves. 165(define_insn_reservation "r10k_fp_miscadd" 2 166 (and (eq_attr "cpu" "r10000") 167 (eq_attr "type" "fadd,fabs,fneg,fcmp")) 168 "r10k_fpadd") 169 170(define_insn_reservation "r10k_fp_miscmul" 2 171 (and (eq_attr "cpu" "r10000") 172 (eq_attr "type" "fmul,fmove")) 173 "r10k_fpmpy") 174 175(define_insn_reservation "r10k_fp_cmove" 2 176 (and (eq_attr "cpu" "r10000") 177 (and (eq_attr "type" "condmove") 178 (eq_attr "mode" "SF,DF"))) 179 "r10k_fpmpy") 180 181 182;; The fcvt.s.[wl] insn has latency 4, repeat 2. 183;; All other fcvt insns have latency 2, repeat 1. 184(define_insn_reservation "r10k_fcvt_single" 4 185 (and (eq_attr "cpu" "r10000") 186 (and (eq_attr "type" "fcvt") 187 (eq_attr "cnv_mode" "I2S"))) 188 "r10k_fpadd * 2") 189 190(define_insn_reservation "r10k_fcvt_other" 2 191 (and (eq_attr "cpu" "r10000") 192 (and (eq_attr "type" "fcvt") 193 (eq_attr "cnv_mode" "!I2S"))) 194 "r10k_fpadd") 195 196 197;; Run the fmadd insn through fp-adder first, then fp-multiplier. 198;; 199;; The latency for fmadd is 2 cycles if the result is used 200;; by another fmadd instruction. 201(define_insn_reservation "r10k_fmadd" 4 202 (and (eq_attr "cpu" "r10000") 203 (eq_attr "type" "fmadd")) 204 "r10k_fpadd, r10k_fpmpy") 205 206(define_bypass 2 "r10k_fmadd" "r10k_fmadd") 207 208 209;; Floating point Divisions & square roots. 210(define_insn_reservation "r10k_fdiv_single" 12 211 (and (eq_attr "cpu" "r10000") 212 (and (eq_attr "type" "fdiv,frdiv") 213 (eq_attr "mode" "SF"))) 214 "r10k_fpdiv * 14") 215 216(define_insn_reservation "r10k_fdiv_double" 19 217 (and (eq_attr "cpu" "r10000") 218 (and (eq_attr "type" "fdiv,frdiv") 219 (eq_attr "mode" "DF"))) 220 "r10k_fpdiv * 21") 221 222(define_insn_reservation "r10k_fsqrt_single" 18 223 (and (eq_attr "cpu" "r10000") 224 (and (eq_attr "type" "fsqrt") 225 (eq_attr "mode" "SF"))) 226 "r10k_fpsqrt * 20") 227 228(define_insn_reservation "r10k_fsqrt_double" 33 229 (and (eq_attr "cpu" "r10000") 230 (and (eq_attr "type" "fsqrt") 231 (eq_attr "mode" "DF"))) 232 "r10k_fpsqrt * 35") 233 234(define_insn_reservation "r10k_frsqrt_single" 30 235 (and (eq_attr "cpu" "r10000") 236 (and (eq_attr "type" "frsqrt") 237 (eq_attr "mode" "SF"))) 238 "r10k_fpsqrt * 20") 239 240(define_insn_reservation "r10k_frsqrt_double" 52 241 (and (eq_attr "cpu" "r10000") 242 (and (eq_attr "type" "frsqrt") 243 (eq_attr "mode" "DF"))) 244 "r10k_fpsqrt * 35") 245 246 247;; Handle unknown/multi insns here (this is a guess). 248(define_insn_reservation "r10k_unknown" 1 249 (and (eq_attr "cpu" "r10000") 250 (eq_attr "type" "unknown,multi,atomic,syncloop")) 251 "r10k_alu1 + r10k_alu2") 252