k6.md revision 169689
1219820Sjeff;; AMD K6/K6-2 Scheduling 2219820Sjeff;; Copyright (C) 2002, 2004 3219820Sjeff;; Free Software Foundation, Inc. 4219820Sjeff;; 5219820Sjeff;; This file is part of GCC. 6219820Sjeff;; 7219820Sjeff;; GCC is free software; you can redistribute it and/or modify 8219820Sjeff;; it under the terms of the GNU General Public License as published by 9219820Sjeff;; the Free Software Foundation; either version 2, or (at your option) 10219820Sjeff;; any later version. 11219820Sjeff;; 12219820Sjeff;; GCC is distributed in the hope that it will be useful, 13219820Sjeff;; but WITHOUT ANY WARRANTY; without even the implied warranty of 14219820Sjeff;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15219820Sjeff;; GNU General Public License for more details. 16219820Sjeff;; 17219820Sjeff;; You should have received a copy of the GNU General Public License 18219820Sjeff;; along with GCC; see the file COPYING. If not, write to 19219820Sjeff;; the Free Software Foundation, 51 Franklin Street, Fifth Floor, 20;; Boston, MA 02110-1301, USA. 21;; 22;; The K6 architecture is quite similar to PPro. Important difference is 23;; that there are only two decoders and they seems to be much slower than 24;; any of the execution units. So we have to pay much more attention to 25;; proper scheduling for the decoders. 26;; FIXME: We don't do that right now. A good start would be to sort the 27;; instructions based on length. 28;; 29;; This description is based on data from the following documents: 30;; 31;; "AMD-K6 Processor Data Sheet (Preliminary information)" 32;; Advanced Micro Devices, Inc., 1998. 33;; 34;; "AMD-K6 Processor Code Optimization Application Note" 35;; Advanced Micro Devices, Inc., 2000. 36;; 37;; CPU execution units of the K6: 38;; 39;; store describes the Store unit. This unit is not modelled 40;; completely and it is only used to model lea operation. 41;; Otherwise it lies outside of any critical path. 42;; load describes the Load unit 43;; alux describes the Integer X unit 44;; mm describes the Multimedia unit, which shares a pipe 45;; with the Integer X unit. This unit is used for MMX, 46;; which is not implemented for K6. 47;; aluy describes the Integer Y unit 48;; fpu describes the FPU unit 49;; branch describes the Branch unit 50;; 51;; The fp unit is not pipelined, and it can only do one operation per two 52;; cycles, including fxcg. 53;; 54;; Generally this is a very poor description, but at least no worse than 55;; the old description, and a lot easier to extend to something more 56;; reasonable if anyone still cares enough about this architecture in 2004. 57;; 58;; ??? fxch isn't handled; not an issue until sched3 after reg-stack is real. 59 60(define_automaton "k6_decoder,k6_load_unit,k6_store_unit,k6_integer_units,k6_fpu_unit,k6_branch_unit") 61 62;; The K6 instruction decoding begins before the on-chip instruction cache is 63;; filled. Depending on the length of the instruction, two simple instructions 64;; can be decoded in two parallel short decoders, or one complex instruction can 65;; be decoded in either the long or the vector decoder. For all practical 66;; purposes, the long and vector decoder can be modelled as one decoder. 67(define_cpu_unit "k6_decode_short0" "k6_decoder") 68(define_cpu_unit "k6_decode_short1" "k6_decoder") 69(define_cpu_unit "k6_decode_long" "k6_decoder") 70(exclusion_set "k6_decode_long" "k6_decode_short0,k6_decode_short1") 71(define_reservation "k6_decode_short" "k6_decode_short0|k6_decode_short1") 72(define_reservation "k6_decode_vector" "k6_decode_long") 73 74(define_cpu_unit "k6_store" "k6_store_unit") 75(define_cpu_unit "k6_load" "k6_load_unit") 76(define_cpu_unit "k6_alux,k6_aluy" "k6_integer_units") 77(define_cpu_unit "k6_fpu" "k6_fpu_unit") 78(define_cpu_unit "k6_branch" "k6_branch_unit") 79 80;; Shift instructions and certain arithmetic are issued only on Integer X. 81(define_insn_reservation "k6_alux_only" 1 82 (and (eq_attr "cpu" "k6") 83 (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot,cld") 84 (eq_attr "memory" "none"))) 85 "k6_decode_short,k6_alux") 86 87(define_insn_reservation "k6_alux_only_load" 3 88 (and (eq_attr "cpu" "k6") 89 (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot,cld") 90 (eq_attr "memory" "load"))) 91 "k6_decode_short,k6_load,k6_alux") 92 93(define_insn_reservation "k6_alux_only_store" 3 94 (and (eq_attr "cpu" "k6") 95 (and (eq_attr "type" "ishift,ishift1,rotate,rotate1,alu1,negnot,cld") 96 (eq_attr "memory" "store,both,unknown"))) 97 "k6_decode_long,k6_load,k6_alux,k6_store") 98 99;; Integer divide and multiply can only be issued on Integer X, too. 100(define_insn_reservation "k6_alu_imul" 2 101 (and (eq_attr "cpu" "k6") 102 (eq_attr "type" "imul")) 103 "k6_decode_vector,k6_alux*3") 104 105(define_insn_reservation "k6_alu_imul_load" 4 106 (and (eq_attr "cpu" "k6") 107 (and (eq_attr "type" "imul") 108 (eq_attr "memory" "load"))) 109 "k6_decode_vector,k6_load,k6_alux*3") 110 111(define_insn_reservation "k6_alu_imul_store" 4 112 (and (eq_attr "cpu" "k6") 113 (and (eq_attr "type" "imul") 114 (eq_attr "memory" "store,both,unknown"))) 115 "k6_decode_vector,k6_load,k6_alux*3,k6_store") 116 117;; ??? Guessed latencies based on the old pipeline description. 118(define_insn_reservation "k6_alu_idiv" 17 119 (and (eq_attr "cpu" "k6") 120 (and (eq_attr "type" "idiv") 121 (eq_attr "memory" "none"))) 122 "k6_decode_vector,k6_alux*17") 123 124(define_insn_reservation "k6_alu_idiv_mem" 19 125 (and (eq_attr "cpu" "k6") 126 (and (eq_attr "type" "idiv") 127 (eq_attr "memory" "!none"))) 128 "k6_decode_vector,k6_load,k6_alux*17") 129 130;; Basic word and doubleword ALU ops can be issued on both Integer units. 131(define_insn_reservation "k6_alu" 1 132 (and (eq_attr "cpu" "k6") 133 (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc") 134 (eq_attr "memory" "none"))) 135 "k6_decode_short,k6_alux|k6_aluy") 136 137(define_insn_reservation "k6_alu_load" 3 138 (and (eq_attr "cpu" "k6") 139 (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc") 140 (eq_attr "memory" "load"))) 141 "k6_decode_short,k6_load,k6_alux|k6_aluy") 142 143(define_insn_reservation "k6_alu_store" 3 144 (and (eq_attr "cpu" "k6") 145 (and (eq_attr "type" "alu,alu1,negnot,icmp,test,imovx,incdec,setcc") 146 (eq_attr "memory" "store,both,unknown"))) 147 "k6_decode_long,k6_load,k6_alux|k6_aluy,k6_store") 148 149;; A "load immediate" operation does not require execution at all, 150;; it is available immediately after decoding. Special-case this. 151(define_insn_reservation "k6_alu_imov" 1 152 (and (eq_attr "cpu" "k6") 153 (and (eq_attr "type" "imov") 154 (and (eq_attr "memory" "none") 155 (match_operand 1 "nonimmediate_operand")))) 156 "k6_decode_short,k6_alux|k6_aluy") 157 158(define_insn_reservation "k6_alu_imov_imm" 0 159 (and (eq_attr "cpu" "k6") 160 (and (eq_attr "type" "imov") 161 (and (eq_attr "memory" "none") 162 (match_operand 1 "immediate_operand")))) 163 "k6_decode_short") 164 165(define_insn_reservation "k6_alu_imov_load" 2 166 (and (eq_attr "cpu" "k6") 167 (and (eq_attr "type" "imov") 168 (eq_attr "memory" "load"))) 169 "k6_decode_short,k6_load") 170 171(define_insn_reservation "k6_alu_imov_store" 1 172 (and (eq_attr "cpu" "k6") 173 (and (eq_attr "type" "imov") 174 (eq_attr "memory" "store"))) 175 "k6_decode_short,k6_store") 176 177(define_insn_reservation "k6_alu_imov_both" 2 178 (and (eq_attr "cpu" "k6") 179 (and (eq_attr "type" "imov") 180 (eq_attr "memory" "both,unknown"))) 181 "k6_decode_long,k6_load,k6_alux|k6_aluy") 182 183;; The branch unit. 184(define_insn_reservation "k6_branch_call" 1 185 (and (eq_attr "cpu" "k6") 186 (eq_attr "type" "call,callv")) 187 "k6_decode_vector,k6_branch") 188 189(define_insn_reservation "k6_branch_branch" 1 190 (and (eq_attr "cpu" "k6") 191 (eq_attr "type" "ibr")) 192 "k6_decode_short,k6_branch") 193 194;; The load and units have two pipeline stages. The load latency is 195;; two cycles. 196(define_insn_reservation "k6_load_pop" 3 197 (and (eq_attr "cpu" "k6") 198 (ior (eq_attr "type" "pop") 199 (eq_attr "memory" "load,both"))) 200 "k6_decode_short,k6_load") 201 202(define_insn_reservation "k6_load_leave" 5 203 (and (eq_attr "cpu" "k6") 204 (eq_attr "type" "leave")) 205 "k6_decode_long,k6_load,(k6_alux|k6_aluy)*2") 206 207;; ??? From the old pipeline description. Egad! 208;; ??? Apparently we take care of this reservation in adjust_cost. 209(define_insn_reservation "k6_load_str" 10 210 (and (eq_attr "cpu" "k6") 211 (and (eq_attr "type" "str") 212 (eq_attr "memory" "load,both"))) 213 "k6_decode_vector,k6_load*10") 214 215;; The store unit handles lea and push. It is otherwise unmodelled. 216(define_insn_reservation "k6_store_lea" 2 217 (and (eq_attr "cpu" "k6") 218 (eq_attr "type" "lea")) 219 "k6_decode_short,k6_store,k6_alux|k6_aluy") 220 221(define_insn_reservation "k6_store_push" 2 222 (and (eq_attr "cpu" "k6") 223 (ior (eq_attr "type" "push") 224 (eq_attr "memory" "store,both"))) 225 "k6_decode_short,k6_store") 226 227(define_insn_reservation "k6_store_str" 10 228 (and (eq_attr "cpu" "k6") 229 (eq_attr "type" "str")) 230 "k6_store*10") 231 232;; Most FPU instructions have latency 2 and throughput 2. 233(define_insn_reservation "k6_fpu" 2 234 (and (eq_attr "cpu" "k6") 235 (and (eq_attr "type" "fop,fmov,fcmp,fistp") 236 (eq_attr "memory" "none"))) 237 "k6_decode_vector,k6_fpu*2") 238 239(define_insn_reservation "k6_fpu_load" 6 240 (and (eq_attr "cpu" "k6") 241 (and (eq_attr "type" "fop,fmov,fcmp,fistp") 242 (eq_attr "memory" "load,both"))) 243 "k6_decode_short,k6_load,k6_fpu*2") 244 245(define_insn_reservation "k6_fpu_store" 6 246 (and (eq_attr "cpu" "k6") 247 (and (eq_attr "type" "fop,fmov,fcmp,fistp") 248 (eq_attr "memory" "store"))) 249 "k6_decode_short,k6_store,k6_fpu*2") 250 251(define_insn_reservation "k6_fpu_fmul" 2 252 (and (eq_attr "cpu" "k6") 253 (and (eq_attr "type" "fmul") 254 (eq_attr "memory" "none"))) 255 "k6_decode_short,k6_fpu*2") 256 257(define_insn_reservation "k6_fpu_fmul_load" 2 258 (and (eq_attr "cpu" "k6") 259 (and (eq_attr "type" "fmul") 260 (eq_attr "memory" "load,both"))) 261 "k6_decode_short,k6_load,k6_fpu*2") 262 263;; ??? Guessed latencies from the old pipeline description. 264(define_insn_reservation "k6_fpu_expensive" 56 265 (and (eq_attr "cpu" "k6") 266 (eq_attr "type" "fdiv,fpspc")) 267 "k6_decode_short,k6_fpu*56") 268 269