1;; ARM Cortex-R4 scheduling description.
2;; Copyright (C) 2007-2015 Free Software Foundation, Inc.
3;; Contributed by CodeSourcery.
4
5;; This file is part of GCC.
6
7;; GCC is free software; you can redistribute it and/or modify it
8;; under the terms of the GNU General Public License as published
9;; by the Free Software Foundation; either version 3, or (at your
10;; option) any later version.
11
12;; GCC is distributed in the hope that it will be useful, but WITHOUT
13;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14;; or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
15;; License for more details.
16
17;; You should have received a copy of the GNU General Public License
18;; along with GCC; see the file COPYING3.  If not see
19;; <http://www.gnu.org/licenses/>.
20
21(define_automaton "cortex_r4")
22
23;; We approximate the dual-issue constraints of this core using four
24;; "issue units" and a reservation matrix as follows.  The numbers indicate
25;; the instruction groups' preferences in order.  Multiple entries for
26;; the same numbered preference indicate units that must be reserved
27;; together.
28;;
29;; Issue unit:		A	B	C	ALU
30;;
31;; ALU w/o reg shift	1st	2nd		1st and 2nd
32;; ALU w/ reg shift	1st	2nd	2nd	1st and 2nd
33;; Moves		1st	2nd		2nd
34;; Multiplication	1st			1st
35;; Division		1st			1st
36;; Load/store single	1st		1st
37;; Other load/store	1st	1st
38;; Branches			1st
39
40(define_cpu_unit "cortex_r4_issue_a" "cortex_r4")
41(define_cpu_unit "cortex_r4_issue_b" "cortex_r4")
42(define_cpu_unit "cortex_r4_issue_c" "cortex_r4")
43(define_cpu_unit "cortex_r4_issue_alu" "cortex_r4")
44
45(define_reservation "cortex_r4_alu"
46                    "(cortex_r4_issue_a+cortex_r4_issue_alu)|\
47                     (cortex_r4_issue_b+cortex_r4_issue_alu)")
48(define_reservation "cortex_r4_alu_shift_reg"
49                    "(cortex_r4_issue_a+cortex_r4_issue_alu)|\
50                     (cortex_r4_issue_b+cortex_r4_issue_c+\
51                      cortex_r4_issue_alu)")
52(define_reservation "cortex_r4_mov"
53                    "cortex_r4_issue_a|(cortex_r4_issue_b+\
54                     cortex_r4_issue_alu)")
55(define_reservation "cortex_r4_mul" "cortex_r4_issue_a+cortex_r4_issue_alu")
56(define_reservation "cortex_r4_mul_2"
57                    "(cortex_r4_issue_a+cortex_r4_issue_alu)*2")
58;; Division instructions execute out-of-order with respect to the
59;; rest of the pipeline and only require reservations on their first and
60;; final cycles.
61(define_reservation "cortex_r4_div_9"
62                    "cortex_r4_issue_a+cortex_r4_issue_alu,\
63                     nothing*7,\
64                     cortex_r4_issue_a+cortex_r4_issue_alu")
65(define_reservation "cortex_r4_div_10"
66                    "cortex_r4_issue_a+cortex_r4_issue_alu,\
67                     nothing*8,\
68                     cortex_r4_issue_a+cortex_r4_issue_alu")
69(define_reservation "cortex_r4_load_store"
70                    "cortex_r4_issue_a+cortex_r4_issue_c")
71(define_reservation "cortex_r4_load_store_2"
72                    "(cortex_r4_issue_a+cortex_r4_issue_b)*2")
73(define_reservation "cortex_r4_branch" "cortex_r4_issue_b")
74
75;; We assume that all instructions are unconditional.
76
77;; Data processing instructions.  Moves without shifts are kept separate
78;; for the purposes of the dual-issue constraints above.
79(define_insn_reservation "cortex_r4_alu" 2
80  (and (eq_attr "tune_cortexr4" "yes")
81       (eq_attr "type" "alu_imm,alus_imm,logic_imm,logics_imm,\
82                        alu_sreg,alus_sreg,logic_reg,logics_reg,\
83                        adc_imm,adcs_imm,adc_reg,adcs_reg,\
84                        adr,bfm,clz,rbit,rev,\
85                        shift_imm,shift_reg,mvn_imm,mvn_reg"))
86  "cortex_r4_alu")
87
88(define_insn_reservation "cortex_r4_mov" 2
89  (and (eq_attr "tune_cortexr4" "yes")
90       (eq_attr "type" "mov_imm,mov_reg"))
91  "cortex_r4_mov")
92
93(define_insn_reservation "cortex_r4_alu_shift" 2
94  (and (eq_attr "tune_cortexr4" "yes")
95       (eq_attr "type" "alu_shift_imm,alus_shift_imm,\
96                        logic_shift_imm,logics_shift_imm,\
97                        extend,mov_shift,mvn_shift"))
98  "cortex_r4_alu")
99
100(define_insn_reservation "cortex_r4_alu_shift_reg" 2
101  (and (eq_attr "tune_cortexr4" "yes")
102       (eq_attr "type" "alu_shift_reg,alus_shift_reg,\
103                       logic_shift_reg,logics_shift_reg,\
104                       mov_shift_reg,mvn_shift_reg,\
105                       mrs,multiple,no_insn"))
106  "cortex_r4_alu_shift_reg")
107
108;; An ALU instruction followed by an ALU instruction with no early dep.
109(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\
110                  cortex_r4_mov"
111               "cortex_r4_alu")
112(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\
113                  cortex_r4_mov"
114               "cortex_r4_alu_shift"
115               "arm_no_early_alu_shift_dep")
116(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\
117                  cortex_r4_mov"
118               "cortex_r4_alu_shift_reg"
119               "arm_no_early_alu_shift_value_dep")
120
121;; In terms of availabilities, a consumer mov could theoretically be
122;; issued together with a producer ALU instruction, without stalls.
123;; In practice this cannot happen because mov;add (in that order) is not
124;; eligible for dual issue and furthermore dual issue is not permitted
125;; when a dependency is involved.  We therefore note it as latency one.
126;; A mov followed by another of the same is also latency one.
127(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\
128                  cortex_r4_mov"
129               "cortex_r4_mov")
130
131;; qadd, qdadd, qsub and qdsub are not currently emitted, and neither are
132;; media data processing instructions nor sad instructions.
133
134;; Multiplication instructions.
135
136(define_insn_reservation "cortex_r4_mul_4" 4
137  (and (eq_attr "tune_cortexr4" "yes")
138       (eq_attr "type" "mul,smmul"))
139  "cortex_r4_mul_2")
140
141(define_insn_reservation "cortex_r4_mul_3" 3
142  (and (eq_attr "tune_cortexr4" "yes")
143       (eq_attr "type" "smulxy,smulwy,smuad,smusd"))
144  "cortex_r4_mul")
145
146(define_insn_reservation "cortex_r4_mla_4" 4
147  (and (eq_attr "tune_cortexr4" "yes")
148       (eq_attr "type" "mla,smmla"))
149  "cortex_r4_mul_2")
150
151(define_insn_reservation "cortex_r4_mla_3" 3
152  (and (eq_attr "tune_cortexr4" "yes")
153       (eq_attr "type" "smlaxy,smlawy,smlad,smlsd"))
154  "cortex_r4_mul")
155
156(define_insn_reservation "cortex_r4_smlald" 3
157  (and (eq_attr "tune_cortexr4" "yes")
158       (eq_attr "type" "smlald,smlsld"))
159  "cortex_r4_mul")
160
161(define_insn_reservation "cortex_r4_mull" 4
162  (and (eq_attr "tune_cortexr4" "yes")
163       (eq_attr "type" "smull,umull,umlal,umaal"))
164  "cortex_r4_mul_2")
165
166;; A multiply or an MLA with a single-register result, followed by an
167;; MLA with an accumulator dependency, has its result forwarded.
168(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3"
169               "cortex_r4_mla_3,cortex_r4_mla_4"
170               "arm_mac_accumulator_is_mul_result")
171
172(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4"
173               "cortex_r4_mla_3,cortex_r4_mla_4"
174               "arm_mac_accumulator_is_mul_result")
175
176;; A multiply followed by an ALU instruction needing the multiply
177;; result only at ALU has lower latency than one needing it at Shift.
178(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald"
179               "cortex_r4_alu")
180(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald"
181               "cortex_r4_alu_shift"
182               "arm_no_early_alu_shift_dep")
183(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald"
184               "cortex_r4_alu_shift_reg"
185               "arm_no_early_alu_shift_value_dep")
186(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull"
187               "cortex_r4_alu")
188(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull"
189               "cortex_r4_alu_shift"
190               "arm_no_early_alu_shift_dep")
191(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull"
192               "cortex_r4_alu_shift_reg"
193               "arm_no_early_alu_shift_value_dep")
194
195;; A multiply followed by a mov has one cycle lower latency again.
196(define_bypass 1 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald"
197               "cortex_r4_mov")
198(define_bypass 2 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull"
199               "cortex_r4_mov")
200
201;; We guess that division of A/B using sdiv or udiv, on average, 
202;; is performed with B having ten more leading zeros than A.
203;; This gives a latency of nine for udiv and ten for sdiv.
204(define_insn_reservation "cortex_r4_udiv" 9
205  (and (eq_attr "tune_cortexr4" "yes")
206       (eq_attr "type" "udiv"))
207  "cortex_r4_div_9")
208
209(define_insn_reservation "cortex_r4_sdiv" 10
210  (and (eq_attr "tune_cortexr4" "yes")
211       (eq_attr "type" "sdiv"))
212  "cortex_r4_div_10")
213
214;; Branches.  We assume correct prediction.
215
216(define_insn_reservation "cortex_r4_branch" 0
217  (and (eq_attr "tune_cortexr4" "yes")
218       (eq_attr "type" "branch"))
219  "cortex_r4_branch")
220
221;; Call latencies are not predictable.  A semi-arbitrary very large
222;; number is used as "positive infinity" so that everything should be
223;; finished by the time of return.
224(define_insn_reservation "cortex_r4_call" 32
225  (and (eq_attr "tune_cortexr4" "yes")
226       (eq_attr "type" "call"))
227  "nothing")
228
229;; Status register access instructions are not currently emitted.
230
231;; Load instructions.
232;; We do not model the "addr_md_3cycle" cases and assume that
233;; accesses following are correctly aligned.
234
235(define_insn_reservation "cortex_r4_load_1_2" 3
236  (and (eq_attr "tune_cortexr4" "yes")
237       (eq_attr "type" "load1,load2"))
238  "cortex_r4_load_store")
239
240(define_insn_reservation "cortex_r4_load_3_4" 4
241  (and (eq_attr "tune_cortexr4" "yes")
242       (eq_attr "type" "load3,load4"))
243  "cortex_r4_load_store_2")
244
245;; If a producing load is followed by an instruction consuming only
246;; as a Normal Reg, there is one fewer cycle of latency.
247
248(define_bypass 2 "cortex_r4_load_1_2"
249               "cortex_r4_alu")
250(define_bypass 2 "cortex_r4_load_1_2"
251               "cortex_r4_alu_shift"
252               "arm_no_early_alu_shift_dep")
253(define_bypass 2 "cortex_r4_load_1_2"
254               "cortex_r4_alu_shift_reg"
255               "arm_no_early_alu_shift_value_dep")
256
257(define_bypass 3 "cortex_r4_load_3_4"
258               "cortex_r4_alu")
259(define_bypass 3 "cortex_r4_load_3_4"
260               "cortex_r4_alu_shift"
261               "arm_no_early_alu_shift_dep")
262(define_bypass 3 "cortex_r4_load_3_4"
263               "cortex_r4_alu_shift_reg"
264               "arm_no_early_alu_shift_value_dep")
265
266;; If a producing load is followed by an instruction consuming only
267;; as a Late Reg, there are two fewer cycles of latency.  Such consumer
268;; instructions are moves and stores.
269
270(define_bypass 1 "cortex_r4_load_1_2"
271               "cortex_r4_mov,cortex_r4_store_1_2,cortex_r4_store_3_4")
272(define_bypass 2 "cortex_r4_load_3_4"
273               "cortex_r4_mov,cortex_r4_store_1_2,cortex_r4_store_3_4")
274
275;; If a producer's result is required as the base or offset of a load,
276;; there is an extra cycle latency.
277
278(define_bypass 3 "cortex_r4_alu,cortex_r4_mov,cortex_r4_alu_shift,\
279                  cortex_r4_alu_shift_reg"
280               "cortex_r4_load_1_2,cortex_r4_load_3_4")
281
282(define_bypass 4 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald"
283               "cortex_r4_load_1_2,cortex_r4_load_3_4")
284
285(define_bypass 5 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull"
286               "cortex_r4_load_1_2,cortex_r4_load_3_4")
287
288;; Store instructions.
289
290(define_insn_reservation "cortex_r4_store_1_2" 0
291  (and (eq_attr "tune_cortexr4" "yes")
292       (eq_attr "type" "store1,store2"))
293  "cortex_r4_load_store")
294
295(define_insn_reservation "cortex_r4_store_3_4" 0
296  (and (eq_attr "tune_cortexr4" "yes")
297       (eq_attr "type" "store3,store4"))
298  "cortex_r4_load_store_2")
299
300