1169689Skan;; ARM 1136J[F]-S Pipeline Description
2169689Skan;; Copyright (C) 2003 Free Software Foundation, Inc.
3169689Skan;; Written by CodeSourcery, LLC.
4169689Skan;;
5169689Skan;; This file is part of GCC.
6169689Skan;;
7169689Skan;; GCC is free software; you can redistribute it and/or modify it
8169689Skan;; under the terms of the GNU General Public License as published by
9169689Skan;; the Free Software Foundation; either version 2, or (at your option)
10169689Skan;; any later version.
11169689Skan;;
12169689Skan;; GCC is distributed in the hope that it will be useful, but
13169689Skan;; WITHOUT ANY WARRANTY; without even the implied warranty of
14169689Skan;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15169689Skan;; General Public License for more details.
16169689Skan;;
17169689Skan;; You should have received a copy of the GNU General Public License
18169689Skan;; along with GCC; see the file COPYING.  If not, write to the Free
19169689Skan;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
20169689Skan;; 02110-1301, USA.  */
21169689Skan
22169689Skan;; These descriptions are based on the information contained in the
23169689Skan;; ARM1136JF-S Technical Reference Manual, Copyright (c) 2003 ARM
24169689Skan;; Limited.
25169689Skan;;
26169689Skan
27169689Skan;; This automaton provides a pipeline description for the ARM
28169689Skan;; 1136J-S and 1136JF-S cores.
29169689Skan;;
30169689Skan;; The model given here assumes that the condition for all conditional
31169689Skan;; instructions is "true", i.e., that all of the instructions are
32169689Skan;; actually executed.
33169689Skan
34169689Skan(define_automaton "arm1136jfs")
35169689Skan
36169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
37169689Skan;; Pipelines
38169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
39169689Skan
40169689Skan;; There are three distinct pipelines (page 1-26 and following):
41169689Skan;;
42169689Skan;; - A 4-stage decode pipeline, shared by all three.  It has fetch (1),
43169689Skan;;   fetch (2), decode, and issue stages.  Since this is always involved,
44169689Skan;;   we do not model it in the scheduler.
45169689Skan;;
46169689Skan;; - A 4-stage ALU pipeline.  It has shifter, ALU (main integer operations),
47169689Skan;;   and saturation stages.  The fourth stage is writeback; see below.
48169689Skan;;
49169689Skan;; - A 4-stage multiply-accumulate pipeline.  It has three stages, called
50169689Skan;;   MAC1 through MAC3, and a fourth writeback stage.
51169689Skan;;
52169689Skan;;   The 4th-stage writeback is shared between the ALU and MAC pipelines,
53169689Skan;;   which operate in lockstep.  Results from either pipeline will be
54169689Skan;;   moved into the writeback stage.  Because the two pipelines operate
55169689Skan;;   in lockstep, we schedule them as a single "execute" pipeline.
56169689Skan;;
57169689Skan;; - A 4-stage LSU pipeline.  It has address generation, data cache (1),
58169689Skan;;   data cache (2), and writeback stages.  (Note that this pipeline,
59169689Skan;;   including the writeback stage, is independent from the ALU & LSU pipes.)  
60169689Skan
61169689Skan(define_cpu_unit "e_1,e_2,e_3,e_wb" "arm1136jfs")     ; ALU and MAC
62169689Skan; e_1 = Sh/Mac1, e_2 = ALU/Mac2, e_3 = SAT/Mac3
63169689Skan(define_cpu_unit "l_a,l_dc1,l_dc2,l_wb" "arm1136jfs") ; Load/Store
64169689Skan
65169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
66169689Skan;; ALU Instructions
67169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
68169689Skan
69169689Skan;; ALU instructions require eight cycles to execute, and use the ALU
70169689Skan;; pipeline in each of the eight stages.  The results are available
71169689Skan;; after the alu stage has finished.
72169689Skan;;
73169689Skan;; If the destination register is the PC, the pipelines are stalled
74169689Skan;; for several cycles.  That case is not modelled here.
75169689Skan
76169689Skan;; ALU operations with no shifted operand
77169689Skan(define_insn_reservation "11_alu_op" 2
78169689Skan (and (eq_attr "tune" "arm1136js,arm1136jfs")
79169689Skan      (eq_attr "type" "alu"))
80169689Skan "e_1,e_2,e_3,e_wb")
81169689Skan
82169689Skan;; ALU operations with a shift-by-constant operand
83169689Skan(define_insn_reservation "11_alu_shift_op" 2
84169689Skan (and (eq_attr "tune" "arm1136js,arm1136jfs")
85169689Skan      (eq_attr "type" "alu_shift"))
86169689Skan "e_1,e_2,e_3,e_wb")
87169689Skan
88169689Skan;; ALU operations with a shift-by-register operand
89169689Skan;; These really stall in the decoder, in order to read
90169689Skan;; the shift value in a second cycle. Pretend we take two cycles in
91169689Skan;; the shift stage.
92169689Skan(define_insn_reservation "11_alu_shift_reg_op" 3
93169689Skan (and (eq_attr "tune" "arm1136js,arm1136jfs")
94169689Skan      (eq_attr "type" "alu_shift_reg"))
95169689Skan "e_1*2,e_2,e_3,e_wb")
96169689Skan
97169689Skan;; alu_ops can start sooner, if there is no shifter dependency
98169689Skan(define_bypass 1 "11_alu_op,11_alu_shift_op"
99169689Skan	       "11_alu_op")
100169689Skan(define_bypass 1 "11_alu_op,11_alu_shift_op"
101169689Skan	       "11_alu_shift_op"
102169689Skan	       "arm_no_early_alu_shift_value_dep")
103169689Skan(define_bypass 1 "11_alu_op,11_alu_shift_op"
104169689Skan	       "11_alu_shift_reg_op"
105169689Skan	       "arm_no_early_alu_shift_dep")
106169689Skan(define_bypass 2 "11_alu_shift_reg_op"
107169689Skan	       "11_alu_op")
108169689Skan(define_bypass 2 "11_alu_shift_reg_op"
109169689Skan	       "11_alu_shift_op"
110169689Skan	       "arm_no_early_alu_shift_value_dep")
111169689Skan(define_bypass 2 "11_alu_shift_reg_op"
112169689Skan	       "11_alu_shift_reg_op"
113169689Skan	       "arm_no_early_alu_shift_dep")
114169689Skan
115169689Skan(define_bypass 1 "11_alu_op,11_alu_shift_op"
116169689Skan	       "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
117169689Skan	       "arm_no_early_mul_dep")
118169689Skan(define_bypass 2 "11_alu_shift_reg_op"
119169689Skan	       "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
120169689Skan	       "arm_no_early_mul_dep")
121169689Skan
122169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
123169689Skan;; Multiplication Instructions
124169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
125169689Skan
126169689Skan;; Multiplication instructions loop in the first two execute stages until
127169689Skan;; the instruction has been passed through the multiplier array enough
128169689Skan;; times.
129169689Skan
130169689Skan;; Multiply and multiply-accumulate results are available after four stages.
131169689Skan(define_insn_reservation "11_mult1" 4
132169689Skan (and (eq_attr "tune" "arm1136js,arm1136jfs")
133169689Skan      (eq_attr "insn" "mul,mla"))
134169689Skan "e_1*2,e_2,e_3,e_wb")
135169689Skan
136169689Skan;; The *S variants set the condition flags, which requires three more cycles.
137169689Skan(define_insn_reservation "11_mult2" 4
138169689Skan (and (eq_attr "tune" "arm1136js,arm1136jfs")
139169689Skan      (eq_attr "insn" "muls,mlas"))
140169689Skan "e_1*2,e_2,e_3,e_wb")
141169689Skan
142169689Skan(define_bypass 3 "11_mult1,11_mult2"
143169689Skan	       "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
144169689Skan	       "arm_no_early_mul_dep")
145169689Skan(define_bypass 3 "11_mult1,11_mult2"
146169689Skan	       "11_alu_op")
147169689Skan(define_bypass 3 "11_mult1,11_mult2"
148169689Skan	       "11_alu_shift_op"
149169689Skan	       "arm_no_early_alu_shift_value_dep")
150169689Skan(define_bypass 3 "11_mult1,11_mult2"
151169689Skan	       "11_alu_shift_reg_op"
152169689Skan	       "arm_no_early_alu_shift_dep")
153169689Skan(define_bypass 3 "11_mult1,11_mult2"
154169689Skan	       "11_store1"
155169689Skan	       "arm_no_early_store_addr_dep")
156169689Skan
157169689Skan;; Signed and unsigned multiply long results are available across two cycles;
158169689Skan;; the less significant word is available one cycle before the more significant
159169689Skan;; word.  Here we conservatively wait until both are available, which is
160169689Skan;; after three iterations and the memory cycle.  The same is also true of
161169689Skan;; the two multiply-accumulate instructions.
162169689Skan(define_insn_reservation "11_mult3" 5
163169689Skan (and (eq_attr "tune" "arm1136js,arm1136jfs")
164169689Skan      (eq_attr "insn" "smull,umull,smlal,umlal"))
165169689Skan "e_1*3,e_2,e_3,e_wb*2")
166169689Skan
167169689Skan;; The *S variants set the condition flags, which requires three more cycles.
168169689Skan(define_insn_reservation "11_mult4" 5
169169689Skan (and (eq_attr "tune" "arm1136js,arm1136jfs")
170169689Skan      (eq_attr "insn" "smulls,umulls,smlals,umlals"))
171169689Skan "e_1*3,e_2,e_3,e_wb*2")
172169689Skan
173169689Skan(define_bypass 4 "11_mult3,11_mult4"
174169689Skan	       "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
175169689Skan	       "arm_no_early_mul_dep")
176169689Skan(define_bypass 4 "11_mult3,11_mult4"
177169689Skan	       "11_alu_op")
178169689Skan(define_bypass 4 "11_mult3,11_mult4"
179169689Skan	       "11_alu_shift_op"
180169689Skan	       "arm_no_early_alu_shift_value_dep")
181169689Skan(define_bypass 4 "11_mult3,11_mult4"
182169689Skan	       "11_alu_shift_reg_op"
183169689Skan	       "arm_no_early_alu_shift_dep")
184169689Skan(define_bypass 4 "11_mult3,11_mult4"
185169689Skan	       "11_store1"
186169689Skan	       "arm_no_early_store_addr_dep")
187169689Skan
188169689Skan;; Various 16x16->32 multiplies and multiply-accumulates, using combinations
189169689Skan;; of high and low halves of the argument registers.  They take a single
190169689Skan;; pass through the pipeline and make the result available after three
191169689Skan;; cycles.
192169689Skan(define_insn_reservation "11_mult5" 3
193169689Skan (and (eq_attr "tune" "arm1136js,arm1136jfs")
194169689Skan      (eq_attr "insn" "smulxy,smlaxy,smulwy,smlawy,smuad,smuadx,smlad,smladx,smusd,smusdx,smlsd,smlsdx"))
195169689Skan "e_1,e_2,e_3,e_wb")
196169689Skan
197169689Skan(define_bypass 2 "11_mult5"
198169689Skan	       "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
199169689Skan	       "arm_no_early_mul_dep")
200169689Skan(define_bypass 2 "11_mult5"
201169689Skan	       "11_alu_op")
202169689Skan(define_bypass 2 "11_mult5"
203169689Skan	       "11_alu_shift_op"
204169689Skan	       "arm_no_early_alu_shift_value_dep")
205169689Skan(define_bypass 2 "11_mult5"
206169689Skan	       "11_alu_shift_reg_op"
207169689Skan	       "arm_no_early_alu_shift_dep")
208169689Skan(define_bypass 2 "11_mult5"
209169689Skan	       "11_store1"
210169689Skan	       "arm_no_early_store_addr_dep")
211169689Skan
212169689Skan;; The same idea, then the 32-bit result is added to a 64-bit quantity.
213169689Skan(define_insn_reservation "11_mult6" 4
214169689Skan (and (eq_attr "tune" "arm1136js,arm1136jfs")
215169689Skan      (eq_attr "insn" "smlalxy"))
216169689Skan "e_1*2,e_2,e_3,e_wb*2")
217169689Skan
218169689Skan;; Signed 32x32 multiply, then the most significant 32 bits are extracted
219169689Skan;; and are available after the memory stage.
220169689Skan(define_insn_reservation "11_mult7" 4
221169689Skan (and (eq_attr "tune" "arm1136js,arm1136jfs")
222169689Skan      (eq_attr "insn" "smmul,smmulr"))
223169689Skan "e_1*2,e_2,e_3,e_wb")
224169689Skan
225169689Skan(define_bypass 3 "11_mult6,11_mult7"
226169689Skan	       "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
227169689Skan	       "arm_no_early_mul_dep")
228169689Skan(define_bypass 3 "11_mult6,11_mult7"
229169689Skan	       "11_alu_op")
230169689Skan(define_bypass 3 "11_mult6,11_mult7"
231169689Skan	       "11_alu_shift_op"
232169689Skan	       "arm_no_early_alu_shift_value_dep")
233169689Skan(define_bypass 3 "11_mult6,11_mult7"
234169689Skan	       "11_alu_shift_reg_op"
235169689Skan	       "arm_no_early_alu_shift_dep")
236169689Skan(define_bypass 3 "11_mult6,11_mult7"
237169689Skan	       "11_store1"
238169689Skan	       "arm_no_early_store_addr_dep")
239169689Skan
240169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
241169689Skan;; Branch Instructions
242169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
243169689Skan
244169689Skan;; These vary greatly depending on their arguments and the results of
245169689Skan;; stat prediction.  Cycle count ranges from zero (unconditional branch,
246169689Skan;; folded dynamic prediction) to seven (incorrect predictions, etc).  We
247169689Skan;; assume an optimal case for now, because the cost of a cache miss
248169689Skan;; overwhelms the cost of everything else anyhow.
249169689Skan
250169689Skan(define_insn_reservation "11_branches" 0
251169689Skan (and (eq_attr "tune" "arm1136js,arm1136jfs")
252169689Skan      (eq_attr "type" "branch"))
253169689Skan "nothing")
254169689Skan
255169689Skan;; Call latencies are not predictable.  A semi-arbitrary very large
256169689Skan;; number is used as "positive infinity" so that everything should be
257169689Skan;; finished by the time of return.
258169689Skan(define_insn_reservation "11_call" 32
259169689Skan (and (eq_attr "tune" "arm1136js,arm1136jfs")
260169689Skan      (eq_attr "type" "call"))
261169689Skan "nothing")
262169689Skan
263169689Skan;; Branches are predicted. A correctly predicted branch will be no
264169689Skan;; cost, but we're conservative here, and use the timings a
265169689Skan;; late-register would give us.
266169689Skan(define_bypass 1 "11_alu_op,11_alu_shift_op"
267169689Skan	       "11_branches")
268169689Skan(define_bypass 2 "11_alu_shift_reg_op"
269169689Skan	       "11_branches")
270169689Skan(define_bypass 2 "11_load1,11_load2"
271169689Skan	       "11_branches")
272169689Skan(define_bypass 3 "11_load34"
273169689Skan	       "11_branches")
274169689Skan
275169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
276169689Skan;; Load/Store Instructions
277169689Skan;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
278169689Skan
279169689Skan;; The models for load/store instructions do not accurately describe
280169689Skan;; the difference between operations with a base register writeback.
281169689Skan;; These models assume that all memory references hit in dcache.  Also,
282169689Skan;; if the PC is one of the registers involved, there are additional stalls
283169689Skan;; not modelled here.  Addressing modes are also not modelled.
284169689Skan
285169689Skan(define_insn_reservation "11_load1" 3
286169689Skan (and (eq_attr "tune" "arm1136js,arm1136jfs")
287169689Skan      (eq_attr "type" "load1"))
288169689Skan "l_a+e_1,l_dc1,l_dc2,l_wb")
289169689Skan
290169689Skan;; Load byte results are not available until the writeback stage, where
291169689Skan;; the correct byte is extracted.
292169689Skan
293169689Skan(define_insn_reservation "11_loadb" 4
294169689Skan (and (eq_attr "tune" "arm1136js,arm1136jfs")
295169689Skan      (eq_attr "type" "load_byte"))
296169689Skan "l_a+e_1,l_dc1,l_dc2,l_wb")
297169689Skan
298169689Skan(define_insn_reservation "11_store1" 0
299169689Skan (and (eq_attr "tune" "arm1136js,arm1136jfs")
300169689Skan      (eq_attr "type" "store1"))
301169689Skan "l_a+e_1,l_dc1,l_dc2,l_wb")
302169689Skan
303169689Skan;; Load/store double words into adjacent registers.  The timing and
304169689Skan;; latencies are different depending on whether the address is 64-bit
305169689Skan;; aligned.  This model assumes that it is.
306169689Skan(define_insn_reservation "11_load2" 3
307169689Skan (and (eq_attr "tune" "arm1136js,arm1136jfs")
308169689Skan      (eq_attr "type" "load2"))
309169689Skan "l_a+e_1,l_dc1,l_dc2,l_wb")
310169689Skan
311169689Skan(define_insn_reservation "11_store2" 0
312169689Skan (and (eq_attr "tune" "arm1136js,arm1136jfs")
313169689Skan      (eq_attr "type" "store2"))
314169689Skan "l_a+e_1,l_dc1,l_dc2,l_wb")
315169689Skan
316169689Skan;; Load/store multiple registers.  Two registers are stored per cycle.
317169689Skan;; Actual timing depends on how many registers are affected, so we
318169689Skan;; optimistically schedule a low latency.
319169689Skan(define_insn_reservation "11_load34" 4
320169689Skan (and (eq_attr "tune" "arm1136js,arm1136jfs")
321169689Skan      (eq_attr "type" "load3,load4"))
322169689Skan "l_a+e_1,l_dc1*2,l_dc2,l_wb")
323169689Skan
324169689Skan(define_insn_reservation "11_store34" 0
325169689Skan (and (eq_attr "tune" "arm1136js,arm1136jfs")
326169689Skan      (eq_attr "type" "store3,store4"))
327169689Skan "l_a+e_1,l_dc1*2,l_dc2,l_wb")
328169689Skan
329169689Skan;; A store can start immediately after an alu op, if that alu op does
330169689Skan;; not provide part of the address to access.
331169689Skan(define_bypass 1 "11_alu_op,11_alu_shift_op"
332169689Skan	       "11_store1"
333169689Skan	       "arm_no_early_store_addr_dep")
334169689Skan(define_bypass 2 "11_alu_shift_reg_op"
335169689Skan	       "11_store1"
336169689Skan	       "arm_no_early_store_addr_dep")
337169689Skan
338169689Skan;; An alu op can start sooner after a load, if that alu op does not
339169689Skan;; have an early register dependency on the load
340169689Skan(define_bypass 2 "11_load1"
341169689Skan	       "11_alu_op")
342169689Skan(define_bypass 2 "11_load1"
343169689Skan	       "11_alu_shift_op"
344169689Skan	       "arm_no_early_alu_shift_value_dep")
345169689Skan(define_bypass 2 "11_load1"
346169689Skan	       "11_alu_shift_reg_op"
347169689Skan	       "arm_no_early_alu_shift_dep")
348169689Skan
349169689Skan(define_bypass 3 "11_loadb"
350169689Skan	       "11_alu_op")
351169689Skan(define_bypass 3 "11_loadb"
352169689Skan	       "11_alu_shift_op"
353169689Skan	       "arm_no_early_alu_shift_value_dep")
354169689Skan(define_bypass 3 "11_loadb"
355169689Skan	       "11_alu_shift_reg_op"
356169689Skan	       "arm_no_early_alu_shift_dep")
357169689Skan
358169689Skan;; A mul op can start sooner after a load, if that mul op does not
359169689Skan;; have an early multiply dependency
360169689Skan(define_bypass 2 "11_load1"
361169689Skan	       "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
362169689Skan	       "arm_no_early_mul_dep")
363169689Skan(define_bypass 3 "11_load34"
364169689Skan	       "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
365169689Skan	       "arm_no_early_mul_dep")
366169689Skan(define_bypass 3 "11_loadb"
367169689Skan	       "11_mult1,11_mult2,11_mult3,11_mult4,11_mult5,11_mult6,11_mult7"
368169689Skan	       "arm_no_early_mul_dep")
369169689Skan
370169689Skan;; A store can start sooner after a load, if that load does not
371169689Skan;; produce part of the address to access
372169689Skan(define_bypass 2 "11_load1"
373169689Skan	       "11_store1"
374169689Skan	       "arm_no_early_store_addr_dep")
375169689Skan(define_bypass 3 "11_loadb"
376169689Skan	       "11_store1"
377169689Skan	       "arm_no_early_store_addr_dep")
378