1169689Skan;; .........................
2169689Skan;;
3169689Skan;; DFA-based pipeline description for Sandcraft SR3 (MIPS64 based)
4169689Skan;;
5169689Skan;; The SR3 is described as:
6169689Skan;;     - nine-stage pipeline, insn buffering with out-of-order issue to
7169689Skan;;       multiple function units, with an average dispatch rate of 2
8169689Skan;;       insn.s per cycle (max 6 insns: 2 fpu, 4 cpu).
9169689Skan;;
10169689Skan;;  The details on this are scant except for a diagram in
11169689Skan;;  Chap. 6 of Rev. 1.0 SR3 Spec.
12169689Skan;;
13169689Skan;;  The model employed below is designed to closely approximate the
14169689Skan;;  published latencies. Emulation of out-of-order issue and the insn
15169689Skan;;  buffering is done via a VLIW dispatch style (with a packing of 6 insns);
16169689Skan;;  the function unit reservations restrictions (define_*_set) are
17169689Skan;;  contrived to support published timings.
18169689Skan;;
19169689Skan;; Reference:
20169689Skan;;   "SR3 Microprocessor Specification, System development information,"
21169689Skan;;   Revision 1.0, 13 December 2000.
22169689Skan;;
23169689Skan;;
24169689Skan;; Reservation model is based on:
25169689Skan;;   1) Figure 6-1, from the 1.0 specification.
26169689Skan;;   2) Chapter 19, from the 1.0 specification.
27169689Skan;;   3) following questions(Red Hat)/answers(Sandcraft):
28169689Skan;;     RH> From Section 19.1
29169689Skan;;     RH>      1) In terms of figure 6-1, are all the instructions in
30169689Skan;;     RH>         table 19-1 restricted
31169689Skan;;     RH>         to ALUx? When ALUx is not in use for an instruction in table;;     RH>          19-1 is
32169689Skan;;     RH>         it fully compatible with all insns that issue to ALUy?
33169689Skan;;
34169689Skan;;     Yes, all the instructions in Table 19-1 only go to ALUX, and all the
35169689Skan;;     instructions that can be issued to ALUY can also be issued to ALUX.
36169689Skan;;
37169689Skan;;
38169689Skan;;     RH> From Section 19.2
39169689Skan;;     RH>      2) Explain conditional moves execution path (in terms of
40169689Skan;;     RH>      figure 6-1)
41169689Skan;;
42169689Skan;;     Conditional move of integer registers (based on floating point condition
43169689Skan;;     codes or integer register value) go to ALUX or ALUY.
44169689Skan;;
45169689Skan;;     RH>      3) Explain floating point store execution path (in terms of
46169689Skan;;     RH>      figure 6-1)
47169689Skan;;
48169689Skan;;     Floating point stores go to Ld/St and go to MOV in the floating point
49169689Skan;;     pipeline.
50169689Skan;;
51169689Skan;;     Floating point loads go to Ld/St and go to LOAD in the floating point
52169689Skan;;     pipeline.
53169689Skan;;
54169689Skan;;     RH>      4) Explain branch on floating condition (in terms of figure 6-1);;
55169689Skan;;     Branch on floating condition go to BRU.
56169689Skan;;
57169689Skan;;     RH>      5) Is the column for single RECIP instruction latency correct?
58169689Skan;;     RH>      What about for RSQRT single and double?
59169689Skan;;
60169689Skan;;     The latency/repeat for RECIP and RSQRT are correct.
61169689Skan;;
62169689Skan
63169689Skan;;
64169689Skan;; Use four automata to isolate long latency operations, and to
65169689Skan;; reduce the complexity of cpu+fpu, reducing space.
66169689Skan;;
67169689Skan(define_automaton "sr71_cpu, sr71_cpu1, sr71_cp1, sr71_cp2, sr71_fextra, sr71_imacc")
68169689Skan
69169689Skan;;  feeders for CPU function units and feeders for fpu (CP1 interface)
70169689Skan(define_cpu_unit "sr_iss0,sr_iss1,sr_iss2,sr_iss3,sr_iss4,sr_iss5" "sr71_cpu")
71169689Skan
72169689Skan;; CPU function units
73169689Skan(define_cpu_unit "ipu_bru"       "sr71_cpu1")
74169689Skan(define_cpu_unit "ipu_alux"      "sr71_cpu1")
75169689Skan(define_cpu_unit "ipu_aluy"      "sr71_cpu1")
76169689Skan(define_cpu_unit "ipu_ldst"      "sr71_cpu1")
77169689Skan(define_cpu_unit "ipu_macc_iter" "sr71_imacc")
78169689Skan
79169689Skan
80169689Skan;; Floating-point unit (Co-processor interface 1).
81169689Skan(define_cpu_unit "fpu_mov"          "sr71_cp1")
82169689Skan(define_cpu_unit "fpu_load"         "sr71_cp1")
83169689Skan(define_cpu_unit "fpu_fpu"          "sr71_cp2")
84169689Skan
85169689Skan;; fictitous unit to track long float insns with separate automaton
86169689Skan(define_cpu_unit "fpu_iter"         "sr71_fextra")
87169689Skan
88169689Skan
89169689Skan;;
90169689Skan;; Define common execution path (reservation) combinations
91169689Skan;;
92169689Skan
93169689Skan;;
94169689Skan(define_reservation "cpu_iss"         "sr_iss0|sr_iss1|sr_iss2|sr_iss3")
95169689Skan
96169689Skan;; two cycles are used for instruction using the fpu as it runs
97169689Skan;; at half the clock speed of the cpu. By adding an extra cycle
98169689Skan;; to the issue units, the default/minimum "repeat" dispatch delay is
99169689Skan;; accounted for all insn.s
100169689Skan(define_reservation "cp1_iss"         "(sr_iss4*2)|(sr_iss5*2)")
101169689Skan
102169689Skan(define_reservation "serial_dispatch" "sr_iss0+sr_iss1+sr_iss2+sr_iss3+sr_iss4+sr_iss5")
103169689Skan
104169689Skan;; Simulate a 6 insn VLIW dispatch, 1 cycle in dispatch followed by
105169689Skan;; reservation of function unit.
106169689Skan(define_reservation "ri_insns"         "cpu_iss,(ipu_alux|ipu_aluy)")
107169689Skan(define_reservation "ri_mem"           "cpu_iss,ipu_ldst")
108169689Skan(define_reservation "ri_alux"          "cpu_iss,ipu_alux")
109169689Skan(define_reservation "ri_branch"        "cpu_iss,ipu_bru")
110169689Skan
111169689Skan(define_reservation "rf_insn"          "cp1_iss,fpu_fpu")
112169689Skan(define_reservation "rf_ldmem"         "cp1_iss,fpu_load")
113169689Skan
114169689Skan; simultaneous reservation of pseudo-unit keeps cp1 fpu tied
115169689Skan; up until long cycle insn is finished...
116169689Skan(define_reservation "rf_multi1"        "rf_insn+fpu_iter")
117169689Skan
118169689Skan;;
119169689Skan;; The ordering of the instruction-execution-path/resource-usage
120169689Skan;; descriptions (also known as reservation RTL) is roughly ordered
121169689Skan;; based on the define attribute RTL for the "type" classification.
122169689Skan;; When modifying, remember that the first test that matches is the
123169689Skan;; reservation used!
124169689Skan;;
125169689Skan
126169689Skan
127169689Skan(define_insn_reservation "ir_sr70_unknown" 1
128169689Skan  (and (eq_attr "cpu" "sr71000")
129169689Skan       (eq_attr "type" "unknown"))
130169689Skan  "serial_dispatch")
131169689Skan
132169689Skan
133169689Skan;; Assume prediction fails.
134169689Skan(define_insn_reservation "ir_sr70_branch" 6
135169689Skan  (and (eq_attr "cpu" "sr71000")
136169689Skan       (eq_attr "type" "branch,jump,call"))
137169689Skan  "ri_branch")
138169689Skan
139169689Skan(define_insn_reservation "ir_sr70_load" 2
140169689Skan  (and (eq_attr "cpu" "sr71000")
141169689Skan       (eq_attr "type" "load"))
142169689Skan  "ri_mem")
143169689Skan
144169689Skan(define_insn_reservation "ir_sr70_store" 1
145169689Skan  (and (eq_attr "cpu" "sr71000")
146169689Skan       (eq_attr "type" "store"))
147169689Skan  "ri_mem")
148169689Skan
149169689Skan
150169689Skan;;
151169689Skan;; float loads/stores flow through both cpu and cp1...
152169689Skan;;
153169689Skan(define_insn_reservation "ir_sr70_fload" 9
154169689Skan  (and (eq_attr "cpu" "sr71000")
155169689Skan       (eq_attr "type" "fpload,fpidxload"))
156169689Skan  "(cpu_iss+cp1_iss),(ri_mem+rf_ldmem)")
157169689Skan
158169689Skan(define_insn_reservation "ir_sr70_fstore" 1
159169689Skan  (and (eq_attr "cpu" "sr71000")
160169689Skan       (eq_attr "type" "fpstore,fpidxstore"))
161169689Skan  "(cpu_iss+cp1_iss),(fpu_mov+ri_mem)")
162169689Skan
163169689Skan
164169689Skan;; This reservation is for conditional move based on integer
165169689Skan;; or floating point CC.
166169689Skan(define_insn_reservation "ir_sr70_condmove" 4
167169689Skan  (and (eq_attr "cpu" "sr71000")
168169689Skan       (eq_attr "type" "condmove"))
169169689Skan  "ri_insns")
170169689Skan
171169689Skan;; Try to discriminate move-from-cp1 versus move-to-cp1 as latencies
172169689Skan;; are different. Like float load/store, these insns use multiple
173169689Skan;; resources simultaneously
174169689Skan(define_insn_reservation "ir_sr70_xfer_from" 6
175169689Skan  (and (eq_attr "cpu" "sr71000")
176169689Skan       (and (eq_attr "type" "xfer")
177169689Skan	    (eq_attr "mode" "!SF,DF,FPSW")))
178169689Skan  "(cpu_iss+cp1_iss),(fpu_mov+ri_mem)")
179169689Skan
180169689Skan(define_insn_reservation "ir_sr70_xfer_to" 9
181169689Skan  (and (eq_attr "cpu" "sr71000")
182169689Skan       (and (eq_attr "type" "xfer")
183169689Skan	    (eq_attr "mode" "SF,DF")))
184169689Skan  "(cpu_iss+cp1_iss),(ri_mem+rf_ldmem)")
185169689Skan
186169689Skan(define_insn_reservation "ir_sr70_hilo" 1
187169689Skan  (and (eq_attr "cpu" "sr71000")
188169689Skan       (eq_attr "type" "mthilo,mfhilo"))
189169689Skan  "ri_insns")
190169689Skan
191169689Skan(define_insn_reservation "ir_sr70_arith" 1
192169689Skan  (and (eq_attr "cpu" "sr71000")
193169689Skan       (eq_attr "type" "arith,shift,slt,clz,const,trap"))
194169689Skan  "ri_insns")
195169689Skan
196169689Skan;; emulate repeat (dispatch stall) by spending extra cycle(s) in
197169689Skan;; in iter unit
198169689Skan(define_insn_reservation "ir_sr70_imul_si" 4
199169689Skan  (and (eq_attr "cpu" "sr71000")
200169689Skan       (and (eq_attr "type" "imul,imul3,imadd")
201169689Skan	    (eq_attr "mode" "SI")))
202169689Skan  "ri_alux,ipu_alux,ipu_macc_iter")
203169689Skan
204169689Skan(define_insn_reservation "ir_sr70_imul_di" 6
205169689Skan  (and (eq_attr "cpu" "sr71000")
206169689Skan       (and (eq_attr "type" "imul,imul3,imadd")
207169689Skan	    (eq_attr "mode" "DI")))
208169689Skan  "ri_alux,ipu_alux,(ipu_macc_iter*3)")
209169689Skan
210169689Skan;; Divide algorithm is early out with best latency of 7 pcycles.
211169689Skan;; Use worst case for scheduling purposes.
212169689Skan(define_insn_reservation "ir_sr70_idiv_si" 41
213169689Skan  (and (eq_attr "cpu" "sr71000")
214169689Skan       (and (eq_attr "type" "idiv")
215169689Skan	    (eq_attr "mode" "SI")))
216169689Skan  "ri_alux,ipu_alux,(ipu_macc_iter*38)")
217169689Skan
218169689Skan(define_insn_reservation "ir_sr70_idiv_di" 73
219169689Skan  (and (eq_attr "cpu" "sr71000")
220169689Skan       (and (eq_attr "type" "idiv")
221169689Skan	    (eq_attr "mode" "DI")))
222169689Skan  "ri_alux,ipu_alux,(ipu_macc_iter*70)")
223169689Skan
224169689Skan;; extra reservations of fpu_fpu are for repeat latency
225169689Skan(define_insn_reservation "ir_sr70_fadd_sf" 8
226169689Skan  (and (eq_attr "cpu" "sr71000")
227169689Skan       (and (eq_attr "type" "fadd")
228169689Skan	    (eq_attr "mode" "SF")))
229169689Skan  "rf_insn,fpu_fpu")
230169689Skan
231169689Skan(define_insn_reservation "ir_sr70_fadd_df" 10
232169689Skan  (and (eq_attr "cpu" "sr71000")
233169689Skan       (and (eq_attr "type" "fadd")
234169689Skan	    (eq_attr "mode" "DF")))
235169689Skan  "rf_insn,fpu_fpu")
236169689Skan
237169689Skan;; Latencies for MADD,MSUB, NMADD, NMSUB assume the Multiply is fused
238169689Skan;; with the sub or add.
239169689Skan(define_insn_reservation "ir_sr70_fmul_sf" 8
240169689Skan  (and (eq_attr "cpu" "sr71000")
241169689Skan       (and (eq_attr "type" "fmul,fmadd")
242169689Skan	    (eq_attr "mode" "SF")))
243169689Skan  "rf_insn,fpu_fpu")
244169689Skan
245169689Skan;; tie up the fpu unit to emulate the balance for the "repeat
246169689Skan;; rate" of 8 (2 are spent in the iss unit)
247169689Skan(define_insn_reservation "ir_sr70_fmul_df" 16
248169689Skan  (and (eq_attr "cpu" "sr71000")
249169689Skan       (and (eq_attr "type" "fmul,fmadd")
250169689Skan	    (eq_attr "mode" "DF")))
251169689Skan  "rf_insn,fpu_fpu*6")
252169689Skan
253169689Skan
254169689Skan;; RECIP insn uses same type attr as div, and for SR3, has same
255169689Skan;; timings for double. However, single RECIP has a latency of
256169689Skan;; 28 -- only way to fix this is to introduce new insn attrs.
257169689Skan;; cycles spent in iter unit are designed to satisfy balance
258169689Skan;; of "repeat" latency after insn uses up rf_multi1 reservation
259169689Skan(define_insn_reservation "ir_sr70_fdiv_sf" 60
260169689Skan  (and (eq_attr "cpu" "sr71000")
261169689Skan       (and (eq_attr "type" "fdiv,frdiv")
262169689Skan	    (eq_attr "mode" "SF")))
263169689Skan  "rf_multi1+(fpu_iter*51)")
264169689Skan
265169689Skan(define_insn_reservation "ir_sr70_fdiv_df" 120
266169689Skan  (and (eq_attr "cpu" "sr71000")
267169689Skan       (and (eq_attr "type" "fdiv,frdiv")
268169689Skan	    (eq_attr "mode" "DF")))
269169689Skan  "rf_multi1+(fpu_iter*109)")
270169689Skan
271169689Skan(define_insn_reservation "ir_sr70_fabs" 4
272169689Skan  (and (eq_attr "cpu" "sr71000")
273169689Skan       (eq_attr "type" "fabs,fneg,fmove"))
274169689Skan  "rf_insn,fpu_fpu")
275169689Skan
276169689Skan(define_insn_reservation "ir_sr70_fcmp" 10
277169689Skan  (and (eq_attr "cpu" "sr71000")
278169689Skan       (eq_attr "type" "fcmp"))
279169689Skan  "rf_insn,fpu_fpu")
280169689Skan
281169689Skan;; "fcvt" type attribute covers a number of diff insns, most have the same
282169689Skan;; latency descriptions, a few vary. We use the
283169689Skan;; most common timing (which is also worst case).
284169689Skan(define_insn_reservation "ir_sr70_fcvt" 12
285169689Skan  (and (eq_attr "cpu" "sr71000")
286169689Skan       (eq_attr "type" "fcvt"))
287169689Skan  "rf_insn,fpu_fpu*4")
288169689Skan
289169689Skan(define_insn_reservation "ir_sr70_fsqrt_sf" 62
290169689Skan  (and (eq_attr "cpu" "sr71000")
291169689Skan       (and (eq_attr "type" "fsqrt")
292169689Skan	    (eq_attr "mode" "SF")))
293169689Skan  "rf_multi1+(fpu_iter*53)")
294169689Skan
295169689Skan(define_insn_reservation "ir_sr70_fsqrt_df" 122
296169689Skan  (and (eq_attr "cpu" "sr71000")
297169689Skan       (and (eq_attr "type" "fsqrt")
298169689Skan	    (eq_attr "mode" "DF")))
299169689Skan  "rf_multi1+(fpu_iter*111)")
300169689Skan
301169689Skan(define_insn_reservation "ir_sr70_frsqrt_sf" 48
302169689Skan  (and (eq_attr "cpu" "sr71000")
303169689Skan       (and (eq_attr "type" "frsqrt")
304169689Skan	    (eq_attr "mode" "SF")))
305169689Skan  "rf_multi1+(fpu_iter*39)")
306169689Skan
307169689Skan(define_insn_reservation "ir_sr70_frsqrt_df" 240
308169689Skan  (and (eq_attr "cpu" "sr71000")
309169689Skan       (and (eq_attr "type" "frsqrt")
310169689Skan	    (eq_attr "mode" "DF")))
311169689Skan  "rf_multi1+(fpu_iter*229)")
312169689Skan
313169689Skan(define_insn_reservation "ir_sr70_multi" 1
314169689Skan  (and (eq_attr "cpu" "sr71000")
315169689Skan       (eq_attr "type" "multi"))
316169689Skan  "serial_dispatch")
317169689Skan
318169689Skan(define_insn_reservation "ir_sr70_nop" 1
319169689Skan  (and (eq_attr "cpu" "sr71000")
320169689Skan       (eq_attr "type" "nop"))
321169689Skan  "ri_insns")
322