1/* Swing Modulo Scheduling implementation.
2   Copyright (C) 2004, 2005
3   Free Software Foundation, Inc.
4   Contributed by Ayal Zaks and Mustafa Hagog <zaks,mustafa@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 2, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING.  If not, write to the Free
20Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
2102110-1301, USA.  */
22
23
24#include "config.h"
25#include "system.h"
26#include "coretypes.h"
27#include "tm.h"
28#include "toplev.h"
29#include "rtl.h"
30#include "tm_p.h"
31#include "hard-reg-set.h"
32#include "regs.h"
33#include "function.h"
34#include "flags.h"
35#include "insn-config.h"
36#include "insn-attr.h"
37#include "except.h"
38#include "toplev.h"
39#include "recog.h"
40#include "sched-int.h"
41#include "target.h"
42#include "cfglayout.h"
43#include "cfgloop.h"
44#include "cfghooks.h"
45#include "expr.h"
46#include "params.h"
47#include "gcov-io.h"
48#include "df.h"
49#include "ddg.h"
50#include "timevar.h"
51#include "tree-pass.h"
52
53#ifdef INSN_SCHEDULING
54
55/* This file contains the implementation of the Swing Modulo Scheduler,
56   described in the following references:
57   [1] J. Llosa, A. Gonzalez, E. Ayguade, M. Valero., and J. Eckhardt.
58       Lifetime--sensitive modulo scheduling in a production environment.
59       IEEE Trans. on Comps., 50(3), March 2001
60   [2] J. Llosa, A. Gonzalez, E. Ayguade, and M. Valero.
61       Swing Modulo Scheduling: A Lifetime Sensitive Approach.
62       PACT '96 , pages 80-87, October 1996 (Boston - Massachusetts - USA).
63
64   The basic structure is:
65   1. Build a data-dependence graph (DDG) for each loop.
66   2. Use the DDG to order the insns of a loop (not in topological order
67      necessarily, but rather) trying to place each insn after all its
68      predecessors _or_ after all its successors.
69   3. Compute MII: a lower bound on the number of cycles to schedule the loop.
70   4. Use the ordering to perform list-scheduling of the loop:
71      1. Set II = MII.  We will try to schedule the loop within II cycles.
72      2. Try to schedule the insns one by one according to the ordering.
73	 For each insn compute an interval of cycles by considering already-
74	 scheduled preds and succs (and associated latencies); try to place
75	 the insn in the cycles of this window checking for potential
76	 resource conflicts (using the DFA interface).
77	 Note: this is different from the cycle-scheduling of schedule_insns;
78	 here the insns are not scheduled monotonically top-down (nor bottom-
79	 up).
80      3. If failed in scheduling all insns - bump II++ and try again, unless
81	 II reaches an upper bound MaxII, in which case report failure.
82   5. If we succeeded in scheduling the loop within II cycles, we now
83      generate prolog and epilog, decrease the counter of the loop, and
84      perform modulo variable expansion for live ranges that span more than
85      II cycles (i.e. use register copies to prevent a def from overwriting
86      itself before reaching the use).
87*/
88
89
90/* This page defines partial-schedule structures and functions for
91   modulo scheduling.  */
92
93typedef struct partial_schedule *partial_schedule_ptr;
94typedef struct ps_insn *ps_insn_ptr;
95
96/* The minimum (absolute) cycle that a node of ps was scheduled in.  */
97#define PS_MIN_CYCLE(ps) (((partial_schedule_ptr)(ps))->min_cycle)
98
99/* The maximum (absolute) cycle that a node of ps was scheduled in.  */
100#define PS_MAX_CYCLE(ps) (((partial_schedule_ptr)(ps))->max_cycle)
101
102/* Perform signed modulo, always returning a non-negative value.  */
103#define SMODULO(x,y) ((x) % (y) < 0 ? ((x) % (y) + (y)) : (x) % (y))
104
105/* The number of different iterations the nodes in ps span, assuming
106   the stage boundaries are placed efficiently.  */
107#define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
108			     + 1 + (ps)->ii - 1) / (ps)->ii)
109
110/* A single instruction in the partial schedule.  */
111struct ps_insn
112{
113  /* The corresponding DDG_NODE.  */
114  ddg_node_ptr node;
115
116  /* The (absolute) cycle in which the PS instruction is scheduled.
117     Same as SCHED_TIME (node).  */
118  int cycle;
119
120  /* The next/prev PS_INSN in the same row.  */
121  ps_insn_ptr next_in_row,
122	      prev_in_row;
123
124  /* The number of nodes in the same row that come after this node.  */
125  int row_rest_count;
126};
127
128/* Holds the partial schedule as an array of II rows.  Each entry of the
129   array points to a linked list of PS_INSNs, which represents the
130   instructions that are scheduled for that row.  */
131struct partial_schedule
132{
133  int ii;	/* Number of rows in the partial schedule.  */
134  int history;  /* Threshold for conflict checking using DFA.  */
135
136  /* rows[i] points to linked list of insns scheduled in row i (0<=i<ii).  */
137  ps_insn_ptr *rows;
138
139  /* The earliest absolute cycle of an insn in the partial schedule.  */
140  int min_cycle;
141
142  /* The latest absolute cycle of an insn in the partial schedule.  */
143  int max_cycle;
144
145  ddg_ptr g;	/* The DDG of the insns in the partial schedule.  */
146};
147
148/* We use this to record all the register replacements we do in
149   the kernel so we can undo SMS if it is not profitable.  */
150struct undo_replace_buff_elem
151{
152  rtx insn;
153  rtx orig_reg;
154  rtx new_reg;
155  struct undo_replace_buff_elem *next;
156};
157
158
159
160partial_schedule_ptr create_partial_schedule (int ii, ddg_ptr, int history);
161void free_partial_schedule (partial_schedule_ptr);
162void reset_partial_schedule (partial_schedule_ptr, int new_ii);
163void print_partial_schedule (partial_schedule_ptr, FILE *);
164static int kernel_number_of_cycles (rtx first_insn, rtx last_insn);
165static ps_insn_ptr ps_add_node_check_conflicts (partial_schedule_ptr,
166						ddg_node_ptr node, int cycle,
167						sbitmap must_precede,
168						sbitmap must_follow);
169static void rotate_partial_schedule (partial_schedule_ptr, int);
170void set_row_column_for_ps (partial_schedule_ptr);
171static bool ps_unschedule_node (partial_schedule_ptr, ddg_node_ptr );
172
173
174/* This page defines constants and structures for the modulo scheduling
175   driver.  */
176
177/* As in haifa-sched.c:  */
178/* issue_rate is the number of insns that can be scheduled in the same
179   machine cycle.  It can be defined in the config/mach/mach.h file,
180   otherwise we set it to 1.  */
181
182static int issue_rate;
183
184/* For printing statistics.  */
185static FILE *stats_file;
186
187static int sms_order_nodes (ddg_ptr, int, int * result);
188static void set_node_sched_params (ddg_ptr);
189static partial_schedule_ptr sms_schedule_by_order (ddg_ptr, int, int,
190						   int *, FILE*);
191static void permute_partial_schedule (partial_schedule_ptr ps, rtx last);
192static void generate_prolog_epilog (partial_schedule_ptr ,struct loop * loop, rtx);
193static void duplicate_insns_of_cycles (partial_schedule_ptr ps,
194				       int from_stage, int to_stage,
195				       int is_prolog);
196
197#define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
198#define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
199#define SCHED_FIRST_REG_MOVE(x) \
200	(((node_sched_params_ptr)(x)->aux.info)->first_reg_move)
201#define SCHED_NREG_MOVES(x) \
202	(((node_sched_params_ptr)(x)->aux.info)->nreg_moves)
203#define SCHED_ROW(x) (((node_sched_params_ptr)(x)->aux.info)->row)
204#define SCHED_STAGE(x) (((node_sched_params_ptr)(x)->aux.info)->stage)
205#define SCHED_COLUMN(x) (((node_sched_params_ptr)(x)->aux.info)->column)
206
207/* The scheduling parameters held for each node.  */
208typedef struct node_sched_params
209{
210  int asap;	/* A lower-bound on the absolute scheduling cycle.  */
211  int time;	/* The absolute scheduling cycle (time >= asap).  */
212
213  /* The following field (first_reg_move) is a pointer to the first
214     register-move instruction added to handle the modulo-variable-expansion
215     of the register defined by this node.  This register-move copies the
216     original register defined by the node.  */
217  rtx first_reg_move;
218
219  /* The number of register-move instructions added, immediately preceding
220     first_reg_move.  */
221  int nreg_moves;
222
223  int row;    /* Holds time % ii.  */
224  int stage;  /* Holds time / ii.  */
225
226  /* The column of a node inside the ps.  If nodes u, v are on the same row,
227     u will precede v if column (u) < column (v).  */
228  int column;
229} *node_sched_params_ptr;
230
231
232/* The following three functions are copied from the current scheduler
233   code in order to use sched_analyze() for computing the dependencies.
234   They are used when initializing the sched_info structure.  */
235static const char *
236sms_print_insn (rtx insn, int aligned ATTRIBUTE_UNUSED)
237{
238  static char tmp[80];
239
240  sprintf (tmp, "i%4d", INSN_UID (insn));
241  return tmp;
242}
243
244static int
245contributes_to_priority (rtx next, rtx insn)
246{
247  return BLOCK_NUM (next) == BLOCK_NUM (insn);
248}
249
250static void
251compute_jump_reg_dependencies (rtx insn ATTRIBUTE_UNUSED,
252			       regset cond_exec ATTRIBUTE_UNUSED,
253			       regset used ATTRIBUTE_UNUSED,
254			       regset set ATTRIBUTE_UNUSED)
255{
256}
257
258static struct sched_info sms_sched_info =
259{
260  NULL,
261  NULL,
262  NULL,
263  NULL,
264  NULL,
265  sms_print_insn,
266  contributes_to_priority,
267  compute_jump_reg_dependencies,
268  NULL, NULL,
269  NULL, NULL,
270  0, 0, 0
271};
272
273
274/* Return the register decremented and tested in INSN,
275   or zero if it is not a decrement-and-branch insn.  */
276
277static rtx
278doloop_register_get (rtx insn ATTRIBUTE_UNUSED)
279{
280#ifdef HAVE_doloop_end
281  rtx pattern, reg, condition;
282
283  if (! JUMP_P (insn))
284    return NULL_RTX;
285
286  pattern = PATTERN (insn);
287  condition = doloop_condition_get (pattern);
288  if (! condition)
289    return NULL_RTX;
290
291  if (REG_P (XEXP (condition, 0)))
292    reg = XEXP (condition, 0);
293  else if (GET_CODE (XEXP (condition, 0)) == PLUS
294	   && REG_P (XEXP (XEXP (condition, 0), 0)))
295    reg = XEXP (XEXP (condition, 0), 0);
296  else
297    gcc_unreachable ();
298
299  return reg;
300#else
301  return NULL_RTX;
302#endif
303}
304
305/* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
306   that the number of iterations is a compile-time constant.  If so,
307   return the rtx that sets COUNT_REG to a constant, and set COUNT to
308   this constant.  Otherwise return 0.  */
309static rtx
310const_iteration_count (rtx count_reg, basic_block pre_header,
311		       HOST_WIDEST_INT * count)
312{
313  rtx insn;
314  rtx head, tail;
315
316  if (! pre_header)
317    return NULL_RTX;
318
319  get_block_head_tail (pre_header->index, &head, &tail);
320
321  for (insn = tail; insn != PREV_INSN (head); insn = PREV_INSN (insn))
322    if (INSN_P (insn) && single_set (insn) &&
323	rtx_equal_p (count_reg, SET_DEST (single_set (insn))))
324      {
325	rtx pat = single_set (insn);
326
327	if (GET_CODE (SET_SRC (pat)) == CONST_INT)
328	  {
329	    *count = INTVAL (SET_SRC (pat));
330	    return insn;
331	  }
332
333	return NULL_RTX;
334      }
335
336  return NULL_RTX;
337}
338
339/* A very simple resource-based lower bound on the initiation interval.
340   ??? Improve the accuracy of this bound by considering the
341   utilization of various units.  */
342static int
343res_MII (ddg_ptr g)
344{
345  return (g->num_nodes / issue_rate);
346}
347
348
349/* Points to the array that contains the sched data for each node.  */
350static node_sched_params_ptr node_sched_params;
351
352/* Allocate sched_params for each node and initialize it.  Assumes that
353   the aux field of each node contain the asap bound (computed earlier),
354   and copies it into the sched_params field.  */
355static void
356set_node_sched_params (ddg_ptr g)
357{
358  int i;
359
360  /* Allocate for each node in the DDG a place to hold the "sched_data".  */
361  /* Initialize ASAP/ALAP/HIGHT to zero.  */
362  node_sched_params = (node_sched_params_ptr)
363		       xcalloc (g->num_nodes,
364				sizeof (struct node_sched_params));
365
366  /* Set the pointer of the general data of the node to point to the
367     appropriate sched_params structure.  */
368  for (i = 0; i < g->num_nodes; i++)
369    {
370      /* Watch out for aliasing problems?  */
371      node_sched_params[i].asap = g->nodes[i].aux.count;
372      g->nodes[i].aux.info = &node_sched_params[i];
373    }
374}
375
376static void
377print_node_sched_params (FILE * dump_file, int num_nodes)
378{
379  int i;
380
381  if (! dump_file)
382    return;
383  for (i = 0; i < num_nodes; i++)
384    {
385      node_sched_params_ptr nsp = &node_sched_params[i];
386      rtx reg_move = nsp->first_reg_move;
387      int j;
388
389      fprintf (dump_file, "Node %d:\n", i);
390      fprintf (dump_file, " asap = %d:\n", nsp->asap);
391      fprintf (dump_file, " time = %d:\n", nsp->time);
392      fprintf (dump_file, " nreg_moves = %d:\n", nsp->nreg_moves);
393      for (j = 0; j < nsp->nreg_moves; j++)
394	{
395	  fprintf (dump_file, " reg_move = ");
396	  print_rtl_single (dump_file, reg_move);
397	  reg_move = PREV_INSN (reg_move);
398	}
399    }
400}
401
402/* Calculate an upper bound for II.  SMS should not schedule the loop if it
403   requires more cycles than this bound.  Currently set to the sum of the
404   longest latency edge for each node.  Reset based on experiments.  */
405static int
406calculate_maxii (ddg_ptr g)
407{
408  int i;
409  int maxii = 0;
410
411  for (i = 0; i < g->num_nodes; i++)
412    {
413      ddg_node_ptr u = &g->nodes[i];
414      ddg_edge_ptr e;
415      int max_edge_latency = 0;
416
417      for (e = u->out; e; e = e->next_out)
418	max_edge_latency = MAX (max_edge_latency, e->latency);
419
420      maxii += max_edge_latency;
421    }
422  return maxii;
423}
424
425/*
426   Breaking intra-loop register anti-dependences:
427   Each intra-loop register anti-dependence implies a cross-iteration true
428   dependence of distance 1. Therefore, we can remove such false dependencies
429   and figure out if the partial schedule broke them by checking if (for a
430   true-dependence of distance 1): SCHED_TIME (def) < SCHED_TIME (use) and
431   if so generate a register move.   The number of such moves is equal to:
432              SCHED_TIME (use) - SCHED_TIME (def)       { 0 broken
433   nreg_moves = ----------------------------------- + 1 - {   dependence.
434                            ii                          { 1 if not.
435*/
436static struct undo_replace_buff_elem *
437generate_reg_moves (partial_schedule_ptr ps)
438{
439  ddg_ptr g = ps->g;
440  int ii = ps->ii;
441  int i;
442  struct undo_replace_buff_elem *reg_move_replaces = NULL;
443
444  for (i = 0; i < g->num_nodes; i++)
445    {
446      ddg_node_ptr u = &g->nodes[i];
447      ddg_edge_ptr e;
448      int nreg_moves = 0, i_reg_move;
449      sbitmap *uses_of_defs;
450      rtx last_reg_move;
451      rtx prev_reg, old_reg;
452
453      /* Compute the number of reg_moves needed for u, by looking at life
454	 ranges started at u (excluding self-loops).  */
455      for (e = u->out; e; e = e->next_out)
456	if (e->type == TRUE_DEP && e->dest != e->src)
457	  {
458	    int nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
459
460            if (e->distance == 1)
461              nreg_moves4e = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
462
463	    /* If dest precedes src in the schedule of the kernel, then dest
464	       will read before src writes and we can save one reg_copy.  */
465	    if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
466		&& SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
467	      nreg_moves4e--;
468
469	    nreg_moves = MAX (nreg_moves, nreg_moves4e);
470	  }
471
472      if (nreg_moves == 0)
473	continue;
474
475      /* Every use of the register defined by node may require a different
476	 copy of this register, depending on the time the use is scheduled.
477	 Set a bitmap vector, telling which nodes use each copy of this
478	 register.  */
479      uses_of_defs = sbitmap_vector_alloc (nreg_moves, g->num_nodes);
480      sbitmap_vector_zero (uses_of_defs, nreg_moves);
481      for (e = u->out; e; e = e->next_out)
482	if (e->type == TRUE_DEP && e->dest != e->src)
483	  {
484	    int dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src)) / ii;
485
486	    if (e->distance == 1)
487	      dest_copy = (SCHED_TIME (e->dest) - SCHED_TIME (e->src) + ii) / ii;
488
489	    if (SCHED_ROW (e->dest) == SCHED_ROW (e->src)
490		&& SCHED_COLUMN (e->dest) < SCHED_COLUMN (e->src))
491	      dest_copy--;
492
493	    if (dest_copy)
494	      SET_BIT (uses_of_defs[dest_copy - 1], e->dest->cuid);
495	  }
496
497      /* Now generate the reg_moves, attaching relevant uses to them.  */
498      SCHED_NREG_MOVES (u) = nreg_moves;
499      old_reg = prev_reg = copy_rtx (SET_DEST (single_set (u->insn)));
500      last_reg_move = u->insn;
501
502      for (i_reg_move = 0; i_reg_move < nreg_moves; i_reg_move++)
503	{
504	  unsigned int i_use = 0;
505	  rtx new_reg = gen_reg_rtx (GET_MODE (prev_reg));
506	  rtx reg_move = gen_move_insn (new_reg, prev_reg);
507	  sbitmap_iterator sbi;
508
509	  add_insn_before (reg_move, last_reg_move);
510	  last_reg_move = reg_move;
511
512	  if (!SCHED_FIRST_REG_MOVE (u))
513	    SCHED_FIRST_REG_MOVE (u) = reg_move;
514
515	  EXECUTE_IF_SET_IN_SBITMAP (uses_of_defs[i_reg_move], 0, i_use, sbi)
516	    {
517	      struct undo_replace_buff_elem *rep;
518
519	      rep = (struct undo_replace_buff_elem *)
520		    xcalloc (1, sizeof (struct undo_replace_buff_elem));
521	      rep->insn = g->nodes[i_use].insn;
522	      rep->orig_reg = old_reg;
523	      rep->new_reg = new_reg;
524
525	      if (! reg_move_replaces)
526		reg_move_replaces = rep;
527	      else
528		{
529		  rep->next = reg_move_replaces;
530		  reg_move_replaces = rep;
531		}
532
533	      replace_rtx (g->nodes[i_use].insn, old_reg, new_reg);
534	    }
535
536	  prev_reg = new_reg;
537	}
538    }
539  return reg_move_replaces;
540}
541
542/* We call this when we want to undo the SMS schedule for a given loop.
543   One of the things that we do is to delete the register moves generated
544   for the sake of SMS; this function deletes the register move instructions
545   recorded in the undo buffer.  */
546static void
547undo_generate_reg_moves (partial_schedule_ptr ps,
548			 struct undo_replace_buff_elem *reg_move_replaces)
549{
550  int i,j;
551
552  for (i = 0; i < ps->g->num_nodes; i++)
553    {
554      ddg_node_ptr u = &ps->g->nodes[i];
555      rtx prev;
556      rtx crr = SCHED_FIRST_REG_MOVE (u);
557
558      for (j = 0; j < SCHED_NREG_MOVES (u); j++)
559	{
560	  prev = PREV_INSN (crr);
561	  delete_insn (crr);
562	  crr = prev;
563	}
564      SCHED_FIRST_REG_MOVE (u) = NULL_RTX;
565    }
566
567  while (reg_move_replaces)
568    {
569      struct undo_replace_buff_elem *rep = reg_move_replaces;
570
571      reg_move_replaces = reg_move_replaces->next;
572      replace_rtx (rep->insn, rep->new_reg, rep->orig_reg);
573    }
574}
575
576/* Free memory allocated for the undo buffer.  */
577static void
578free_undo_replace_buff (struct undo_replace_buff_elem *reg_move_replaces)
579{
580
581  while (reg_move_replaces)
582    {
583      struct undo_replace_buff_elem *rep = reg_move_replaces;
584
585      reg_move_replaces = reg_move_replaces->next;
586      free (rep);
587    }
588}
589
590/* Bump the SCHED_TIMEs of all nodes to start from zero.  Set the values
591   of SCHED_ROW and SCHED_STAGE.  */
592static void
593normalize_sched_times (partial_schedule_ptr ps)
594{
595  int i;
596  ddg_ptr g = ps->g;
597  int amount = PS_MIN_CYCLE (ps);
598  int ii = ps->ii;
599
600  /* Don't include the closing branch assuming that it is the last node.  */
601  for (i = 0; i < g->num_nodes - 1; i++)
602    {
603      ddg_node_ptr u = &g->nodes[i];
604      int normalized_time = SCHED_TIME (u) - amount;
605
606      gcc_assert (normalized_time >= 0);
607
608      SCHED_TIME (u) = normalized_time;
609      SCHED_ROW (u) = normalized_time % ii;
610      SCHED_STAGE (u) = normalized_time / ii;
611    }
612}
613
614/* Set SCHED_COLUMN of each node according to its position in PS.  */
615static void
616set_columns_for_ps (partial_schedule_ptr ps)
617{
618  int row;
619
620  for (row = 0; row < ps->ii; row++)
621    {
622      ps_insn_ptr cur_insn = ps->rows[row];
623      int column = 0;
624
625      for (; cur_insn; cur_insn = cur_insn->next_in_row)
626	SCHED_COLUMN (cur_insn->node) = column++;
627    }
628}
629
630/* Permute the insns according to their order in PS, from row 0 to
631   row ii-1, and position them right before LAST.  This schedules
632   the insns of the loop kernel.  */
633static void
634permute_partial_schedule (partial_schedule_ptr ps, rtx last)
635{
636  int ii = ps->ii;
637  int row;
638  ps_insn_ptr ps_ij;
639
640  for (row = 0; row < ii ; row++)
641    for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
642      if (PREV_INSN (last) != ps_ij->node->insn)
643      	reorder_insns_nobb (ps_ij->node->first_note, ps_ij->node->insn,
644			    PREV_INSN (last));
645}
646
647/* As part of undoing SMS we return to the original ordering of the
648   instructions inside the loop kernel.  Given the partial schedule PS, this
649   function returns the ordering of the instruction according to their CUID
650   in the DDG (PS->G), which is the original order of the instruction before
651   performing SMS.  */
652static void
653undo_permute_partial_schedule (partial_schedule_ptr ps, rtx last)
654{
655  int i;
656
657  for (i = 0 ; i < ps->g->num_nodes; i++)
658    if (last == ps->g->nodes[i].insn
659	|| last == ps->g->nodes[i].first_note)
660      break;
661    else if (PREV_INSN (last) != ps->g->nodes[i].insn)
662      reorder_insns_nobb (ps->g->nodes[i].first_note, ps->g->nodes[i].insn,
663			  PREV_INSN (last));
664}
665
666/* Used to generate the prologue & epilogue.  Duplicate the subset of
667   nodes whose stages are between FROM_STAGE and TO_STAGE (inclusive
668   of both), together with a prefix/suffix of their reg_moves.  */
669static void
670duplicate_insns_of_cycles (partial_schedule_ptr ps, int from_stage,
671			   int to_stage, int for_prolog)
672{
673  int row;
674  ps_insn_ptr ps_ij;
675
676  for (row = 0; row < ps->ii; row++)
677    for (ps_ij = ps->rows[row]; ps_ij; ps_ij = ps_ij->next_in_row)
678      {
679	ddg_node_ptr u_node = ps_ij->node;
680	int j, i_reg_moves;
681	rtx reg_move = NULL_RTX;
682
683	if (for_prolog)
684	  {
685	    /* SCHED_STAGE (u_node) >= from_stage == 0.  Generate increasing
686	       number of reg_moves starting with the second occurrence of
687	       u_node, which is generated if its SCHED_STAGE <= to_stage.  */
688	    i_reg_moves = to_stage - SCHED_STAGE (u_node) + 1;
689	    i_reg_moves = MAX (i_reg_moves, 0);
690	    i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
691
692	    /* The reg_moves start from the *first* reg_move backwards.  */
693	    if (i_reg_moves)
694	      {
695		reg_move = SCHED_FIRST_REG_MOVE (u_node);
696		for (j = 1; j < i_reg_moves; j++)
697		  reg_move = PREV_INSN (reg_move);
698	      }
699	  }
700	else /* It's for the epilog.  */
701	  {
702	    /* SCHED_STAGE (u_node) <= to_stage.  Generate all reg_moves,
703	       starting to decrease one stage after u_node no longer occurs;
704	       that is, generate all reg_moves until
705	       SCHED_STAGE (u_node) == from_stage - 1.  */
706	    i_reg_moves = SCHED_NREG_MOVES (u_node)
707	    	       - (from_stage - SCHED_STAGE (u_node) - 1);
708	    i_reg_moves = MAX (i_reg_moves, 0);
709	    i_reg_moves = MIN (i_reg_moves, SCHED_NREG_MOVES (u_node));
710
711	    /* The reg_moves start from the *last* reg_move forwards.  */
712	    if (i_reg_moves)
713	      {
714		reg_move = SCHED_FIRST_REG_MOVE (u_node);
715		for (j = 1; j < SCHED_NREG_MOVES (u_node); j++)
716		  reg_move = PREV_INSN (reg_move);
717	      }
718	  }
719
720	for (j = 0; j < i_reg_moves; j++, reg_move = NEXT_INSN (reg_move))
721	  emit_insn (copy_rtx (PATTERN (reg_move)));
722	if (SCHED_STAGE (u_node) >= from_stage
723	    && SCHED_STAGE (u_node) <= to_stage)
724	  duplicate_insn_chain (u_node->first_note, u_node->insn);
725      }
726}
727
728
729/* Generate the instructions (including reg_moves) for prolog & epilog.  */
730static void
731generate_prolog_epilog (partial_schedule_ptr ps, struct loop * loop, rtx count_reg)
732{
733  int i;
734  int last_stage = PS_STAGE_COUNT (ps) - 1;
735  edge e;
736
737  /* Generate the prolog, inserting its insns on the loop-entry edge.  */
738  start_sequence ();
739
740  if (count_reg)
741   /* Generate a subtract instruction at the beginning of the prolog to
742      adjust the loop count by STAGE_COUNT.  */
743   emit_insn (gen_sub2_insn (count_reg, GEN_INT (last_stage)));
744
745  for (i = 0; i < last_stage; i++)
746    duplicate_insns_of_cycles (ps, 0, i, 1);
747
748  /* Put the prolog ,  on the one and only entry edge.  */
749  e = loop_preheader_edge (loop);
750  loop_split_edge_with(e , get_insns());
751
752  end_sequence ();
753
754  /* Generate the epilog, inserting its insns on the loop-exit edge.  */
755  start_sequence ();
756
757  for (i = 0; i < last_stage; i++)
758    duplicate_insns_of_cycles (ps, i + 1, last_stage, 0);
759
760  /* Put the epilogue on the one and only one exit edge.  */
761  gcc_assert (loop->single_exit);
762  e = loop->single_exit;
763  loop_split_edge_with(e , get_insns());
764  end_sequence ();
765}
766
767/* Return the line note insn preceding INSN, for debugging.  Taken from
768   emit-rtl.c.  */
769static rtx
770find_line_note (rtx insn)
771{
772  for (; insn; insn = PREV_INSN (insn))
773    if (NOTE_P (insn)
774	&& NOTE_LINE_NUMBER (insn) >= 0)
775      break;
776
777  return insn;
778}
779
780/* Return true if all the BBs of the loop are empty except the
781   loop header.  */
782static bool
783loop_single_full_bb_p (struct loop *loop)
784{
785  unsigned i;
786  basic_block *bbs = get_loop_body (loop);
787
788  for (i = 0; i < loop->num_nodes ; i++)
789    {
790      rtx head, tail;
791      bool empty_bb = true;
792
793      if (bbs[i] == loop->header)
794        continue;
795
796      /* Make sure that basic blocks other than the header
797         have only notes labels or jumps.  */
798      get_block_head_tail (bbs[i]->index, &head, &tail);
799      for (; head != NEXT_INSN (tail); head = NEXT_INSN (head))
800        {
801          if (NOTE_P (head) || LABEL_P (head)
802 	      || (INSN_P (head) && JUMP_P (head)))
803 	    continue;
804 	  empty_bb = false;
805 	  break;
806        }
807
808      if (! empty_bb)
809        {
810          free (bbs);
811          return false;
812        }
813    }
814  free (bbs);
815  return true;
816}
817
818/* A simple loop from SMS point of view; it is a loop that is composed of
819   either a single basic block or two BBs - a header and a latch.  */
820#define SIMPLE_SMS_LOOP_P(loop) ((loop->num_nodes < 3 ) 		    \
821				  && (EDGE_COUNT (loop->latch->preds) == 1) \
822                                  && (EDGE_COUNT (loop->latch->succs) == 1))
823
824/* Return true if the loop is in its canonical form and false if not.
825   i.e. SIMPLE_SMS_LOOP_P and have one preheader block, and single exit.  */
826static bool
827loop_canon_p (struct loop *loop, FILE *dump_file)
828{
829
830  if (loop->inner || ! loop->outer)
831    return false;
832
833  if (!loop->single_exit)
834    {
835      if (dump_file)
836	{
837	  rtx line_note = find_line_note (BB_END (loop->header));
838
839	  fprintf (dump_file, "SMS loop many exits ");
840	  if (line_note)
841	    {
842	      expanded_location xloc;
843	      NOTE_EXPANDED_LOCATION (xloc, line_note);
844	      fprintf (stats_file, " %s %d (file, line)\n",
845		       xloc.file, xloc.line);
846	    }
847	}
848      return false;
849    }
850
851  if (! SIMPLE_SMS_LOOP_P (loop) && ! loop_single_full_bb_p (loop))
852    {
853      if (dump_file)
854	{
855	  rtx line_note = find_line_note (BB_END (loop->header));
856
857	  fprintf (dump_file, "SMS loop many BBs. ");
858	  if (line_note)
859	    {
860	      expanded_location xloc;
861  	      NOTE_EXPANDED_LOCATION (xloc, line_note);
862	      fprintf (stats_file, " %s %d (file, line)\n",
863		       xloc.file, xloc.line);
864	    }
865	}
866      return false;
867    }
868
869    return true;
870}
871
872/* If there are more than one entry for the loop,
873   make it one by splitting the first entry edge and
874   redirecting the others to the new BB.  */
875static void
876canon_loop (struct loop *loop)
877{
878  edge e;
879  edge_iterator i;
880
881  /* Avoid annoying special cases of edges going to exit
882     block.  */
883  FOR_EACH_EDGE (e, i, EXIT_BLOCK_PTR->preds)
884    if ((e->flags & EDGE_FALLTHRU) && (EDGE_COUNT (e->src->succs) > 1))
885      loop_split_edge_with (e, NULL_RTX);
886
887  if (loop->latch == loop->header
888      || EDGE_COUNT (loop->latch->succs) > 1)
889    {
890      FOR_EACH_EDGE (e, i, loop->header->preds)
891        if (e->src == loop->latch)
892          break;
893      loop_split_edge_with (e, NULL_RTX);
894    }
895}
896
897/* Build the loop information without loop
898   canonization, the loop canonization will
899   be performed if the loop is SMSable.  */
900static struct loops *
901build_loops_structure (FILE *dumpfile)
902{
903  struct loops *loops = xcalloc (1, sizeof (struct loops));
904
905  /* Find the loops.  */
906
907  if (flow_loops_find (loops) <= 1)
908    {
909      /* No loops.  */
910      flow_loops_free (loops);
911      free (loops);
912
913      return NULL;
914    }
915
916  /* Not going to update these.  */
917  free (loops->cfg.rc_order);
918  loops->cfg.rc_order = NULL;
919  free (loops->cfg.dfs_order);
920  loops->cfg.dfs_order = NULL;
921
922  create_preheaders (loops, CP_SIMPLE_PREHEADERS);
923  mark_single_exit_loops (loops);
924  /* Dump loops.  */
925  flow_loops_dump (loops, dumpfile, NULL, 1);
926
927#ifdef ENABLE_CHECKING
928  verify_dominators (CDI_DOMINATORS);
929  verify_loop_structure (loops);
930#endif
931
932  return loops;
933}
934
935/* Main entry point, perform SMS scheduling on the loops of the function
936   that consist of single basic blocks.  */
937void
938sms_schedule (FILE *dump_file)
939{
940  static int passes = 0;
941  rtx insn;
942  ddg_ptr *g_arr, g;
943  int * node_order;
944  int maxii;
945  unsigned i,num_loops;
946  partial_schedule_ptr ps;
947  struct df *df;
948  struct loops *loops;
949  basic_block bb = NULL;
950  /* vars to the versioning only if needed*/
951  struct loop * nloop;
952  basic_block condition_bb = NULL;
953  edge latch_edge;
954  gcov_type trip_count = 0;
955
956  if (! (loops = build_loops_structure (dump_file)))
957    return;  /* There is no loops to schedule.  */
958
959
960  stats_file = dump_file;
961
962  /* Initialize issue_rate.  */
963  if (targetm.sched.issue_rate)
964    {
965      int temp = reload_completed;
966
967      reload_completed = 1;
968      issue_rate = targetm.sched.issue_rate ();
969      reload_completed = temp;
970    }
971  else
972    issue_rate = 1;
973
974  /* Initialize the scheduler.  */
975  current_sched_info = &sms_sched_info;
976  sched_init (NULL);
977
978  /* Init Data Flow analysis, to be used in interloop dep calculation.  */
979  df = df_init ();
980  df_analyze (df, 0, DF_ALL);
981
982  /* Allocate memory to hold the DDG array one entry for each loop.
983     We use loop->num as index into this array.  */
984  g_arr = xcalloc (loops->num, sizeof (ddg_ptr));
985
986
987  /* Build DDGs for all the relevant loops and hold them in G_ARR
988     indexed by the loop index.  */
989  for (i = 0; i < loops->num; i++)
990    {
991      rtx head, tail;
992      rtx count_reg;
993      struct loop *loop = loops->parray[i];
994
995      /* For debugging.  */
996      if ((passes++ > MAX_SMS_LOOP_NUMBER) && (MAX_SMS_LOOP_NUMBER != -1))
997        {
998          if (dump_file)
999            fprintf (dump_file, "SMS reached MAX_PASSES... \n");
1000
1001          break;
1002        }
1003
1004      if (! loop_canon_p (loop, dump_file))
1005        continue;
1006
1007      if (! loop_single_full_bb_p (loop))
1008	continue;
1009
1010      bb = loop->header;
1011
1012      get_block_head_tail (bb->index, &head, &tail);
1013      latch_edge = loop_latch_edge (loop);
1014      gcc_assert (loop->single_exit);
1015      if (loop->single_exit->count)
1016	trip_count = latch_edge->count / loop->single_exit->count;
1017
1018      /* Perfrom SMS only on loops that their average count is above threshold.  */
1019
1020      if ( latch_edge->count
1021          && (latch_edge->count < loop->single_exit->count * SMS_LOOP_AVERAGE_COUNT_THRESHOLD))
1022	{
1023	  if (stats_file)
1024	    {
1025	      rtx line_note = find_line_note (tail);
1026
1027	      if (line_note)
1028		{
1029		  expanded_location xloc;
1030		  NOTE_EXPANDED_LOCATION (xloc, line_note);
1031		  fprintf (stats_file, "SMS bb %s %d (file, line)\n",
1032			   xloc.file, xloc.line);
1033		}
1034	      fprintf (stats_file, "SMS single-bb-loop\n");
1035	      if (profile_info && flag_branch_probabilities)
1036	    	{
1037	      	  fprintf (stats_file, "SMS loop-count ");
1038	      	  fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1039	             	   (HOST_WIDEST_INT) bb->count);
1040	      	  fprintf (stats_file, "\n");
1041                  fprintf (stats_file, "SMS trip-count ");
1042                  fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1043                           (HOST_WIDEST_INT) trip_count);
1044                  fprintf (stats_file, "\n");
1045	      	  fprintf (stats_file, "SMS profile-sum-max ");
1046	      	  fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1047	          	   (HOST_WIDEST_INT) profile_info->sum_max);
1048	      	  fprintf (stats_file, "\n");
1049	    	}
1050	    }
1051          continue;
1052        }
1053
1054      /* Make sure this is a doloop.  */
1055      if ( !(count_reg = doloop_register_get (tail)))
1056	continue;
1057
1058      /* Don't handle BBs with calls or barriers, or !single_set insns.  */
1059      for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
1060	if (CALL_P (insn)
1061	    || BARRIER_P (insn)
1062	    || (INSN_P (insn) && !JUMP_P (insn)
1063		&& !single_set (insn) && GET_CODE (PATTERN (insn)) != USE))
1064	  break;
1065
1066      if (insn != NEXT_INSN (tail))
1067	{
1068	  if (stats_file)
1069	    {
1070	      if (CALL_P (insn))
1071		fprintf (stats_file, "SMS loop-with-call\n");
1072	      else if (BARRIER_P (insn))
1073		fprintf (stats_file, "SMS loop-with-barrier\n");
1074	      else
1075		fprintf (stats_file, "SMS loop-with-not-single-set\n");
1076	      print_rtl_single (stats_file, insn);
1077	    }
1078
1079	  continue;
1080	}
1081
1082      if (! (g = create_ddg (bb, df, 0)))
1083        {
1084          if (stats_file)
1085	    fprintf (stats_file, "SMS doloop\n");
1086	  continue;
1087        }
1088
1089      g_arr[i] = g;
1090    }
1091
1092  /* Release Data Flow analysis data structures.  */
1093  df_finish (df);
1094
1095  /* We don't want to perform SMS on new loops - created by versioning.  */
1096  num_loops = loops->num;
1097  /* Go over the built DDGs and perfrom SMS for each one of them.  */
1098  for (i = 0; i < num_loops; i++)
1099    {
1100      rtx head, tail;
1101      rtx count_reg, count_init;
1102      int mii, rec_mii;
1103      unsigned stage_count = 0;
1104      HOST_WIDEST_INT loop_count = 0;
1105      struct loop *loop = loops->parray[i];
1106
1107      if (! (g = g_arr[i]))
1108        continue;
1109
1110      if (dump_file)
1111	print_ddg (dump_file, g);
1112
1113      get_block_head_tail (loop->header->index, &head, &tail);
1114
1115      latch_edge = loop_latch_edge (loop);
1116      gcc_assert (loop->single_exit);
1117      if (loop->single_exit->count)
1118	trip_count = latch_edge->count / loop->single_exit->count;
1119
1120      if (stats_file)
1121	{
1122	  rtx line_note = find_line_note (tail);
1123
1124	  if (line_note)
1125	    {
1126	      expanded_location xloc;
1127	      NOTE_EXPANDED_LOCATION (xloc, line_note);
1128	      fprintf (stats_file, "SMS bb %s %d (file, line)\n",
1129		       xloc.file, xloc.line);
1130	    }
1131	  fprintf (stats_file, "SMS single-bb-loop\n");
1132	  if (profile_info && flag_branch_probabilities)
1133	    {
1134	      fprintf (stats_file, "SMS loop-count ");
1135	      fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1136	               (HOST_WIDEST_INT) bb->count);
1137	      fprintf (stats_file, "\n");
1138	      fprintf (stats_file, "SMS profile-sum-max ");
1139	      fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1140	               (HOST_WIDEST_INT) profile_info->sum_max);
1141	      fprintf (stats_file, "\n");
1142	    }
1143	  fprintf (stats_file, "SMS doloop\n");
1144	  fprintf (stats_file, "SMS built-ddg %d\n", g->num_nodes);
1145          fprintf (stats_file, "SMS num-loads %d\n", g->num_loads);
1146          fprintf (stats_file, "SMS num-stores %d\n", g->num_stores);
1147	}
1148
1149
1150      /* In case of th loop have doloop register it gets special
1151	 handling.  */
1152      count_init = NULL_RTX;
1153      if ((count_reg = doloop_register_get (tail)))
1154	{
1155	  basic_block pre_header;
1156
1157	  pre_header = loop_preheader_edge (loop)->src;
1158	  count_init = const_iteration_count (count_reg, pre_header,
1159					      &loop_count);
1160	}
1161      gcc_assert (count_reg);
1162
1163      if (stats_file && count_init)
1164        {
1165          fprintf (stats_file, "SMS const-doloop ");
1166          fprintf (stats_file, HOST_WIDEST_INT_PRINT_DEC,
1167		     loop_count);
1168          fprintf (stats_file, "\n");
1169        }
1170
1171      node_order = (int *) xmalloc (sizeof (int) * g->num_nodes);
1172
1173      mii = 1; /* Need to pass some estimate of mii.  */
1174      rec_mii = sms_order_nodes (g, mii, node_order);
1175      mii = MAX (res_MII (g), rec_mii);
1176      maxii = (calculate_maxii (g) * SMS_MAX_II_FACTOR) / 100;
1177
1178      if (stats_file)
1179	fprintf (stats_file, "SMS iis %d %d %d (rec_mii, mii, maxii)\n",
1180		 rec_mii, mii, maxii);
1181
1182      /* After sms_order_nodes and before sms_schedule_by_order, to copy over
1183	 ASAP.  */
1184      set_node_sched_params (g);
1185
1186      ps = sms_schedule_by_order (g, mii, maxii, node_order, dump_file);
1187
1188      if (ps)
1189	stage_count = PS_STAGE_COUNT (ps);
1190
1191      /* Stage count of 1 means that there is no interleaving between
1192         iterations, let the scheduling passes do the job.  */
1193      if (stage_count < 1
1194	  || (count_init && (loop_count <= stage_count))
1195	  || (flag_branch_probabilities && (trip_count <= stage_count)))
1196	{
1197	  if (dump_file)
1198	    {
1199	      fprintf (dump_file, "SMS failed... \n");
1200	      fprintf (dump_file, "SMS sched-failed (stage-count=%d, loop-count=", stage_count);
1201	      fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, loop_count);
1202	      fprintf (dump_file, ", trip-count=");
1203	      fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, trip_count);
1204	      fprintf (dump_file, ")\n");
1205	    }
1206	  continue;
1207	}
1208      else
1209	{
1210	  int orig_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1211	  int new_cycles;
1212	  struct undo_replace_buff_elem *reg_move_replaces;
1213
1214	  if (stats_file)
1215	    {
1216	      fprintf (stats_file,
1217		       "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
1218		       stage_count);
1219	      print_partial_schedule (ps, stats_file);
1220	      fprintf (stats_file,
1221		       "SMS Branch (%d) will later be scheduled at cycle %d.\n",
1222		       g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
1223	    }
1224
1225	  /* Set the stage boundaries.  If the DDG is built with closing_branch_deps,
1226	     the closing_branch was scheduled and should appear in the last (ii-1)
1227	     row.  Otherwise, we are free to schedule the branch, and we let nodes
1228	     that were scheduled at the first PS_MIN_CYCLE cycle appear in the first
1229	     row; this should reduce stage_count to minimum.  */
1230	  normalize_sched_times (ps);
1231	  rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
1232	  set_columns_for_ps (ps);
1233
1234	  /* Generate the kernel just to be able to measure its cycles.  */
1235	  permute_partial_schedule (ps, g->closing_branch->first_note);
1236	  reg_move_replaces = generate_reg_moves (ps);
1237
1238	  /* Get the number of cycles the new kernel expect to execute in.  */
1239	  new_cycles = kernel_number_of_cycles (BB_HEAD (g->bb), BB_END (g->bb));
1240
1241	  /* Get back to the original loop so we can do loop versioning.  */
1242	  undo_permute_partial_schedule (ps, g->closing_branch->first_note);
1243	  if (reg_move_replaces)
1244	    undo_generate_reg_moves (ps, reg_move_replaces);
1245
1246	  if ( new_cycles >= orig_cycles)
1247	    {
1248	      /* SMS is not profitable so undo the permutation and reg move generation
1249	         and return the kernel to its original state.  */
1250	      if (dump_file)
1251		fprintf (dump_file, "Undoing SMS because it is not profitable.\n");
1252
1253	    }
1254	  else
1255	    {
1256	      canon_loop (loop);
1257
1258              /* case the BCT count is not known , Do loop-versioning */
1259	      if (count_reg && ! count_init)
1260		{
1261		  rtx comp_rtx = gen_rtx_fmt_ee (GT, VOIDmode, count_reg,
1262						 GEN_INT(stage_count));
1263
1264		  nloop = loop_version (loops, loop, comp_rtx, &condition_bb,
1265					true);
1266		}
1267
1268	      /* Set new iteration count of loop kernel.  */
1269              if (count_reg && count_init)
1270		SET_SRC (single_set (count_init)) = GEN_INT (loop_count
1271							     - stage_count + 1);
1272
1273	      /* Now apply the scheduled kernel to the RTL of the loop.  */
1274	      permute_partial_schedule (ps, g->closing_branch->first_note);
1275
1276              /* Mark this loop as software pipelined so the later
1277	      scheduling passes doesn't touch it.  */
1278	      if (! flag_resched_modulo_sched)
1279		g->bb->flags |= BB_DISABLE_SCHEDULE;
1280	      /* The life-info is not valid any more.  */
1281	      g->bb->flags |= BB_DIRTY;
1282
1283	      reg_move_replaces = generate_reg_moves (ps);
1284	      if (dump_file)
1285		print_node_sched_params (dump_file, g->num_nodes);
1286	      /* Generate prolog and epilog.  */
1287	      if (count_reg && !count_init)
1288		generate_prolog_epilog (ps, loop, count_reg);
1289	      else
1290	 	generate_prolog_epilog (ps, loop, NULL_RTX);
1291	    }
1292	  free_undo_replace_buff (reg_move_replaces);
1293	}
1294
1295      free_partial_schedule (ps);
1296      free (node_sched_params);
1297      free (node_order);
1298      free_ddg (g);
1299    }
1300
1301  /* Release scheduler data, needed until now because of DFA.  */
1302  sched_finish ();
1303  loop_optimizer_finalize (loops, dump_file);
1304}
1305
1306/* The SMS scheduling algorithm itself
1307   -----------------------------------
1308   Input: 'O' an ordered list of insns of a loop.
1309   Output: A scheduling of the loop - kernel, prolog, and epilogue.
1310
1311   'Q' is the empty Set
1312   'PS' is the partial schedule; it holds the currently scheduled nodes with
1313	their cycle/slot.
1314   'PSP' previously scheduled predecessors.
1315   'PSS' previously scheduled successors.
1316   't(u)' the cycle where u is scheduled.
1317   'l(u)' is the latency of u.
1318   'd(v,u)' is the dependence distance from v to u.
1319   'ASAP(u)' the earliest time at which u could be scheduled as computed in
1320	     the node ordering phase.
1321   'check_hardware_resources_conflicts(u, PS, c)'
1322			     run a trace around cycle/slot through DFA model
1323			     to check resource conflicts involving instruction u
1324			     at cycle c given the partial schedule PS.
1325   'add_to_partial_schedule_at_time(u, PS, c)'
1326			     Add the node/instruction u to the partial schedule
1327			     PS at time c.
1328   'calculate_register_pressure(PS)'
1329			     Given a schedule of instructions, calculate the register
1330			     pressure it implies.  One implementation could be the
1331			     maximum number of overlapping live ranges.
1332   'maxRP' The maximum allowed register pressure, it is usually derived from the number
1333	   registers available in the hardware.
1334
1335   1. II = MII.
1336   2. PS = empty list
1337   3. for each node u in O in pre-computed order
1338   4.   if (PSP(u) != Q && PSS(u) == Q) then
1339   5.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1340   6.     start = Early_start; end = Early_start + II - 1; step = 1
1341   11.  else if (PSP(u) == Q && PSS(u) != Q) then
1342   12.      Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1343   13.     start = Late_start; end = Late_start - II + 1; step = -1
1344   14.  else if (PSP(u) != Q && PSS(u) != Q) then
1345   15.     Early_start(u) = max ( t(v) + l(v) - d(v,u)*II ) over all every v in PSP(u).
1346   16.     Late_start(u) = min ( t(v) - l(v) + d(v,u)*II ) over all every v in PSS(u).
1347   17.     start = Early_start;
1348   18.     end = min(Early_start + II - 1 , Late_start);
1349   19.     step = 1
1350   20.     else "if (PSP(u) == Q && PSS(u) == Q)"
1351   21.	  start = ASAP(u); end = start + II - 1; step = 1
1352   22.  endif
1353
1354   23.  success = false
1355   24.  for (c = start ; c != end ; c += step)
1356   25.     if check_hardware_resources_conflicts(u, PS, c) then
1357   26.       add_to_partial_schedule_at_time(u, PS, c)
1358   27.       success = true
1359   28.       break
1360   29.     endif
1361   30.  endfor
1362   31.  if (success == false) then
1363   32.    II = II + 1
1364   33.    if (II > maxII) then
1365   34.       finish - failed to schedule
1366   35.	 endif
1367   36.    goto 2.
1368   37.  endif
1369   38. endfor
1370   39. if (calculate_register_pressure(PS) > maxRP) then
1371   40.    goto 32.
1372   41. endif
1373   42. compute epilogue & prologue
1374   43. finish - succeeded to schedule
1375*/
1376
1377/* A limit on the number of cycles that resource conflicts can span.  ??? Should
1378   be provided by DFA, and be dependent on the type of insn scheduled.  Currently
1379   set to 0 to save compile time.  */
1380#define DFA_HISTORY SMS_DFA_HISTORY
1381
1382/* Given the partial schedule PS, this function calculates and returns the
1383   cycles in which we can schedule the node with the given index I.
1384   NOTE: Here we do the backtracking in SMS, in some special cases. We have
1385   noticed that there are several cases in which we fail    to SMS the loop
1386   because the sched window of a node is empty    due to tight data-deps. In
1387   such cases we want to unschedule    some of the predecessors/successors
1388   until we get non-empty    scheduling window.  It returns -1 if the
1389   scheduling window is empty and zero otherwise.  */
1390
1391static int
1392get_sched_window (partial_schedule_ptr ps, int *nodes_order, int i,
1393		  sbitmap sched_nodes, int ii, int *start_p, int *step_p, int *end_p)
1394{
1395  int start, step, end;
1396  ddg_edge_ptr e;
1397  int u = nodes_order [i];
1398  ddg_node_ptr u_node = &ps->g->nodes[u];
1399  sbitmap psp = sbitmap_alloc (ps->g->num_nodes);
1400  sbitmap pss = sbitmap_alloc (ps->g->num_nodes);
1401  sbitmap u_node_preds = NODE_PREDECESSORS (u_node);
1402  sbitmap u_node_succs = NODE_SUCCESSORS (u_node);
1403  int psp_not_empty;
1404  int pss_not_empty;
1405
1406  /* 1. compute sched window for u (start, end, step).  */
1407  sbitmap_zero (psp);
1408  sbitmap_zero (pss);
1409  psp_not_empty = sbitmap_a_and_b_cg (psp, u_node_preds, sched_nodes);
1410  pss_not_empty = sbitmap_a_and_b_cg (pss, u_node_succs, sched_nodes);
1411
1412  if (psp_not_empty && !pss_not_empty)
1413    {
1414      int early_start = INT_MIN;
1415
1416      end = INT_MAX;
1417      for (e = u_node->in; e != 0; e = e->next_in)
1418	{
1419	  ddg_node_ptr v_node = e->src;
1420	  if (TEST_BIT (sched_nodes, v_node->cuid))
1421	    {
1422	      int node_st = SCHED_TIME (v_node)
1423	      		    + e->latency - (e->distance * ii);
1424
1425	      early_start = MAX (early_start, node_st);
1426
1427	      if (e->data_type == MEM_DEP)
1428		end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1429	    }
1430	}
1431      start = early_start;
1432      end = MIN (end, early_start + ii);
1433      step = 1;
1434    }
1435
1436  else if (!psp_not_empty && pss_not_empty)
1437    {
1438      int late_start = INT_MAX;
1439
1440      end = INT_MIN;
1441      for (e = u_node->out; e != 0; e = e->next_out)
1442	{
1443	  ddg_node_ptr v_node = e->dest;
1444	  if (TEST_BIT (sched_nodes, v_node->cuid))
1445	    {
1446	      late_start = MIN (late_start,
1447				SCHED_TIME (v_node) - e->latency
1448				+ (e->distance * ii));
1449	      if (e->data_type == MEM_DEP)
1450		end = MAX (end, SCHED_TIME (v_node) - ii + 1);
1451	    }
1452	}
1453      start = late_start;
1454      end = MAX (end, late_start - ii);
1455      step = -1;
1456    }
1457
1458  else if (psp_not_empty && pss_not_empty)
1459    {
1460      int early_start = INT_MIN;
1461      int late_start = INT_MAX;
1462
1463      start = INT_MIN;
1464      end = INT_MAX;
1465      for (e = u_node->in; e != 0; e = e->next_in)
1466	{
1467	  ddg_node_ptr v_node = e->src;
1468
1469	  if (TEST_BIT (sched_nodes, v_node->cuid))
1470	    {
1471	      early_start = MAX (early_start,
1472				 SCHED_TIME (v_node) + e->latency
1473				 - (e->distance * ii));
1474	      if (e->data_type == MEM_DEP)
1475		end = MIN (end, SCHED_TIME (v_node) + ii - 1);
1476	    }
1477	}
1478      for (e = u_node->out; e != 0; e = e->next_out)
1479	{
1480	  ddg_node_ptr v_node = e->dest;
1481
1482	  if (TEST_BIT (sched_nodes, v_node->cuid))
1483	    {
1484	      late_start = MIN (late_start,
1485				SCHED_TIME (v_node) - e->latency
1486				+ (e->distance * ii));
1487	      if (e->data_type == MEM_DEP)
1488		start = MAX (start, SCHED_TIME (v_node) - ii + 1);
1489	    }
1490	}
1491      start = MAX (start, early_start);
1492      end = MIN (end, MIN (early_start + ii, late_start + 1));
1493      step = 1;
1494    }
1495  else /* psp is empty && pss is empty.  */
1496    {
1497      start = SCHED_ASAP (u_node);
1498      end = start + ii;
1499      step = 1;
1500    }
1501
1502  *start_p = start;
1503  *step_p = step;
1504  *end_p = end;
1505  sbitmap_free (psp);
1506  sbitmap_free (pss);
1507
1508  if ((start >= end && step == 1) || (start <= end && step == -1))
1509    return -1;
1510  else
1511    return 0;
1512}
1513
1514/* This function implements the scheduling algorithm for SMS according to the
1515   above algorithm.  */
1516static partial_schedule_ptr
1517sms_schedule_by_order (ddg_ptr g, int mii, int maxii, int *nodes_order, FILE *dump_file)
1518{
1519  int ii = mii;
1520  int i, c, success;
1521  int try_again_with_larger_ii = true;
1522  int num_nodes = g->num_nodes;
1523  ddg_edge_ptr e;
1524  int start, end, step; /* Place together into one struct?  */
1525  sbitmap sched_nodes = sbitmap_alloc (num_nodes);
1526  sbitmap must_precede = sbitmap_alloc (num_nodes);
1527  sbitmap must_follow = sbitmap_alloc (num_nodes);
1528  sbitmap tobe_scheduled = sbitmap_alloc (num_nodes);
1529
1530  partial_schedule_ptr ps = create_partial_schedule (ii, g, DFA_HISTORY);
1531
1532  sbitmap_ones (tobe_scheduled);
1533  sbitmap_zero (sched_nodes);
1534
1535  while ((! sbitmap_equal (tobe_scheduled, sched_nodes)
1536	 || try_again_with_larger_ii ) && ii < maxii)
1537    {
1538      int j;
1539      bool unscheduled_nodes = false;
1540
1541      if (dump_file)
1542	fprintf(dump_file, "Starting with ii=%d\n", ii);
1543      if (try_again_with_larger_ii)
1544	{
1545	  try_again_with_larger_ii = false;
1546	  sbitmap_zero (sched_nodes);
1547	}
1548
1549      for (i = 0; i < num_nodes; i++)
1550	{
1551	  int u = nodes_order[i];
1552  	  ddg_node_ptr u_node = &ps->g->nodes[u];
1553	  rtx insn = u_node->insn;
1554
1555	  if (!INSN_P (insn))
1556	    {
1557	      RESET_BIT (tobe_scheduled, u);
1558	      continue;
1559	    }
1560
1561	  if (JUMP_P (insn)) /* Closing branch handled later.  */
1562	    {
1563	      RESET_BIT (tobe_scheduled, u);
1564	      continue;
1565	    }
1566
1567	  if (TEST_BIT (sched_nodes, u))
1568	    continue;
1569
1570	  /* Try to get non-empty scheduling window.  */
1571	  j = i;
1572	  while (get_sched_window (ps, nodes_order, i, sched_nodes, ii, &start, &step, &end) < 0
1573		 && j > 0)
1574	    {
1575	      unscheduled_nodes = true;
1576	      if (TEST_BIT (NODE_PREDECESSORS (u_node), nodes_order[j - 1])
1577		  || TEST_BIT (NODE_SUCCESSORS (u_node), nodes_order[j - 1]))
1578		{
1579		  ps_unschedule_node (ps, &ps->g->nodes[nodes_order[j - 1]]);
1580		  RESET_BIT (sched_nodes, nodes_order [j - 1]);
1581		}
1582	      j--;
1583	    }
1584	  if (j < 0)
1585	    {
1586	      /* ??? Try backtracking instead of immediately ii++?  */
1587	      ii++;
1588	      try_again_with_larger_ii = true;
1589	      reset_partial_schedule (ps, ii);
1590	      break;
1591	    }
1592	  /* 2. Try scheduling u in window.  */
1593	  if (dump_file)
1594	    fprintf(dump_file, "Trying to schedule node %d in (%d .. %d) step %d\n",
1595		    u, start, end, step);
1596
1597          /* use must_follow & must_precede bitmaps to determine order
1598	     of nodes within the cycle.  */
1599          sbitmap_zero (must_precede);
1600          sbitmap_zero (must_follow);
1601      	  for (e = u_node->in; e != 0; e = e->next_in)
1602            if (TEST_BIT (sched_nodes, e->src->cuid)
1603	        && e->latency == (ii * e->distance)
1604		&& start == SCHED_TIME (e->src))
1605             SET_BIT (must_precede, e->src->cuid);
1606
1607	  for (e = u_node->out; e != 0; e = e->next_out)
1608            if (TEST_BIT (sched_nodes, e->dest->cuid)
1609	        && e->latency == (ii * e->distance)
1610		&& end == SCHED_TIME (e->dest))
1611             SET_BIT (must_follow, e->dest->cuid);
1612
1613	  success = 0;
1614	  if ((step > 0 && start < end) ||  (step < 0 && start > end))
1615	    for (c = start; c != end; c += step)
1616	      {
1617		ps_insn_ptr psi;
1618
1619		psi = ps_add_node_check_conflicts (ps, u_node, c,
1620						   must_precede,
1621						   must_follow);
1622
1623  		if (psi)
1624		  {
1625		    SCHED_TIME (u_node) = c;
1626		    SET_BIT (sched_nodes, u);
1627		    success = 1;
1628		    if (dump_file)
1629		      fprintf(dump_file, "Schedule in %d\n", c);
1630		    break;
1631		  }
1632	      }
1633	  if (!success)
1634	    {
1635	      /* ??? Try backtracking instead of immediately ii++?  */
1636	      ii++;
1637	      try_again_with_larger_ii = true;
1638	      reset_partial_schedule (ps, ii);
1639	      break;
1640	    }
1641	  if (unscheduled_nodes)
1642	    break;
1643
1644	  /* ??? If (success), check register pressure estimates.  */
1645	} /* Continue with next node.  */
1646    } /* While try_again_with_larger_ii.  */
1647
1648  sbitmap_free (sched_nodes);
1649
1650  if (ii >= maxii)
1651    {
1652      free_partial_schedule (ps);
1653      ps = NULL;
1654    }
1655  return ps;
1656}
1657
1658
1659/* This page implements the algorithm for ordering the nodes of a DDG
1660   for modulo scheduling, activated through the
1661   "int sms_order_nodes (ddg_ptr, int mii, int * result)" API.  */
1662
1663#define ORDER_PARAMS(x) ((struct node_order_params *) (x)->aux.info)
1664#define ASAP(x) (ORDER_PARAMS ((x))->asap)
1665#define ALAP(x) (ORDER_PARAMS ((x))->alap)
1666#define HEIGHT(x) (ORDER_PARAMS ((x))->height)
1667#define MOB(x) (ALAP ((x)) - ASAP ((x)))
1668#define DEPTH(x) (ASAP ((x)))
1669
1670typedef struct node_order_params * nopa;
1671
1672static void order_nodes_of_sccs (ddg_all_sccs_ptr, int * result);
1673static int order_nodes_in_scc (ddg_ptr, sbitmap, sbitmap, int*, int);
1674static nopa  calculate_order_params (ddg_ptr, int mii);
1675static int find_max_asap (ddg_ptr, sbitmap);
1676static int find_max_hv_min_mob (ddg_ptr, sbitmap);
1677static int find_max_dv_min_mob (ddg_ptr, sbitmap);
1678
1679enum sms_direction {BOTTOMUP, TOPDOWN};
1680
1681struct node_order_params
1682{
1683  int asap;
1684  int alap;
1685  int height;
1686};
1687
1688/* Check if NODE_ORDER contains a permutation of 0 .. NUM_NODES-1.  */
1689static void
1690check_nodes_order (int *node_order, int num_nodes)
1691{
1692  int i;
1693  sbitmap tmp = sbitmap_alloc (num_nodes);
1694
1695  sbitmap_zero (tmp);
1696
1697  for (i = 0; i < num_nodes; i++)
1698    {
1699      int u = node_order[i];
1700
1701      gcc_assert (u < num_nodes && u >= 0 && !TEST_BIT (tmp, u));
1702
1703      SET_BIT (tmp, u);
1704    }
1705
1706  sbitmap_free (tmp);
1707}
1708
1709/* Order the nodes of G for scheduling and pass the result in
1710   NODE_ORDER.  Also set aux.count of each node to ASAP.
1711   Return the recMII for the given DDG.  */
1712static int
1713sms_order_nodes (ddg_ptr g, int mii, int * node_order)
1714{
1715  int i;
1716  int rec_mii = 0;
1717  ddg_all_sccs_ptr sccs = create_ddg_all_sccs (g);
1718
1719  nopa nops = calculate_order_params (g, mii);
1720
1721  order_nodes_of_sccs (sccs, node_order);
1722
1723  if (sccs->num_sccs > 0)
1724    /* First SCC has the largest recurrence_length.  */
1725    rec_mii = sccs->sccs[0]->recurrence_length;
1726
1727  /* Save ASAP before destroying node_order_params.  */
1728  for (i = 0; i < g->num_nodes; i++)
1729    {
1730      ddg_node_ptr v = &g->nodes[i];
1731      v->aux.count = ASAP (v);
1732    }
1733
1734  free (nops);
1735  free_ddg_all_sccs (sccs);
1736  check_nodes_order (node_order, g->num_nodes);
1737
1738  return rec_mii;
1739}
1740
1741static void
1742order_nodes_of_sccs (ddg_all_sccs_ptr all_sccs, int * node_order)
1743{
1744  int i, pos = 0;
1745  ddg_ptr g = all_sccs->ddg;
1746  int num_nodes = g->num_nodes;
1747  sbitmap prev_sccs = sbitmap_alloc (num_nodes);
1748  sbitmap on_path = sbitmap_alloc (num_nodes);
1749  sbitmap tmp = sbitmap_alloc (num_nodes);
1750  sbitmap ones = sbitmap_alloc (num_nodes);
1751
1752  sbitmap_zero (prev_sccs);
1753  sbitmap_ones (ones);
1754
1755  /* Perfrom the node ordering starting from the SCC with the highest recMII.
1756     For each SCC order the nodes according to their ASAP/ALAP/HEIGHT etc.  */
1757  for (i = 0; i < all_sccs->num_sccs; i++)
1758    {
1759      ddg_scc_ptr scc = all_sccs->sccs[i];
1760
1761      /* Add nodes on paths from previous SCCs to the current SCC.  */
1762      find_nodes_on_paths (on_path, g, prev_sccs, scc->nodes);
1763      sbitmap_a_or_b (tmp, scc->nodes, on_path);
1764
1765      /* Add nodes on paths from the current SCC to previous SCCs.  */
1766      find_nodes_on_paths (on_path, g, scc->nodes, prev_sccs);
1767      sbitmap_a_or_b (tmp, tmp, on_path);
1768
1769      /* Remove nodes of previous SCCs from current extended SCC.  */
1770      sbitmap_difference (tmp, tmp, prev_sccs);
1771
1772      pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1773      /* Above call to order_nodes_in_scc updated prev_sccs |= tmp.  */
1774    }
1775
1776  /* Handle the remaining nodes that do not belong to any scc.  Each call
1777     to order_nodes_in_scc handles a single connected component.  */
1778  while (pos < g->num_nodes)
1779    {
1780      sbitmap_difference (tmp, ones, prev_sccs);
1781      pos = order_nodes_in_scc (g, prev_sccs, tmp, node_order, pos);
1782    }
1783  sbitmap_free (prev_sccs);
1784  sbitmap_free (on_path);
1785  sbitmap_free (tmp);
1786  sbitmap_free (ones);
1787}
1788
1789/* MII is needed if we consider backarcs (that do not close recursive cycles).  */
1790static struct node_order_params *
1791calculate_order_params (ddg_ptr g, int mii ATTRIBUTE_UNUSED)
1792{
1793  int u;
1794  int max_asap;
1795  int num_nodes = g->num_nodes;
1796  ddg_edge_ptr e;
1797  /* Allocate a place to hold ordering params for each node in the DDG.  */
1798  nopa node_order_params_arr;
1799
1800  /* Initialize of ASAP/ALAP/HEIGHT to zero.  */
1801  node_order_params_arr = (nopa) xcalloc (num_nodes,
1802					  sizeof (struct node_order_params));
1803
1804  /* Set the aux pointer of each node to point to its order_params structure.  */
1805  for (u = 0; u < num_nodes; u++)
1806    g->nodes[u].aux.info = &node_order_params_arr[u];
1807
1808  /* Disregarding a backarc from each recursive cycle to obtain a DAG,
1809     calculate ASAP, ALAP, mobility, distance, and height for each node
1810     in the dependence (direct acyclic) graph.  */
1811
1812  /* We assume that the nodes in the array are in topological order.  */
1813
1814  max_asap = 0;
1815  for (u = 0; u < num_nodes; u++)
1816    {
1817      ddg_node_ptr u_node = &g->nodes[u];
1818
1819      ASAP (u_node) = 0;
1820      for (e = u_node->in; e; e = e->next_in)
1821	if (e->distance == 0)
1822	  ASAP (u_node) = MAX (ASAP (u_node),
1823			       ASAP (e->src) + e->latency);
1824      max_asap = MAX (max_asap, ASAP (u_node));
1825    }
1826
1827  for (u = num_nodes - 1; u > -1; u--)
1828    {
1829      ddg_node_ptr u_node = &g->nodes[u];
1830
1831      ALAP (u_node) = max_asap;
1832      HEIGHT (u_node) = 0;
1833      for (e = u_node->out; e; e = e->next_out)
1834	if (e->distance == 0)
1835	  {
1836	    ALAP (u_node) = MIN (ALAP (u_node),
1837				 ALAP (e->dest) - e->latency);
1838	    HEIGHT (u_node) = MAX (HEIGHT (u_node),
1839				   HEIGHT (e->dest) + e->latency);
1840	  }
1841    }
1842
1843  return node_order_params_arr;
1844}
1845
1846static int
1847find_max_asap (ddg_ptr g, sbitmap nodes)
1848{
1849  unsigned int u = 0;
1850  int max_asap = -1;
1851  int result = -1;
1852  sbitmap_iterator sbi;
1853
1854  EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1855    {
1856      ddg_node_ptr u_node = &g->nodes[u];
1857
1858      if (max_asap < ASAP (u_node))
1859	{
1860	  max_asap = ASAP (u_node);
1861	  result = u;
1862	}
1863    }
1864  return result;
1865}
1866
1867static int
1868find_max_hv_min_mob (ddg_ptr g, sbitmap nodes)
1869{
1870  unsigned int u = 0;
1871  int max_hv = -1;
1872  int min_mob = INT_MAX;
1873  int result = -1;
1874  sbitmap_iterator sbi;
1875
1876  EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1877    {
1878      ddg_node_ptr u_node = &g->nodes[u];
1879
1880      if (max_hv < HEIGHT (u_node))
1881	{
1882	  max_hv = HEIGHT (u_node);
1883	  min_mob = MOB (u_node);
1884	  result = u;
1885	}
1886      else if ((max_hv == HEIGHT (u_node))
1887	       && (min_mob > MOB (u_node)))
1888	{
1889	  min_mob = MOB (u_node);
1890	  result = u;
1891	}
1892    }
1893  return result;
1894}
1895
1896static int
1897find_max_dv_min_mob (ddg_ptr g, sbitmap nodes)
1898{
1899  unsigned int u = 0;
1900  int max_dv = -1;
1901  int min_mob = INT_MAX;
1902  int result = -1;
1903  sbitmap_iterator sbi;
1904
1905  EXECUTE_IF_SET_IN_SBITMAP (nodes, 0, u, sbi)
1906    {
1907      ddg_node_ptr u_node = &g->nodes[u];
1908
1909      if (max_dv < DEPTH (u_node))
1910	{
1911	  max_dv = DEPTH (u_node);
1912	  min_mob = MOB (u_node);
1913	  result = u;
1914	}
1915      else if ((max_dv == DEPTH (u_node))
1916	       && (min_mob > MOB (u_node)))
1917	{
1918	  min_mob = MOB (u_node);
1919	  result = u;
1920	}
1921    }
1922  return result;
1923}
1924
1925/* Places the nodes of SCC into the NODE_ORDER array starting
1926   at position POS, according to the SMS ordering algorithm.
1927   NODES_ORDERED (in&out parameter) holds the bitset of all nodes in
1928   the NODE_ORDER array, starting from position zero.  */
1929static int
1930order_nodes_in_scc (ddg_ptr g, sbitmap nodes_ordered, sbitmap scc,
1931		    int * node_order, int pos)
1932{
1933  enum sms_direction dir;
1934  int num_nodes = g->num_nodes;
1935  sbitmap workset = sbitmap_alloc (num_nodes);
1936  sbitmap tmp = sbitmap_alloc (num_nodes);
1937  sbitmap zero_bitmap = sbitmap_alloc (num_nodes);
1938  sbitmap predecessors = sbitmap_alloc (num_nodes);
1939  sbitmap successors = sbitmap_alloc (num_nodes);
1940
1941  sbitmap_zero (predecessors);
1942  find_predecessors (predecessors, g, nodes_ordered);
1943
1944  sbitmap_zero (successors);
1945  find_successors (successors, g, nodes_ordered);
1946
1947  sbitmap_zero (tmp);
1948  if (sbitmap_a_and_b_cg (tmp, predecessors, scc))
1949    {
1950      sbitmap_copy (workset, tmp);
1951      dir = BOTTOMUP;
1952    }
1953  else if (sbitmap_a_and_b_cg (tmp, successors, scc))
1954    {
1955      sbitmap_copy (workset, tmp);
1956      dir = TOPDOWN;
1957    }
1958  else
1959    {
1960      int u;
1961
1962      sbitmap_zero (workset);
1963      if ((u = find_max_asap (g, scc)) >= 0)
1964	SET_BIT (workset, u);
1965      dir = BOTTOMUP;
1966    }
1967
1968  sbitmap_zero (zero_bitmap);
1969  while (!sbitmap_equal (workset, zero_bitmap))
1970    {
1971      int v;
1972      ddg_node_ptr v_node;
1973      sbitmap v_node_preds;
1974      sbitmap v_node_succs;
1975
1976      if (dir == TOPDOWN)
1977	{
1978	  while (!sbitmap_equal (workset, zero_bitmap))
1979	    {
1980	      v = find_max_hv_min_mob (g, workset);
1981	      v_node = &g->nodes[v];
1982	      node_order[pos++] = v;
1983	      v_node_succs = NODE_SUCCESSORS (v_node);
1984	      sbitmap_a_and_b (tmp, v_node_succs, scc);
1985
1986	      /* Don't consider the already ordered successors again.  */
1987	      sbitmap_difference (tmp, tmp, nodes_ordered);
1988	      sbitmap_a_or_b (workset, workset, tmp);
1989	      RESET_BIT (workset, v);
1990	      SET_BIT (nodes_ordered, v);
1991	    }
1992	  dir = BOTTOMUP;
1993	  sbitmap_zero (predecessors);
1994	  find_predecessors (predecessors, g, nodes_ordered);
1995	  sbitmap_a_and_b (workset, predecessors, scc);
1996	}
1997      else
1998	{
1999	  while (!sbitmap_equal (workset, zero_bitmap))
2000	    {
2001	      v = find_max_dv_min_mob (g, workset);
2002	      v_node = &g->nodes[v];
2003	      node_order[pos++] = v;
2004	      v_node_preds = NODE_PREDECESSORS (v_node);
2005	      sbitmap_a_and_b (tmp, v_node_preds, scc);
2006
2007	      /* Don't consider the already ordered predecessors again.  */
2008	      sbitmap_difference (tmp, tmp, nodes_ordered);
2009	      sbitmap_a_or_b (workset, workset, tmp);
2010	      RESET_BIT (workset, v);
2011	      SET_BIT (nodes_ordered, v);
2012	    }
2013	  dir = TOPDOWN;
2014	  sbitmap_zero (successors);
2015	  find_successors (successors, g, nodes_ordered);
2016	  sbitmap_a_and_b (workset, successors, scc);
2017	}
2018    }
2019  sbitmap_free (tmp);
2020  sbitmap_free (workset);
2021  sbitmap_free (zero_bitmap);
2022  sbitmap_free (predecessors);
2023  sbitmap_free (successors);
2024  return pos;
2025}
2026
2027
2028/* This page contains functions for manipulating partial-schedules during
2029   modulo scheduling.  */
2030
2031/* Create a partial schedule and allocate a memory to hold II rows.  */
2032partial_schedule_ptr
2033create_partial_schedule (int ii, ddg_ptr g, int history)
2034{
2035  partial_schedule_ptr ps = (partial_schedule_ptr)
2036			     xmalloc (sizeof (struct partial_schedule));
2037  ps->rows = (ps_insn_ptr *) xcalloc (ii, sizeof (ps_insn_ptr));
2038  ps->ii = ii;
2039  ps->history = history;
2040  ps->min_cycle = INT_MAX;
2041  ps->max_cycle = INT_MIN;
2042  ps->g = g;
2043
2044  return ps;
2045}
2046
2047/* Free the PS_INSNs in rows array of the given partial schedule.
2048   ??? Consider caching the PS_INSN's.  */
2049static void
2050free_ps_insns (partial_schedule_ptr ps)
2051{
2052  int i;
2053
2054  for (i = 0; i < ps->ii; i++)
2055    {
2056      while (ps->rows[i])
2057	{
2058	  ps_insn_ptr ps_insn = ps->rows[i]->next_in_row;
2059
2060	  free (ps->rows[i]);
2061	  ps->rows[i] = ps_insn;
2062	}
2063      ps->rows[i] = NULL;
2064    }
2065}
2066
2067/* Free all the memory allocated to the partial schedule.  */
2068void
2069free_partial_schedule (partial_schedule_ptr ps)
2070{
2071  if (!ps)
2072    return;
2073  free_ps_insns (ps);
2074  free (ps->rows);
2075  free (ps);
2076}
2077
2078/* Clear the rows array with its PS_INSNs, and create a new one with
2079   NEW_II rows.  */
2080void
2081reset_partial_schedule (partial_schedule_ptr ps, int new_ii)
2082{
2083  if (!ps)
2084    return;
2085  free_ps_insns (ps);
2086  if (new_ii == ps->ii)
2087    return;
2088  ps->rows = (ps_insn_ptr *) xrealloc (ps->rows, new_ii
2089						 * sizeof (ps_insn_ptr));
2090  memset (ps->rows, 0, new_ii * sizeof (ps_insn_ptr));
2091  ps->ii = new_ii;
2092  ps->min_cycle = INT_MAX;
2093  ps->max_cycle = INT_MIN;
2094}
2095
2096/* Prints the partial schedule as an ii rows array, for each rows
2097   print the ids of the insns in it.  */
2098void
2099print_partial_schedule (partial_schedule_ptr ps, FILE *dump)
2100{
2101  int i;
2102
2103  for (i = 0; i < ps->ii; i++)
2104    {
2105      ps_insn_ptr ps_i = ps->rows[i];
2106
2107      fprintf (dump, "\n[CYCLE %d ]: ", i);
2108      while (ps_i)
2109	{
2110	  fprintf (dump, "%d, ",
2111		   INSN_UID (ps_i->node->insn));
2112	  ps_i = ps_i->next_in_row;
2113	}
2114    }
2115}
2116
2117/* Creates an object of PS_INSN and initializes it to the given parameters.  */
2118static ps_insn_ptr
2119create_ps_insn (ddg_node_ptr node, int rest_count, int cycle)
2120{
2121  ps_insn_ptr ps_i = xmalloc (sizeof (struct ps_insn));
2122
2123  ps_i->node = node;
2124  ps_i->next_in_row = NULL;
2125  ps_i->prev_in_row = NULL;
2126  ps_i->row_rest_count = rest_count;
2127  ps_i->cycle = cycle;
2128
2129  return ps_i;
2130}
2131
2132
2133/* Removes the given PS_INSN from the partial schedule.  Returns false if the
2134   node is not found in the partial schedule, else returns true.  */
2135static bool
2136remove_node_from_ps (partial_schedule_ptr ps, ps_insn_ptr ps_i)
2137{
2138  int row;
2139
2140  if (!ps || !ps_i)
2141    return false;
2142
2143  row = SMODULO (ps_i->cycle, ps->ii);
2144  if (! ps_i->prev_in_row)
2145    {
2146      if (ps_i != ps->rows[row])
2147	return false;
2148
2149      ps->rows[row] = ps_i->next_in_row;
2150      if (ps->rows[row])
2151	ps->rows[row]->prev_in_row = NULL;
2152    }
2153  else
2154    {
2155      ps_i->prev_in_row->next_in_row = ps_i->next_in_row;
2156      if (ps_i->next_in_row)
2157	ps_i->next_in_row->prev_in_row = ps_i->prev_in_row;
2158    }
2159  free (ps_i);
2160  return true;
2161}
2162
2163/* Unlike what literature describes for modulo scheduling (which focuses
2164   on VLIW machines) the order of the instructions inside a cycle is
2165   important.  Given the bitmaps MUST_FOLLOW and MUST_PRECEDE we know
2166   where the current instruction should go relative to the already
2167   scheduled instructions in the given cycle.  Go over these
2168   instructions and find the first possible column to put it in.  */
2169static bool
2170ps_insn_find_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2171		     sbitmap must_precede, sbitmap must_follow)
2172{
2173  ps_insn_ptr next_ps_i;
2174  ps_insn_ptr first_must_follow = NULL;
2175  ps_insn_ptr last_must_precede = NULL;
2176  int row;
2177
2178  if (! ps_i)
2179    return false;
2180
2181  row = SMODULO (ps_i->cycle, ps->ii);
2182
2183  /* Find the first must follow and the last must precede
2184     and insert the node immediately after the must precede
2185     but make sure that it there is no must follow after it.  */
2186  for (next_ps_i = ps->rows[row];
2187       next_ps_i;
2188       next_ps_i = next_ps_i->next_in_row)
2189    {
2190      if (TEST_BIT (must_follow, next_ps_i->node->cuid)
2191	  && ! first_must_follow)
2192        first_must_follow = next_ps_i;
2193      if (TEST_BIT (must_precede, next_ps_i->node->cuid))
2194        {
2195          /* If we have already met a node that must follow, then
2196	     there is no possible column.  */
2197  	  if (first_must_follow)
2198            return false;
2199	  else
2200            last_must_precede = next_ps_i;
2201        }
2202    }
2203
2204  /* Now insert the node after INSERT_AFTER_PSI.  */
2205
2206  if (! last_must_precede)
2207    {
2208      ps_i->next_in_row = ps->rows[row];
2209      ps_i->prev_in_row = NULL;
2210      if (ps_i->next_in_row)
2211    	ps_i->next_in_row->prev_in_row = ps_i;
2212      ps->rows[row] = ps_i;
2213    }
2214  else
2215    {
2216      ps_i->next_in_row = last_must_precede->next_in_row;
2217      last_must_precede->next_in_row = ps_i;
2218      ps_i->prev_in_row = last_must_precede;
2219      if (ps_i->next_in_row)
2220        ps_i->next_in_row->prev_in_row = ps_i;
2221    }
2222
2223  return true;
2224}
2225
2226/* Advances the PS_INSN one column in its current row; returns false
2227   in failure and true in success.  Bit N is set in MUST_FOLLOW if
2228   the node with cuid N must be come after the node pointed to by
2229   PS_I when scheduled in the same cycle.  */
2230static int
2231ps_insn_advance_column (partial_schedule_ptr ps, ps_insn_ptr ps_i,
2232			sbitmap must_follow)
2233{
2234  ps_insn_ptr prev, next;
2235  int row;
2236  ddg_node_ptr next_node;
2237
2238  if (!ps || !ps_i)
2239    return false;
2240
2241  row = SMODULO (ps_i->cycle, ps->ii);
2242
2243  if (! ps_i->next_in_row)
2244    return false;
2245
2246  next_node = ps_i->next_in_row->node;
2247
2248  /* Check if next_in_row is dependent on ps_i, both having same sched
2249     times (typically ANTI_DEP).  If so, ps_i cannot skip over it.  */
2250  if (TEST_BIT (must_follow, next_node->cuid))
2251    return false;
2252
2253  /* Advance PS_I over its next_in_row in the doubly linked list.  */
2254  prev = ps_i->prev_in_row;
2255  next = ps_i->next_in_row;
2256
2257  if (ps_i == ps->rows[row])
2258    ps->rows[row] = next;
2259
2260  ps_i->next_in_row = next->next_in_row;
2261
2262  if (next->next_in_row)
2263    next->next_in_row->prev_in_row = ps_i;
2264
2265  next->next_in_row = ps_i;
2266  ps_i->prev_in_row = next;
2267
2268  next->prev_in_row = prev;
2269  if (prev)
2270    prev->next_in_row = next;
2271
2272  return true;
2273}
2274
2275/* Inserts a DDG_NODE to the given partial schedule at the given cycle.
2276   Returns 0 if this is not possible and a PS_INSN otherwise.  Bit N is
2277   set in MUST_PRECEDE/MUST_FOLLOW if the node with cuid N must be come
2278   before/after (respectively) the node pointed to by PS_I when scheduled
2279   in the same cycle.  */
2280static ps_insn_ptr
2281add_node_to_ps (partial_schedule_ptr ps, ddg_node_ptr node, int cycle,
2282		sbitmap must_precede, sbitmap must_follow)
2283{
2284  ps_insn_ptr ps_i;
2285  int rest_count = 1;
2286  int row = SMODULO (cycle, ps->ii);
2287
2288  if (ps->rows[row]
2289      && ps->rows[row]->row_rest_count >= issue_rate)
2290    return NULL;
2291
2292  if (ps->rows[row])
2293    rest_count += ps->rows[row]->row_rest_count;
2294
2295  ps_i = create_ps_insn (node, rest_count, cycle);
2296
2297  /* Finds and inserts PS_I according to MUST_FOLLOW and
2298     MUST_PRECEDE.  */
2299  if (! ps_insn_find_column (ps, ps_i, must_precede, must_follow))
2300    {
2301      free (ps_i);
2302      return NULL;
2303    }
2304
2305  return ps_i;
2306}
2307
2308/* Advance time one cycle.  Assumes DFA is being used.  */
2309static void
2310advance_one_cycle (void)
2311{
2312  if (targetm.sched.dfa_pre_cycle_insn)
2313    state_transition (curr_state,
2314		      targetm.sched.dfa_pre_cycle_insn ());
2315
2316  state_transition (curr_state, NULL);
2317
2318  if (targetm.sched.dfa_post_cycle_insn)
2319    state_transition (curr_state,
2320		      targetm.sched.dfa_post_cycle_insn ());
2321}
2322
2323/* Given the kernel of a loop (from FIRST_INSN to LAST_INSN), finds
2324   the number of cycles according to DFA that the kernel fits in,
2325   we use this to check if we done well with SMS after we add
2326   register moves.  In some cases register moves overhead makes
2327   it even worse than the original loop.  We want SMS to be performed
2328   when it gives less cycles after register moves are added.  */
2329static int
2330kernel_number_of_cycles (rtx first_insn, rtx last_insn)
2331{
2332  int cycles = 0;
2333  rtx insn;
2334  int can_issue_more = issue_rate;
2335
2336  state_reset (curr_state);
2337
2338  for (insn = first_insn;
2339       insn != NULL_RTX && insn != last_insn;
2340       insn = NEXT_INSN (insn))
2341    {
2342      if (! INSN_P (insn) || GET_CODE (PATTERN (insn)) == USE)
2343	continue;
2344
2345      /* Check if there is room for the current insn.  */
2346      if (!can_issue_more || state_dead_lock_p (curr_state))
2347	{
2348	  cycles ++;
2349	  advance_one_cycle ();
2350	  can_issue_more = issue_rate;
2351	}
2352
2353	/* Update the DFA state and return with failure if the DFA found
2354	   recource conflicts.  */
2355      if (state_transition (curr_state, insn) >= 0)
2356	{
2357	  cycles ++;
2358	  advance_one_cycle ();
2359	  can_issue_more = issue_rate;
2360	}
2361
2362      if (targetm.sched.variable_issue)
2363	can_issue_more =
2364	  targetm.sched.variable_issue (sched_dump, sched_verbose,
2365					insn, can_issue_more);
2366      /* A naked CLOBBER or USE generates no instruction, so don't
2367	 let them consume issue slots.  */
2368      else if (GET_CODE (PATTERN (insn)) != USE
2369	       && GET_CODE (PATTERN (insn)) != CLOBBER)
2370	can_issue_more--;
2371    }
2372  return cycles;
2373}
2374
2375/* Checks if PS has resource conflicts according to DFA, starting from
2376   FROM cycle to TO cycle; returns true if there are conflicts and false
2377   if there are no conflicts.  Assumes DFA is being used.  */
2378static int
2379ps_has_conflicts (partial_schedule_ptr ps, int from, int to)
2380{
2381  int cycle;
2382
2383  state_reset (curr_state);
2384
2385  for (cycle = from; cycle <= to; cycle++)
2386    {
2387      ps_insn_ptr crr_insn;
2388      /* Holds the remaining issue slots in the current row.  */
2389      int can_issue_more = issue_rate;
2390
2391      /* Walk through the DFA for the current row.  */
2392      for (crr_insn = ps->rows[SMODULO (cycle, ps->ii)];
2393	   crr_insn;
2394	   crr_insn = crr_insn->next_in_row)
2395	{
2396	  rtx insn = crr_insn->node->insn;
2397
2398	  if (!INSN_P (insn))
2399	    continue;
2400
2401	  /* Check if there is room for the current insn.  */
2402	  if (!can_issue_more || state_dead_lock_p (curr_state))
2403	    return true;
2404
2405	  /* Update the DFA state and return with failure if the DFA found
2406	     recource conflicts.  */
2407	  if (state_transition (curr_state, insn) >= 0)
2408	    return true;
2409
2410	  if (targetm.sched.variable_issue)
2411	    can_issue_more =
2412	      targetm.sched.variable_issue (sched_dump, sched_verbose,
2413					    insn, can_issue_more);
2414	  /* A naked CLOBBER or USE generates no instruction, so don't
2415	     let them consume issue slots.  */
2416	  else if (GET_CODE (PATTERN (insn)) != USE
2417		   && GET_CODE (PATTERN (insn)) != CLOBBER)
2418	    can_issue_more--;
2419	}
2420
2421      /* Advance the DFA to the next cycle.  */
2422      advance_one_cycle ();
2423    }
2424  return false;
2425}
2426
2427/* Checks if the given node causes resource conflicts when added to PS at
2428   cycle C.  If not the node is added to PS and returned; otherwise zero
2429   is returned.  Bit N is set in MUST_PRECEDE/MUST_FOLLOW if the node with
2430   cuid N must be come before/after (respectively) the node pointed to by
2431   PS_I when scheduled in the same cycle.  */
2432ps_insn_ptr
2433ps_add_node_check_conflicts (partial_schedule_ptr ps, ddg_node_ptr n,
2434   			     int c, sbitmap must_precede,
2435			     sbitmap must_follow)
2436{
2437  int has_conflicts = 0;
2438  ps_insn_ptr ps_i;
2439
2440  /* First add the node to the PS, if this succeeds check for
2441     conflicts, trying different issue slots in the same row.  */
2442  if (! (ps_i = add_node_to_ps (ps, n, c, must_precede, must_follow)))
2443    return NULL; /* Failed to insert the node at the given cycle.  */
2444
2445  has_conflicts = ps_has_conflicts (ps, c, c)
2446		  || (ps->history > 0
2447		      && ps_has_conflicts (ps,
2448					   c - ps->history,
2449					   c + ps->history));
2450
2451  /* Try different issue slots to find one that the given node can be
2452     scheduled in without conflicts.  */
2453  while (has_conflicts)
2454    {
2455      if (! ps_insn_advance_column (ps, ps_i, must_follow))
2456	break;
2457      has_conflicts = ps_has_conflicts (ps, c, c)
2458		      || (ps->history > 0
2459			  && ps_has_conflicts (ps,
2460					       c - ps->history,
2461					       c + ps->history));
2462    }
2463
2464  if (has_conflicts)
2465    {
2466      remove_node_from_ps (ps, ps_i);
2467      return NULL;
2468    }
2469
2470  ps->min_cycle = MIN (ps->min_cycle, c);
2471  ps->max_cycle = MAX (ps->max_cycle, c);
2472  return ps_i;
2473}
2474
2475/* Rotate the rows of PS such that insns scheduled at time
2476   START_CYCLE will appear in row 0.  Updates max/min_cycles.  */
2477void
2478rotate_partial_schedule (partial_schedule_ptr ps, int start_cycle)
2479{
2480  int i, row, backward_rotates;
2481  int last_row = ps->ii - 1;
2482
2483  if (start_cycle == 0)
2484    return;
2485
2486  backward_rotates = SMODULO (start_cycle, ps->ii);
2487
2488  /* Revisit later and optimize this into a single loop.  */
2489  for (i = 0; i < backward_rotates; i++)
2490    {
2491      ps_insn_ptr first_row = ps->rows[0];
2492
2493      for (row = 0; row < last_row; row++)
2494	ps->rows[row] = ps->rows[row+1];
2495
2496      ps->rows[last_row] = first_row;
2497    }
2498
2499  ps->max_cycle -= start_cycle;
2500  ps->min_cycle -= start_cycle;
2501}
2502
2503/* Remove the node N from the partial schedule PS; because we restart the DFA
2504   each time we want to check for resource conflicts; this is equivalent to
2505   unscheduling the node N.  */
2506static bool
2507ps_unschedule_node (partial_schedule_ptr ps, ddg_node_ptr n)
2508{
2509  ps_insn_ptr ps_i;
2510  int row = SMODULO (SCHED_TIME (n), ps->ii);
2511
2512  if (row < 0 || row > ps->ii)
2513    return false;
2514
2515  for (ps_i = ps->rows[row];
2516       ps_i &&  ps_i->node != n;
2517       ps_i = ps_i->next_in_row);
2518  if (!ps_i)
2519    return false;
2520
2521  return remove_node_from_ps (ps, ps_i);
2522}
2523#endif /* INSN_SCHEDULING */
2524
2525static bool
2526gate_handle_sms (void)
2527{
2528  return (optimize > 0 && flag_modulo_sched);
2529}
2530
2531
2532/* Run instruction scheduler.  */
2533/* Perform SMS module scheduling.  */
2534static void
2535rest_of_handle_sms (void)
2536{
2537#ifdef INSN_SCHEDULING
2538  basic_block bb;
2539  sbitmap blocks;
2540
2541  /* We want to be able to create new pseudos.  */
2542  no_new_pseudos = 0;
2543  /* Collect loop information to be used in SMS.  */
2544  cfg_layout_initialize (CLEANUP_UPDATE_LIFE);
2545  sms_schedule (dump_file);
2546
2547  /* Update the life information, because we add pseudos.  */
2548  max_regno = max_reg_num ();
2549  allocate_reg_info (max_regno, FALSE, FALSE);
2550  blocks = sbitmap_alloc (last_basic_block);
2551  sbitmap_ones (blocks);
2552  update_life_info (blocks, UPDATE_LIFE_GLOBAL_RM_NOTES,
2553                    (PROP_DEATH_NOTES
2554                     | PROP_REG_INFO
2555                     | PROP_KILL_DEAD_CODE
2556                     | PROP_SCAN_DEAD_CODE));
2557
2558  no_new_pseudos = 1;
2559
2560  /* Finalize layout changes.  */
2561  FOR_EACH_BB (bb)
2562    if (bb->next_bb != EXIT_BLOCK_PTR)
2563      bb->aux = bb->next_bb;
2564  cfg_layout_finalize ();
2565  free_dominance_info (CDI_DOMINATORS);
2566#endif /* INSN_SCHEDULING */
2567}
2568
2569struct tree_opt_pass pass_sms =
2570{
2571  "sms",                                /* name */
2572  gate_handle_sms,                      /* gate */
2573  rest_of_handle_sms,                   /* execute */
2574  NULL,                                 /* sub */
2575  NULL,                                 /* next */
2576  0,                                    /* static_pass_number */
2577  TV_SMS,                               /* tv_id */
2578  0,                                    /* properties_required */
2579  0,                                    /* properties_provided */
2580  0,                                    /* properties_destroyed */
2581  0,                                    /* todo_flags_start */
2582  TODO_dump_func |
2583  TODO_ggc_collect,                     /* todo_flags_finish */
2584  'm'                                   /* letter */
2585};
2586
2587