1/* Loop distribution.
2   Copyright (C) 2006, 2007, 2008, 2009, 2010
3   Free Software Foundation, Inc.
4   Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
5   and Sebastian Pop <sebastian.pop@amd.com>.
6
7This file is part of GCC.
8
9GCC is free software; you can redistribute it and/or modify it
10under the terms of the GNU General Public License as published by the
11Free Software Foundation; either version 3, or (at your option) any
12later version.
13
14GCC is distributed in the hope that it will be useful, but WITHOUT
15ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
17for more details.
18
19You should have received a copy of the GNU General Public License
20along with GCC; see the file COPYING3.  If not see
21<http://www.gnu.org/licenses/>.  */
22
23/* This pass performs loop distribution: for example, the loop
24
25   |DO I = 2, N
26   |    A(I) = B(I) + C
27   |    D(I) = A(I-1)*E
28   |ENDDO
29
30   is transformed to
31
32   |DOALL I = 2, N
33   |   A(I) = B(I) + C
34   |ENDDO
35   |
36   |DOALL I = 2, N
37   |   D(I) = A(I-1)*E
38   |ENDDO
39
40   This pass uses an RDG, Reduced Dependence Graph built on top of the
41   data dependence relations.  The RDG is then topologically sorted to
42   obtain a map of information producers/consumers based on which it
43   generates the new loops.  */
44
45#include "config.h"
46#include "system.h"
47#include "coretypes.h"
48#include "tm.h"
49#include "ggc.h"
50#include "tree.h"
51#include "target.h"
52
53#include "rtl.h"
54#include "basic-block.h"
55#include "diagnostic.h"
56#include "tree-flow.h"
57#include "tree-dump.h"
58#include "timevar.h"
59#include "cfgloop.h"
60#include "expr.h"
61#include "optabs.h"
62#include "tree-chrec.h"
63#include "tree-data-ref.h"
64#include "tree-scalar-evolution.h"
65#include "tree-pass.h"
66#include "lambda.h"
67#include "langhooks.h"
68#include "tree-vectorizer.h"
69
70/* If bit I is not set, it means that this node represents an
71   operation that has already been performed, and that should not be
72   performed again.  This is the subgraph of remaining important
73   computations that is passed to the DFS algorithm for avoiding to
74   include several times the same stores in different loops.  */
75static bitmap remaining_stmts;
76
77/* A node of the RDG is marked in this bitmap when it has as a
78   predecessor a node that writes to memory.  */
79static bitmap upstream_mem_writes;
80
81/* Update the PHI nodes of NEW_LOOP.  NEW_LOOP is a duplicate of
82   ORIG_LOOP.  */
83
84static void
85update_phis_for_loop_copy (struct loop *orig_loop, struct loop *new_loop)
86{
87  tree new_ssa_name;
88  gimple_stmt_iterator si_new, si_orig;
89  edge orig_loop_latch = loop_latch_edge (orig_loop);
90  edge orig_entry_e = loop_preheader_edge (orig_loop);
91  edge new_loop_entry_e = loop_preheader_edge (new_loop);
92
93  /* Scan the phis in the headers of the old and new loops
94     (they are organized in exactly the same order).  */
95  for (si_new = gsi_start_phis (new_loop->header),
96       si_orig = gsi_start_phis (orig_loop->header);
97       !gsi_end_p (si_new) && !gsi_end_p (si_orig);
98       gsi_next (&si_new), gsi_next (&si_orig))
99    {
100      tree def;
101      source_location locus;
102      gimple phi_new = gsi_stmt (si_new);
103      gimple phi_orig = gsi_stmt (si_orig);
104
105      /* Add the first phi argument for the phi in NEW_LOOP (the one
106	 associated with the entry of NEW_LOOP)  */
107      def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_entry_e);
108      locus = gimple_phi_arg_location_from_edge (phi_orig, orig_entry_e);
109      add_phi_arg (phi_new, def, new_loop_entry_e, locus);
110
111      /* Add the second phi argument for the phi in NEW_LOOP (the one
112	 associated with the latch of NEW_LOOP)  */
113      def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_loop_latch);
114      locus = gimple_phi_arg_location_from_edge (phi_orig, orig_loop_latch);
115
116      if (TREE_CODE (def) == SSA_NAME)
117	{
118	  new_ssa_name = get_current_def (def);
119
120	  if (!new_ssa_name)
121	    /* This only happens if there are no definitions inside the
122	       loop.  Use the the invariant in the new loop as is.  */
123	    new_ssa_name = def;
124	}
125      else
126	/* Could be an integer.  */
127	new_ssa_name = def;
128
129      add_phi_arg (phi_new, new_ssa_name, loop_latch_edge (new_loop), locus);
130    }
131}
132
133/* Return a copy of LOOP placed before LOOP.  */
134
135static struct loop *
136copy_loop_before (struct loop *loop)
137{
138  struct loop *res;
139  edge preheader = loop_preheader_edge (loop);
140
141  if (!single_exit (loop))
142    return NULL;
143
144  initialize_original_copy_tables ();
145  res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, preheader);
146  free_original_copy_tables ();
147
148  if (!res)
149    return NULL;
150
151  update_phis_for_loop_copy (loop, res);
152  rename_variables_in_loop (res);
153
154  return res;
155}
156
157/* Creates an empty basic block after LOOP.  */
158
159static void
160create_bb_after_loop (struct loop *loop)
161{
162  edge exit = single_exit (loop);
163
164  if (!exit)
165    return;
166
167  split_edge (exit);
168}
169
170/* Generate code for PARTITION from the code in LOOP.  The loop is
171   copied when COPY_P is true.  All the statements not flagged in the
172   PARTITION bitmap are removed from the loop or from its copy.  The
173   statements are indexed in sequence inside a basic block, and the
174   basic blocks of a loop are taken in dom order.  Returns true when
175   the code gen succeeded. */
176
177static bool
178generate_loops_for_partition (struct loop *loop, bitmap partition, bool copy_p)
179{
180  unsigned i, x;
181  gimple_stmt_iterator bsi;
182  basic_block *bbs;
183
184  if (copy_p)
185    {
186      loop = copy_loop_before (loop);
187      create_preheader (loop, CP_SIMPLE_PREHEADERS);
188      create_bb_after_loop (loop);
189    }
190
191  if (loop == NULL)
192    return false;
193
194  /* Remove stmts not in the PARTITION bitmap.  The order in which we
195     visit the phi nodes and the statements is exactly as in
196     stmts_from_loop.  */
197  bbs = get_loop_body_in_dom_order (loop);
198
199  for (x = 0, i = 0; i < loop->num_nodes; i++)
200    {
201      basic_block bb = bbs[i];
202
203      for (bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
204	if (!bitmap_bit_p (partition, x++))
205	  {
206	    gimple phi = gsi_stmt (bsi);
207	    if (!is_gimple_reg (gimple_phi_result (phi)))
208	      mark_virtual_phi_result_for_renaming (phi);
209	    remove_phi_node (&bsi, true);
210	  }
211	else
212	  gsi_next (&bsi);
213
214      for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
215	{
216	  gimple stmt = gsi_stmt (bsi);
217	  if (gimple_code (gsi_stmt (bsi)) != GIMPLE_LABEL
218	      && !bitmap_bit_p (partition, x++))
219	    {
220	      unlink_stmt_vdef (stmt);
221	      gsi_remove (&bsi, true);
222	      release_defs (stmt);
223	    }
224	  else
225	    gsi_next (&bsi);
226	}
227    }
228
229  free (bbs);
230  return true;
231}
232
233/* Build the size argument for a memset call.  */
234
235static inline tree
236build_size_arg_loc (location_t loc, tree nb_iter, tree op,
237		    gimple_seq *stmt_list)
238{
239  gimple_seq stmts;
240  tree x;
241
242  x = fold_build2_loc (loc, MULT_EXPR, size_type_node,
243		       fold_convert_loc (loc, size_type_node, nb_iter),
244		       fold_convert_loc (loc, size_type_node,
245					 TYPE_SIZE_UNIT (TREE_TYPE (op))));
246  x = force_gimple_operand (x, &stmts, true, NULL);
247  gimple_seq_add_seq (stmt_list, stmts);
248
249  return x;
250}
251
252/* Generate a call to memset.  Return true when the operation succeeded.  */
253
254static void
255generate_memset_zero (gimple stmt, tree op0, tree nb_iter,
256		      gimple_stmt_iterator bsi)
257{
258  tree addr_base, nb_bytes;
259  bool res = false;
260  gimple_seq stmt_list = NULL, stmts;
261  gimple fn_call;
262  tree mem, fn;
263  struct data_reference *dr = XCNEW (struct data_reference);
264  location_t loc = gimple_location (stmt);
265
266  DR_STMT (dr) = stmt;
267  DR_REF (dr) = op0;
268  res = dr_analyze_innermost (dr);
269  gcc_assert (res && stride_of_unit_type_p (DR_STEP (dr), TREE_TYPE (op0)));
270
271  nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list);
272  addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr));
273  addr_base = fold_convert_loc (loc, sizetype, addr_base);
274
275  /* Test for a negative stride, iterating over every element.  */
276  if (integer_zerop (size_binop (PLUS_EXPR,
277				 TYPE_SIZE_UNIT (TREE_TYPE (op0)),
278				 fold_convert (sizetype, DR_STEP (dr)))))
279    {
280      addr_base = size_binop_loc (loc, MINUS_EXPR, addr_base,
281				  fold_convert_loc (loc, sizetype, nb_bytes));
282      addr_base = size_binop_loc (loc, PLUS_EXPR, addr_base,
283				  TYPE_SIZE_UNIT (TREE_TYPE (op0)));
284    }
285
286  addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR,
287			       TREE_TYPE (DR_BASE_ADDRESS (dr)),
288			       DR_BASE_ADDRESS (dr), addr_base);
289  mem = force_gimple_operand (addr_base, &stmts, true, NULL);
290  gimple_seq_add_seq (&stmt_list, stmts);
291
292  fn = build_fold_addr_expr (implicit_built_in_decls [BUILT_IN_MEMSET]);
293  fn_call = gimple_build_call (fn, 3, mem, integer_zero_node, nb_bytes);
294  gimple_seq_add_stmt (&stmt_list, fn_call);
295  gsi_insert_seq_after (&bsi, stmt_list, GSI_CONTINUE_LINKING);
296
297  if (dump_file && (dump_flags & TDF_DETAILS))
298    fprintf (dump_file, "generated memset zero\n");
299
300  free_data_ref (dr);
301}
302
303/* Tries to generate a builtin function for the instructions of LOOP
304   pointed to by the bits set in PARTITION.  Returns true when the
305   operation succeeded.  */
306
307static bool
308generate_builtin (struct loop *loop, bitmap partition, bool copy_p)
309{
310  bool res = false;
311  unsigned i, x = 0;
312  basic_block *bbs;
313  gimple write = NULL;
314  gimple_stmt_iterator bsi;
315  tree nb_iter = number_of_exit_cond_executions (loop);
316
317  if (!nb_iter || nb_iter == chrec_dont_know)
318    return false;
319
320  bbs = get_loop_body_in_dom_order (loop);
321
322  for (i = 0; i < loop->num_nodes; i++)
323    {
324      basic_block bb = bbs[i];
325
326      for (bsi = gsi_start_phis (bb); !gsi_end_p (bsi); gsi_next (&bsi))
327	x++;
328
329      for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
330	{
331	  gimple stmt = gsi_stmt (bsi);
332
333	  if (bitmap_bit_p (partition, x++)
334	      && is_gimple_assign (stmt)
335	      && !is_gimple_reg (gimple_assign_lhs (stmt)))
336	    {
337	      /* Don't generate the builtins when there are more than
338		 one memory write.  */
339	      if (write != NULL)
340		goto end;
341
342	      write = stmt;
343	      if (bb == loop->latch)
344		nb_iter = number_of_latch_executions (loop);
345	    }
346	}
347    }
348
349  if (!stmt_with_adjacent_zero_store_dr_p (write))
350    goto end;
351
352  /* The new statements will be placed before LOOP.  */
353  bsi = gsi_last_bb (loop_preheader_edge (loop)->src);
354  generate_memset_zero (write, gimple_assign_lhs (write), nb_iter, bsi);
355  res = true;
356
357  /* If this is the last partition for which we generate code, we have
358     to destroy the loop.  */
359  if (!copy_p)
360    {
361      unsigned nbbs = loop->num_nodes;
362      edge exit = single_exit (loop);
363      basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
364      redirect_edge_pred (exit, src);
365      exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
366      exit->flags |= EDGE_FALLTHRU;
367      cancel_loop_tree (loop);
368      rescan_loop_exit (exit, false, true);
369
370      for (i = 0; i < nbbs; i++)
371	delete_basic_block (bbs[i]);
372
373      set_immediate_dominator (CDI_DOMINATORS, dest,
374			       recompute_dominator (CDI_DOMINATORS, dest));
375    }
376
377 end:
378  free (bbs);
379  return res;
380}
381
382/* Generates code for PARTITION.  For simple loops, this function can
383   generate a built-in.  */
384
385static bool
386generate_code_for_partition (struct loop *loop, bitmap partition, bool copy_p)
387{
388  if (generate_builtin (loop, partition, copy_p))
389    return true;
390
391  return generate_loops_for_partition (loop, partition, copy_p);
392}
393
394
395/* Returns true if the node V of RDG cannot be recomputed.  */
396
397static bool
398rdg_cannot_recompute_vertex_p (struct graph *rdg, int v)
399{
400  if (RDG_MEM_WRITE_STMT (rdg, v))
401    return true;
402
403  return false;
404}
405
406/* Returns true when the vertex V has already been generated in the
407   current partition (V is in PROCESSED), or when V belongs to another
408   partition and cannot be recomputed (V is not in REMAINING_STMTS).  */
409
410static inline bool
411already_processed_vertex_p (bitmap processed, int v)
412{
413  return (bitmap_bit_p (processed, v)
414	  || !bitmap_bit_p (remaining_stmts, v));
415}
416
417/* Returns NULL when there is no anti-dependence among the successors
418   of vertex V, otherwise returns the edge with the anti-dep.  */
419
420static struct graph_edge *
421has_anti_dependence (struct vertex *v)
422{
423  struct graph_edge *e;
424
425  if (v->succ)
426    for (e = v->succ; e; e = e->succ_next)
427      if (RDGE_TYPE (e) == anti_dd)
428	return e;
429
430  return NULL;
431}
432
433/* Returns true when V has an anti-dependence edge among its successors.  */
434
435static bool
436predecessor_has_mem_write (struct graph *rdg, struct vertex *v)
437{
438  struct graph_edge *e;
439
440  if (v->pred)
441    for (e = v->pred; e; e = e->pred_next)
442      if (bitmap_bit_p (upstream_mem_writes, e->src)
443	  /* Don't consider flow channels: a write to memory followed
444	     by a read from memory.  These channels allow the split of
445	     the RDG in different partitions.  */
446	  && !RDG_MEM_WRITE_STMT (rdg, e->src))
447	return true;
448
449  return false;
450}
451
452/* Initializes the upstream_mem_writes bitmap following the
453   information from RDG.  */
454
455static void
456mark_nodes_having_upstream_mem_writes (struct graph *rdg)
457{
458  int v, x;
459  bitmap seen = BITMAP_ALLOC (NULL);
460
461  for (v = rdg->n_vertices - 1; v >= 0; v--)
462    if (!bitmap_bit_p (seen, v))
463      {
464	unsigned i;
465	VEC (int, heap) *nodes = VEC_alloc (int, heap, 3);
466
467	graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
468
469	for (i = 0; VEC_iterate (int, nodes, i, x); i++)
470	  {
471	    if (bitmap_bit_p (seen, x))
472	      continue;
473
474	    bitmap_set_bit (seen, x);
475
476	    if (RDG_MEM_WRITE_STMT (rdg, x)
477		|| predecessor_has_mem_write (rdg, &(rdg->vertices[x]))
478		/* In anti dependences the read should occur before
479		   the write, this is why both the read and the write
480		   should be placed in the same partition.  */
481		|| has_anti_dependence (&(rdg->vertices[x])))
482	      {
483		bitmap_set_bit (upstream_mem_writes, x);
484	      }
485	  }
486
487	VEC_free (int, heap, nodes);
488      }
489}
490
491/* Returns true when vertex u has a memory write node as a predecessor
492   in RDG.  */
493
494static bool
495has_upstream_mem_writes (int u)
496{
497  return bitmap_bit_p (upstream_mem_writes, u);
498}
499
500static void rdg_flag_vertex_and_dependent (struct graph *, int, bitmap, bitmap,
501					   bitmap, bool *);
502
503/* Flag the uses of U stopping following the information from
504   upstream_mem_writes.  */
505
506static void
507rdg_flag_uses (struct graph *rdg, int u, bitmap partition, bitmap loops,
508	       bitmap processed, bool *part_has_writes)
509{
510  use_operand_p use_p;
511  struct vertex *x = &(rdg->vertices[u]);
512  gimple stmt = RDGV_STMT (x);
513  struct graph_edge *anti_dep = has_anti_dependence (x);
514
515  /* Keep in the same partition the destination of an antidependence,
516     because this is a store to the exact same location.  Putting this
517     in another partition is bad for cache locality.  */
518  if (anti_dep)
519    {
520      int v = anti_dep->dest;
521
522      if (!already_processed_vertex_p (processed, v))
523	rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
524				       processed, part_has_writes);
525    }
526
527  if (gimple_code (stmt) != GIMPLE_PHI)
528    {
529      if ((use_p = gimple_vuse_op (stmt)) != NULL_USE_OPERAND_P)
530	{
531	  tree use = USE_FROM_PTR (use_p);
532
533	  if (TREE_CODE (use) == SSA_NAME)
534	    {
535	      gimple def_stmt = SSA_NAME_DEF_STMT (use);
536	      int v = rdg_vertex_for_stmt (rdg, def_stmt);
537
538	      if (v >= 0
539		  && !already_processed_vertex_p (processed, v))
540		rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
541					       processed, part_has_writes);
542	    }
543	}
544    }
545
546  if (is_gimple_assign (stmt) && has_upstream_mem_writes (u))
547    {
548      tree op0 = gimple_assign_lhs (stmt);
549
550      /* Scalar channels don't have enough space for transmitting data
551	 between tasks, unless we add more storage by privatizing.  */
552      if (is_gimple_reg (op0))
553	{
554	  use_operand_p use_p;
555	  imm_use_iterator iter;
556
557	  FOR_EACH_IMM_USE_FAST (use_p, iter, op0)
558	    {
559	      int v = rdg_vertex_for_stmt (rdg, USE_STMT (use_p));
560
561	      if (!already_processed_vertex_p (processed, v))
562		rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
563					       processed, part_has_writes);
564	    }
565	}
566    }
567}
568
569/* Flag V from RDG as part of PARTITION, and also flag its loop number
570   in LOOPS.  */
571
572static void
573rdg_flag_vertex (struct graph *rdg, int v, bitmap partition, bitmap loops,
574		 bool *part_has_writes)
575{
576  struct loop *loop;
577
578  if (bitmap_bit_p (partition, v))
579    return;
580
581  loop = loop_containing_stmt (RDG_STMT (rdg, v));
582  bitmap_set_bit (loops, loop->num);
583  bitmap_set_bit (partition, v);
584
585  if (rdg_cannot_recompute_vertex_p (rdg, v))
586    {
587      *part_has_writes = true;
588      bitmap_clear_bit (remaining_stmts, v);
589    }
590}
591
592/* Flag in the bitmap PARTITION the vertex V and all its predecessors.
593   Also flag their loop number in LOOPS.  */
594
595static void
596rdg_flag_vertex_and_dependent (struct graph *rdg, int v, bitmap partition,
597			       bitmap loops, bitmap processed,
598			       bool *part_has_writes)
599{
600  unsigned i;
601  VEC (int, heap) *nodes = VEC_alloc (int, heap, 3);
602  int x;
603
604  bitmap_set_bit (processed, v);
605  rdg_flag_uses (rdg, v, partition, loops, processed, part_has_writes);
606  graphds_dfs (rdg, &v, 1, &nodes, false, remaining_stmts);
607  rdg_flag_vertex (rdg, v, partition, loops, part_has_writes);
608
609  for (i = 0; VEC_iterate (int, nodes, i, x); i++)
610    if (!already_processed_vertex_p (processed, x))
611      rdg_flag_vertex_and_dependent (rdg, x, partition, loops, processed,
612				     part_has_writes);
613
614  VEC_free (int, heap, nodes);
615}
616
617/* Initialize CONDS with all the condition statements from the basic
618   blocks of LOOP.  */
619
620static void
621collect_condition_stmts (struct loop *loop, VEC (gimple, heap) **conds)
622{
623  unsigned i;
624  edge e;
625  VEC (edge, heap) *exits = get_loop_exit_edges (loop);
626
627  for (i = 0; VEC_iterate (edge, exits, i, e); i++)
628    {
629      gimple cond = last_stmt (e->src);
630
631      if (cond)
632	VEC_safe_push (gimple, heap, *conds, cond);
633    }
634
635  VEC_free (edge, heap, exits);
636}
637
638/* Add to PARTITION all the exit condition statements for LOOPS
639   together with all their dependent statements determined from
640   RDG.  */
641
642static void
643rdg_flag_loop_exits (struct graph *rdg, bitmap loops, bitmap partition,
644		     bitmap processed, bool *part_has_writes)
645{
646  unsigned i;
647  bitmap_iterator bi;
648  VEC (gimple, heap) *conds = VEC_alloc (gimple, heap, 3);
649
650  EXECUTE_IF_SET_IN_BITMAP (loops, 0, i, bi)
651    collect_condition_stmts (get_loop (i), &conds);
652
653  while (!VEC_empty (gimple, conds))
654    {
655      gimple cond = VEC_pop (gimple, conds);
656      int v = rdg_vertex_for_stmt (rdg, cond);
657      bitmap new_loops = BITMAP_ALLOC (NULL);
658
659      if (!already_processed_vertex_p (processed, v))
660	rdg_flag_vertex_and_dependent (rdg, v, partition, new_loops, processed,
661				       part_has_writes);
662
663      EXECUTE_IF_SET_IN_BITMAP (new_loops, 0, i, bi)
664	if (!bitmap_bit_p (loops, i))
665	  {
666	    bitmap_set_bit (loops, i);
667	    collect_condition_stmts (get_loop (i), &conds);
668	  }
669
670      BITMAP_FREE (new_loops);
671    }
672}
673
674/* Returns a bitmap in which all the statements needed for computing
675   the strongly connected component C of the RDG are flagged, also
676   including the loop exit conditions.  */
677
678static bitmap
679build_rdg_partition_for_component (struct graph *rdg, rdgc c,
680				   bool *part_has_writes)
681{
682  int i, v;
683  bitmap partition = BITMAP_ALLOC (NULL);
684  bitmap loops = BITMAP_ALLOC (NULL);
685  bitmap processed = BITMAP_ALLOC (NULL);
686
687  for (i = 0; VEC_iterate (int, c->vertices, i, v); i++)
688    if (!already_processed_vertex_p (processed, v))
689      rdg_flag_vertex_and_dependent (rdg, v, partition, loops, processed,
690				     part_has_writes);
691
692  rdg_flag_loop_exits (rdg, loops, partition, processed, part_has_writes);
693
694  BITMAP_FREE (processed);
695  BITMAP_FREE (loops);
696  return partition;
697}
698
699/* Free memory for COMPONENTS.  */
700
701static void
702free_rdg_components (VEC (rdgc, heap) *components)
703{
704  int i;
705  rdgc x;
706
707  for (i = 0; VEC_iterate (rdgc, components, i, x); i++)
708    {
709      VEC_free (int, heap, x->vertices);
710      free (x);
711    }
712}
713
714/* Build the COMPONENTS vector with the strongly connected components
715   of RDG in which the STARTING_VERTICES occur.  */
716
717static void
718rdg_build_components (struct graph *rdg, VEC (int, heap) *starting_vertices,
719		      VEC (rdgc, heap) **components)
720{
721  int i, v;
722  bitmap saved_components = BITMAP_ALLOC (NULL);
723  int n_components = graphds_scc (rdg, NULL);
724  VEC (int, heap) **all_components = XNEWVEC (VEC (int, heap) *, n_components);
725
726  for (i = 0; i < n_components; i++)
727    all_components[i] = VEC_alloc (int, heap, 3);
728
729  for (i = 0; i < rdg->n_vertices; i++)
730    VEC_safe_push (int, heap, all_components[rdg->vertices[i].component], i);
731
732  for (i = 0; VEC_iterate (int, starting_vertices, i, v); i++)
733    {
734      int c = rdg->vertices[v].component;
735
736      if (!bitmap_bit_p (saved_components, c))
737	{
738	  rdgc x = XCNEW (struct rdg_component);
739	  x->num = c;
740	  x->vertices = all_components[c];
741
742	  VEC_safe_push (rdgc, heap, *components, x);
743	  bitmap_set_bit (saved_components, c);
744	}
745    }
746
747  for (i = 0; i < n_components; i++)
748    if (!bitmap_bit_p (saved_components, i))
749      VEC_free (int, heap, all_components[i]);
750
751  free (all_components);
752  BITMAP_FREE (saved_components);
753}
754
755/* Returns true when it is possible to generate a builtin pattern for
756   the PARTITION of RDG.  For the moment we detect only the memset
757   zero pattern.  */
758
759static bool
760can_generate_builtin (struct graph *rdg, bitmap partition)
761{
762  unsigned i;
763  bitmap_iterator bi;
764  int nb_reads = 0;
765  int nb_writes = 0;
766  int stores_zero = 0;
767
768  EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, bi)
769    if (RDG_MEM_READS_STMT (rdg, i))
770      nb_reads++;
771    else if (RDG_MEM_WRITE_STMT (rdg, i))
772      {
773	nb_writes++;
774	if (stmt_with_adjacent_zero_store_dr_p (RDG_STMT (rdg, i)))
775	  stores_zero++;
776      }
777
778  return stores_zero == 1 && nb_writes == 1 && nb_reads == 0;
779}
780
781/* Returns true when PARTITION1 and PARTITION2 have similar memory
782   accesses in RDG.  */
783
784static bool
785similar_memory_accesses (struct graph *rdg, bitmap partition1,
786			 bitmap partition2)
787{
788  unsigned i, j;
789  bitmap_iterator bi, bj;
790
791  EXECUTE_IF_SET_IN_BITMAP (partition1, 0, i, bi)
792    if (RDG_MEM_WRITE_STMT (rdg, i)
793	|| RDG_MEM_READS_STMT (rdg, i))
794      EXECUTE_IF_SET_IN_BITMAP (partition2, 0, j, bj)
795	if (RDG_MEM_WRITE_STMT (rdg, j)
796	    || RDG_MEM_READS_STMT (rdg, j))
797	  if (rdg_has_similar_memory_accesses (rdg, i, j))
798	    return true;
799
800  return false;
801}
802
803/* Fuse all the partitions from PARTITIONS that contain similar memory
804   references, i.e., we're taking care of cache locality.  This
805   function does not fuse those partitions that contain patterns that
806   can be code generated with builtins.  */
807
808static void
809fuse_partitions_with_similar_memory_accesses (struct graph *rdg,
810					      VEC (bitmap, heap) **partitions)
811{
812  int p1, p2;
813  bitmap partition1, partition2;
814
815  for (p1 = 0; VEC_iterate (bitmap, *partitions, p1, partition1); p1++)
816    if (!can_generate_builtin (rdg, partition1))
817      for (p2 = 0; VEC_iterate (bitmap, *partitions, p2, partition2); p2++)
818	if (p1 != p2
819	    && !can_generate_builtin (rdg, partition2)
820	    && similar_memory_accesses (rdg, partition1, partition2))
821	  {
822	    bitmap_ior_into (partition1, partition2);
823	    VEC_ordered_remove (bitmap, *partitions, p2);
824	    p2--;
825	  }
826}
827
828/* Aggregate several components into a useful partition that is
829   registered in the PARTITIONS vector.  Partitions will be
830   distributed in different loops.  */
831
832static void
833rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components,
834		      VEC (int, heap) **other_stores,
835		      VEC (bitmap, heap) **partitions, bitmap processed)
836{
837  int i;
838  rdgc x;
839  bitmap partition = BITMAP_ALLOC (NULL);
840
841  for (i = 0; VEC_iterate (rdgc, components, i, x); i++)
842    {
843      bitmap np;
844      bool part_has_writes = false;
845      int v = VEC_index (int, x->vertices, 0);
846
847      if (bitmap_bit_p (processed, v))
848	continue;
849
850      np = build_rdg_partition_for_component (rdg, x, &part_has_writes);
851      bitmap_ior_into (partition, np);
852      bitmap_ior_into (processed, np);
853      BITMAP_FREE (np);
854
855      if (part_has_writes)
856	{
857	  if (dump_file && (dump_flags & TDF_DETAILS))
858	    {
859	      fprintf (dump_file, "ldist useful partition:\n");
860	      dump_bitmap (dump_file, partition);
861	    }
862
863	  VEC_safe_push (bitmap, heap, *partitions, partition);
864	  partition = BITMAP_ALLOC (NULL);
865	}
866    }
867
868  /* Add the nodes from the RDG that were not marked as processed, and
869     that are used outside the current loop.  These are scalar
870     computations that are not yet part of previous partitions.  */
871  for (i = 0; i < rdg->n_vertices; i++)
872    if (!bitmap_bit_p (processed, i)
873	&& rdg_defs_used_in_other_loops_p (rdg, i))
874      VEC_safe_push (int, heap, *other_stores, i);
875
876  /* If there are still statements left in the OTHER_STORES array,
877     create other components and partitions with these stores and
878     their dependences.  */
879  if (VEC_length (int, *other_stores) > 0)
880    {
881      VEC (rdgc, heap) *comps = VEC_alloc (rdgc, heap, 3);
882      VEC (int, heap) *foo = VEC_alloc (int, heap, 3);
883
884      rdg_build_components (rdg, *other_stores, &comps);
885      rdg_build_partitions (rdg, comps, &foo, partitions, processed);
886
887      VEC_free (int, heap, foo);
888      free_rdg_components (comps);
889    }
890
891  /* If there is something left in the last partition, save it.  */
892  if (bitmap_count_bits (partition) > 0)
893    VEC_safe_push (bitmap, heap, *partitions, partition);
894  else
895    BITMAP_FREE (partition);
896
897  fuse_partitions_with_similar_memory_accesses (rdg, partitions);
898}
899
900/* Dump to FILE the PARTITIONS.  */
901
902static void
903dump_rdg_partitions (FILE *file, VEC (bitmap, heap) *partitions)
904{
905  int i;
906  bitmap partition;
907
908  for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
909    debug_bitmap_file (file, partition);
910}
911
912/* Debug PARTITIONS.  */
913extern void debug_rdg_partitions (VEC (bitmap, heap) *);
914
915void
916debug_rdg_partitions (VEC (bitmap, heap) *partitions)
917{
918  dump_rdg_partitions (stderr, partitions);
919}
920
921/* Returns the number of read and write operations in the RDG.  */
922
923static int
924number_of_rw_in_rdg (struct graph *rdg)
925{
926  int i, res = 0;
927
928  for (i = 0; i < rdg->n_vertices; i++)
929    {
930      if (RDG_MEM_WRITE_STMT (rdg, i))
931	++res;
932
933      if (RDG_MEM_READS_STMT (rdg, i))
934	++res;
935    }
936
937  return res;
938}
939
940/* Returns the number of read and write operations in a PARTITION of
941   the RDG.  */
942
943static int
944number_of_rw_in_partition (struct graph *rdg, bitmap partition)
945{
946  int res = 0;
947  unsigned i;
948  bitmap_iterator ii;
949
950  EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, ii)
951    {
952      if (RDG_MEM_WRITE_STMT (rdg, i))
953	++res;
954
955      if (RDG_MEM_READS_STMT (rdg, i))
956	++res;
957    }
958
959  return res;
960}
961
962/* Returns true when one of the PARTITIONS contains all the read or
963   write operations of RDG.  */
964
965static bool
966partition_contains_all_rw (struct graph *rdg, VEC (bitmap, heap) *partitions)
967{
968  int i;
969  bitmap partition;
970  int nrw = number_of_rw_in_rdg (rdg);
971
972  for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
973    if (nrw == number_of_rw_in_partition (rdg, partition))
974      return true;
975
976  return false;
977}
978
979/* Generate code from STARTING_VERTICES in RDG.  Returns the number of
980   distributed loops.  */
981
982static int
983ldist_gen (struct loop *loop, struct graph *rdg,
984	   VEC (int, heap) *starting_vertices)
985{
986  int i, nbp;
987  VEC (rdgc, heap) *components = VEC_alloc (rdgc, heap, 3);
988  VEC (bitmap, heap) *partitions = VEC_alloc (bitmap, heap, 3);
989  VEC (int, heap) *other_stores = VEC_alloc (int, heap, 3);
990  bitmap partition, processed = BITMAP_ALLOC (NULL);
991
992  remaining_stmts = BITMAP_ALLOC (NULL);
993  upstream_mem_writes = BITMAP_ALLOC (NULL);
994
995  for (i = 0; i < rdg->n_vertices; i++)
996    {
997      bitmap_set_bit (remaining_stmts, i);
998
999      /* Save in OTHER_STORES all the memory writes that are not in
1000	 STARTING_VERTICES.  */
1001      if (RDG_MEM_WRITE_STMT (rdg, i))
1002	{
1003	  int v;
1004	  unsigned j;
1005	  bool found = false;
1006
1007	  for (j = 0; VEC_iterate (int, starting_vertices, j, v); j++)
1008	    if (i == v)
1009	      {
1010		found = true;
1011		break;
1012	      }
1013
1014	  if (!found)
1015	    VEC_safe_push (int, heap, other_stores, i);
1016	}
1017    }
1018
1019  mark_nodes_having_upstream_mem_writes (rdg);
1020  rdg_build_components (rdg, starting_vertices, &components);
1021  rdg_build_partitions (rdg, components, &other_stores, &partitions,
1022			processed);
1023  BITMAP_FREE (processed);
1024  nbp = VEC_length (bitmap, partitions);
1025
1026  if (nbp <= 1
1027      || partition_contains_all_rw (rdg, partitions))
1028    goto ldist_done;
1029
1030  if (dump_file && (dump_flags & TDF_DETAILS))
1031    dump_rdg_partitions (dump_file, partitions);
1032
1033  for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
1034    if (!generate_code_for_partition (loop, partition, i < nbp - 1))
1035      goto ldist_done;
1036
1037  rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
1038  update_ssa (TODO_update_ssa_only_virtuals | TODO_update_ssa);
1039
1040 ldist_done:
1041
1042  BITMAP_FREE (remaining_stmts);
1043  BITMAP_FREE (upstream_mem_writes);
1044
1045  for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
1046    BITMAP_FREE (partition);
1047
1048  VEC_free (int, heap, other_stores);
1049  VEC_free (bitmap, heap, partitions);
1050  free_rdg_components (components);
1051  return nbp;
1052}
1053
1054/* Distributes the code from LOOP in such a way that producer
1055   statements are placed before consumer statements.  When STMTS is
1056   NULL, performs the maximal distribution, if STMTS is not NULL,
1057   tries to separate only these statements from the LOOP's body.
1058   Returns the number of distributed loops.  */
1059
1060static int
1061distribute_loop (struct loop *loop, VEC (gimple, heap) *stmts)
1062{
1063  int res = 0;
1064  struct graph *rdg;
1065  gimple s;
1066  unsigned i;
1067  VEC (int, heap) *vertices;
1068
1069  if (loop->num_nodes > 2)
1070    {
1071      if (dump_file && (dump_flags & TDF_DETAILS))
1072	fprintf (dump_file,
1073		 "FIXME: Loop %d not distributed: it has more than two basic blocks.\n",
1074		 loop->num);
1075
1076      return res;
1077    }
1078
1079  rdg = build_rdg (loop);
1080
1081  if (!rdg)
1082    {
1083      if (dump_file && (dump_flags & TDF_DETAILS))
1084	fprintf (dump_file,
1085		 "FIXME: Loop %d not distributed: failed to build the RDG.\n",
1086		 loop->num);
1087
1088      return res;
1089    }
1090
1091  vertices = VEC_alloc (int, heap, 3);
1092
1093  if (dump_file && (dump_flags & TDF_DETAILS))
1094    dump_rdg (dump_file, rdg);
1095
1096  for (i = 0; VEC_iterate (gimple, stmts, i, s); i++)
1097    {
1098      int v = rdg_vertex_for_stmt (rdg, s);
1099
1100      if (v >= 0)
1101	{
1102	  VEC_safe_push (int, heap, vertices, v);
1103
1104	  if (dump_file && (dump_flags & TDF_DETAILS))
1105	    fprintf (dump_file,
1106		     "ldist asked to generate code for vertex %d\n", v);
1107	}
1108    }
1109
1110  res = ldist_gen (loop, rdg, vertices);
1111  VEC_free (int, heap, vertices);
1112  free_rdg (rdg);
1113
1114  return res;
1115}
1116
1117/* Distribute all loops in the current function.  */
1118
1119static unsigned int
1120tree_loop_distribution (void)
1121{
1122  struct loop *loop;
1123  loop_iterator li;
1124  int nb_generated_loops = 0;
1125
1126  FOR_EACH_LOOP (li, loop, 0)
1127    {
1128      VEC (gimple, heap) *work_list = NULL;
1129
1130      /* If the loop doesn't have a single exit we will fail anyway,
1131	 so do that early.  */
1132      if (!single_exit (loop))
1133	continue;
1134
1135      /* With the following working list, we're asking distribute_loop
1136	 to separate the stores of the loop: when dependences allow,
1137	 it will end on having one store per loop.  */
1138      stores_from_loop (loop, &work_list);
1139
1140      /* A simple heuristic for cache locality is to not split stores
1141	 to the same array.  Without this call, an unrolled loop would
1142	 be split into as many loops as unroll factor, each loop
1143	 storing in the same array.  */
1144      remove_similar_memory_refs (&work_list);
1145
1146      nb_generated_loops = distribute_loop (loop, work_list);
1147
1148      if (dump_file && (dump_flags & TDF_DETAILS))
1149	{
1150	  if (nb_generated_loops > 1)
1151	    fprintf (dump_file, "Loop %d distributed: split to %d loops.\n",
1152		     loop->num, nb_generated_loops);
1153	  else
1154	    fprintf (dump_file, "Loop %d is the same.\n", loop->num);
1155	}
1156
1157      verify_loop_structure ();
1158
1159      VEC_free (gimple, heap, work_list);
1160    }
1161
1162  return 0;
1163}
1164
1165static bool
1166gate_tree_loop_distribution (void)
1167{
1168  return flag_tree_loop_distribution != 0;
1169}
1170
1171struct gimple_opt_pass pass_loop_distribution =
1172{
1173 {
1174  GIMPLE_PASS,
1175  "ldist",			/* name */
1176  gate_tree_loop_distribution,  /* gate */
1177  tree_loop_distribution,       /* execute */
1178  NULL,				/* sub */
1179  NULL,				/* next */
1180  0,				/* static_pass_number */
1181  TV_TREE_LOOP_DISTRIBUTION,    /* tv_id */
1182  PROP_cfg | PROP_ssa,		/* properties_required */
1183  0,				/* properties_provided */
1184  0,				/* properties_destroyed */
1185  0,				/* todo_flags_start */
1186  TODO_dump_func | TODO_verify_loops            /* todo_flags_finish */
1187 }
1188};
1189