117683Spst/* Loop distribution.
239291Sfenner   Copyright (C) 2006-2020 Free Software Foundation, Inc.
317683Spst   Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
417683Spst   and Sebastian Pop <sebastian.pop@amd.com>.
517683Spst
617683SpstThis file is part of GCC.
717683Spst
817683SpstGCC is free software; you can redistribute it and/or modify it
917683Spstunder the terms of the GNU General Public License as published by the
1017683SpstFree Software Foundation; either version 3, or (at your option) any
1117683Spstlater version.
1217683Spst
1317683SpstGCC is distributed in the hope that it will be useful, but WITHOUT
1417683SpstANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
1517683SpstFITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
1617683Spstfor more details.
1717683Spst
1817683SpstYou should have received a copy of the GNU General Public License
1917683Spstalong with GCC; see the file COPYING3.  If not see
2017683Spst<http://www.gnu.org/licenses/>.  */
2117683Spst
2226175Sfenner/* This pass performs loop distribution: for example, the loop
2398530Sfenner
2417683Spst   |DO I = 2, N
2517683Spst   |    A(I) = B(I) + C
2675107Sfenner   |    D(I) = A(I-1)*E
2775107Sfenner   |ENDDO
2875107Sfenner
2975107Sfenner   is transformed to
3017683Spst
3117683Spst   |DOALL I = 2, N
3217683Spst   |   A(I) = B(I) + C
3317683Spst   |ENDDO
3417683Spst   |
3517683Spst   |DOALL I = 2, N
3617683Spst   |   D(I) = A(I-1)*E
3717683Spst   |ENDDO
3898530Sfenner
3998530Sfenner   Loop distribution is the dual of loop fusion.  It separates statements
4098530Sfenner   of a loop (or loop nest) into multiple loops (or loop nests) with the
4198530Sfenner   same loop header.  The major goal is to separate statements which may
4298530Sfenner   be vectorized from those that can't.  This pass implements distribution
4398530Sfenner   in the following steps:
4498530Sfenner
4598530Sfenner     1) Seed partitions with specific type statements.  For now we support
4698530Sfenner	two types seed statements: statement defining variable used outside
4798530Sfenner	of loop; statement storing to memory.
4817683Spst     2) Build reduced dependence graph (RDG) for loop to be distributed.
4917683Spst	The vertices (RDG:V) model all statements in the loop and the edges
5017683Spst	(RDG:E) model flow and control dependencies between statements.
5117683Spst     3) Apart from RDG, compute data dependencies between memory references.
5217683Spst     4) Starting from seed statement, build up partition by adding depended
5317683Spst	statements according to RDG's dependence information.  Partition is
5417683Spst	classified as parallel type if it can be executed paralleled; or as
5517683Spst	sequential type if it can't.  Parallel type partition is further
5617683Spst	classified as different builtin kinds if it can be implemented as
5717683Spst	builtin function calls.
5817683Spst     5) Build partition dependence graph (PG) based on data dependencies.
5917683Spst	The vertices (PG:V) model all partitions and the edges (PG:E) model
6017683Spst	all data dependencies between every partitions pair.  In general,
6117683Spst	data dependence is either compilation time known or unknown.  In C
6217683Spst	family languages, there exists quite amount compilation time unknown
6356889Sfenner	dependencies because of possible alias relation of data references.
6456889Sfenner	We categorize PG's edge to two types: "true" edge that represents
6517683Spst	compilation time known data dependencies; "alias" edge for all other
6617683Spst	data dependencies.
6717683Spst     6) Traverse subgraph of PG as if all "alias" edges don't exist.  Merge
6817683Spst	partitions in each strong connected component (SCC) correspondingly.
6917683Spst	Build new PG for merged partitions.
7098530Sfenner     7) Traverse PG again and this time with both "true" and "alias" edges
7198530Sfenner	included.  We try to break SCCs by removing some edges.  Because
7298530Sfenner	SCCs by "true" edges are all fused in step 6), we can break SCCs
7398530Sfenner	by removing some "alias" edges.  It's NP-hard to choose optimal
7498530Sfenner	edge set, fortunately simple approximation is good enough for us
7598530Sfenner	given the small problem scale.
7698530Sfenner     8) Collect all data dependencies of the removed "alias" edges.  Create
7798530Sfenner	runtime alias checks for collected data dependencies.
7898530Sfenner     9) Version loop under the condition of runtime alias checks.  Given
7998530Sfenner	loop distribution generally introduces additional overhead, it is
8098530Sfenner	only useful if vectorization is achieved in distributed loop.  We
8198530Sfenner	version loop with internal function call IFN_LOOP_DIST_ALIAS.  If
8298530Sfenner	no distributed loop can be vectorized, we simply remove distributed
8317683Spst	loops and recover to the original one.
8475107Sfenner
8575107Sfenner   TODO:
8617683Spst     1) We only distribute innermost two-level loop nest now.  We should
8717683Spst	extend it for arbitrary loop nests in the future.
8817683Spst     2) We only fuse partitions in SCC now.  A better fusion algorithm is
8917683Spst	desired to minimize loop overhead, maximize parallelism and maximize
9017683Spst	data reuse.  */
9117683Spst
9217683Spst#include "config.h"
9317683Spst#include "system.h"
9417683Spst#include "coretypes.h"
9517683Spst#include "backend.h"
9617683Spst#include "tree.h"
9717683Spst#include "gimple.h"
9817683Spst#include "cfghooks.h"
9917683Spst#include "tree-pass.h"
10017683Spst#include "ssa.h"
10117683Spst#include "gimple-pretty-print.h"
10217683Spst#include "fold-const.h"
10317683Spst#include "cfganal.h"
10417683Spst#include "gimple-iterator.h"
10517683Spst#include "gimplify-me.h"
10617683Spst#include "stor-layout.h"
10717683Spst#include "tree-cfg.h"
10817683Spst#include "tree-ssa-loop-manip.h"
10917683Spst#include "tree-ssa-loop-ivopts.h"
11017683Spst#include "tree-ssa-loop.h"
11117683Spst#include "tree-into-ssa.h"
11217683Spst#include "tree-ssa.h"
11317683Spst#include "cfgloop.h"
11417683Spst#include "tree-scalar-evolution.h"
11517683Spst#include "tree-vectorizer.h"
11617683Spst#include "tree-eh.h"
11717683Spst#include "gimple-fold.h"
11817683Spst#include "tree-affine.h"
11917683Spst
12017683Spst
12117683Spst#define MAX_DATAREFS_NUM \
12217683Spst	((unsigned) param_loop_max_datarefs_for_datadeps)
12317683Spst
12417683Spst/* Threshold controlling number of distributed partitions.  Given it may
12517683Spst   be unnecessary if a memory stream cost model is invented in the future,
12617683Spst   we define it as a temporary macro, rather than a parameter.  */
12717683Spst#define NUM_PARTITION_THRESHOLD (4)
12817683Spst
12975107Sfenner/* Hashtable helpers.  */
13075107Sfenner
13117683Spststruct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
13217683Spst{
13317683Spst  static inline hashval_t hash (const data_dependence_relation *);
13417683Spst  static inline bool equal (const data_dependence_relation *,
13517683Spst			    const data_dependence_relation *);
13617683Spst};
13717683Spst
13817683Spst/* Hash function for data dependence.  */
13917683Spst
14017683Spstinline hashval_t
14117683Spstddr_hasher::hash (const data_dependence_relation *ddr)
14217683Spst{
14317683Spst  inchash::hash h;
14417683Spst  h.add_ptr (DDR_A (ddr));
14517683Spst  h.add_ptr (DDR_B (ddr));
14617683Spst  return h.end ();
14717683Spst}
14817683Spst
14998530Sfenner/* Hash table equality function for data dependence.  */
15098530Sfenner
15198530Sfennerinline bool
15298530Sfennerddr_hasher::equal (const data_dependence_relation *ddr1,
15398530Sfenner		   const data_dependence_relation *ddr2)
15498530Sfenner{
15598530Sfenner  return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
15698530Sfenner}
15798530Sfenner
15898530Sfenner
15998530Sfenner
16098530Sfenner#define DR_INDEX(dr)      ((uintptr_t) (dr)->aux)
16198530Sfenner
16298530Sfenner/* A Reduced Dependence Graph (RDG) vertex representing a statement.  */
16317683Spststruct rdg_vertex
16417683Spst{
16517683Spst  /* The statement represented by this vertex.  */
16617683Spst  gimple *stmt;
16717683Spst
16817683Spst  /* Vector of data-references in this statement.  */
16917683Spst  vec<data_reference_p> datarefs;
17017683Spst
17117683Spst  /* True when the statement contains a write to memory.  */
17217683Spst  bool has_mem_write;
17317683Spst
17417683Spst  /* True when the statement contains a read from memory.  */
17517683Spst  bool has_mem_reads;
17617683Spst};
17717683Spst
17817683Spst#define RDGV_STMT(V)     ((struct rdg_vertex *) ((V)->data))->stmt
17917683Spst#define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
18017683Spst#define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
18175107Sfenner#define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
18217683Spst#define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
18317683Spst#define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
18417683Spst#define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
18517683Spst#define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
18617683Spst
18775107Sfenner/* Data dependence type.  */
18817683Spst
18917683Spstenum rdg_dep_type
19017683Spst{
19117683Spst  /* Read After Write (RAW).  */
19217683Spst  flow_dd = 'f',
19317683Spst
19417683Spst  /* Control dependence (execute conditional on).  */
19575107Sfenner  control_dd = 'c'
19675107Sfenner};
19717683Spst
19817683Spst/* Dependence information attached to an edge of the RDG.  */
19917683Spst
20017683Spststruct rdg_edge
20117683Spst{
20217683Spst  /* Type of the dependence.  */
20317683Spst  enum rdg_dep_type type;
20417683Spst};
20517683Spst
20617683Spst#define RDGE_TYPE(E)        ((struct rdg_edge *) ((E)->data))->type
20717683Spst
20817683Spst/* Kind of distributed loop.  */
20917683Spstenum partition_kind {
21017683Spst    PKIND_NORMAL,
21117683Spst    /* Partial memset stands for a paritition can be distributed into a loop
21275107Sfenner       of memset calls, rather than a single memset call.  It's handled just
21375107Sfenner       like a normal parition, i.e, distributed as separate loop, no memset
21417683Spst       call is generated.
21517683Spst
21675107Sfenner       Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
21717683Spst       loop nest as deep as possible.  As a result, parloop achieves better
21817683Spst       parallelization by parallelizing deeper loop nest.  This hack should
21917683Spst       be unnecessary and removed once distributed memset can be understood
22017683Spst       and analyzed in data reference analysis.  See PR82604 for more.  */
22117683Spst    PKIND_PARTIAL_MEMSET,
22217683Spst    PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
22317683Spst};
22417683Spst
22575107Sfenner/* Type of distributed loop.  */
22675107Sfennerenum partition_type {
22717683Spst    /* The distributed loop can be executed parallelly.  */
22817683Spst    PTYPE_PARALLEL = 0,
22917683Spst    /* The distributed loop has to be executed sequentially.  */
23017683Spst    PTYPE_SEQUENTIAL
23175107Sfenner};
23275107Sfenner
23317683Spst/* Builtin info for loop distribution.  */
23417683Spststruct builtin_info
23575107Sfenner{
23675107Sfenner  /* data-references a kind != PKIND_NORMAL partition is about.  */
23775107Sfenner  data_reference_p dst_dr;
23875107Sfenner  data_reference_p src_dr;
23975107Sfenner  /* Base address and size of memory objects operated by the builtin.  Note
24075107Sfenner     both dest and source memory objects must have the same size.  */
24175107Sfenner  tree dst_base;
24275107Sfenner  tree src_base;
24339291Sfenner  tree size;
24475107Sfenner  /* Base and offset part of dst_base after stripping constant offset.  This
24575107Sfenner     is only used in memset builtin distribution for now.  */
24675107Sfenner  tree dst_base_base;
24775107Sfenner  unsigned HOST_WIDE_INT dst_base_offset;
24875107Sfenner};
24975107Sfenner
25075107Sfenner/* Partition for loop distribution.  */
25139291Sfennerstruct partition
25275107Sfenner{
25375107Sfenner  /* Statements of the partition.  */
25475107Sfenner  bitmap stmts;
25575107Sfenner  /* True if the partition defines variable which is used outside of loop.  */
25675107Sfenner  bool reduction_p;
25775107Sfenner  location_t loc;
25875107Sfenner  enum partition_kind kind;
25975107Sfenner  enum partition_type type;
26075107Sfenner  /* Data references in the partition.  */
26175107Sfenner  bitmap datarefs;
26275107Sfenner  /* Information of builtin parition.  */
26375107Sfenner  struct builtin_info *builtin;
26475107Sfenner};
26575107Sfenner
26617683Spst/* Partitions are fused because of different reasons.  */
26717683Spstenum fuse_type
26875107Sfenner{
26917683Spst  FUSE_NON_BUILTIN = 0,
27017683Spst  FUSE_REDUCTION = 1,
27175107Sfenner  FUSE_SHARE_REF = 2,
27275107Sfenner  FUSE_SAME_SCC = 3,
27317683Spst  FUSE_FINALIZE = 4
27417683Spst};
27598530Sfenner
27698530Sfenner/* Description on different fusing reason.  */
27798530Sfennerstatic const char *fuse_message[] = {
27898530Sfenner  "they are non-builtins",
27956889Sfenner  "they have reductions",
28098530Sfenner  "they have shared memory refs",
28198530Sfenner  "they are in the same dependence scc",
28298530Sfenner  "there is no point to distribute loop"};
28398530Sfenner
28498530Sfenner
28598530Sfenner/* Dump vertex I in RDG to FILE.  */
28698530Sfenner
28798530Sfennerstatic void
28898530Sfennerdump_rdg_vertex (FILE *file, struct graph *rdg, int i)
28998530Sfenner{
29098530Sfenner  struct vertex *v = &(rdg->vertices[i]);
29198530Sfenner  struct graph_edge *e;
29298530Sfenner
29398530Sfenner  fprintf (file, "(vertex %d: (%s%s) (in:", i,
29498530Sfenner	   RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
29575107Sfenner	   RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
29698530Sfenner
29775107Sfenner  if (v->pred)
29898530Sfenner    for (e = v->pred; e; e = e->pred_next)
29998530Sfenner      fprintf (file, " %d", e->src);
30098530Sfenner
30156889Sfenner  fprintf (file, ") (out:");
30256889Sfenner
30339291Sfenner  if (v->succ)
30439291Sfenner    for (e = v->succ; e; e = e->succ_next)
30539291Sfenner      fprintf (file, " %d", e->dest);
30639291Sfenner
30739291Sfenner  fprintf (file, ")\n");
30839291Sfenner  print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
30939291Sfenner  fprintf (file, ")\n");
31039291Sfenner}
31139291Sfenner
31239291Sfenner/* Call dump_rdg_vertex on stderr.  */
31339291Sfenner
31456889SfennerDEBUG_FUNCTION void
31556889Sfennerdebug_rdg_vertex (struct graph *rdg, int i)
31656889Sfenner{
31756889Sfenner  dump_rdg_vertex (stderr, rdg, i);
31856889Sfenner}
31956889Sfenner
32056889Sfenner/* Dump the reduced dependence graph RDG to FILE.  */
32156889Sfenner
32239291Sfennerstatic void
32339291Sfennerdump_rdg (FILE *file, struct graph *rdg)
32417683Spst{
32517683Spst  fprintf (file, "(rdg\n");
32617683Spst  for (int i = 0; i < rdg->n_vertices; i++)
32717683Spst    dump_rdg_vertex (file, rdg, i);
32817683Spst  fprintf (file, ")\n");
32917683Spst}
33017683Spst
33117683Spst/* Call dump_rdg on stderr.  */
33275107Sfenner
33375107SfennerDEBUG_FUNCTION void
33417683Spstdebug_rdg (struct graph *rdg)
33517683Spst{
33617683Spst  dump_rdg (stderr, rdg);
33775107Sfenner}
33875107Sfenner
33975107Sfennerstatic void
34075107Sfennerdot_rdg_1 (FILE *file, struct graph *rdg)
34175107Sfenner{
34275107Sfenner  int i;
34375107Sfenner  pretty_printer buffer;
34475107Sfenner  pp_needs_newline (&buffer) = false;
34575107Sfenner  buffer.buffer->stream = file;
34675107Sfenner
34775107Sfenner  fprintf (file, "digraph RDG {\n");
34875107Sfenner
34975107Sfenner  for (i = 0; i < rdg->n_vertices; i++)
35075107Sfenner    {
35175107Sfenner      struct vertex *v = &(rdg->vertices[i]);
35275107Sfenner      struct graph_edge *e;
35375107Sfenner
35475107Sfenner      fprintf (file, "%d [label=\"[%d] ", i, i);
35575107Sfenner      pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM);
35675107Sfenner      pp_flush (&buffer);
35775107Sfenner      fprintf (file, "\"]\n");
35875107Sfenner
35975107Sfenner      /* Highlight reads from memory.  */
36075107Sfenner      if (RDG_MEM_READS_STMT (rdg, i))
36175107Sfenner       fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
36275107Sfenner
36375107Sfenner      /* Highlight stores to memory.  */
36475107Sfenner      if (RDG_MEM_WRITE_STMT (rdg, i))
36575107Sfenner       fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
36675107Sfenner
36775107Sfenner      if (v->succ)
36875107Sfenner       for (e = v->succ; e; e = e->succ_next)
36975107Sfenner         switch (RDGE_TYPE (e))
37075107Sfenner           {
37175107Sfenner           case flow_dd:
37275107Sfenner             /* These are the most common dependences: don't print these. */
37375107Sfenner             fprintf (file, "%d -> %d \n", i, e->dest);
37475107Sfenner             break;
37575107Sfenner
37675107Sfenner	   case control_dd:
37775107Sfenner             fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
37875107Sfenner             break;
37975107Sfenner
38075107Sfenner           default:
38175107Sfenner             gcc_unreachable ();
38275107Sfenner           }
38375107Sfenner    }
38475107Sfenner
38575107Sfenner  fprintf (file, "}\n\n");
38675107Sfenner}
38775107Sfenner
38875107Sfenner/* Display the Reduced Dependence Graph using dotty.  */
38975107Sfenner
39075107SfennerDEBUG_FUNCTION void
39175107Sfennerdot_rdg (struct graph *rdg)
39275107Sfenner{
39375107Sfenner  /* When debugging, you may want to enable the following code.  */
39475107Sfenner#ifdef HAVE_POPEN
39598530Sfenner  FILE *file = popen ("dot -Tx11", "w");
39617683Spst  if (!file)
39798530Sfenner    return;
39898530Sfenner  dot_rdg_1 (file, rdg);
39998530Sfenner  fflush (file);
40098530Sfenner  close (fileno (file));
40198530Sfenner  pclose (file);
40217683Spst#else
40317683Spst  dot_rdg_1 (stderr, rdg);
40475107Sfenner#endif
40575107Sfenner}
40617683Spst
40717683Spst/* Returns the index of STMT in RDG.  */
40817683Spst
40917683Spststatic int
41017683Spstrdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
41175107Sfenner{
41275107Sfenner  int index = gimple_uid (stmt);
41317683Spst  gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
41417683Spst  return index;
41517683Spst}
41617683Spst
41717683Spst/* Creates dependence edges in RDG for all the uses of DEF.  IDEF is
41817683Spst   the index of DEF in RDG.  */
41917683Spst
42017683Spststatic void
42117683Spstcreate_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
42217683Spst{
42317683Spst  use_operand_p imm_use_p;
42417683Spst  imm_use_iterator iterator;
42517683Spst
42656889Sfenner  FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
42756889Sfenner    {
42856889Sfenner      struct graph_edge *e;
42956889Sfenner      int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
43056889Sfenner
43175107Sfenner      if (use < 0)
43275107Sfenner	continue;
43375107Sfenner
43475107Sfenner      e = add_edge (rdg, idef, use);
43575107Sfenner      e->data = XNEW (struct rdg_edge);
43675107Sfenner      RDGE_TYPE (e) = flow_dd;
43775107Sfenner    }
43875107Sfenner}
43975107Sfenner
44017683Spst/* Creates an edge for the control dependences of BB to the vertex V.  */
44117683Spst
44217683Spststatic void
44317683Spstcreate_edge_for_control_dependence (struct graph *rdg, basic_block bb,
444				    int v, control_dependences *cd)
445{
446  bitmap_iterator bi;
447  unsigned edge_n;
448  EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
449			    0, edge_n, bi)
450    {
451      basic_block cond_bb = cd->get_edge_src (edge_n);
452      gimple *stmt = last_stmt (cond_bb);
453      if (stmt && is_ctrl_stmt (stmt))
454	{
455	  struct graph_edge *e;
456	  int c = rdg_vertex_for_stmt (rdg, stmt);
457	  if (c < 0)
458	    continue;
459
460	  e = add_edge (rdg, c, v);
461	  e->data = XNEW (struct rdg_edge);
462	  RDGE_TYPE (e) = control_dd;
463	}
464    }
465}
466
467/* Creates the edges of the reduced dependence graph RDG.  */
468
469static void
470create_rdg_flow_edges (struct graph *rdg)
471{
472  int i;
473  def_operand_p def_p;
474  ssa_op_iter iter;
475
476  for (i = 0; i < rdg->n_vertices; i++)
477    FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
478			      iter, SSA_OP_DEF)
479      create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
480}
481
482/* Creates the edges of the reduced dependence graph RDG.  */
483
484static void
485create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
486{
487  int i;
488
489  for (i = 0; i < rdg->n_vertices; i++)
490    {
491      gimple *stmt = RDG_STMT (rdg, i);
492      if (gimple_code (stmt) == GIMPLE_PHI)
493	{
494	  edge_iterator ei;
495	  edge e;
496	  FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
497	    if (flow_bb_inside_loop_p (loop, e->src))
498	      create_edge_for_control_dependence (rdg, e->src, i, cd);
499	}
500      else
501	create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
502    }
503}
504
505
506class loop_distribution
507{
508  private:
509  /* The loop (nest) to be distributed.  */
510  vec<loop_p> loop_nest;
511
512  /* Vector of data references in the loop to be distributed.  */
513  vec<data_reference_p> datarefs_vec;
514
515  /* If there is nonaddressable data reference in above vector.  */
516  bool has_nonaddressable_dataref_p;
517
518  /* Store index of data reference in aux field.  */
519
520  /* Hash table for data dependence relation in the loop to be distributed.  */
521  hash_table<ddr_hasher> *ddrs_table;
522
523  /* Array mapping basic block's index to its topological order.  */
524  int *bb_top_order_index;
525  /* And size of the array.  */
526  int bb_top_order_index_size;
527
528  /* Build the vertices of the reduced dependence graph RDG.  Return false
529     if that failed.  */
530  bool create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, loop_p loop);
531
532  /* Initialize STMTS with all the statements of LOOP.  We use topological
533     order to discover all statements.  The order is important because
534     generate_loops_for_partition is using the same traversal for identifying
535     statements in loop copies.  */
536  void stmts_from_loop (class loop *loop, vec<gimple *> *stmts);
537
538
539  /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
540     LOOP, and one edge per flow dependence or control dependence from control
541     dependence CD.  During visiting each statement, data references are also
542     collected and recorded in global data DATAREFS_VEC.  */
543  struct graph * build_rdg (class loop *loop, control_dependences *cd);
544
545/* Merge PARTITION into the partition DEST.  RDG is the reduced dependence
546   graph and we update type for result partition if it is non-NULL.  */
547  void partition_merge_into (struct graph *rdg,
548			     partition *dest, partition *partition,
549			     enum fuse_type ft);
550
551
552  /* Return data dependence relation for data references A and B.  The two
553     data references must be in lexicographic order wrto reduced dependence
554     graph RDG.  We firstly try to find ddr from global ddr hash table.  If
555     it doesn't exist, compute the ddr and cache it.  */
556  data_dependence_relation * get_data_dependence (struct graph *rdg,
557						  data_reference_p a,
558						  data_reference_p b);
559
560
561  /* In reduced dependence graph RDG for loop distribution, return true if
562     dependence between references DR1 and DR2 leads to a dependence cycle
563     and such dependence cycle can't be resolved by runtime alias check.  */
564  bool data_dep_in_cycle_p (struct graph *rdg, data_reference_p dr1,
565			    data_reference_p dr2);
566
567
568  /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
569     PARTITION1's type after merging PARTITION2 into PARTITION1.  */
570  void update_type_for_merge (struct graph *rdg,
571			      partition *partition1, partition *partition2);
572
573
574  /* Returns a partition with all the statements needed for computing
575     the vertex V of the RDG, also including the loop exit conditions.  */
576  partition *build_rdg_partition_for_vertex (struct graph *rdg, int v);
577
578  /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
579     if it forms builtin memcpy or memmove call.  */
580  void classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
581			      data_reference_p dst_dr, data_reference_p src_dr);
582
583  /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
584     For the moment we detect memset, memcpy and memmove patterns.  Bitmap
585     STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions.
586     Returns true if there is a reduction in all partitions and we
587     possibly did not mark PARTITION as having one for this reason.  */
588
589  bool
590  classify_partition (loop_p loop,
591		      struct graph *rdg, partition *partition,
592		      bitmap stmt_in_all_partitions);
593
594
595  /* Returns true when PARTITION1 and PARTITION2 access the same memory
596     object in RDG.  */
597  bool share_memory_accesses (struct graph *rdg,
598			      partition *partition1, partition *partition2);
599
600  /* For each seed statement in STARTING_STMTS, this function builds
601     partition for it by adding depended statements according to RDG.
602     All partitions are recorded in PARTITIONS.  */
603  void rdg_build_partitions (struct graph *rdg,
604			     vec<gimple *> starting_stmts,
605			     vec<partition *> *partitions);
606
607  /* Compute partition dependence created by the data references in DRS1
608     and DRS2, modify and return DIR according to that.  IF ALIAS_DDR is
609     not NULL, we record dependence introduced by possible alias between
610     two data references in ALIAS_DDRS; otherwise, we simply ignore such
611     dependence as if it doesn't exist at all.  */
612  int pg_add_dependence_edges (struct graph *rdg, int dir, bitmap drs1,
613			       bitmap drs2, vec<ddr_p> *alias_ddrs);
614
615
616  /* Build and return partition dependence graph for PARTITIONS.  RDG is
617     reduced dependence graph for the loop to be distributed.  If IGNORE_ALIAS_P
618     is true, data dependence caused by possible alias between references
619     is ignored, as if it doesn't exist at all; otherwise all depdendences
620     are considered.  */
621  struct graph *build_partition_graph (struct graph *rdg,
622				       vec<struct partition *> *partitions,
623				       bool ignore_alias_p);
624
625  /* Given reduced dependence graph RDG merge strong connected components
626     of PARTITIONS.  If IGNORE_ALIAS_P is true, data dependence caused by
627     possible alias between references is ignored, as if it doesn't exist
628     at all; otherwise all depdendences are considered.  */
629  void merge_dep_scc_partitions (struct graph *rdg, vec<struct partition *>
630				 *partitions, bool ignore_alias_p);
631
632/* This is the main function breaking strong conected components in
633   PARTITIONS giving reduced depdendence graph RDG.  Store data dependence
634   relations for runtime alias check in ALIAS_DDRS.  */
635  void break_alias_scc_partitions (struct graph *rdg, vec<struct partition *>
636				   *partitions, vec<ddr_p> *alias_ddrs);
637
638
639  /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
640     ALIAS_DDRS contains ddrs which need runtime alias check.  */
641  void finalize_partitions (class loop *loop, vec<struct partition *>
642			    *partitions, vec<ddr_p> *alias_ddrs);
643
644  /* Distributes the code from LOOP in such a way that producer statements
645     are placed before consumer statements.  Tries to separate only the
646     statements from STMTS into separate loops.  Returns the number of
647     distributed loops.  Set NB_CALLS to number of generated builtin calls.
648     Set *DESTROY_P to whether LOOP needs to be destroyed.  */
649  int distribute_loop (class loop *loop, vec<gimple *> stmts,
650		       control_dependences *cd, int *nb_calls, bool *destroy_p,
651		       bool only_patterns_p);
652
653  /* Compute topological order for basic blocks.  Topological order is
654     needed because data dependence is computed for data references in
655     lexicographical order.  */
656  void bb_top_order_init (void);
657
658  void bb_top_order_destroy (void);
659
660  public:
661
662  /* Getter for bb_top_order.  */
663
664  inline int get_bb_top_order_index_size (void)
665    {
666      return bb_top_order_index_size;
667    }
668
669  inline int get_bb_top_order_index (int i)
670    {
671      return bb_top_order_index[i];
672    }
673
674  unsigned int execute (function *fun);
675};
676
677
678/* If X has a smaller topological sort number than Y, returns -1;
679   if greater, returns 1.  */
680static int
681bb_top_order_cmp_r (const void *x, const void *y, void *loop)
682{
683  loop_distribution *_loop =
684    (loop_distribution *) loop;
685
686  basic_block bb1 = *(const basic_block *) x;
687  basic_block bb2 = *(const basic_block *) y;
688
689  int bb_top_order_index_size = _loop->get_bb_top_order_index_size ();
690
691  gcc_assert (bb1->index < bb_top_order_index_size
692	      && bb2->index < bb_top_order_index_size);
693  gcc_assert (bb1 == bb2
694	      || _loop->get_bb_top_order_index(bb1->index)
695		 != _loop->get_bb_top_order_index(bb2->index));
696
697  return (_loop->get_bb_top_order_index(bb1->index) -
698	  _loop->get_bb_top_order_index(bb2->index));
699}
700
701bool
702loop_distribution::create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts,
703					loop_p loop)
704{
705  int i;
706  gimple *stmt;
707
708  FOR_EACH_VEC_ELT (stmts, i, stmt)
709    {
710      struct vertex *v = &(rdg->vertices[i]);
711
712      /* Record statement to vertex mapping.  */
713      gimple_set_uid (stmt, i);
714
715      v->data = XNEW (struct rdg_vertex);
716      RDGV_STMT (v) = stmt;
717      RDGV_DATAREFS (v).create (0);
718      RDGV_HAS_MEM_WRITE (v) = false;
719      RDGV_HAS_MEM_READS (v) = false;
720      if (gimple_code (stmt) == GIMPLE_PHI)
721	continue;
722
723      unsigned drp = datarefs_vec.length ();
724      if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
725	return false;
726      for (unsigned j = drp; j < datarefs_vec.length (); ++j)
727	{
728	  data_reference_p dr = datarefs_vec[j];
729	  if (DR_IS_READ (dr))
730	    RDGV_HAS_MEM_READS (v) = true;
731	  else
732	    RDGV_HAS_MEM_WRITE (v) = true;
733	  RDGV_DATAREFS (v).safe_push (dr);
734	  has_nonaddressable_dataref_p |= may_be_nonaddressable_p (dr->ref);
735	}
736    }
737  return true;
738}
739
740void
741loop_distribution::stmts_from_loop (class loop *loop, vec<gimple *> *stmts)
742{
743  unsigned int i;
744  basic_block *bbs = get_loop_body_in_custom_order (loop, this, bb_top_order_cmp_r);
745
746  for (i = 0; i < loop->num_nodes; i++)
747    {
748      basic_block bb = bbs[i];
749
750      for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
751	   gsi_next (&bsi))
752	if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
753	  stmts->safe_push (bsi.phi ());
754
755      for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
756	   gsi_next (&bsi))
757	{
758	  gimple *stmt = gsi_stmt (bsi);
759	  if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
760	    stmts->safe_push (stmt);
761	}
762    }
763
764  free (bbs);
765}
766
767/* Free the reduced dependence graph RDG.  */
768
769static void
770free_rdg (struct graph *rdg)
771{
772  int i;
773
774  for (i = 0; i < rdg->n_vertices; i++)
775    {
776      struct vertex *v = &(rdg->vertices[i]);
777      struct graph_edge *e;
778
779      for (e = v->succ; e; e = e->succ_next)
780	free (e->data);
781
782      if (v->data)
783	{
784	  gimple_set_uid (RDGV_STMT (v), -1);
785	  (RDGV_DATAREFS (v)).release ();
786	  free (v->data);
787	}
788    }
789
790  free_graph (rdg);
791}
792
793struct graph *
794loop_distribution::build_rdg (class loop *loop, control_dependences *cd)
795{
796  struct graph *rdg;
797
798  /* Create the RDG vertices from the stmts of the loop nest.  */
799  auto_vec<gimple *, 10> stmts;
800  stmts_from_loop (loop, &stmts);
801  rdg = new_graph (stmts.length ());
802  if (!create_rdg_vertices (rdg, stmts, loop))
803    {
804      free_rdg (rdg);
805      return NULL;
806    }
807  stmts.release ();
808
809  create_rdg_flow_edges (rdg);
810  if (cd)
811    create_rdg_cd_edges (rdg, cd, loop);
812
813  return rdg;
814}
815
816
817/* Allocate and initialize a partition from BITMAP.  */
818
819static partition *
820partition_alloc (void)
821{
822  partition *partition = XCNEW (struct partition);
823  partition->stmts = BITMAP_ALLOC (NULL);
824  partition->reduction_p = false;
825  partition->loc = UNKNOWN_LOCATION;
826  partition->kind = PKIND_NORMAL;
827  partition->type = PTYPE_PARALLEL;
828  partition->datarefs = BITMAP_ALLOC (NULL);
829  return partition;
830}
831
832/* Free PARTITION.  */
833
834static void
835partition_free (partition *partition)
836{
837  BITMAP_FREE (partition->stmts);
838  BITMAP_FREE (partition->datarefs);
839  if (partition->builtin)
840    free (partition->builtin);
841
842  free (partition);
843}
844
845/* Returns true if the partition can be generated as a builtin.  */
846
847static bool
848partition_builtin_p (partition *partition)
849{
850  return partition->kind > PKIND_PARTIAL_MEMSET;
851}
852
853/* Returns true if the partition contains a reduction.  */
854
855static bool
856partition_reduction_p (partition *partition)
857{
858  return partition->reduction_p;
859}
860
861void
862loop_distribution::partition_merge_into (struct graph *rdg,
863		      partition *dest, partition *partition, enum fuse_type ft)
864{
865  if (dump_file && (dump_flags & TDF_DETAILS))
866    {
867      fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
868      fprintf (dump_file, "  Part 1: ");
869      dump_bitmap (dump_file, dest->stmts);
870      fprintf (dump_file, "  Part 2: ");
871      dump_bitmap (dump_file, partition->stmts);
872    }
873
874  dest->kind = PKIND_NORMAL;
875  if (dest->type == PTYPE_PARALLEL)
876    dest->type = partition->type;
877
878  bitmap_ior_into (dest->stmts, partition->stmts);
879  if (partition_reduction_p (partition))
880    dest->reduction_p = true;
881
882  /* Further check if any data dependence prevents us from executing the
883     new partition parallelly.  */
884  if (dest->type == PTYPE_PARALLEL && rdg != NULL)
885    update_type_for_merge (rdg, dest, partition);
886
887  bitmap_ior_into (dest->datarefs, partition->datarefs);
888}
889
890
891/* Returns true when DEF is an SSA_NAME defined in LOOP and used after
892   the LOOP.  */
893
894static bool
895ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
896{
897  imm_use_iterator imm_iter;
898  use_operand_p use_p;
899
900  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
901    {
902      if (is_gimple_debug (USE_STMT (use_p)))
903	continue;
904
905      basic_block use_bb = gimple_bb (USE_STMT (use_p));
906      if (!flow_bb_inside_loop_p (loop, use_bb))
907	return true;
908    }
909
910  return false;
911}
912
913/* Returns true when STMT defines a scalar variable used after the
914   loop LOOP.  */
915
916static bool
917stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
918{
919  def_operand_p def_p;
920  ssa_op_iter op_iter;
921
922  if (gimple_code (stmt) == GIMPLE_PHI)
923    return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
924
925  FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
926    if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
927      return true;
928
929  return false;
930}
931
932/* Return a copy of LOOP placed before LOOP.  */
933
934static class loop *
935copy_loop_before (class loop *loop)
936{
937  class loop *res;
938  edge preheader = loop_preheader_edge (loop);
939
940  initialize_original_copy_tables ();
941  res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
942  gcc_assert (res != NULL);
943  free_original_copy_tables ();
944  delete_update_ssa ();
945
946  return res;
947}
948
949/* Creates an empty basic block after LOOP.  */
950
951static void
952create_bb_after_loop (class loop *loop)
953{
954  edge exit = single_exit (loop);
955
956  if (!exit)
957    return;
958
959  split_edge (exit);
960}
961
962/* Generate code for PARTITION from the code in LOOP.  The loop is
963   copied when COPY_P is true.  All the statements not flagged in the
964   PARTITION bitmap are removed from the loop or from its copy.  The
965   statements are indexed in sequence inside a basic block, and the
966   basic blocks of a loop are taken in dom order.  */
967
968static void
969generate_loops_for_partition (class loop *loop, partition *partition,
970			      bool copy_p)
971{
972  unsigned i;
973  basic_block *bbs;
974
975  if (copy_p)
976    {
977      int orig_loop_num = loop->orig_loop_num;
978      loop = copy_loop_before (loop);
979      gcc_assert (loop != NULL);
980      loop->orig_loop_num = orig_loop_num;
981      create_preheader (loop, CP_SIMPLE_PREHEADERS);
982      create_bb_after_loop (loop);
983    }
984  else
985    {
986      /* Origin number is set to the new versioned loop's num.  */
987      gcc_assert (loop->orig_loop_num != loop->num);
988    }
989
990  /* Remove stmts not in the PARTITION bitmap.  */
991  bbs = get_loop_body_in_dom_order (loop);
992
993  if (MAY_HAVE_DEBUG_BIND_STMTS)
994    for (i = 0; i < loop->num_nodes; i++)
995      {
996	basic_block bb = bbs[i];
997
998	for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
999	     gsi_next (&bsi))
1000	  {
1001	    gphi *phi = bsi.phi ();
1002	    if (!virtual_operand_p (gimple_phi_result (phi))
1003		&& !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
1004	      reset_debug_uses (phi);
1005	  }
1006
1007	for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
1008	  {
1009	    gimple *stmt = gsi_stmt (bsi);
1010	    if (gimple_code (stmt) != GIMPLE_LABEL
1011		&& !is_gimple_debug (stmt)
1012		&& !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
1013	      reset_debug_uses (stmt);
1014	  }
1015      }
1016
1017  for (i = 0; i < loop->num_nodes; i++)
1018    {
1019      basic_block bb = bbs[i];
1020      edge inner_exit = NULL;
1021
1022      if (loop != bb->loop_father)
1023	inner_exit = single_exit (bb->loop_father);
1024
1025      for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
1026	{
1027	  gphi *phi = bsi.phi ();
1028	  if (!virtual_operand_p (gimple_phi_result (phi))
1029	      && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
1030	    remove_phi_node (&bsi, true);
1031	  else
1032	    gsi_next (&bsi);
1033	}
1034
1035      for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
1036	{
1037	  gimple *stmt = gsi_stmt (bsi);
1038	  if (gimple_code (stmt) != GIMPLE_LABEL
1039	      && !is_gimple_debug (stmt)
1040	      && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
1041	    {
1042	      /* In distribution of loop nest, if bb is inner loop's exit_bb,
1043		 we choose its exit edge/path in order to avoid generating
1044		 infinite loop.  For all other cases, we choose an arbitrary
1045		 path through the empty CFG part that this unnecessary
1046		 control stmt controls.  */
1047	      if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
1048		{
1049		  if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE)
1050		    gimple_cond_make_true (cond_stmt);
1051		  else
1052		    gimple_cond_make_false (cond_stmt);
1053		  update_stmt (stmt);
1054		}
1055	      else if (gimple_code (stmt) == GIMPLE_SWITCH)
1056		{
1057		  gswitch *switch_stmt = as_a <gswitch *> (stmt);
1058		  gimple_switch_set_index
1059		      (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
1060		  update_stmt (stmt);
1061		}
1062	      else
1063		{
1064		  unlink_stmt_vdef (stmt);
1065		  gsi_remove (&bsi, true);
1066		  release_defs (stmt);
1067		  continue;
1068		}
1069	    }
1070	  gsi_next (&bsi);
1071	}
1072    }
1073
1074  free (bbs);
1075}
1076
1077/* If VAL memory representation contains the same value in all bytes,
1078   return that value, otherwise return -1.
1079   E.g. for 0x24242424 return 0x24, for IEEE double
1080   747708026454360457216.0 return 0x44, etc.  */
1081
1082static int
1083const_with_all_bytes_same (tree val)
1084{
1085  unsigned char buf[64];
1086  int i, len;
1087
1088  if (integer_zerop (val)
1089      || (TREE_CODE (val) == CONSTRUCTOR
1090          && !TREE_CLOBBER_P (val)
1091          && CONSTRUCTOR_NELTS (val) == 0))
1092    return 0;
1093
1094  if (real_zerop (val))
1095    {
1096      /* Only return 0 for +0.0, not for -0.0, which doesn't have
1097	 an all bytes same memory representation.  Don't transform
1098	 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS.  */
1099      switch (TREE_CODE (val))
1100	{
1101	case REAL_CST:
1102	  if (!real_isneg (TREE_REAL_CST_PTR (val)))
1103	    return 0;
1104	  break;
1105	case COMPLEX_CST:
1106	  if (!const_with_all_bytes_same (TREE_REALPART (val))
1107	      && !const_with_all_bytes_same (TREE_IMAGPART (val)))
1108	    return 0;
1109	  break;
1110	case VECTOR_CST:
1111	  {
1112	    unsigned int count = vector_cst_encoded_nelts (val);
1113	    unsigned int j;
1114	    for (j = 0; j < count; ++j)
1115	      if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j)))
1116		break;
1117	    if (j == count)
1118	      return 0;
1119	    break;
1120	  }
1121	default:
1122	  break;
1123	}
1124    }
1125
1126  if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
1127    return -1;
1128
1129  len = native_encode_expr (val, buf, sizeof (buf));
1130  if (len == 0)
1131    return -1;
1132  for (i = 1; i < len; i++)
1133    if (buf[i] != buf[0])
1134      return -1;
1135  return buf[0];
1136}
1137
1138/* Generate a call to memset for PARTITION in LOOP.  */
1139
1140static void
1141generate_memset_builtin (class loop *loop, partition *partition)
1142{
1143  gimple_stmt_iterator gsi;
1144  tree mem, fn, nb_bytes;
1145  tree val;
1146  struct builtin_info *builtin = partition->builtin;
1147  gimple *fn_call;
1148
1149  /* The new statements will be placed before LOOP.  */
1150  gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1151
1152  nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1153  nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1154				       false, GSI_CONTINUE_LINKING);
1155  mem = rewrite_to_non_trapping_overflow (builtin->dst_base);
1156  mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
1157				  false, GSI_CONTINUE_LINKING);
1158
1159  /* This exactly matches the pattern recognition in classify_partition.  */
1160  val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr));
1161  /* Handle constants like 0x15151515 and similarly
1162     floating point constants etc. where all bytes are the same.  */
1163  int bytev = const_with_all_bytes_same (val);
1164  if (bytev != -1)
1165    val = build_int_cst (integer_type_node, bytev);
1166  else if (TREE_CODE (val) == INTEGER_CST)
1167    val = fold_convert (integer_type_node, val);
1168  else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
1169    {
1170      tree tem = make_ssa_name (integer_type_node);
1171      gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
1172      gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
1173      val = tem;
1174    }
1175
1176  fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
1177  fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
1178  gimple_set_location (fn_call, partition->loc);
1179  gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1180  fold_stmt (&gsi);
1181
1182  if (dump_file && (dump_flags & TDF_DETAILS))
1183    {
1184      fprintf (dump_file, "generated memset");
1185      if (bytev == 0)
1186	fprintf (dump_file, " zero\n");
1187      else
1188	fprintf (dump_file, "\n");
1189    }
1190}
1191
1192/* Generate a call to memcpy for PARTITION in LOOP.  */
1193
1194static void
1195generate_memcpy_builtin (class loop *loop, partition *partition)
1196{
1197  gimple_stmt_iterator gsi;
1198  gimple *fn_call;
1199  tree dest, src, fn, nb_bytes;
1200  enum built_in_function kind;
1201  struct builtin_info *builtin = partition->builtin;
1202
1203  /* The new statements will be placed before LOOP.  */
1204  gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1205
1206  nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1207  nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1208				       false, GSI_CONTINUE_LINKING);
1209  dest = rewrite_to_non_trapping_overflow (builtin->dst_base);
1210  src = rewrite_to_non_trapping_overflow (builtin->src_base);
1211  if (partition->kind == PKIND_MEMCPY
1212      || ! ptr_derefs_may_alias_p (dest, src))
1213    kind = BUILT_IN_MEMCPY;
1214  else
1215    kind = BUILT_IN_MEMMOVE;
1216  /* Try harder if we're copying a constant size.  */
1217  if (kind == BUILT_IN_MEMMOVE && poly_int_tree_p (nb_bytes))
1218    {
1219      aff_tree asrc, adest;
1220      tree_to_aff_combination (src, ptr_type_node, &asrc);
1221      tree_to_aff_combination (dest, ptr_type_node, &adest);
1222      aff_combination_scale (&adest, -1);
1223      aff_combination_add (&asrc, &adest);
1224      if (aff_comb_cannot_overlap_p (&asrc, wi::to_poly_widest (nb_bytes),
1225				     wi::to_poly_widest (nb_bytes)))
1226	kind = BUILT_IN_MEMCPY;
1227    }
1228
1229  dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
1230				   false, GSI_CONTINUE_LINKING);
1231  src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
1232				  false, GSI_CONTINUE_LINKING);
1233  fn = build_fold_addr_expr (builtin_decl_implicit (kind));
1234  fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
1235  gimple_set_location (fn_call, partition->loc);
1236  gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1237  fold_stmt (&gsi);
1238
1239  if (dump_file && (dump_flags & TDF_DETAILS))
1240    {
1241      if (kind == BUILT_IN_MEMCPY)
1242	fprintf (dump_file, "generated memcpy\n");
1243      else
1244	fprintf (dump_file, "generated memmove\n");
1245    }
1246}
1247
1248/* Remove and destroy the loop LOOP.  */
1249
1250static void
1251destroy_loop (class loop *loop)
1252{
1253  unsigned nbbs = loop->num_nodes;
1254  edge exit = single_exit (loop);
1255  basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
1256  basic_block *bbs;
1257  unsigned i;
1258
1259  bbs = get_loop_body_in_dom_order (loop);
1260
1261  gimple_stmt_iterator dst_gsi = gsi_after_labels (exit->dest);
1262  bool safe_p = single_pred_p (exit->dest);
1263  for (unsigned i = 0; i < nbbs; ++i)
1264    {
1265      /* We have made sure to not leave any dangling uses of SSA
1266         names defined in the loop.  With the exception of virtuals.
1267	 Make sure we replace all uses of virtual defs that will remain
1268	 outside of the loop with the bare symbol as delete_basic_block
1269	 will release them.  */
1270      for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
1271	   gsi_next (&gsi))
1272	{
1273	  gphi *phi = gsi.phi ();
1274	  if (virtual_operand_p (gimple_phi_result (phi)))
1275	    mark_virtual_phi_result_for_renaming (phi);
1276	}
1277      for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);)
1278	{
1279	  gimple *stmt = gsi_stmt (gsi);
1280	  tree vdef = gimple_vdef (stmt);
1281	  if (vdef && TREE_CODE (vdef) == SSA_NAME)
1282	    mark_virtual_operand_for_renaming (vdef);
1283	  /* Also move and eventually reset debug stmts.  We can leave
1284	     constant values in place in case the stmt dominates the exit.
1285	     ???  Non-constant values from the last iteration can be
1286	     replaced with final values if we can compute them.  */
1287	  if (gimple_debug_bind_p (stmt))
1288	    {
1289	      tree val = gimple_debug_bind_get_value (stmt);
1290	      gsi_move_before (&gsi, &dst_gsi);
1291	      if (val
1292		  && (!safe_p
1293		      || !is_gimple_min_invariant (val)
1294		      || !dominated_by_p (CDI_DOMINATORS, exit->src, bbs[i])))
1295		{
1296		  gimple_debug_bind_reset_value (stmt);
1297		  update_stmt (stmt);
1298		}
1299	    }
1300	  else
1301	    gsi_next (&gsi);
1302	}
1303    }
1304
1305  redirect_edge_pred (exit, src);
1306  exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
1307  exit->flags |= EDGE_FALLTHRU;
1308  cancel_loop_tree (loop);
1309  rescan_loop_exit (exit, false, true);
1310
1311  i = nbbs;
1312  do
1313    {
1314      --i;
1315      delete_basic_block (bbs[i]);
1316    }
1317  while (i != 0);
1318
1319  free (bbs);
1320
1321  set_immediate_dominator (CDI_DOMINATORS, dest,
1322			   recompute_dominator (CDI_DOMINATORS, dest));
1323}
1324
1325/* Generates code for PARTITION.  Return whether LOOP needs to be destroyed.  */
1326
1327static bool
1328generate_code_for_partition (class loop *loop,
1329			     partition *partition, bool copy_p)
1330{
1331  switch (partition->kind)
1332    {
1333    case PKIND_NORMAL:
1334    case PKIND_PARTIAL_MEMSET:
1335      /* Reductions all have to be in the last partition.  */
1336      gcc_assert (!partition_reduction_p (partition)
1337		  || !copy_p);
1338      generate_loops_for_partition (loop, partition, copy_p);
1339      return false;
1340
1341    case PKIND_MEMSET:
1342      generate_memset_builtin (loop, partition);
1343      break;
1344
1345    case PKIND_MEMCPY:
1346    case PKIND_MEMMOVE:
1347      generate_memcpy_builtin (loop, partition);
1348      break;
1349
1350    default:
1351      gcc_unreachable ();
1352    }
1353
1354  /* Common tail for partitions we turn into a call.  If this was the last
1355     partition for which we generate code, we have to destroy the loop.  */
1356  if (!copy_p)
1357    return true;
1358  return false;
1359}
1360
1361data_dependence_relation *
1362loop_distribution::get_data_dependence (struct graph *rdg, data_reference_p a,
1363					data_reference_p b)
1364{
1365  struct data_dependence_relation ent, **slot;
1366  struct data_dependence_relation *ddr;
1367
1368  gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
1369  gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
1370	      <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
1371  ent.a = a;
1372  ent.b = b;
1373  slot = ddrs_table->find_slot (&ent, INSERT);
1374  if (*slot == NULL)
1375    {
1376      ddr = initialize_data_dependence_relation (a, b, loop_nest);
1377      compute_affine_dependence (ddr, loop_nest[0]);
1378      *slot = ddr;
1379    }
1380
1381  return *slot;
1382}
1383
1384bool
1385loop_distribution::data_dep_in_cycle_p (struct graph *rdg,
1386					data_reference_p dr1,
1387					data_reference_p dr2)
1388{
1389  struct data_dependence_relation *ddr;
1390
1391  /* Re-shuffle data-refs to be in topological order.  */
1392  if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1393      > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1394    std::swap (dr1, dr2);
1395
1396  ddr = get_data_dependence (rdg, dr1, dr2);
1397
1398  /* In case of no data dependence.  */
1399  if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1400    return false;
1401  /* For unknown data dependence or known data dependence which can't be
1402     expressed in classic distance vector, we check if it can be resolved
1403     by runtime alias check.  If yes, we still consider data dependence
1404     as won't introduce data dependence cycle.  */
1405  else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1406	   || DDR_NUM_DIST_VECTS (ddr) == 0)
1407    return !runtime_alias_check_p (ddr, NULL, true);
1408  else if (DDR_NUM_DIST_VECTS (ddr) > 1)
1409    return true;
1410  else if (DDR_REVERSED_P (ddr)
1411	   || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1412    return false;
1413
1414  return true;
1415}
1416
1417void
1418loop_distribution::update_type_for_merge (struct graph *rdg,
1419					   partition *partition1,
1420					   partition *partition2)
1421{
1422  unsigned i, j;
1423  bitmap_iterator bi, bj;
1424  data_reference_p dr1, dr2;
1425
1426  EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1427    {
1428      unsigned start = (partition1 == partition2) ? i + 1 : 0;
1429
1430      dr1 = datarefs_vec[i];
1431      EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
1432	{
1433	  dr2 = datarefs_vec[j];
1434	  if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1435	    continue;
1436
1437	  /* Partition can only be executed sequentially if there is any
1438	     data dependence cycle.  */
1439	  if (data_dep_in_cycle_p (rdg, dr1, dr2))
1440	    {
1441	      partition1->type = PTYPE_SEQUENTIAL;
1442	      return;
1443	    }
1444	}
1445    }
1446}
1447
1448partition *
1449loop_distribution::build_rdg_partition_for_vertex (struct graph *rdg, int v)
1450{
1451  partition *partition = partition_alloc ();
1452  auto_vec<int, 3> nodes;
1453  unsigned i, j;
1454  int x;
1455  data_reference_p dr;
1456
1457  graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
1458
1459  FOR_EACH_VEC_ELT (nodes, i, x)
1460    {
1461      bitmap_set_bit (partition->stmts, x);
1462
1463      for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
1464	{
1465	  unsigned idx = (unsigned) DR_INDEX (dr);
1466	  gcc_assert (idx < datarefs_vec.length ());
1467
1468	  /* Partition can only be executed sequentially if there is any
1469	     unknown data reference.  */
1470	  if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
1471	      || !DR_INIT (dr) || !DR_STEP (dr))
1472	    partition->type = PTYPE_SEQUENTIAL;
1473
1474	  bitmap_set_bit (partition->datarefs, idx);
1475	}
1476    }
1477
1478  if (partition->type == PTYPE_SEQUENTIAL)
1479    return partition;
1480
1481  /* Further check if any data dependence prevents us from executing the
1482     partition parallelly.  */
1483  update_type_for_merge (rdg, partition, partition);
1484
1485  return partition;
1486}
1487
1488/* Given PARTITION of LOOP and RDG, record single load/store data references
1489   for builtin partition in SRC_DR/DST_DR, return false if there is no such
1490   data references.  */
1491
1492static bool
1493find_single_drs (class loop *loop, struct graph *rdg, partition *partition,
1494		 data_reference_p *dst_dr, data_reference_p *src_dr)
1495{
1496  unsigned i;
1497  data_reference_p single_ld = NULL, single_st = NULL;
1498  bitmap_iterator bi;
1499
1500  EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1501    {
1502      gimple *stmt = RDG_STMT (rdg, i);
1503      data_reference_p dr;
1504
1505      if (gimple_code (stmt) == GIMPLE_PHI)
1506	continue;
1507
1508      /* Any scalar stmts are ok.  */
1509      if (!gimple_vuse (stmt))
1510	continue;
1511
1512      /* Otherwise just regular loads/stores.  */
1513      if (!gimple_assign_single_p (stmt))
1514	return false;
1515
1516      /* But exactly one store and/or load.  */
1517      for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
1518	{
1519	  tree type = TREE_TYPE (DR_REF (dr));
1520
1521	  /* The memset, memcpy and memmove library calls are only
1522	     able to deal with generic address space.  */
1523	  if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
1524	    return false;
1525
1526	  if (DR_IS_READ (dr))
1527	    {
1528	      if (single_ld != NULL)
1529		return false;
1530	      single_ld = dr;
1531	    }
1532	  else
1533	    {
1534	      if (single_st != NULL)
1535		return false;
1536	      single_st = dr;
1537	    }
1538	}
1539    }
1540
1541  if (!single_st)
1542    return false;
1543
1544  /* Bail out if this is a bitfield memory reference.  */
1545  if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
1546      && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
1547    return false;
1548
1549  /* Data reference must be executed exactly once per iteration of each
1550     loop in the loop nest.  We only need to check dominance information
1551     against the outermost one in a perfect loop nest because a bb can't
1552     dominate outermost loop's latch without dominating inner loop's.  */
1553  basic_block bb_st = gimple_bb (DR_STMT (single_st));
1554  if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st))
1555    return false;
1556
1557  if (single_ld)
1558    {
1559      gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
1560      /* Direct aggregate copy or via an SSA name temporary.  */
1561      if (load != store
1562	  && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
1563	return false;
1564
1565      /* Bail out if this is a bitfield memory reference.  */
1566      if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF
1567	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1)))
1568	return false;
1569
1570      /* Load and store must be in the same loop nest.  */
1571      basic_block bb_ld = gimple_bb (DR_STMT (single_ld));
1572      if (bb_st->loop_father != bb_ld->loop_father)
1573	return false;
1574
1575      /* Data reference must be executed exactly once per iteration.
1576	 Same as single_st, we only need to check against the outermost
1577	 loop.  */
1578      if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld))
1579	return false;
1580
1581      edge e = single_exit (bb_st->loop_father);
1582      bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld);
1583      bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st);
1584      if (dom_ld != dom_st)
1585	return false;
1586    }
1587
1588  *src_dr = single_ld;
1589  *dst_dr = single_st;
1590  return true;
1591}
1592
1593/* Given data reference DR in LOOP_NEST, this function checks the enclosing
1594   loops from inner to outer to see if loop's step equals to access size at
1595   each level of loop.  Return 2 if we can prove this at all level loops;
1596   record access base and size in BASE and SIZE; save loop's step at each
1597   level of loop in STEPS if it is not null.  For example:
1598
1599     int arr[100][100][100];
1600     for (i = 0; i < 100; i++)       ;steps[2] = 40000
1601       for (j = 100; j > 0; j--)     ;steps[1] = -400
1602	 for (k = 0; k < 100; k++)   ;steps[0] = 4
1603	   arr[i][j - 1][k] = 0;     ;base = &arr, size = 4000000
1604
1605   Return 1 if we can prove the equality at the innermost loop, but not all
1606   level loops.  In this case, no information is recorded.
1607
1608   Return 0 if no equality can be proven at any level loops.  */
1609
1610static int
1611compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
1612		      tree *size, vec<tree> *steps = NULL)
1613{
1614  location_t loc = gimple_location (DR_STMT (dr));
1615  basic_block bb = gimple_bb (DR_STMT (dr));
1616  class loop *loop = bb->loop_father;
1617  tree ref = DR_REF (dr);
1618  tree access_base = build_fold_addr_expr (ref);
1619  tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1620  int res = 0;
1621
1622  do {
1623      tree scev_fn = analyze_scalar_evolution (loop, access_base);
1624      if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
1625	return res;
1626
1627      access_base = CHREC_LEFT (scev_fn);
1628      if (tree_contains_chrecs (access_base, NULL))
1629	return res;
1630
1631      tree scev_step = CHREC_RIGHT (scev_fn);
1632      /* Only support constant steps.  */
1633      if (TREE_CODE (scev_step) != INTEGER_CST)
1634	return res;
1635
1636      enum ev_direction access_dir = scev_direction (scev_fn);
1637      if (access_dir == EV_DIR_UNKNOWN)
1638	return res;
1639
1640      if (steps != NULL)
1641	steps->safe_push (scev_step);
1642
1643      scev_step = fold_convert_loc (loc, sizetype, scev_step);
1644      /* Compute absolute value of scev step.  */
1645      if (access_dir == EV_DIR_DECREASES)
1646	scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step);
1647
1648      /* At each level of loop, scev step must equal to access size.  In other
1649	 words, DR must access consecutive memory between loop iterations.  */
1650      if (!operand_equal_p (scev_step, access_size, 0))
1651	return res;
1652
1653      /* Access stride can be computed for data reference at least for the
1654	 innermost loop.  */
1655      res = 1;
1656
1657      /* Compute DR's execution times in loop.  */
1658      tree niters = number_of_latch_executions (loop);
1659      niters = fold_convert_loc (loc, sizetype, niters);
1660      if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb))
1661	niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node);
1662
1663      /* Compute DR's overall access size in loop.  */
1664      access_size = fold_build2_loc (loc, MULT_EXPR, sizetype,
1665				     niters, scev_step);
1666      /* Adjust base address in case of negative step.  */
1667      if (access_dir == EV_DIR_DECREASES)
1668	{
1669	  tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype,
1670				      scev_step, access_size);
1671	  access_base = fold_build_pointer_plus_loc (loc, access_base, adj);
1672	}
1673  } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
1674
1675  *base = access_base;
1676  *size = access_size;
1677  /* Access stride can be computed for data reference at each level loop.  */
1678  return 2;
1679}
1680
1681/* Allocate and return builtin struct.  Record information like DST_DR,
1682   SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct.  */
1683
1684static struct builtin_info *
1685alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr,
1686	       tree dst_base, tree src_base, tree size)
1687{
1688  struct builtin_info *builtin = XNEW (struct builtin_info);
1689  builtin->dst_dr = dst_dr;
1690  builtin->src_dr = src_dr;
1691  builtin->dst_base = dst_base;
1692  builtin->src_base = src_base;
1693  builtin->size = size;
1694  return builtin;
1695}
1696
1697/* Given data reference DR in loop nest LOOP, classify if it forms builtin
1698   memset call.  */
1699
1700static void
1701classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
1702{
1703  gimple *stmt = DR_STMT (dr);
1704  tree base, size, rhs = gimple_assign_rhs1 (stmt);
1705
1706  if (const_with_all_bytes_same (rhs) == -1
1707      && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
1708	  || (TYPE_MODE (TREE_TYPE (rhs))
1709	      != TYPE_MODE (unsigned_char_type_node))))
1710    return;
1711
1712  if (TREE_CODE (rhs) == SSA_NAME
1713      && !SSA_NAME_IS_DEFAULT_DEF (rhs)
1714      && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
1715    return;
1716
1717  int res = compute_access_range (loop, dr, &base, &size);
1718  if (res == 0)
1719    return;
1720  if (res == 1)
1721    {
1722      partition->kind = PKIND_PARTIAL_MEMSET;
1723      return;
1724    }
1725
1726  poly_uint64 base_offset;
1727  unsigned HOST_WIDE_INT const_base_offset;
1728  tree base_base = strip_offset (base, &base_offset);
1729  if (!base_offset.is_constant (&const_base_offset))
1730    return;
1731
1732  struct builtin_info *builtin;
1733  builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
1734  builtin->dst_base_base = base_base;
1735  builtin->dst_base_offset = const_base_offset;
1736  partition->builtin = builtin;
1737  partition->kind = PKIND_MEMSET;
1738}
1739
1740/* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
1741   if it forms builtin memcpy or memmove call.  */
1742
1743void
1744loop_distribution::classify_builtin_ldst (loop_p loop, struct graph *rdg,
1745					  partition *partition,
1746					  data_reference_p dst_dr,
1747					  data_reference_p src_dr)
1748{
1749  tree base, size, src_base, src_size;
1750  auto_vec<tree> dst_steps, src_steps;
1751
1752  /* Compute access range of both load and store.  */
1753  int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
1754  if (res != 2)
1755    return;
1756  res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
1757  if (res != 2)
1758    return;
1759
1760  /* They much have the same access size.  */
1761  if (!operand_equal_p (size, src_size, 0))
1762    return;
1763
1764  /* Load and store in loop nest must access memory in the same way, i.e,
1765     their must have the same steps in each loop of the nest.  */
1766  if (dst_steps.length () != src_steps.length ())
1767    return;
1768  for (unsigned i = 0; i < dst_steps.length (); ++i)
1769    if (!operand_equal_p (dst_steps[i], src_steps[i], 0))
1770      return;
1771
1772  /* Now check that if there is a dependence.  */
1773  ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr);
1774
1775  /* Classify as memmove if no dependence between load and store.  */
1776  if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1777    {
1778      partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1779      partition->kind = PKIND_MEMMOVE;
1780      return;
1781    }
1782
1783  /* Can't do memmove in case of unknown dependence or dependence without
1784     classical distance vector.  */
1785  if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1786      || DDR_NUM_DIST_VECTS (ddr) == 0)
1787    return;
1788
1789  unsigned i;
1790  lambda_vector dist_v;
1791  int num_lev = (DDR_LOOP_NEST (ddr)).length ();
1792  FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1793    {
1794      unsigned dep_lev = dependence_level (dist_v, num_lev);
1795      /* Can't do memmove if load depends on store.  */
1796      if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr))
1797	return;
1798    }
1799
1800  partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1801  partition->kind = PKIND_MEMMOVE;
1802  return;
1803}
1804
1805bool
1806loop_distribution::classify_partition (loop_p loop,
1807				       struct graph *rdg, partition *partition,
1808				       bitmap stmt_in_all_partitions)
1809{
1810  bitmap_iterator bi;
1811  unsigned i;
1812  data_reference_p single_ld = NULL, single_st = NULL;
1813  bool volatiles_p = false, has_reduction = false;
1814
1815  EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1816    {
1817      gimple *stmt = RDG_STMT (rdg, i);
1818
1819      if (gimple_has_volatile_ops (stmt))
1820	volatiles_p = true;
1821
1822      /* If the stmt is not included by all partitions and there is uses
1823	 outside of the loop, then mark the partition as reduction.  */
1824      if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1825	{
1826	  /* Due to limitation in the transform phase we have to fuse all
1827	     reduction partitions.  As a result, this could cancel valid
1828	     loop distribution especially for loop that induction variable
1829	     is used outside of loop.  To workaround this issue, we skip
1830	     marking partition as reudction if the reduction stmt belongs
1831	     to all partitions.  In such case, reduction will be computed
1832	     correctly no matter how partitions are fused/distributed.  */
1833	  if (!bitmap_bit_p (stmt_in_all_partitions, i))
1834	    partition->reduction_p = true;
1835	  else
1836	    has_reduction = true;
1837	}
1838    }
1839
1840  /* Simple workaround to prevent classifying the partition as builtin
1841     if it contains any use outside of loop.  For the case where all
1842     partitions have the reduction this simple workaround is delayed
1843     to only affect the last partition.  */
1844  if (partition->reduction_p)
1845     return has_reduction;
1846
1847  /* Perform general partition disqualification for builtins.  */
1848  if (volatiles_p
1849      || !flag_tree_loop_distribute_patterns)
1850    return has_reduction;
1851
1852  /* Find single load/store data references for builtin partition.  */
1853  if (!find_single_drs (loop, rdg, partition, &single_st, &single_ld))
1854    return has_reduction;
1855
1856  partition->loc = gimple_location (DR_STMT (single_st));
1857
1858  /* Classify the builtin kind.  */
1859  if (single_ld == NULL)
1860    classify_builtin_st (loop, partition, single_st);
1861  else
1862    classify_builtin_ldst (loop, rdg, partition, single_st, single_ld);
1863  return has_reduction;
1864}
1865
1866bool
1867loop_distribution::share_memory_accesses (struct graph *rdg,
1868		       partition *partition1, partition *partition2)
1869{
1870  unsigned i, j;
1871  bitmap_iterator bi, bj;
1872  data_reference_p dr1, dr2;
1873
1874  /* First check whether in the intersection of the two partitions are
1875     any loads or stores.  Common loads are the situation that happens
1876     most often.  */
1877  EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1878    if (RDG_MEM_WRITE_STMT (rdg, i)
1879	|| RDG_MEM_READS_STMT (rdg, i))
1880      return true;
1881
1882  /* Then check whether the two partitions access the same memory object.  */
1883  EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1884    {
1885      dr1 = datarefs_vec[i];
1886
1887      if (!DR_BASE_ADDRESS (dr1)
1888	  || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
1889	continue;
1890
1891      EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
1892	{
1893	  dr2 = datarefs_vec[j];
1894
1895	  if (!DR_BASE_ADDRESS (dr2)
1896	      || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
1897	    continue;
1898
1899	  if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
1900	      && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
1901	      && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
1902	      && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
1903	    return true;
1904	}
1905    }
1906
1907  return false;
1908}
1909
1910/* For each seed statement in STARTING_STMTS, this function builds
1911   partition for it by adding depended statements according to RDG.
1912   All partitions are recorded in PARTITIONS.  */
1913
1914void
1915loop_distribution::rdg_build_partitions (struct graph *rdg,
1916					 vec<gimple *> starting_stmts,
1917					 vec<partition *> *partitions)
1918{
1919  auto_bitmap processed;
1920  int i;
1921  gimple *stmt;
1922
1923  FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
1924    {
1925      int v = rdg_vertex_for_stmt (rdg, stmt);
1926
1927      if (dump_file && (dump_flags & TDF_DETAILS))
1928	fprintf (dump_file,
1929		 "ldist asked to generate code for vertex %d\n", v);
1930
1931      /* If the vertex is already contained in another partition so
1932         is the partition rooted at it.  */
1933      if (bitmap_bit_p (processed, v))
1934	continue;
1935
1936      partition *partition = build_rdg_partition_for_vertex (rdg, v);
1937      bitmap_ior_into (processed, partition->stmts);
1938
1939      if (dump_file && (dump_flags & TDF_DETAILS))
1940	{
1941	  fprintf (dump_file, "ldist creates useful %s partition:\n",
1942		   partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
1943	  bitmap_print (dump_file, partition->stmts, "  ", "\n");
1944	}
1945
1946      partitions->safe_push (partition);
1947    }
1948
1949  /* All vertices should have been assigned to at least one partition now,
1950     other than vertices belonging to dead code.  */
1951}
1952
1953/* Dump to FILE the PARTITIONS.  */
1954
1955static void
1956dump_rdg_partitions (FILE *file, vec<partition *> partitions)
1957{
1958  int i;
1959  partition *partition;
1960
1961  FOR_EACH_VEC_ELT (partitions, i, partition)
1962    debug_bitmap_file (file, partition->stmts);
1963}
1964
1965/* Debug PARTITIONS.  */
1966extern void debug_rdg_partitions (vec<partition *> );
1967
1968DEBUG_FUNCTION void
1969debug_rdg_partitions (vec<partition *> partitions)
1970{
1971  dump_rdg_partitions (stderr, partitions);
1972}
1973
1974/* Returns the number of read and write operations in the RDG.  */
1975
1976static int
1977number_of_rw_in_rdg (struct graph *rdg)
1978{
1979  int i, res = 0;
1980
1981  for (i = 0; i < rdg->n_vertices; i++)
1982    {
1983      if (RDG_MEM_WRITE_STMT (rdg, i))
1984	++res;
1985
1986      if (RDG_MEM_READS_STMT (rdg, i))
1987	++res;
1988    }
1989
1990  return res;
1991}
1992
1993/* Returns the number of read and write operations in a PARTITION of
1994   the RDG.  */
1995
1996static int
1997number_of_rw_in_partition (struct graph *rdg, partition *partition)
1998{
1999  int res = 0;
2000  unsigned i;
2001  bitmap_iterator ii;
2002
2003  EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
2004    {
2005      if (RDG_MEM_WRITE_STMT (rdg, i))
2006	++res;
2007
2008      if (RDG_MEM_READS_STMT (rdg, i))
2009	++res;
2010    }
2011
2012  return res;
2013}
2014
2015/* Returns true when one of the PARTITIONS contains all the read or
2016   write operations of RDG.  */
2017
2018static bool
2019partition_contains_all_rw (struct graph *rdg,
2020			   vec<partition *> partitions)
2021{
2022  int i;
2023  partition *partition;
2024  int nrw = number_of_rw_in_rdg (rdg);
2025
2026  FOR_EACH_VEC_ELT (partitions, i, partition)
2027    if (nrw == number_of_rw_in_partition (rdg, partition))
2028      return true;
2029
2030  return false;
2031}
2032
2033int
2034loop_distribution::pg_add_dependence_edges (struct graph *rdg, int dir,
2035			 bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
2036{
2037  unsigned i, j;
2038  bitmap_iterator bi, bj;
2039  data_reference_p dr1, dr2, saved_dr1;
2040
2041  /* dependence direction - 0 is no dependence, -1 is back,
2042     1 is forth, 2 is both (we can stop then, merging will occur).  */
2043  EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
2044    {
2045      dr1 = datarefs_vec[i];
2046
2047      EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
2048	{
2049	  int res, this_dir = 1;
2050	  ddr_p ddr;
2051
2052	  dr2 = datarefs_vec[j];
2053
2054	  /* Skip all <read, read> data dependence.  */
2055	  if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
2056	    continue;
2057
2058	  saved_dr1 = dr1;
2059	  /* Re-shuffle data-refs to be in topological order.  */
2060	  if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
2061	      > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
2062	    {
2063	      std::swap (dr1, dr2);
2064	      this_dir = -this_dir;
2065	    }
2066	  ddr = get_data_dependence (rdg, dr1, dr2);
2067	  if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2068	    {
2069	      this_dir = 0;
2070	      res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
2071					   DR_BASE_ADDRESS (dr2));
2072	      /* Be conservative.  If data references are not well analyzed,
2073		 or the two data references have the same base address and
2074		 offset, add dependence and consider it alias to each other.
2075		 In other words, the dependence cannot be resolved by
2076		 runtime alias check.  */
2077	      if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
2078		  || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
2079		  || !DR_INIT (dr1) || !DR_INIT (dr2)
2080		  || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
2081		  || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
2082		  || res == 0)
2083		this_dir = 2;
2084	      /* Data dependence could be resolved by runtime alias check,
2085		 record it in ALIAS_DDRS.  */
2086	      else if (alias_ddrs != NULL)
2087		alias_ddrs->safe_push (ddr);
2088	      /* Or simply ignore it.  */
2089	    }
2090	  else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
2091	    {
2092	      if (DDR_REVERSED_P (ddr))
2093		this_dir = -this_dir;
2094
2095	      /* Known dependences can still be unordered througout the
2096		 iteration space, see gcc.dg/tree-ssa/ldist-16.c and
2097		 gcc.dg/tree-ssa/pr94969.c.  */
2098	      if (DDR_NUM_DIST_VECTS (ddr) != 1)
2099		this_dir = 2;
2100	      /* If the overlap is exact preserve stmt order.  */
2101	      else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0),
2102					    DDR_NB_LOOPS (ddr)))
2103		;
2104	      /* Else as the distance vector is lexicographic positive swap
2105		 the dependence direction.  */
2106	      else
2107		this_dir = -this_dir;
2108	    }
2109	  else
2110	    this_dir = 0;
2111	  if (this_dir == 2)
2112	    return 2;
2113	  else if (dir == 0)
2114	    dir = this_dir;
2115	  else if (this_dir != 0 && dir != this_dir)
2116	    return 2;
2117	  /* Shuffle "back" dr1.  */
2118	  dr1 = saved_dr1;
2119	}
2120    }
2121  return dir;
2122}
2123
2124/* Compare postorder number of the partition graph vertices V1 and V2.  */
2125
2126static int
2127pgcmp (const void *v1_, const void *v2_)
2128{
2129  const vertex *v1 = (const vertex *)v1_;
2130  const vertex *v2 = (const vertex *)v2_;
2131  return v2->post - v1->post;
2132}
2133
2134/* Data attached to vertices of partition dependence graph.  */
2135struct pg_vdata
2136{
2137  /* ID of the corresponding partition.  */
2138  int id;
2139  /* The partition.  */
2140  struct partition *partition;
2141};
2142
2143/* Data attached to edges of partition dependence graph.  */
2144struct pg_edata
2145{
2146  /* If the dependence edge can be resolved by runtime alias check,
2147     this vector contains data dependence relations for runtime alias
2148     check.  On the other hand, if the dependence edge is introduced
2149     because of compilation time known data dependence, this vector
2150     contains nothing.  */
2151  vec<ddr_p> alias_ddrs;
2152};
2153
2154/* Callback data for traversing edges in graph.  */
2155struct pg_edge_callback_data
2156{
2157  /* Bitmap contains strong connected components should be merged.  */
2158  bitmap sccs_to_merge;
2159  /* Array constains component information for all vertices.  */
2160  int *vertices_component;
2161  /* Vector to record all data dependence relations which are needed
2162     to break strong connected components by runtime alias checks.  */
2163  vec<ddr_p> *alias_ddrs;
2164};
2165
2166/* Initialize vertice's data for partition dependence graph PG with
2167   PARTITIONS.  */
2168
2169static void
2170init_partition_graph_vertices (struct graph *pg,
2171			       vec<struct partition *> *partitions)
2172{
2173  int i;
2174  partition *partition;
2175  struct pg_vdata *data;
2176
2177  for (i = 0; partitions->iterate (i, &partition); ++i)
2178    {
2179      data = new pg_vdata;
2180      pg->vertices[i].data = data;
2181      data->id = i;
2182      data->partition = partition;
2183    }
2184}
2185
2186/* Add edge <I, J> to partition dependence graph PG.  Attach vector of data
2187   dependence relations to the EDGE if DDRS isn't NULL.  */
2188
2189static void
2190add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
2191{
2192  struct graph_edge *e = add_edge (pg, i, j);
2193
2194  /* If the edge is attached with data dependence relations, it means this
2195     dependence edge can be resolved by runtime alias checks.  */
2196  if (ddrs != NULL)
2197    {
2198      struct pg_edata *data = new pg_edata;
2199
2200      gcc_assert (ddrs->length () > 0);
2201      e->data = data;
2202      data->alias_ddrs = vNULL;
2203      data->alias_ddrs.safe_splice (*ddrs);
2204    }
2205}
2206
2207/* Callback function for graph travesal algorithm.  It returns true
2208   if edge E should skipped when traversing the graph.  */
2209
2210static bool
2211pg_skip_alias_edge (struct graph_edge *e)
2212{
2213  struct pg_edata *data = (struct pg_edata *)e->data;
2214  return (data != NULL && data->alias_ddrs.length () > 0);
2215}
2216
2217/* Callback function freeing data attached to edge E of graph.  */
2218
2219static void
2220free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
2221{
2222  if (e->data != NULL)
2223    {
2224      struct pg_edata *data = (struct pg_edata *)e->data;
2225      data->alias_ddrs.release ();
2226      delete data;
2227    }
2228}
2229
2230/* Free data attached to vertice of partition dependence graph PG.  */
2231
2232static void
2233free_partition_graph_vdata (struct graph *pg)
2234{
2235  int i;
2236  struct pg_vdata *data;
2237
2238  for (i = 0; i < pg->n_vertices; ++i)
2239    {
2240      data = (struct pg_vdata *)pg->vertices[i].data;
2241      delete data;
2242    }
2243}
2244
2245/* Build and return partition dependence graph for PARTITIONS.  RDG is
2246   reduced dependence graph for the loop to be distributed.  If IGNORE_ALIAS_P
2247   is true, data dependence caused by possible alias between references
2248   is ignored, as if it doesn't exist at all; otherwise all depdendences
2249   are considered.  */
2250
2251struct graph *
2252loop_distribution::build_partition_graph (struct graph *rdg,
2253					  vec<struct partition *> *partitions,
2254					  bool ignore_alias_p)
2255{
2256  int i, j;
2257  struct partition *partition1, *partition2;
2258  graph *pg = new_graph (partitions->length ());
2259  auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
2260
2261  alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
2262
2263  init_partition_graph_vertices (pg, partitions);
2264
2265  for (i = 0; partitions->iterate (i, &partition1); ++i)
2266    {
2267      for (j = i + 1; partitions->iterate (j, &partition2); ++j)
2268	{
2269	  /* dependence direction - 0 is no dependence, -1 is back,
2270	     1 is forth, 2 is both (we can stop then, merging will occur).  */
2271	  int dir = 0;
2272
2273	  /* If the first partition has reduction, add back edge; if the
2274	     second partition has reduction, add forth edge.  This makes
2275	     sure that reduction partition will be sorted as the last one.  */
2276	  if (partition_reduction_p (partition1))
2277	    dir = -1;
2278	  else if (partition_reduction_p (partition2))
2279	    dir = 1;
2280
2281	  /* Cleanup the temporary vector.  */
2282	  alias_ddrs.truncate (0);
2283
2284	  dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
2285					 partition2->datarefs, alias_ddrs_p);
2286
2287	  /* Add edge to partition graph if there exists dependence.  There
2288	     are two types of edges.  One type edge is caused by compilation
2289	     time known dependence, this type cannot be resolved by runtime
2290	     alias check.  The other type can be resolved by runtime alias
2291	     check.  */
2292	  if (dir == 1 || dir == 2
2293	      || alias_ddrs.length () > 0)
2294	    {
2295	      /* Attach data dependence relations to edge that can be resolved
2296		 by runtime alias check.  */
2297	      bool alias_edge_p = (dir != 1 && dir != 2);
2298	      add_partition_graph_edge (pg, i, j,
2299					(alias_edge_p) ? &alias_ddrs : NULL);
2300	    }
2301	  if (dir == -1 || dir == 2
2302	      || alias_ddrs.length () > 0)
2303	    {
2304	      /* Attach data dependence relations to edge that can be resolved
2305		 by runtime alias check.  */
2306	      bool alias_edge_p = (dir != -1 && dir != 2);
2307	      add_partition_graph_edge (pg, j, i,
2308					(alias_edge_p) ? &alias_ddrs : NULL);
2309	    }
2310	}
2311    }
2312  return pg;
2313}
2314
2315/* Sort partitions in PG in descending post order and store them in
2316   PARTITIONS.  */
2317
2318static void
2319sort_partitions_by_post_order (struct graph *pg,
2320			       vec<struct partition *> *partitions)
2321{
2322  int i;
2323  struct pg_vdata *data;
2324
2325  /* Now order the remaining nodes in descending postorder.  */
2326  qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
2327  partitions->truncate (0);
2328  for (i = 0; i < pg->n_vertices; ++i)
2329    {
2330      data = (struct pg_vdata *)pg->vertices[i].data;
2331      if (data->partition)
2332	partitions->safe_push (data->partition);
2333    }
2334}
2335
2336void
2337loop_distribution::merge_dep_scc_partitions (struct graph *rdg,
2338					     vec<struct partition *> *partitions,
2339					     bool ignore_alias_p)
2340{
2341  struct partition *partition1, *partition2;
2342  struct pg_vdata *data;
2343  graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
2344  int i, j, num_sccs = graphds_scc (pg, NULL);
2345
2346  /* Strong connected compoenent means dependence cycle, we cannot distribute
2347     them.  So fuse them together.  */
2348  if ((unsigned) num_sccs < partitions->length ())
2349    {
2350      for (i = 0; i < num_sccs; ++i)
2351	{
2352	  for (j = 0; partitions->iterate (j, &partition1); ++j)
2353	    if (pg->vertices[j].component == i)
2354	      break;
2355	  for (j = j + 1; partitions->iterate (j, &partition2); ++j)
2356	    if (pg->vertices[j].component == i)
2357	      {
2358		partition_merge_into (NULL, partition1,
2359				      partition2, FUSE_SAME_SCC);
2360		partition1->type = PTYPE_SEQUENTIAL;
2361		(*partitions)[j] = NULL;
2362		partition_free (partition2);
2363		data = (struct pg_vdata *)pg->vertices[j].data;
2364		data->partition = NULL;
2365	      }
2366	}
2367    }
2368
2369  sort_partitions_by_post_order (pg, partitions);
2370  gcc_assert (partitions->length () == (unsigned)num_sccs);
2371  free_partition_graph_vdata (pg);
2372  free_graph (pg);
2373}
2374
2375/* Callback function for traversing edge E in graph G.  DATA is private
2376   callback data.  */
2377
2378static void
2379pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
2380{
2381  int i, j, component;
2382  struct pg_edge_callback_data *cbdata;
2383  struct pg_edata *edata = (struct pg_edata *) e->data;
2384
2385  /* If the edge doesn't have attached data dependence, it represents
2386     compilation time known dependences.  This type dependence cannot
2387     be resolved by runtime alias check.  */
2388  if (edata == NULL || edata->alias_ddrs.length () == 0)
2389    return;
2390
2391  cbdata = (struct pg_edge_callback_data *) data;
2392  i = e->src;
2393  j = e->dest;
2394  component = cbdata->vertices_component[i];
2395  /* Vertices are topologically sorted according to compilation time
2396     known dependences, so we can break strong connected components
2397     by removing edges of the opposite direction, i.e, edges pointing
2398     from vertice with smaller post number to vertice with bigger post
2399     number.  */
2400  if (g->vertices[i].post < g->vertices[j].post
2401      /* We only need to remove edges connecting vertices in the same
2402	 strong connected component to break it.  */
2403      && component == cbdata->vertices_component[j]
2404      /* Check if we want to break the strong connected component or not.  */
2405      && !bitmap_bit_p (cbdata->sccs_to_merge, component))
2406    cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
2407}
2408
2409/* Callback function for traversing edge E.  DATA is private
2410   callback data.  */
2411
2412static void
2413pg_unmark_merged_alias_ddrs (struct graph *, struct graph_edge *e, void *data)
2414{
2415  int i, j, component;
2416  struct pg_edge_callback_data *cbdata;
2417  struct pg_edata *edata = (struct pg_edata *) e->data;
2418
2419  if (edata == NULL || edata->alias_ddrs.length () == 0)
2420    return;
2421
2422  cbdata = (struct pg_edge_callback_data *) data;
2423  i = e->src;
2424  j = e->dest;
2425  component = cbdata->vertices_component[i];
2426  /* Make sure to not skip vertices inside SCCs we are going to merge.  */
2427  if (component == cbdata->vertices_component[j]
2428      && bitmap_bit_p (cbdata->sccs_to_merge, component))
2429    {
2430      edata->alias_ddrs.release ();
2431      delete edata;
2432      e->data = NULL;
2433    }
2434}
2435
2436/* This is the main function breaking strong conected components in
2437   PARTITIONS giving reduced depdendence graph RDG.  Store data dependence
2438   relations for runtime alias check in ALIAS_DDRS.  */
2439void
2440loop_distribution::break_alias_scc_partitions (struct graph *rdg,
2441					       vec<struct partition *> *partitions,
2442					       vec<ddr_p> *alias_ddrs)
2443{
2444  int i, j, k, num_sccs, num_sccs_no_alias = 0;
2445  /* Build partition dependence graph.  */
2446  graph *pg = build_partition_graph (rdg, partitions, false);
2447
2448  alias_ddrs->truncate (0);
2449  /* Find strong connected components in the graph, with all dependence edges
2450     considered.  */
2451  num_sccs = graphds_scc (pg, NULL);
2452  /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2453     compilation time known dependences are merged before this function.  */
2454  if ((unsigned) num_sccs < partitions->length ())
2455    {
2456      struct pg_edge_callback_data cbdata;
2457      auto_bitmap sccs_to_merge;
2458      auto_vec<enum partition_type> scc_types;
2459      struct partition *partition, *first;
2460
2461      /* If all partitions in a SCC have the same type, we can simply merge the
2462	 SCC.  This loop finds out such SCCS and record them in bitmap.  */
2463      bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
2464      for (i = 0; i < num_sccs; ++i)
2465	{
2466	  for (j = 0; partitions->iterate (j, &first); ++j)
2467	    if (pg->vertices[j].component == i)
2468	      break;
2469
2470	  bool same_type = true, all_builtins = partition_builtin_p (first);
2471	  for (++j; partitions->iterate (j, &partition); ++j)
2472	    {
2473	      if (pg->vertices[j].component != i)
2474		continue;
2475
2476	      if (first->type != partition->type)
2477		{
2478		  same_type = false;
2479		  break;
2480		}
2481	      all_builtins &= partition_builtin_p (partition);
2482	    }
2483	  /* Merge SCC if all partitions in SCC have the same type, though the
2484	     result partition is sequential, because vectorizer can do better
2485	     runtime alias check.  One expecption is all partitions in SCC are
2486	     builtins.  */
2487	  if (!same_type || all_builtins)
2488	    bitmap_clear_bit (sccs_to_merge, i);
2489	}
2490
2491      /* Initialize callback data for traversing.  */
2492      cbdata.sccs_to_merge = sccs_to_merge;
2493      cbdata.alias_ddrs = alias_ddrs;
2494      cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
2495      /* Record the component information which will be corrupted by next
2496	 graph scc finding call.  */
2497      for (i = 0; i < pg->n_vertices; ++i)
2498	cbdata.vertices_component[i] = pg->vertices[i].component;
2499
2500      /* Collect data dependences for runtime alias checks to break SCCs.  */
2501      if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
2502	{
2503	  /* For SCCs we want to merge clear all alias_ddrs for edges
2504	     inside the component.  */
2505	  for_each_edge (pg, pg_unmark_merged_alias_ddrs, &cbdata);
2506
2507	  /* Run SCC finding algorithm again, with alias dependence edges
2508	     skipped.  This is to topologically sort partitions according to
2509	     compilation time known dependence.  Note the topological order
2510	     is stored in the form of pg's post order number.  */
2511	  num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
2512	  /* We cannot assert partitions->length () == num_sccs_no_alias
2513	     since we are not ignoring alias edges in cycles we are
2514	     going to merge.  That's required to compute correct postorder.  */
2515	  /* With topological order, we can construct two subgraphs L and R.
2516	     L contains edge <x, y> where x < y in terms of post order, while
2517	     R contains edge <x, y> where x > y.  Edges for compilation time
2518	     known dependence all fall in R, so we break SCCs by removing all
2519	     (alias) edges of in subgraph L.  */
2520	  for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
2521	}
2522
2523      /* For SCC that doesn't need to be broken, merge it.  */
2524      for (i = 0; i < num_sccs; ++i)
2525	{
2526	  if (!bitmap_bit_p (sccs_to_merge, i))
2527	    continue;
2528
2529	  for (j = 0; partitions->iterate (j, &first); ++j)
2530	    if (cbdata.vertices_component[j] == i)
2531	      break;
2532	  for (k = j + 1; partitions->iterate (k, &partition); ++k)
2533	    {
2534	      struct pg_vdata *data;
2535
2536	      if (cbdata.vertices_component[k] != i)
2537		continue;
2538
2539	      partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
2540	      (*partitions)[k] = NULL;
2541	      partition_free (partition);
2542	      data = (struct pg_vdata *)pg->vertices[k].data;
2543	      gcc_assert (data->id == k);
2544	      data->partition = NULL;
2545	      /* The result partition of merged SCC must be sequential.  */
2546	      first->type = PTYPE_SEQUENTIAL;
2547	    }
2548	}
2549      /* If reduction partition's SCC is broken by runtime alias checks,
2550	 we force a negative post order to it making sure it will be scheduled
2551	 in the last.  */
2552      if (num_sccs_no_alias > 0)
2553	{
2554	  j = -1;
2555	  for (i = 0; i < pg->n_vertices; ++i)
2556	    {
2557	      struct pg_vdata *data = (struct pg_vdata *)pg->vertices[i].data;
2558	      if (data->partition && partition_reduction_p (data->partition))
2559		{
2560		  gcc_assert (j == -1);
2561		  j = i;
2562		}
2563	    }
2564	  if (j >= 0)
2565	    pg->vertices[j].post = -1;
2566	}
2567
2568      free (cbdata.vertices_component);
2569    }
2570
2571  sort_partitions_by_post_order (pg, partitions);
2572  free_partition_graph_vdata (pg);
2573  for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2574  free_graph (pg);
2575
2576  if (dump_file && (dump_flags & TDF_DETAILS))
2577    {
2578      fprintf (dump_file, "Possible alias data dependence to break:\n");
2579      dump_data_dependence_relations (dump_file, *alias_ddrs);
2580    }
2581}
2582
2583/* Compute and return an expression whose value is the segment length which
2584   will be accessed by DR in NITERS iterations.  */
2585
2586static tree
2587data_ref_segment_size (struct data_reference *dr, tree niters)
2588{
2589  niters = size_binop (MINUS_EXPR,
2590		       fold_convert (sizetype, niters),
2591		       size_one_node);
2592  return size_binop (MULT_EXPR,
2593		     fold_convert (sizetype, DR_STEP (dr)),
2594		     fold_convert (sizetype, niters));
2595}
2596
2597/* Return true if LOOP's latch is dominated by statement for data reference
2598   DR.  */
2599
2600static inline bool
2601latch_dominated_by_data_ref (class loop *loop, data_reference *dr)
2602{
2603  return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
2604			 gimple_bb (DR_STMT (dr)));
2605}
2606
2607/* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2608   data dependence relations ALIAS_DDRS.  */
2609
2610static void
2611compute_alias_check_pairs (class loop *loop, vec<ddr_p> *alias_ddrs,
2612			   vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
2613{
2614  unsigned int i;
2615  unsigned HOST_WIDE_INT factor = 1;
2616  tree niters_plus_one, niters = number_of_latch_executions (loop);
2617
2618  gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
2619  niters = fold_convert (sizetype, niters);
2620  niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
2621
2622  if (dump_file && (dump_flags & TDF_DETAILS))
2623    fprintf (dump_file, "Creating alias check pairs:\n");
2624
2625  /* Iterate all data dependence relations and compute alias check pairs.  */
2626  for (i = 0; i < alias_ddrs->length (); i++)
2627    {
2628      ddr_p ddr = (*alias_ddrs)[i];
2629      struct data_reference *dr_a = DDR_A (ddr);
2630      struct data_reference *dr_b = DDR_B (ddr);
2631      tree seg_length_a, seg_length_b;
2632
2633      if (latch_dominated_by_data_ref (loop, dr_a))
2634	seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
2635      else
2636	seg_length_a = data_ref_segment_size (dr_a, niters);
2637
2638      if (latch_dominated_by_data_ref (loop, dr_b))
2639	seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2640      else
2641	seg_length_b = data_ref_segment_size (dr_b, niters);
2642
2643      unsigned HOST_WIDE_INT access_size_a
2644	= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a))));
2645      unsigned HOST_WIDE_INT access_size_b
2646	= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b))));
2647      unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a)));
2648      unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b)));
2649
2650      dr_with_seg_len_pair_t dr_with_seg_len_pair
2651	(dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a),
2652	 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b),
2653	 /* ??? Would WELL_ORDERED be safe?  */
2654	 dr_with_seg_len_pair_t::REORDERED);
2655
2656      comp_alias_pairs->safe_push (dr_with_seg_len_pair);
2657    }
2658
2659  if (tree_fits_uhwi_p (niters))
2660    factor = tree_to_uhwi (niters);
2661
2662  /* Prune alias check pairs.  */
2663  prune_runtime_alias_test_list (comp_alias_pairs, factor);
2664  if (dump_file && (dump_flags & TDF_DETAILS))
2665    fprintf (dump_file,
2666	     "Improved number of alias checks from %d to %d\n",
2667	     alias_ddrs->length (), comp_alias_pairs->length ());
2668}
2669
2670/* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2671   checks and version LOOP under condition of these runtime alias checks.  */
2672
2673static void
2674version_loop_by_alias_check (vec<struct partition *> *partitions,
2675			     class loop *loop, vec<ddr_p> *alias_ddrs)
2676{
2677  profile_probability prob;
2678  basic_block cond_bb;
2679  class loop *nloop;
2680  tree lhs, arg0, cond_expr = NULL_TREE;
2681  gimple_seq cond_stmts = NULL;
2682  gimple *call_stmt = NULL;
2683  auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
2684
2685  /* Generate code for runtime alias checks if necessary.  */
2686  gcc_assert (alias_ddrs->length () > 0);
2687
2688  if (dump_file && (dump_flags & TDF_DETAILS))
2689    fprintf (dump_file,
2690	     "Version loop <%d> with runtime alias check\n", loop->num);
2691
2692  compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
2693  create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2694  cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2695				      is_gimple_val, NULL_TREE);
2696
2697  /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS.  */
2698  bool cancelable_p = flag_tree_loop_vectorize;
2699  if (cancelable_p)
2700    {
2701      unsigned i = 0;
2702      struct partition *partition;
2703      for (; partitions->iterate (i, &partition); ++i)
2704	if (!partition_builtin_p (partition))
2705	  break;
2706
2707     /* If all partitions are builtins, distributing it would be profitable and
2708	we don't want to cancel the runtime alias checks.  */
2709      if (i == partitions->length ())
2710	cancelable_p = false;
2711    }
2712
2713  /* Generate internal function call for loop distribution alias check if the
2714     runtime alias check should be cancelable.  */
2715  if (cancelable_p)
2716    {
2717      call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2718					      2, NULL_TREE, cond_expr);
2719      lhs = make_ssa_name (boolean_type_node);
2720      gimple_call_set_lhs (call_stmt, lhs);
2721    }
2722  else
2723    lhs = cond_expr;
2724
2725  prob = profile_probability::guessed_always ().apply_scale (9, 10);
2726  initialize_original_copy_tables ();
2727  nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
2728			prob, prob.invert (), true);
2729  free_original_copy_tables ();
2730  /* Record the original loop number in newly generated loops.  In case of
2731     distribution, the original loop will be distributed and the new loop
2732     is kept.  */
2733  loop->orig_loop_num = nloop->num;
2734  nloop->orig_loop_num = nloop->num;
2735  nloop->dont_vectorize = true;
2736  nloop->force_vectorize = false;
2737
2738  if (call_stmt)
2739    {
2740      /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2741	 loop could be destroyed.  */
2742      arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
2743      gimple_call_set_arg (call_stmt, 0, arg0);
2744      gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
2745    }
2746
2747  if (cond_stmts)
2748    {
2749      gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
2750      gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
2751    }
2752  update_ssa (TODO_update_ssa);
2753}
2754
2755/* Return true if loop versioning is needed to distrubute PARTITIONS.
2756   ALIAS_DDRS are data dependence relations for runtime alias check.  */
2757
2758static inline bool
2759version_for_distribution_p (vec<struct partition *> *partitions,
2760			    vec<ddr_p> *alias_ddrs)
2761{
2762  /* No need to version loop if we have only one partition.  */
2763  if (partitions->length () == 1)
2764    return false;
2765
2766  /* Need to version loop if runtime alias check is necessary.  */
2767  return (alias_ddrs->length () > 0);
2768}
2769
2770/* Compare base offset of builtin mem* partitions P1 and P2.  */
2771
2772static int
2773offset_cmp (const void *vp1, const void *vp2)
2774{
2775  struct partition *p1 = *(struct partition *const *) vp1;
2776  struct partition *p2 = *(struct partition *const *) vp2;
2777  unsigned HOST_WIDE_INT o1 = p1->builtin->dst_base_offset;
2778  unsigned HOST_WIDE_INT o2 = p2->builtin->dst_base_offset;
2779  return (o2 < o1) - (o1 < o2);
2780}
2781
2782/* Fuse adjacent memset builtin PARTITIONS if possible.  This is a special
2783   case optimization transforming below code:
2784
2785     __builtin_memset (&obj, 0, 100);
2786     _1 = &obj + 100;
2787     __builtin_memset (_1, 0, 200);
2788     _2 = &obj + 300;
2789     __builtin_memset (_2, 0, 100);
2790
2791   into:
2792
2793     __builtin_memset (&obj, 0, 400);
2794
2795   Note we don't have dependence information between different partitions
2796   at this point, as a result, we can't handle nonadjacent memset builtin
2797   partitions since dependence might be broken.  */
2798
2799static void
2800fuse_memset_builtins (vec<struct partition *> *partitions)
2801{
2802  unsigned i, j;
2803  struct partition *part1, *part2;
2804  tree rhs1, rhs2;
2805
2806  for (i = 0; partitions->iterate (i, &part1);)
2807    {
2808      if (part1->kind != PKIND_MEMSET)
2809	{
2810	  i++;
2811	  continue;
2812	}
2813
2814      /* Find sub-array of memset builtins of the same base.  Index range
2815	 of the sub-array is [i, j) with "j > i".  */
2816      for (j = i + 1; partitions->iterate (j, &part2); ++j)
2817	{
2818	  if (part2->kind != PKIND_MEMSET
2819	      || !operand_equal_p (part1->builtin->dst_base_base,
2820				   part2->builtin->dst_base_base, 0))
2821	    break;
2822
2823	  /* Memset calls setting different values can't be merged.  */
2824	  rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2825	  rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2826	  if (!operand_equal_p (rhs1, rhs2, 0))
2827	    break;
2828	}
2829
2830      /* Stable sort is required in order to avoid breaking dependence.  */
2831      gcc_stablesort (&(*partitions)[i], j - i, sizeof (*partitions)[i],
2832		      offset_cmp);
2833      /* Continue with next partition.  */
2834      i = j;
2835    }
2836
2837  /* Merge all consecutive memset builtin partitions.  */
2838  for (i = 0; i < partitions->length () - 1;)
2839    {
2840      part1 = (*partitions)[i];
2841      if (part1->kind != PKIND_MEMSET)
2842	{
2843	  i++;
2844	  continue;
2845	}
2846
2847      part2 = (*partitions)[i + 1];
2848      /* Only merge memset partitions of the same base and with constant
2849	 access sizes.  */
2850      if (part2->kind != PKIND_MEMSET
2851	  || TREE_CODE (part1->builtin->size) != INTEGER_CST
2852	  || TREE_CODE (part2->builtin->size) != INTEGER_CST
2853	  || !operand_equal_p (part1->builtin->dst_base_base,
2854			       part2->builtin->dst_base_base, 0))
2855	{
2856	  i++;
2857	  continue;
2858	}
2859      rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2860      rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2861      int bytev1 = const_with_all_bytes_same (rhs1);
2862      int bytev2 = const_with_all_bytes_same (rhs2);
2863      /* Only merge memset partitions of the same value.  */
2864      if (bytev1 != bytev2 || bytev1 == -1)
2865	{
2866	  i++;
2867	  continue;
2868	}
2869      wide_int end1 = wi::add (part1->builtin->dst_base_offset,
2870			       wi::to_wide (part1->builtin->size));
2871      /* Only merge adjacent memset partitions.  */
2872      if (wi::ne_p (end1, part2->builtin->dst_base_offset))
2873	{
2874	  i++;
2875	  continue;
2876	}
2877      /* Merge partitions[i] and partitions[i+1].  */
2878      part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype,
2879					  part1->builtin->size,
2880					  part2->builtin->size);
2881      partition_free (part2);
2882      partitions->ordered_remove (i + 1);
2883    }
2884}
2885
2886void
2887loop_distribution::finalize_partitions (class loop *loop,
2888					vec<struct partition *> *partitions,
2889					vec<ddr_p> *alias_ddrs)
2890{
2891  unsigned i;
2892  struct partition *partition, *a;
2893
2894  if (partitions->length () == 1
2895      || alias_ddrs->length () > 0)
2896    return;
2897
2898  unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
2899  bool same_type_p = true;
2900  enum partition_type type = ((*partitions)[0])->type;
2901  for (i = 0; partitions->iterate (i, &partition); ++i)
2902    {
2903      same_type_p &= (type == partition->type);
2904      if (partition_builtin_p (partition))
2905	{
2906	  num_builtin++;
2907	  continue;
2908	}
2909      num_normal++;
2910      if (partition->kind == PKIND_PARTIAL_MEMSET)
2911	num_partial_memset++;
2912    }
2913
2914  /* Don't distribute current loop into too many loops given we don't have
2915     memory stream cost model.  Be even more conservative in case of loop
2916     nest distribution.  */
2917  if ((same_type_p && num_builtin == 0
2918       && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
2919      || (loop->inner != NULL
2920	  && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
2921      || (loop->inner == NULL
2922	  && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
2923    {
2924      a = (*partitions)[0];
2925      for (i = 1; partitions->iterate (i, &partition); ++i)
2926	{
2927	  partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
2928	  partition_free (partition);
2929	}
2930      partitions->truncate (1);
2931    }
2932
2933  /* Fuse memset builtins if possible.  */
2934  if (partitions->length () > 1)
2935    fuse_memset_builtins (partitions);
2936}
2937
2938/* Distributes the code from LOOP in such a way that producer statements
2939   are placed before consumer statements.  Tries to separate only the
2940   statements from STMTS into separate loops.  Returns the number of
2941   distributed loops.  Set NB_CALLS to number of generated builtin calls.
2942   Set *DESTROY_P to whether LOOP needs to be destroyed.  */
2943
2944int
2945loop_distribution::distribute_loop (class loop *loop, vec<gimple *> stmts,
2946		 control_dependences *cd, int *nb_calls, bool *destroy_p,
2947		 bool only_patterns_p)
2948{
2949  ddrs_table = new hash_table<ddr_hasher> (389);
2950  struct graph *rdg;
2951  partition *partition;
2952  int i, nbp;
2953
2954  *destroy_p = false;
2955  *nb_calls = 0;
2956  loop_nest.create (0);
2957  if (!find_loop_nest (loop, &loop_nest))
2958    {
2959      loop_nest.release ();
2960      delete ddrs_table;
2961      return 0;
2962    }
2963
2964  datarefs_vec.create (20);
2965  has_nonaddressable_dataref_p = false;
2966  rdg = build_rdg (loop, cd);
2967  if (!rdg)
2968    {
2969      if (dump_file && (dump_flags & TDF_DETAILS))
2970	fprintf (dump_file,
2971		 "Loop %d not distributed: failed to build the RDG.\n",
2972		 loop->num);
2973
2974      loop_nest.release ();
2975      free_data_refs (datarefs_vec);
2976      delete ddrs_table;
2977      return 0;
2978    }
2979
2980  if (datarefs_vec.length () > MAX_DATAREFS_NUM)
2981    {
2982      if (dump_file && (dump_flags & TDF_DETAILS))
2983	fprintf (dump_file,
2984		 "Loop %d not distributed: too many memory references.\n",
2985		 loop->num);
2986
2987      free_rdg (rdg);
2988      loop_nest.release ();
2989      free_data_refs (datarefs_vec);
2990      delete ddrs_table;
2991      return 0;
2992    }
2993
2994  data_reference_p dref;
2995  for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
2996    dref->aux = (void *) (uintptr_t) i;
2997
2998  if (dump_file && (dump_flags & TDF_DETAILS))
2999    dump_rdg (dump_file, rdg);
3000
3001  auto_vec<struct partition *, 3> partitions;
3002  rdg_build_partitions (rdg, stmts, &partitions);
3003
3004  auto_vec<ddr_p> alias_ddrs;
3005
3006  auto_bitmap stmt_in_all_partitions;
3007  bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
3008  for (i = 1; partitions.iterate (i, &partition); ++i)
3009    bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
3010
3011  bool any_builtin = false;
3012  bool reduction_in_all = false;
3013  FOR_EACH_VEC_ELT (partitions, i, partition)
3014    {
3015      reduction_in_all
3016	|= classify_partition (loop, rdg, partition, stmt_in_all_partitions);
3017      any_builtin |= partition_builtin_p (partition);
3018    }
3019
3020  /* If we are only distributing patterns but did not detect any,
3021     simply bail out.  */
3022  if (only_patterns_p
3023      && !any_builtin)
3024    {
3025      nbp = 0;
3026      goto ldist_done;
3027    }
3028
3029  /* If we are only distributing patterns fuse all partitions that
3030     were not classified as builtins.  This also avoids chopping
3031     a loop into pieces, separated by builtin calls.  That is, we
3032     only want no or a single loop body remaining.  */
3033  struct partition *into;
3034  if (only_patterns_p)
3035    {
3036      for (i = 0; partitions.iterate (i, &into); ++i)
3037	if (!partition_builtin_p (into))
3038	  break;
3039      for (++i; partitions.iterate (i, &partition); ++i)
3040	if (!partition_builtin_p (partition))
3041	  {
3042	    partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
3043	    partitions.unordered_remove (i);
3044	    partition_free (partition);
3045	    i--;
3046	  }
3047    }
3048
3049  /* Due to limitations in the transform phase we have to fuse all
3050     reduction partitions into the last partition so the existing
3051     loop will contain all loop-closed PHI nodes.  */
3052  for (i = 0; partitions.iterate (i, &into); ++i)
3053    if (partition_reduction_p (into))
3054      break;
3055  for (i = i + 1; partitions.iterate (i, &partition); ++i)
3056    if (partition_reduction_p (partition))
3057      {
3058	partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
3059	partitions.unordered_remove (i);
3060	partition_free (partition);
3061	i--;
3062      }
3063
3064  /* Apply our simple cost model - fuse partitions with similar
3065     memory accesses.  */
3066  for (i = 0; partitions.iterate (i, &into); ++i)
3067    {
3068      bool changed = false;
3069      if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
3070	continue;
3071      for (int j = i + 1;
3072	   partitions.iterate (j, &partition); ++j)
3073	{
3074	  if (share_memory_accesses (rdg, into, partition))
3075	    {
3076	      partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
3077	      partitions.unordered_remove (j);
3078	      partition_free (partition);
3079	      j--;
3080	      changed = true;
3081	    }
3082	}
3083      /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
3084         accesses when 1 and 2 have similar accesses but not 0 and 1
3085	 then in the next iteration we will fail to consider merging
3086	 1 into 0,2.  So try again if we did any merging into 0.  */
3087      if (changed)
3088	i--;
3089    }
3090
3091  /* Put a non-builtin partition last if we need to preserve a reduction.
3092     ???  This is a workaround that makes sort_partitions_by_post_order do
3093     the correct thing while in reality it should sort each component
3094     separately and then put the component with a reduction or a non-builtin
3095     last.  */
3096  if (reduction_in_all
3097      && partition_builtin_p (partitions.last()))
3098    FOR_EACH_VEC_ELT (partitions, i, partition)
3099      if (!partition_builtin_p (partition))
3100	{
3101	  partitions.unordered_remove (i);
3102	  partitions.quick_push (partition);
3103	  break;
3104	}
3105
3106  /* Build the partition dependency graph and fuse partitions in strong
3107     connected component.  */
3108  if (partitions.length () > 1)
3109    {
3110      /* Don't support loop nest distribution under runtime alias check
3111	 since it's not likely to enable many vectorization opportunities.
3112	 Also if loop has any data reference which may be not addressable
3113	 since alias check needs to take, compare address of the object.  */
3114      if (loop->inner || has_nonaddressable_dataref_p)
3115	merge_dep_scc_partitions (rdg, &partitions, false);
3116      else
3117	{
3118	  merge_dep_scc_partitions (rdg, &partitions, true);
3119	  if (partitions.length () > 1)
3120	    break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
3121	}
3122    }
3123
3124  finalize_partitions (loop, &partitions, &alias_ddrs);
3125
3126  /* If there is a reduction in all partitions make sure the last one
3127     is not classified for builtin code generation.  */
3128  if (reduction_in_all)
3129    {
3130      partition = partitions.last ();
3131      if (only_patterns_p
3132	  && partition_builtin_p (partition)
3133	  && !partition_builtin_p (partitions[0]))
3134	{
3135	  nbp = 0;
3136	  goto ldist_done;
3137	}
3138      partition->kind = PKIND_NORMAL;
3139    }
3140
3141  nbp = partitions.length ();
3142  if (nbp == 0
3143      || (nbp == 1 && !partition_builtin_p (partitions[0]))
3144      || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
3145    {
3146      nbp = 0;
3147      goto ldist_done;
3148    }
3149
3150  if (version_for_distribution_p (&partitions, &alias_ddrs))
3151    version_loop_by_alias_check (&partitions, loop, &alias_ddrs);
3152
3153  if (dump_file && (dump_flags & TDF_DETAILS))
3154    {
3155      fprintf (dump_file,
3156	       "distribute loop <%d> into partitions:\n", loop->num);
3157      dump_rdg_partitions (dump_file, partitions);
3158    }
3159
3160  FOR_EACH_VEC_ELT (partitions, i, partition)
3161    {
3162      if (partition_builtin_p (partition))
3163	(*nb_calls)++;
3164      *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1);
3165    }
3166
3167 ldist_done:
3168  loop_nest.release ();
3169  free_data_refs (datarefs_vec);
3170  for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin ();
3171       iter != ddrs_table->end (); ++iter)
3172    {
3173      free_dependence_relation (*iter);
3174      *iter = NULL;
3175    }
3176  delete ddrs_table;
3177
3178  FOR_EACH_VEC_ELT (partitions, i, partition)
3179    partition_free (partition);
3180
3181  free_rdg (rdg);
3182  return nbp - *nb_calls;
3183}
3184
3185
3186void loop_distribution::bb_top_order_init (void)
3187{
3188  int rpo_num;
3189  int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (cfun) - NUM_FIXED_BLOCKS);
3190  edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
3191  bitmap exit_bbs = BITMAP_ALLOC (NULL);
3192
3193  bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
3194  bb_top_order_index_size = last_basic_block_for_fn (cfun);
3195
3196  entry->flags &= ~EDGE_DFS_BACK;
3197  bitmap_set_bit (exit_bbs, EXIT_BLOCK);
3198  rpo_num = rev_post_order_and_mark_dfs_back_seme (cfun, entry, exit_bbs, true,
3199						   rpo, NULL);
3200  BITMAP_FREE (exit_bbs);
3201
3202  for (int i = 0; i < rpo_num; i++)
3203    bb_top_order_index[rpo[i]] = i;
3204
3205  free (rpo);
3206}
3207
3208void loop_distribution::bb_top_order_destroy ()
3209{
3210  free (bb_top_order_index);
3211  bb_top_order_index = NULL;
3212  bb_top_order_index_size = 0;
3213}
3214
3215
3216/* Given LOOP, this function records seed statements for distribution in
3217   WORK_LIST.  Return false if there is nothing for distribution.  */
3218
3219static bool
3220find_seed_stmts_for_distribution (class loop *loop, vec<gimple *> *work_list)
3221{
3222  basic_block *bbs = get_loop_body_in_dom_order (loop);
3223
3224  /* Initialize the worklist with stmts we seed the partitions with.  */
3225  for (unsigned i = 0; i < loop->num_nodes; ++i)
3226    {
3227      /* In irreducible sub-regions we don't know how to redirect
3228	 conditions, so fail.  See PR100492.  */
3229      if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
3230	{
3231	  if (dump_file && (dump_flags & TDF_DETAILS))
3232	    fprintf (dump_file, "loop %d contains an irreducible region.\n",
3233		     loop->num);
3234	  work_list->truncate (0);
3235	  break;
3236	}
3237      for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
3238	   !gsi_end_p (gsi); gsi_next (&gsi))
3239	{
3240	  gphi *phi = gsi.phi ();
3241	  if (virtual_operand_p (gimple_phi_result (phi)))
3242	    continue;
3243	  /* Distribute stmts which have defs that are used outside of
3244	     the loop.  */
3245	  if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
3246	    continue;
3247	  work_list->safe_push (phi);
3248	}
3249      for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3250	   !gsi_end_p (gsi); gsi_next (&gsi))
3251	{
3252	  gimple *stmt = gsi_stmt (gsi);
3253
3254	  /* Ignore clobbers, they do not have true side effects.  */
3255	  if (gimple_clobber_p (stmt))
3256	    continue;
3257
3258	  /* If there is a stmt with side-effects bail out - we
3259	     cannot and should not distribute this loop.  */
3260	  if (gimple_has_side_effects (stmt))
3261	    {
3262	      free (bbs);
3263	      return false;
3264	    }
3265
3266	  /* Distribute stmts which have defs that are used outside of
3267	     the loop.  */
3268	  if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
3269	    ;
3270	  /* Otherwise only distribute stores for now.  */
3271	  else if (!gimple_vdef (stmt))
3272	    continue;
3273
3274	  work_list->safe_push (stmt);
3275	}
3276    }
3277  free (bbs);
3278  return work_list->length () > 0;
3279}
3280
3281/* Given innermost LOOP, return the outermost enclosing loop that forms a
3282   perfect loop nest.  */
3283
3284static class loop *
3285prepare_perfect_loop_nest (class loop *loop)
3286{
3287  class loop *outer = loop_outer (loop);
3288  tree niters = number_of_latch_executions (loop);
3289
3290  /* TODO: We only support the innermost 3-level loop nest distribution
3291     because of compilation time issue for now.  This should be relaxed
3292     in the future.  Note we only allow 3-level loop nest distribution
3293     when parallelizing loops.  */
3294  while ((loop->inner == NULL
3295	  || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
3296	 && loop_outer (outer)
3297	 && outer->inner == loop && loop->next == NULL
3298	 && single_exit (outer)
3299	 && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
3300	 && (niters = number_of_latch_executions (outer)) != NULL_TREE
3301	 && niters != chrec_dont_know)
3302    {
3303      loop = outer;
3304      outer = loop_outer (loop);
3305    }
3306
3307  return loop;
3308}
3309
3310
3311unsigned int
3312loop_distribution::execute (function *fun)
3313{
3314  class loop *loop;
3315  bool changed = false;
3316  basic_block bb;
3317  control_dependences *cd = NULL;
3318  auto_vec<loop_p> loops_to_be_destroyed;
3319
3320  if (number_of_loops (fun) <= 1)
3321    return 0;
3322
3323  bb_top_order_init ();
3324
3325  FOR_ALL_BB_FN (bb, fun)
3326    {
3327      gimple_stmt_iterator gsi;
3328      for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3329	gimple_set_uid (gsi_stmt (gsi), -1);
3330      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3331	gimple_set_uid (gsi_stmt (gsi), -1);
3332    }
3333
3334  /* We can at the moment only distribute non-nested loops, thus restrict
3335     walking to innermost loops.  */
3336  FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
3337    {
3338      /* Don't distribute multiple exit edges loop, or cold loop when
3339         not doing pattern detection.  */
3340      if (!single_exit (loop)
3341	  || (!flag_tree_loop_distribute_patterns
3342	      && !optimize_loop_for_speed_p (loop)))
3343	continue;
3344
3345      /* Don't distribute loop if niters is unknown.  */
3346      tree niters = number_of_latch_executions (loop);
3347      if (niters == NULL_TREE || niters == chrec_dont_know)
3348	continue;
3349
3350      /* Get the perfect loop nest for distribution.  */
3351      loop = prepare_perfect_loop_nest (loop);
3352      for (; loop; loop = loop->inner)
3353	{
3354	  auto_vec<gimple *> work_list;
3355	  if (!find_seed_stmts_for_distribution (loop, &work_list))
3356	    break;
3357
3358	  const char *str = loop->inner ? " nest" : "";
3359	  dump_user_location_t loc = find_loop_location (loop);
3360	  if (!cd)
3361	    {
3362	      calculate_dominance_info (CDI_DOMINATORS);
3363	      calculate_dominance_info (CDI_POST_DOMINATORS);
3364	      cd = new control_dependences ();
3365	      free_dominance_info (CDI_POST_DOMINATORS);
3366	    }
3367
3368	  bool destroy_p;
3369	  int nb_generated_loops, nb_generated_calls;
3370	  nb_generated_loops
3371	    = distribute_loop (loop, work_list, cd, &nb_generated_calls,
3372			       &destroy_p, (!optimize_loop_for_speed_p (loop)
3373					    || !flag_tree_loop_distribution));
3374	  if (destroy_p)
3375	    loops_to_be_destroyed.safe_push (loop);
3376
3377	  if (nb_generated_loops + nb_generated_calls > 0)
3378	    {
3379	      changed = true;
3380	      if (dump_enabled_p ())
3381		dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
3382				 loc, "Loop%s %d distributed: split to %d loops "
3383				 "and %d library calls.\n", str, loop->num,
3384				 nb_generated_loops, nb_generated_calls);
3385
3386	      break;
3387	    }
3388
3389	  if (dump_file && (dump_flags & TDF_DETAILS))
3390	    fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num);
3391	}
3392    }
3393
3394  if (cd)
3395    delete cd;
3396
3397  if (bb_top_order_index != NULL)
3398    bb_top_order_destroy ();
3399
3400  if (changed)
3401    {
3402      /* Destroy loop bodies that could not be reused.  Do this late as we
3403	 otherwise can end up refering to stale data in control dependences.  */
3404      unsigned i;
3405      FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
3406	destroy_loop (loop);
3407
3408      /* Cached scalar evolutions now may refer to wrong or non-existing
3409	 loops.  */
3410      scev_reset_htab ();
3411      mark_virtual_operands_for_renaming (fun);
3412      rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
3413    }
3414
3415  checking_verify_loop_structure ();
3416
3417  return changed ? TODO_cleanup_cfg : 0;
3418}
3419
3420
3421/* Distribute all loops in the current function.  */
3422
3423namespace {
3424
3425const pass_data pass_data_loop_distribution =
3426{
3427  GIMPLE_PASS, /* type */
3428  "ldist", /* name */
3429  OPTGROUP_LOOP, /* optinfo_flags */
3430  TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
3431  ( PROP_cfg | PROP_ssa ), /* properties_required */
3432  0, /* properties_provided */
3433  0, /* properties_destroyed */
3434  0, /* todo_flags_start */
3435  0, /* todo_flags_finish */
3436};
3437
3438class pass_loop_distribution : public gimple_opt_pass
3439{
3440public:
3441  pass_loop_distribution (gcc::context *ctxt)
3442    : gimple_opt_pass (pass_data_loop_distribution, ctxt)
3443  {}
3444
3445  /* opt_pass methods: */
3446  virtual bool gate (function *)
3447    {
3448      return flag_tree_loop_distribution
3449	|| flag_tree_loop_distribute_patterns;
3450    }
3451
3452  virtual unsigned int execute (function *);
3453
3454}; // class pass_loop_distribution
3455
3456unsigned int
3457pass_loop_distribution::execute (function *fun)
3458{
3459  return loop_distribution ().execute (fun);
3460}
3461
3462} // anon namespace
3463
3464gimple_opt_pass *
3465make_pass_loop_distribution (gcc::context *ctxt)
3466{
3467  return new pass_loop_distribution (ctxt);
3468}
3469