1169689Skan/* Linear Loop transforms
2169689Skan   Copyright (C) 2003, 2004, 2005 Free Software Foundation, Inc.
3169689Skan   Contributed by Daniel Berlin <dberlin@dberlin.org>.
4169689Skan
5169689SkanThis file is part of GCC.
6169689Skan
7169689SkanGCC is free software; you can redistribute it and/or modify it under
8169689Skanthe terms of the GNU General Public License as published by the Free
9169689SkanSoftware Foundation; either version 2, or (at your option) any later
10169689Skanversion.
11169689Skan
12169689SkanGCC is distributed in the hope that it will be useful, but WITHOUT ANY
13169689SkanWARRANTY; without even the implied warranty of MERCHANTABILITY or
14169689SkanFITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15169689Skanfor more details.
16169689Skan
17169689SkanYou should have received a copy of the GNU General Public License
18169689Skanalong with GCC; see the file COPYING.  If not, write to the Free
19169689SkanSoftware Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
20169689Skan02110-1301, USA.  */
21169689Skan
22169689Skan
23169689Skan#include "config.h"
24169689Skan#include "system.h"
25169689Skan#include "coretypes.h"
26169689Skan#include "tm.h"
27169689Skan#include "ggc.h"
28169689Skan#include "tree.h"
29169689Skan#include "target.h"
30169689Skan
31169689Skan#include "rtl.h"
32169689Skan#include "basic-block.h"
33169689Skan#include "diagnostic.h"
34169689Skan#include "tree-flow.h"
35169689Skan#include "tree-dump.h"
36169689Skan#include "timevar.h"
37169689Skan#include "cfgloop.h"
38169689Skan#include "expr.h"
39169689Skan#include "optabs.h"
40169689Skan#include "tree-chrec.h"
41169689Skan#include "tree-data-ref.h"
42169689Skan#include "tree-scalar-evolution.h"
43169689Skan#include "tree-pass.h"
44169689Skan#include "lambda.h"
45169689Skan
46169689Skan/* Linear loop transforms include any composition of interchange,
47169689Skan   scaling, skewing, and reversal.  They are used to change the
48169689Skan   iteration order of loop nests in order to optimize data locality of
49169689Skan   traversals, or remove dependences that prevent
50169689Skan   parallelization/vectorization/etc.
51169689Skan
52169689Skan   TODO: Determine reuse vectors/matrix and use it to determine optimal
53169689Skan   transform matrix for locality purposes.
54169689Skan   TODO: Completion of partial transforms.  */
55169689Skan
56169689Skan/* Gather statistics for loop interchange.  LOOP is the loop being
57169689Skan   considered. The first loop in the considered loop nest is
58169689Skan   FIRST_LOOP, and consequently, the index of the considered loop is
59169689Skan   obtained by LOOP->DEPTH - FIRST_LOOP->DEPTH
60169689Skan
61169689Skan   Initializes:
62169689Skan   - DEPENDENCE_STEPS the sum of all the data dependence distances
63169689Skan   carried by loop LOOP,
64169689Skan
65169689Skan   - NB_DEPS_NOT_CARRIED_BY_LOOP the number of dependence relations
66169689Skan   for which the loop LOOP is not carrying any dependence,
67169689Skan
68169689Skan   - ACCESS_STRIDES the sum of all the strides in LOOP.
69169689Skan
70169689Skan   Example: for the following loop,
71169689Skan
72169689Skan   | loop_1 runs 1335 times
73169689Skan   |   loop_2 runs 1335 times
74169689Skan   |     A[{{0, +, 1}_1, +, 1335}_2]
75169689Skan   |     B[{{0, +, 1}_1, +, 1335}_2]
76169689Skan   |   endloop_2
77169689Skan   |   A[{0, +, 1336}_1]
78169689Skan   | endloop_1
79169689Skan
80169689Skan   gather_interchange_stats (in loop_1) will return
81169689Skan   DEPENDENCE_STEPS = 3002
82169689Skan   NB_DEPS_NOT_CARRIED_BY_LOOP = 5
83169689Skan   ACCESS_STRIDES = 10694
84169689Skan
85169689Skan   gather_interchange_stats (in loop_2) will return
86169689Skan   DEPENDENCE_STEPS = 3000
87169689Skan   NB_DEPS_NOT_CARRIED_BY_LOOP = 7
88169689Skan   ACCESS_STRIDES = 8010
89169689Skan*/
90169689Skan
91169689Skanstatic void
92169689Skangather_interchange_stats (VEC (ddr_p, heap) *dependence_relations,
93169689Skan			  VEC (data_reference_p, heap) *datarefs,
94169689Skan			  struct loop *loop,
95169689Skan			  struct loop *first_loop,
96169689Skan			  unsigned int *dependence_steps,
97169689Skan			  unsigned int *nb_deps_not_carried_by_loop,
98169689Skan			  unsigned int *access_strides)
99169689Skan{
100169689Skan  unsigned int i, j;
101169689Skan  struct data_dependence_relation *ddr;
102169689Skan  struct data_reference *dr;
103169689Skan
104169689Skan  *dependence_steps = 0;
105169689Skan  *nb_deps_not_carried_by_loop = 0;
106169689Skan  *access_strides = 0;
107169689Skan
108169689Skan  for (i = 0; VEC_iterate (ddr_p, dependence_relations, i, ddr); i++)
109169689Skan    {
110169689Skan      /* If we don't know anything about this dependence, or the distance
111169689Skan	 vector is NULL, or there is no dependence, then there is no reuse of
112169689Skan	 data.  */
113169689Skan      if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
114169689Skan	  || DDR_ARE_DEPENDENT (ddr) == chrec_known
115169689Skan	  || DDR_NUM_DIST_VECTS (ddr) == 0)
116169689Skan	continue;
117169689Skan
118169689Skan      for (j = 0; j < DDR_NUM_DIST_VECTS (ddr); j++)
119169689Skan	{
120169689Skan	  int dist = DDR_DIST_VECT (ddr, j)[loop->depth - first_loop->depth];
121169689Skan
122169689Skan	  if (dist == 0)
123169689Skan	    (*nb_deps_not_carried_by_loop) += 1;
124169689Skan
125169689Skan	  else if (dist < 0)
126169689Skan	    (*dependence_steps) += -dist;
127169689Skan
128169689Skan	  else
129169689Skan	    (*dependence_steps) += dist;
130169689Skan	}
131169689Skan    }
132169689Skan
133169689Skan  /* Compute the access strides.  */
134169689Skan  for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
135169689Skan    {
136169689Skan      unsigned int it;
137169689Skan      tree stmt = DR_STMT (dr);
138169689Skan      struct loop *stmt_loop = loop_containing_stmt (stmt);
139169689Skan      struct loop *inner_loop = first_loop->inner;
140169689Skan
141169689Skan      if (inner_loop != stmt_loop
142169689Skan	  && !flow_loop_nested_p (inner_loop, stmt_loop))
143169689Skan	continue;
144169689Skan      for (it = 0; it < DR_NUM_DIMENSIONS (dr); it++)
145169689Skan	{
146169689Skan	  tree chrec = DR_ACCESS_FN (dr, it);
147169689Skan	  tree tstride = evolution_part_in_loop_num
148169689Skan	    (chrec, loop->num);
149169689Skan
150169689Skan	  if (tstride == NULL_TREE
151169689Skan	      || TREE_CODE (tstride) != INTEGER_CST)
152169689Skan	    continue;
153169689Skan
154169689Skan	  (*access_strides) += int_cst_value (tstride);
155169689Skan	}
156169689Skan    }
157169689Skan}
158169689Skan
159169689Skan/* Attempt to apply interchange transformations to TRANS to maximize the
160169689Skan   spatial and temporal locality of the loop.
161169689Skan   Returns the new transform matrix.  The smaller the reuse vector
162169689Skan   distances in the inner loops, the fewer the cache misses.
163169689Skan   FIRST_LOOP is the loop->num of the first loop in the analyzed loop
164169689Skan   nest.  */
165169689Skan
166169689Skan
167169689Skanstatic lambda_trans_matrix
168169689Skantry_interchange_loops (lambda_trans_matrix trans,
169169689Skan		       unsigned int depth,
170169689Skan		       VEC (ddr_p, heap) *dependence_relations,
171169689Skan		       VEC (data_reference_p, heap) *datarefs,
172169689Skan		       struct loop *first_loop)
173169689Skan{
174169689Skan  struct loop *loop_i;
175169689Skan  struct loop *loop_j;
176169689Skan  unsigned int dependence_steps_i, dependence_steps_j;
177169689Skan  unsigned int access_strides_i, access_strides_j;
178169689Skan  unsigned int nb_deps_not_carried_by_i, nb_deps_not_carried_by_j;
179169689Skan  struct data_dependence_relation *ddr;
180169689Skan
181169689Skan  if (VEC_length (ddr_p, dependence_relations) == 0)
182169689Skan    return trans;
183169689Skan
184169689Skan  /* When there is an unknown relation in the dependence_relations, we
185169689Skan     know that it is no worth looking at this loop nest: give up.  */
186169689Skan  ddr = VEC_index (ddr_p, dependence_relations, 0);
187169689Skan  if (ddr == NULL || DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
188169689Skan    return trans;
189169689Skan
190169689Skan  /* LOOP_I is always the outer loop.  */
191169689Skan  for (loop_j = first_loop->inner;
192169689Skan       loop_j;
193169689Skan       loop_j = loop_j->inner)
194169689Skan    for (loop_i = first_loop;
195169689Skan	 loop_i->depth < loop_j->depth;
196169689Skan	 loop_i = loop_i->inner)
197169689Skan      {
198169689Skan	gather_interchange_stats (dependence_relations, datarefs,
199169689Skan				  loop_i, first_loop,
200169689Skan				  &dependence_steps_i,
201169689Skan				  &nb_deps_not_carried_by_i,
202169689Skan				  &access_strides_i);
203169689Skan	gather_interchange_stats (dependence_relations, datarefs,
204169689Skan				  loop_j, first_loop,
205169689Skan				  &dependence_steps_j,
206169689Skan				  &nb_deps_not_carried_by_j,
207169689Skan				  &access_strides_j);
208169689Skan
209169689Skan	/* Heuristics for loop interchange profitability:
210169689Skan
211169689Skan	   1. (spatial locality) Inner loops should have smallest
212169689Skan              dependence steps.
213169689Skan
214169689Skan	   2. (spatial locality) Inner loops should contain more
215169689Skan	   dependence relations not carried by the loop.
216169689Skan
217169689Skan	   3. (temporal locality) Inner loops should have smallest
218169689Skan	      array access strides.
219169689Skan	*/
220169689Skan	if (dependence_steps_i < dependence_steps_j
221169689Skan	    || nb_deps_not_carried_by_i > nb_deps_not_carried_by_j
222169689Skan	    || access_strides_i < access_strides_j)
223169689Skan	  {
224169689Skan	    lambda_matrix_row_exchange (LTM_MATRIX (trans),
225169689Skan					loop_i->depth - first_loop->depth,
226169689Skan					loop_j->depth - first_loop->depth);
227169689Skan	    /* Validate the resulting matrix.  When the transformation
228169689Skan	       is not valid, reverse to the previous transformation.  */
229169689Skan	    if (!lambda_transform_legal_p (trans, depth, dependence_relations))
230169689Skan	      lambda_matrix_row_exchange (LTM_MATRIX (trans),
231169689Skan					  loop_i->depth - first_loop->depth,
232169689Skan					  loop_j->depth - first_loop->depth);
233169689Skan	  }
234169689Skan      }
235169689Skan
236169689Skan  return trans;
237169689Skan}
238169689Skan
239169689Skan/* Perform a set of linear transforms on LOOPS.  */
240169689Skan
241169689Skanvoid
242169689Skanlinear_transform_loops (struct loops *loops)
243169689Skan{
244169689Skan  bool modified = false;
245169689Skan  unsigned int i;
246169689Skan  VEC(tree,heap) *oldivs = NULL;
247169689Skan  VEC(tree,heap) *invariants = NULL;
248169689Skan
249169689Skan  for (i = 1; i < loops->num; i++)
250169689Skan    {
251169689Skan      unsigned int depth = 0;
252169689Skan      VEC (ddr_p, heap) *dependence_relations;
253169689Skan      VEC (data_reference_p, heap) *datarefs;
254169689Skan      struct loop *loop_nest = loops->parray[i];
255169689Skan      struct loop *temp;
256169689Skan      lambda_loopnest before, after;
257169689Skan      lambda_trans_matrix trans;
258169689Skan      bool problem = false;
259169689Skan      /* If it's not a loop nest, we don't want it.
260169689Skan         We also don't handle sibling loops properly,
261169689Skan         which are loops of the following form:
262169689Skan         for (i = 0; i < 50; i++)
263169689Skan           {
264169689Skan             for (j = 0; j < 50; j++)
265169689Skan               {
266169689Skan	        ...
267169689Skan               }
268169689Skan             for (j = 0; j < 50; j++)
269169689Skan               {
270169689Skan                ...
271169689Skan               }
272169689Skan           } */
273169689Skan      if (!loop_nest || !loop_nest->inner || !loop_nest->single_exit)
274169689Skan	continue;
275169689Skan      VEC_truncate (tree, oldivs, 0);
276169689Skan      VEC_truncate (tree, invariants, 0);
277169689Skan      depth = 1;
278169689Skan      for (temp = loop_nest->inner; temp; temp = temp->inner)
279169689Skan	{
280169689Skan	  /* If we have a sibling loop or multiple exit edges, jump ship.  */
281169689Skan	  if (temp->next || !temp->single_exit)
282169689Skan	    {
283169689Skan	      problem = true;
284169689Skan	      break;
285169689Skan	    }
286169689Skan	  depth ++;
287169689Skan	}
288169689Skan      if (problem)
289169689Skan	continue;
290169689Skan
291169689Skan      /* Analyze data references and dependence relations using scev.  */
292169689Skan
293169689Skan      datarefs = VEC_alloc (data_reference_p, heap, 10);
294169689Skan      dependence_relations = VEC_alloc (ddr_p, heap, 10 * 10);
295169689Skan      compute_data_dependences_for_loop (loop_nest, true, &datarefs,
296169689Skan					 &dependence_relations);
297169689Skan
298169689Skan      if (dump_file && (dump_flags & TDF_DETAILS))
299169689Skan	dump_ddrs (dump_file, dependence_relations);
300169689Skan
301169689Skan      /* Build the transformation matrix.  */
302169689Skan      trans = lambda_trans_matrix_new (depth, depth);
303169689Skan      lambda_matrix_id (LTM_MATRIX (trans), depth);
304169689Skan      trans = try_interchange_loops (trans, depth, dependence_relations,
305169689Skan				     datarefs, loop_nest);
306169689Skan
307169689Skan      if (lambda_trans_matrix_id_p (trans))
308169689Skan	{
309169689Skan	  if (dump_file)
310169689Skan	   fprintf (dump_file, "Won't transform loop. Optimal transform is the identity transform\n");
311169689Skan	  goto free_and_continue;
312169689Skan	}
313169689Skan
314169689Skan      /* Check whether the transformation is legal.  */
315169689Skan      if (!lambda_transform_legal_p (trans, depth, dependence_relations))
316169689Skan	{
317169689Skan	  if (dump_file)
318169689Skan	    fprintf (dump_file, "Can't transform loop, transform is illegal:\n");
319169689Skan	  goto free_and_continue;
320169689Skan	}
321169689Skan
322169689Skan      before = gcc_loopnest_to_lambda_loopnest (loops, loop_nest, &oldivs,
323169689Skan						&invariants);
324169689Skan
325169689Skan      if (!before)
326169689Skan	goto free_and_continue;
327169689Skan
328169689Skan      if (dump_file)
329169689Skan	{
330169689Skan	  fprintf (dump_file, "Before:\n");
331169689Skan	  print_lambda_loopnest (dump_file, before, 'i');
332169689Skan	}
333169689Skan
334169689Skan      after = lambda_loopnest_transform (before, trans);
335169689Skan
336169689Skan      if (dump_file)
337169689Skan	{
338169689Skan	  fprintf (dump_file, "After:\n");
339169689Skan	  print_lambda_loopnest (dump_file, after, 'u');
340169689Skan	}
341169689Skan
342169689Skan      lambda_loopnest_to_gcc_loopnest (loop_nest, oldivs, invariants,
343169689Skan				       after, trans);
344169689Skan      modified = true;
345169689Skan
346169689Skan      if (dump_file)
347169689Skan	fprintf (dump_file, "Successfully transformed loop.\n");
348169689Skan
349169689Skan    free_and_continue:
350169689Skan      free_dependence_relations (dependence_relations);
351169689Skan      free_data_refs (datarefs);
352169689Skan    }
353169689Skan
354169689Skan  VEC_free (tree, heap, oldivs);
355169689Skan  VEC_free (tree, heap, invariants);
356169689Skan  scev_reset ();
357169689Skan
358169689Skan  if (modified)
359169689Skan    rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa_full_phi);
360169689Skan}
361