1/* Linear Loop transforms
2   Copyright (C) 2003, 2004, 2005, 2007, 2008, 2009, 2010
3   Free Software Foundation, Inc.
4   Contributed by Daniel Berlin <dberlin@dberlin.org>.
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3.  If not see
20<http://www.gnu.org/licenses/>.  */
21
22
23#include "config.h"
24#include "system.h"
25#include "coretypes.h"
26#include "tm.h"
27#include "ggc.h"
28#include "tree.h"
29#include "target.h"
30
31#include "rtl.h"
32#include "basic-block.h"
33#include "diagnostic.h"
34#include "obstack.h"
35#include "tree-flow.h"
36#include "tree-dump.h"
37#include "timevar.h"
38#include "cfgloop.h"
39#include "expr.h"
40#include "optabs.h"
41#include "tree-chrec.h"
42#include "tree-data-ref.h"
43#include "tree-scalar-evolution.h"
44#include "tree-pass.h"
45#include "lambda.h"
46
47/* Linear loop transforms include any composition of interchange,
48   scaling, skewing, and reversal.  They are used to change the
49   iteration order of loop nests in order to optimize data locality of
50   traversals, or remove dependences that prevent
51   parallelization/vectorization/etc.
52
53   TODO: Determine reuse vectors/matrix and use it to determine optimal
54   transform matrix for locality purposes.
55   TODO: Completion of partial transforms.  */
56
57/* Gather statistics for loop interchange.  LOOP is the loop being
58   considered. The first loop in the considered loop nest is
59   FIRST_LOOP, and consequently, the index of the considered loop is
60   obtained by LOOP->DEPTH - FIRST_LOOP->DEPTH
61
62   Initializes:
63   - DEPENDENCE_STEPS the sum of all the data dependence distances
64   carried by loop LOOP,
65
66   - NB_DEPS_NOT_CARRIED_BY_LOOP the number of dependence relations
67   for which the loop LOOP is not carrying any dependence,
68
69   - ACCESS_STRIDES the sum of all the strides in LOOP.
70
71   Example: for the following loop,
72
73   | loop_1 runs 1335 times
74   |   loop_2 runs 1335 times
75   |     A[{{0, +, 1}_1, +, 1335}_2]
76   |     B[{{0, +, 1}_1, +, 1335}_2]
77   |   endloop_2
78   |   A[{0, +, 1336}_1]
79   | endloop_1
80
81   gather_interchange_stats (in loop_1) will return
82   DEPENDENCE_STEPS = 3002
83   NB_DEPS_NOT_CARRIED_BY_LOOP = 5
84   ACCESS_STRIDES = 10694
85
86   gather_interchange_stats (in loop_2) will return
87   DEPENDENCE_STEPS = 3000
88   NB_DEPS_NOT_CARRIED_BY_LOOP = 7
89   ACCESS_STRIDES = 8010
90*/
91
92static void
93gather_interchange_stats (VEC (ddr_p, heap) *dependence_relations ATTRIBUTE_UNUSED,
94			  VEC (data_reference_p, heap) *datarefs ATTRIBUTE_UNUSED,
95			  struct loop *loop ATTRIBUTE_UNUSED,
96			  struct loop *first_loop ATTRIBUTE_UNUSED,
97			  unsigned int *dependence_steps ATTRIBUTE_UNUSED,
98			  unsigned int *nb_deps_not_carried_by_loop ATTRIBUTE_UNUSED,
99			  double_int *access_strides ATTRIBUTE_UNUSED)
100{
101  unsigned int i, j;
102  struct data_dependence_relation *ddr;
103  struct data_reference *dr;
104
105  *dependence_steps = 0;
106  *nb_deps_not_carried_by_loop = 0;
107  *access_strides = double_int_zero;
108
109  for (i = 0; VEC_iterate (ddr_p, dependence_relations, i, ddr); i++)
110    {
111      /* If we don't know anything about this dependence, or the distance
112	 vector is NULL, or there is no dependence, then there is no reuse of
113	 data.  */
114      if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
115	  || DDR_ARE_DEPENDENT (ddr) == chrec_known
116	  || DDR_NUM_DIST_VECTS (ddr) == 0)
117	continue;
118
119      for (j = 0; j < DDR_NUM_DIST_VECTS (ddr); j++)
120	{
121	  int dist = DDR_DIST_VECT (ddr, j)[loop_depth (loop) - loop_depth (first_loop)];
122
123	  if (dist == 0)
124	    (*nb_deps_not_carried_by_loop) += 1;
125
126	  else if (dist < 0)
127	    (*dependence_steps) += -dist;
128
129	  else
130	    (*dependence_steps) += dist;
131	}
132    }
133
134  /* Compute the access strides.  */
135  for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
136    {
137      unsigned int it;
138      tree ref = DR_REF (dr);
139      gimple stmt = DR_STMT (dr);
140      struct loop *stmt_loop = loop_containing_stmt (stmt);
141      struct loop *inner_loop = first_loop->inner;
142
143      if (inner_loop != stmt_loop
144	  && !flow_loop_nested_p (inner_loop, stmt_loop))
145	continue;
146
147      for (it = 0; it < DR_NUM_DIMENSIONS (dr);
148	   it++, ref = TREE_OPERAND (ref, 0))
149	{
150	  int num = am_vector_index_for_loop (DR_ACCESS_MATRIX (dr), loop->num);
151	  int istride = AM_GET_ACCESS_MATRIX_ELEMENT (DR_ACCESS_MATRIX (dr), it, num);
152	  tree array_size = TYPE_SIZE (TREE_TYPE (ref));
153	  double_int dstride;
154
155	  if (array_size == NULL_TREE
156	      || TREE_CODE (array_size) != INTEGER_CST)
157	    continue;
158
159	  dstride = double_int_mul (tree_to_double_int (array_size),
160				    shwi_to_double_int (istride));
161	  (*access_strides) = double_int_add (*access_strides, dstride);
162	}
163    }
164}
165
166/* Attempt to apply interchange transformations to TRANS to maximize the
167   spatial and temporal locality of the loop.
168   Returns the new transform matrix.  The smaller the reuse vector
169   distances in the inner loops, the fewer the cache misses.
170   FIRST_LOOP is the loop->num of the first loop in the analyzed loop
171   nest.  */
172
173
174static lambda_trans_matrix
175try_interchange_loops (lambda_trans_matrix trans,
176		       unsigned int depth,
177		       VEC (ddr_p, heap) *dependence_relations,
178		       VEC (data_reference_p, heap) *datarefs,
179		       struct loop *first_loop)
180{
181  bool res;
182  struct loop *loop_i;
183  struct loop *loop_j;
184  unsigned int dependence_steps_i, dependence_steps_j;
185  double_int access_strides_i, access_strides_j;
186  double_int small, large, nb_iter;
187  double_int l1_cache_size, l2_cache_size;
188  int cmp;
189  unsigned int nb_deps_not_carried_by_i, nb_deps_not_carried_by_j;
190  struct data_dependence_relation *ddr;
191
192  if (VEC_length (ddr_p, dependence_relations) == 0)
193    return trans;
194
195  /* When there is an unknown relation in the dependence_relations, we
196     know that it is no worth looking at this loop nest: give up.  */
197  ddr = VEC_index (ddr_p, dependence_relations, 0);
198  if (ddr == NULL || DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
199    return trans;
200
201  l1_cache_size = uhwi_to_double_int (L1_CACHE_SIZE * 1024);
202  l2_cache_size = uhwi_to_double_int (L2_CACHE_SIZE * 1024);
203
204  /* LOOP_I is always the outer loop.  */
205  for (loop_j = first_loop->inner;
206       loop_j;
207       loop_j = loop_j->inner)
208    for (loop_i = first_loop;
209	 loop_depth (loop_i) < loop_depth (loop_j);
210	 loop_i = loop_i->inner)
211      {
212	gather_interchange_stats (dependence_relations, datarefs,
213				  loop_i, first_loop,
214				  &dependence_steps_i,
215				  &nb_deps_not_carried_by_i,
216				  &access_strides_i);
217	gather_interchange_stats (dependence_relations, datarefs,
218				  loop_j, first_loop,
219				  &dependence_steps_j,
220				  &nb_deps_not_carried_by_j,
221				  &access_strides_j);
222
223	/* Heuristics for loop interchange profitability:
224
225	   0. Don't transform if the smallest stride is larger than
226	      the L2 cache, or if the largest stride multiplied by the
227	      number of iterations is smaller than the L1 cache.
228
229	   1. (spatial locality) Inner loops should have smallest
230              dependence steps.
231
232	   2. (spatial locality) Inner loops should contain more
233	   dependence relations not carried by the loop.
234
235	   3. (temporal locality) Inner loops should have smallest
236	      array access strides.
237	*/
238
239	cmp = double_int_ucmp (access_strides_i, access_strides_j);
240	small = cmp < 0 ? access_strides_i : access_strides_j;
241	large = cmp < 0 ? access_strides_j : access_strides_i;
242
243	if (double_int_ucmp (small, l2_cache_size) > 0)
244	  continue;
245
246	res = cmp < 0 ?
247	  estimated_loop_iterations (loop_j, false, &nb_iter):
248	  estimated_loop_iterations (loop_i, false, &nb_iter);
249
250	if (res
251	    && double_int_ucmp (double_int_mul (large, nb_iter),
252				l1_cache_size) < 0)
253	  continue;
254
255	if (dependence_steps_i < dependence_steps_j
256	    || nb_deps_not_carried_by_i > nb_deps_not_carried_by_j
257	    || cmp < 0)
258	  {
259	    lambda_matrix_row_exchange (LTM_MATRIX (trans),
260					loop_depth (loop_i) - loop_depth (first_loop),
261					loop_depth (loop_j) - loop_depth (first_loop));
262	    /* Validate the resulting matrix.  When the transformation
263	       is not valid, reverse to the previous transformation.  */
264	    if (!lambda_transform_legal_p (trans, depth, dependence_relations))
265	      lambda_matrix_row_exchange (LTM_MATRIX (trans),
266					  loop_depth (loop_i) - loop_depth (first_loop),
267					  loop_depth (loop_j) - loop_depth (first_loop));
268	  }
269      }
270
271  return trans;
272}
273
274/* Return the number of nested loops in LOOP_NEST, or 0 if the loops
275   are not perfectly nested.  */
276
277unsigned int
278perfect_loop_nest_depth (struct loop *loop_nest)
279{
280  struct loop *temp;
281  unsigned int depth = 1;
282
283  /* If it's not a loop nest, we don't want it.  We also don't handle
284     sibling loops properly, which are loops of the following form:
285
286     | for (i = 0; i < 50; i++)
287     |   {
288     |     for (j = 0; j < 50; j++)
289     |       {
290     |        ...
291     |       }
292     |     for (j = 0; j < 50; j++)
293     |       {
294     |        ...
295     |       }
296     |   }
297  */
298
299  if (!loop_nest->inner || !single_exit (loop_nest))
300    return 0;
301
302  for (temp = loop_nest->inner; temp; temp = temp->inner)
303    {
304      /* If we have a sibling loop or multiple exit edges, jump ship.  */
305      if (temp->next || !single_exit (temp))
306	return 0;
307
308      depth++;
309    }
310
311  return depth;
312}
313
314/* Perform a set of linear transforms on loops.  */
315
316void
317linear_transform_loops (void)
318{
319  bool modified = false;
320  loop_iterator li;
321  VEC(tree,heap) *oldivs = NULL;
322  VEC(tree,heap) *invariants = NULL;
323  VEC(tree,heap) *lambda_parameters = NULL;
324  VEC(gimple,heap) *remove_ivs = VEC_alloc (gimple, heap, 3);
325  struct loop *loop_nest;
326  gimple oldiv_stmt;
327  unsigned i;
328
329  FOR_EACH_LOOP (li, loop_nest, 0)
330    {
331      unsigned int depth = 0;
332      VEC (ddr_p, heap) *dependence_relations;
333      VEC (data_reference_p, heap) *datarefs;
334
335      lambda_loopnest before, after;
336      lambda_trans_matrix trans;
337      struct obstack lambda_obstack;
338      struct loop *loop;
339      VEC(loop_p,heap) *nest;
340
341      depth = perfect_loop_nest_depth (loop_nest);
342      if (depth == 0)
343	continue;
344
345      nest = VEC_alloc (loop_p, heap, 3);
346      for (loop = loop_nest; loop; loop = loop->inner)
347	VEC_safe_push (loop_p, heap, nest, loop);
348
349      gcc_obstack_init (&lambda_obstack);
350      VEC_truncate (tree, oldivs, 0);
351      VEC_truncate (tree, invariants, 0);
352      VEC_truncate (tree, lambda_parameters, 0);
353
354      datarefs = VEC_alloc (data_reference_p, heap, 10);
355      dependence_relations = VEC_alloc (ddr_p, heap, 10 * 10);
356      if (!compute_data_dependences_for_loop (loop_nest, true, &datarefs,
357					      &dependence_relations))
358	goto free_and_continue;
359
360      lambda_collect_parameters (datarefs, &lambda_parameters);
361      if (!lambda_compute_access_matrices (datarefs, lambda_parameters, nest))
362	goto free_and_continue;
363
364      if (dump_file && (dump_flags & TDF_DETAILS))
365	dump_ddrs (dump_file, dependence_relations);
366
367      /* Build the transformation matrix.  */
368      trans = lambda_trans_matrix_new (depth, depth);
369      lambda_matrix_id (LTM_MATRIX (trans), depth);
370      trans = try_interchange_loops (trans, depth, dependence_relations,
371				     datarefs, loop_nest);
372
373      if (lambda_trans_matrix_id_p (trans))
374	{
375	  if (dump_file)
376	   fprintf (dump_file, "Won't transform loop. Optimal transform is the identity transform\n");
377	  goto free_and_continue;
378	}
379
380      /* Check whether the transformation is legal.  */
381      if (!lambda_transform_legal_p (trans, depth, dependence_relations))
382	{
383	  if (dump_file)
384	    fprintf (dump_file, "Can't transform loop, transform is illegal:\n");
385	  goto free_and_continue;
386	}
387
388      before = gcc_loopnest_to_lambda_loopnest (loop_nest, &oldivs,
389                                                &invariants, &lambda_obstack);
390
391      if (!before)
392	goto free_and_continue;
393
394      if (dump_file)
395	{
396	  fprintf (dump_file, "Before:\n");
397	  print_lambda_loopnest (dump_file, before, 'i');
398	}
399
400      after = lambda_loopnest_transform (before, trans, &lambda_obstack);
401
402      if (dump_file)
403	{
404	  fprintf (dump_file, "After:\n");
405	  print_lambda_loopnest (dump_file, after, 'u');
406	}
407
408      lambda_loopnest_to_gcc_loopnest (loop_nest, oldivs, invariants,
409				       &remove_ivs,
410                                       after, trans, &lambda_obstack);
411      modified = true;
412
413      if (dump_file)
414	fprintf (dump_file, "Successfully transformed loop.\n");
415
416    free_and_continue:
417      obstack_free (&lambda_obstack, NULL);
418      free_dependence_relations (dependence_relations);
419      free_data_refs (datarefs);
420      VEC_free (loop_p, heap, nest);
421    }
422
423  for (i = 0; VEC_iterate (gimple, remove_ivs, i, oldiv_stmt); i++)
424    remove_iv (oldiv_stmt);
425
426  VEC_free (tree, heap, oldivs);
427  VEC_free (tree, heap, invariants);
428  VEC_free (gimple, heap, remove_ivs);
429  scev_reset ();
430
431  if (modified)
432    rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa_full_phi);
433}
434