tree-ssa-loop-prefetch.c revision 1.6
1/* Array prefetching.
2   Copyright (C) 2005-2015 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify it
7under the terms of the GNU General Public License as published by the
8Free Software Foundation; either version 3, or (at your option) any
9later version.
10
11GCC is distributed in the hope that it will be useful, but WITHOUT
12ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14for more details.
15
16You should have received a copy of the GNU General Public License
17along with GCC; see the file COPYING3.  If not see
18<http://www.gnu.org/licenses/>.  */
19
20#include "config.h"
21#include "system.h"
22#include "coretypes.h"
23#include "tm.h"
24#include "hash-set.h"
25#include "machmode.h"
26#include "vec.h"
27#include "double-int.h"
28#include "input.h"
29#include "alias.h"
30#include "symtab.h"
31#include "wide-int.h"
32#include "inchash.h"
33#include "tree.h"
34#include "fold-const.h"
35#include "stor-layout.h"
36#include "tm_p.h"
37#include "predict.h"
38#include "hard-reg-set.h"
39#include "function.h"
40#include "dominance.h"
41#include "cfg.h"
42#include "basic-block.h"
43#include "tree-pretty-print.h"
44#include "tree-ssa-alias.h"
45#include "internal-fn.h"
46#include "gimple-expr.h"
47#include "is-a.h"
48#include "gimple.h"
49#include "gimplify.h"
50#include "gimple-iterator.h"
51#include "gimplify-me.h"
52#include "gimple-ssa.h"
53#include "tree-ssa-loop-ivopts.h"
54#include "tree-ssa-loop-manip.h"
55#include "tree-ssa-loop-niter.h"
56#include "tree-ssa-loop.h"
57#include "tree-into-ssa.h"
58#include "cfgloop.h"
59#include "tree-pass.h"
60#include "insn-config.h"
61#include "tree-chrec.h"
62#include "tree-scalar-evolution.h"
63#include "diagnostic-core.h"
64#include "params.h"
65#include "langhooks.h"
66#include "tree-inline.h"
67#include "tree-data-ref.h"
68#include "diagnostic-core.h"
69
70
71/* FIXME: Needed for optabs, but this should all be moved to a TBD interface
72   between the GIMPLE and RTL worlds.  */
73#include "hashtab.h"
74#include "rtl.h"
75#include "flags.h"
76#include "statistics.h"
77#include "real.h"
78#include "fixed-value.h"
79#include "expmed.h"
80#include "dojump.h"
81#include "explow.h"
82#include "calls.h"
83#include "emit-rtl.h"
84#include "varasm.h"
85#include "stmt.h"
86#include "expr.h"
87#include "insn-codes.h"
88#include "optabs.h"
89#include "recog.h"
90
91/* This pass inserts prefetch instructions to optimize cache usage during
92   accesses to arrays in loops.  It processes loops sequentially and:
93
94   1) Gathers all memory references in the single loop.
95   2) For each of the references it decides when it is profitable to prefetch
96      it.  To do it, we evaluate the reuse among the accesses, and determines
97      two values: PREFETCH_BEFORE (meaning that it only makes sense to do
98      prefetching in the first PREFETCH_BEFORE iterations of the loop) and
99      PREFETCH_MOD (meaning that it only makes sense to prefetch in the
100      iterations of the loop that are zero modulo PREFETCH_MOD).  For example
101      (assuming cache line size is 64 bytes, char has size 1 byte and there
102      is no hardware sequential prefetch):
103
104      char *a;
105      for (i = 0; i < max; i++)
106	{
107	  a[255] = ...;		(0)
108	  a[i] = ...;		(1)
109	  a[i + 64] = ...;	(2)
110	  a[16*i] = ...;	(3)
111	  a[187*i] = ...;	(4)
112	  a[187*i + 50] = ...;	(5)
113	}
114
115       (0) obviously has PREFETCH_BEFORE 1
116       (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
117           location 64 iterations before it, and PREFETCH_MOD 64 (since
118	   it hits the same cache line otherwise).
119       (2) has PREFETCH_MOD 64
120       (3) has PREFETCH_MOD 4
121       (4) has PREFETCH_MOD 1.  We do not set PREFETCH_BEFORE here, since
122           the cache line accessed by (5) is the same with probability only
123	   7/32.
124       (5) has PREFETCH_MOD 1 as well.
125
126      Additionally, we use data dependence analysis to determine for each
127      reference the distance till the first reuse; this information is used
128      to determine the temporality of the issued prefetch instruction.
129
130   3) We determine how much ahead we need to prefetch.  The number of
131      iterations needed is time to fetch / time spent in one iteration of
132      the loop.  The problem is that we do not know either of these values,
133      so we just make a heuristic guess based on a magic (possibly)
134      target-specific constant and size of the loop.
135
136   4) Determine which of the references we prefetch.  We take into account
137      that there is a maximum number of simultaneous prefetches (provided
138      by machine description).  We prefetch as many prefetches as possible
139      while still within this bound (starting with those with lowest
140      prefetch_mod, since they are responsible for most of the cache
141      misses).
142
143   5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
144      and PREFETCH_BEFORE requirements (within some bounds), and to avoid
145      prefetching nonaccessed memory.
146      TODO -- actually implement peeling.
147
148   6) We actually emit the prefetch instructions.  ??? Perhaps emit the
149      prefetch instructions with guards in cases where 5) was not sufficient
150      to satisfy the constraints?
151
152   A cost model is implemented to determine whether or not prefetching is
153   profitable for a given loop.  The cost model has three heuristics:
154
155   1. Function trip_count_to_ahead_ratio_too_small_p implements a
156      heuristic that determines whether or not the loop has too few
157      iterations (compared to ahead).  Prefetching is not likely to be
158      beneficial if the trip count to ahead ratio is below a certain
159      minimum.
160
161   2. Function mem_ref_count_reasonable_p implements a heuristic that
162      determines whether the given loop has enough CPU ops that can be
163      overlapped with cache missing memory ops.  If not, the loop
164      won't benefit from prefetching.  In the implementation,
165      prefetching is not considered beneficial if the ratio between
166      the instruction count and the mem ref count is below a certain
167      minimum.
168
169   3. Function insn_to_prefetch_ratio_too_small_p implements a
170      heuristic that disables prefetching in a loop if the prefetching
171      cost is above a certain limit.  The relative prefetching cost is
172      estimated by taking the ratio between the prefetch count and the
173      total intruction count (this models the I-cache cost).
174
175   The limits used in these heuristics are defined as parameters with
176   reasonable default values. Machine-specific default values will be
177   added later.
178
179   Some other TODO:
180      -- write and use more general reuse analysis (that could be also used
181	 in other cache aimed loop optimizations)
182      -- make it behave sanely together with the prefetches given by user
183	 (now we just ignore them; at the very least we should avoid
184	 optimizing loops in that user put his own prefetches)
185      -- we assume cache line size alignment of arrays; this could be
186	 improved.  */
187
188/* Magic constants follow.  These should be replaced by machine specific
189   numbers.  */
190
191/* True if write can be prefetched by a read prefetch.  */
192
193#ifndef WRITE_CAN_USE_READ_PREFETCH
194#define WRITE_CAN_USE_READ_PREFETCH 1
195#endif
196
197/* True if read can be prefetched by a write prefetch. */
198
199#ifndef READ_CAN_USE_WRITE_PREFETCH
200#define READ_CAN_USE_WRITE_PREFETCH 0
201#endif
202
203/* The size of the block loaded by a single prefetch.  Usually, this is
204   the same as cache line size (at the moment, we only consider one level
205   of cache hierarchy).  */
206
207#ifndef PREFETCH_BLOCK
208#define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
209#endif
210
211/* Do we have a forward hardware sequential prefetching?  */
212
213#ifndef HAVE_FORWARD_PREFETCH
214#define HAVE_FORWARD_PREFETCH 0
215#endif
216
217/* Do we have a backward hardware sequential prefetching?  */
218
219#ifndef HAVE_BACKWARD_PREFETCH
220#define HAVE_BACKWARD_PREFETCH 0
221#endif
222
223/* In some cases we are only able to determine that there is a certain
224   probability that the two accesses hit the same cache line.  In this
225   case, we issue the prefetches for both of them if this probability
226   is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand.  */
227
228#ifndef ACCEPTABLE_MISS_RATE
229#define ACCEPTABLE_MISS_RATE 50
230#endif
231
232#ifndef HAVE_prefetch
233#define HAVE_prefetch 0
234#endif
235
236#define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024))
237#define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024))
238
239/* We consider a memory access nontemporal if it is not reused sooner than
240   after L2_CACHE_SIZE_BYTES of memory are accessed.  However, we ignore
241   accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
242   so that we use nontemporal prefetches e.g. if single memory location
243   is accessed several times in a single iteration of the loop.  */
244#define NONTEMPORAL_FRACTION 16
245
246/* In case we have to emit a memory fence instruction after the loop that
247   uses nontemporal stores, this defines the builtin to use.  */
248
249#ifndef FENCE_FOLLOWING_MOVNT
250#define FENCE_FOLLOWING_MOVNT NULL_TREE
251#endif
252
253/* It is not profitable to prefetch when the trip count is not at
254   least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
255   For example, in a loop with a prefetch ahead distance of 10,
256   supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
257   profitable to prefetch when the trip count is greater or equal to
258   40.  In that case, 30 out of the 40 iterations will benefit from
259   prefetching.  */
260
261#ifndef TRIP_COUNT_TO_AHEAD_RATIO
262#define TRIP_COUNT_TO_AHEAD_RATIO 4
263#endif
264
265/* The group of references between that reuse may occur.  */
266
267struct mem_ref_group
268{
269  tree base;			/* Base of the reference.  */
270  tree step;			/* Step of the reference.  */
271  struct mem_ref *refs;		/* References in the group.  */
272  struct mem_ref_group *next;	/* Next group of references.  */
273};
274
275/* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched.  */
276
277#define PREFETCH_ALL		(~(unsigned HOST_WIDE_INT) 0)
278
279/* Do not generate a prefetch if the unroll factor is significantly less
280   than what is required by the prefetch.  This is to avoid redundant
281   prefetches.  For example, when prefetch_mod is 16 and unroll_factor is
282   2, prefetching requires unrolling the loop 16 times, but
283   the loop is actually unrolled twice.  In this case (ratio = 8),
284   prefetching is not likely to be beneficial.  */
285
286#ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
287#define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4
288#endif
289
290/* Some of the prefetch computations have quadratic complexity.  We want to
291   avoid huge compile times and, therefore, want to limit the amount of
292   memory references per loop where we consider prefetching.  */
293
294#ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP
295#define PREFETCH_MAX_MEM_REFS_PER_LOOP 200
296#endif
297
298/* The memory reference.  */
299
300struct mem_ref
301{
302  gimple stmt;			/* Statement in that the reference appears.  */
303  tree mem;			/* The reference.  */
304  HOST_WIDE_INT delta;		/* Constant offset of the reference.  */
305  struct mem_ref_group *group;	/* The group of references it belongs to.  */
306  unsigned HOST_WIDE_INT prefetch_mod;
307				/* Prefetch only each PREFETCH_MOD-th
308				   iteration.  */
309  unsigned HOST_WIDE_INT prefetch_before;
310				/* Prefetch only first PREFETCH_BEFORE
311				   iterations.  */
312  unsigned reuse_distance;	/* The amount of data accessed before the first
313				   reuse of this value.  */
314  struct mem_ref *next;		/* The next reference in the group.  */
315  unsigned write_p : 1;		/* Is it a write?  */
316  unsigned independent_p : 1;	/* True if the reference is independent on
317				   all other references inside the loop.  */
318  unsigned issue_prefetch_p : 1;	/* Should we really issue the prefetch?  */
319  unsigned storent_p : 1;	/* True if we changed the store to a
320				   nontemporal one.  */
321};
322
323/* Dumps information about memory reference */
324static void
325dump_mem_details (FILE *file, tree base, tree step,
326	    HOST_WIDE_INT delta, bool write_p)
327{
328  fprintf (file, "(base ");
329  print_generic_expr (file, base, TDF_SLIM);
330  fprintf (file, ", step ");
331  if (cst_and_fits_in_hwi (step))
332    fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (step));
333  else
334    print_generic_expr (file, step, TDF_TREE);
335  fprintf (file, ")\n");
336  fprintf (file, "  delta ");
337  fprintf (file, HOST_WIDE_INT_PRINT_DEC, delta);
338  fprintf (file, "\n");
339  fprintf (file, "  %s\n", write_p ? "write" : "read");
340  fprintf (file, "\n");
341}
342
343/* Dumps information about reference REF to FILE.  */
344
345static void
346dump_mem_ref (FILE *file, struct mem_ref *ref)
347{
348  fprintf (file, "Reference %p:\n", (void *) ref);
349
350  fprintf (file, "  group %p ", (void *) ref->group);
351
352  dump_mem_details (file, ref->group->base, ref->group->step, ref->delta,
353                   ref->write_p);
354}
355
356/* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
357   exist.  */
358
359static struct mem_ref_group *
360find_or_create_group (struct mem_ref_group **groups, tree base, tree step)
361{
362  struct mem_ref_group *group;
363
364  for (; *groups; groups = &(*groups)->next)
365    {
366      if (operand_equal_p ((*groups)->step, step, 0)
367	  && operand_equal_p ((*groups)->base, base, 0))
368	return *groups;
369
370      /* If step is an integer constant, keep the list of groups sorted
371         by decreasing step.  */
372        if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step)
373            && int_cst_value ((*groups)->step) < int_cst_value (step))
374	break;
375    }
376
377  group = XNEW (struct mem_ref_group);
378  group->base = base;
379  group->step = step;
380  group->refs = NULL;
381  group->next = *groups;
382  *groups = group;
383
384  return group;
385}
386
387/* Records a memory reference MEM in GROUP with offset DELTA and write status
388   WRITE_P.  The reference occurs in statement STMT.  */
389
390static void
391record_ref (struct mem_ref_group *group, gimple stmt, tree mem,
392	    HOST_WIDE_INT delta, bool write_p)
393{
394  struct mem_ref **aref;
395
396  /* Do not record the same address twice.  */
397  for (aref = &group->refs; *aref; aref = &(*aref)->next)
398    {
399      /* It does not have to be possible for write reference to reuse the read
400	 prefetch, or vice versa.  */
401      if (!WRITE_CAN_USE_READ_PREFETCH
402	  && write_p
403	  && !(*aref)->write_p)
404	continue;
405      if (!READ_CAN_USE_WRITE_PREFETCH
406	  && !write_p
407	  && (*aref)->write_p)
408	continue;
409
410      if ((*aref)->delta == delta)
411	return;
412    }
413
414  (*aref) = XNEW (struct mem_ref);
415  (*aref)->stmt = stmt;
416  (*aref)->mem = mem;
417  (*aref)->delta = delta;
418  (*aref)->write_p = write_p;
419  (*aref)->prefetch_before = PREFETCH_ALL;
420  (*aref)->prefetch_mod = 1;
421  (*aref)->reuse_distance = 0;
422  (*aref)->issue_prefetch_p = false;
423  (*aref)->group = group;
424  (*aref)->next = NULL;
425  (*aref)->independent_p = false;
426  (*aref)->storent_p = false;
427
428  if (dump_file && (dump_flags & TDF_DETAILS))
429    dump_mem_ref (dump_file, *aref);
430}
431
432/* Release memory references in GROUPS.  */
433
434static void
435release_mem_refs (struct mem_ref_group *groups)
436{
437  struct mem_ref_group *next_g;
438  struct mem_ref *ref, *next_r;
439
440  for (; groups; groups = next_g)
441    {
442      next_g = groups->next;
443      for (ref = groups->refs; ref; ref = next_r)
444	{
445	  next_r = ref->next;
446	  free (ref);
447	}
448      free (groups);
449    }
450}
451
452/* A structure used to pass arguments to idx_analyze_ref.  */
453
454struct ar_data
455{
456  struct loop *loop;			/* Loop of the reference.  */
457  gimple stmt;				/* Statement of the reference.  */
458  tree *step;				/* Step of the memory reference.  */
459  HOST_WIDE_INT *delta;			/* Offset of the memory reference.  */
460};
461
462/* Analyzes a single INDEX of a memory reference to obtain information
463   described at analyze_ref.  Callback for for_each_index.  */
464
465static bool
466idx_analyze_ref (tree base, tree *index, void *data)
467{
468  struct ar_data *ar_data = (struct ar_data *) data;
469  tree ibase, step, stepsize;
470  HOST_WIDE_INT idelta = 0, imult = 1;
471  affine_iv iv;
472
473  if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
474		  *index, &iv, true))
475    return false;
476  ibase = iv.base;
477  step = iv.step;
478
479  if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
480      && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
481    {
482      idelta = int_cst_value (TREE_OPERAND (ibase, 1));
483      ibase = TREE_OPERAND (ibase, 0);
484    }
485  if (cst_and_fits_in_hwi (ibase))
486    {
487      idelta += int_cst_value (ibase);
488      ibase = build_int_cst (TREE_TYPE (ibase), 0);
489    }
490
491  if (TREE_CODE (base) == ARRAY_REF)
492    {
493      stepsize = array_ref_element_size (base);
494      if (!cst_and_fits_in_hwi (stepsize))
495	return false;
496      imult = int_cst_value (stepsize);
497      step = fold_build2 (MULT_EXPR, sizetype,
498			  fold_convert (sizetype, step),
499			  fold_convert (sizetype, stepsize));
500      idelta *= imult;
501    }
502
503  if (*ar_data->step == NULL_TREE)
504    *ar_data->step = step;
505  else
506    *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
507				  fold_convert (sizetype, *ar_data->step),
508				  fold_convert (sizetype, step));
509  *ar_data->delta += idelta;
510  *index = ibase;
511
512  return true;
513}
514
515/* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
516   STEP are integer constants and iter is number of iterations of LOOP.  The
517   reference occurs in statement STMT.  Strips nonaddressable component
518   references from REF_P.  */
519
520static bool
521analyze_ref (struct loop *loop, tree *ref_p, tree *base,
522	     tree *step, HOST_WIDE_INT *delta,
523	     gimple stmt)
524{
525  struct ar_data ar_data;
526  tree off;
527  HOST_WIDE_INT bit_offset;
528  tree ref = *ref_p;
529
530  *step = NULL_TREE;
531  *delta = 0;
532
533  /* First strip off the component references.  Ignore bitfields.
534     Also strip off the real and imagine parts of a complex, so that
535     they can have the same base.  */
536  if (TREE_CODE (ref) == REALPART_EXPR
537      || TREE_CODE (ref) == IMAGPART_EXPR
538      || (TREE_CODE (ref) == COMPONENT_REF
539          && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))))
540    {
541      if (TREE_CODE (ref) == IMAGPART_EXPR)
542        *delta += int_size_in_bytes (TREE_TYPE (ref));
543      ref = TREE_OPERAND (ref, 0);
544    }
545
546  *ref_p = ref;
547
548  for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
549    {
550      off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
551      bit_offset = TREE_INT_CST_LOW (off);
552      gcc_assert (bit_offset % BITS_PER_UNIT == 0);
553
554      *delta += bit_offset / BITS_PER_UNIT;
555    }
556
557  *base = unshare_expr (ref);
558  ar_data.loop = loop;
559  ar_data.stmt = stmt;
560  ar_data.step = step;
561  ar_data.delta = delta;
562  return for_each_index (base, idx_analyze_ref, &ar_data);
563}
564
565/* Record a memory reference REF to the list REFS.  The reference occurs in
566   LOOP in statement STMT and it is write if WRITE_P.  Returns true if the
567   reference was recorded, false otherwise.  */
568
569static bool
570gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
571			      tree ref, bool write_p, gimple stmt)
572{
573  tree base, step;
574  HOST_WIDE_INT delta;
575  struct mem_ref_group *agrp;
576
577  if (get_base_address (ref) == NULL)
578    return false;
579
580  if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
581    return false;
582  /* If analyze_ref fails the default is a NULL_TREE.  We can stop here.  */
583  if (step == NULL_TREE)
584    return false;
585
586  /* Stop if the address of BASE could not be taken.  */
587  if (may_be_nonaddressable_p (base))
588    return false;
589
590  /* Limit non-constant step prefetching only to the innermost loops and
591     only when the step is loop invariant in the entire loop nest. */
592  if (!cst_and_fits_in_hwi (step))
593    {
594      if (loop->inner != NULL)
595        {
596          if (dump_file && (dump_flags & TDF_DETAILS))
597            {
598              fprintf (dump_file, "Memory expression %p\n",(void *) ref );
599              print_generic_expr (dump_file, ref, TDF_TREE);
600              fprintf (dump_file,":");
601              dump_mem_details (dump_file, base, step, delta, write_p);
602              fprintf (dump_file,
603                       "Ignoring %p, non-constant step prefetching is "
604                       "limited to inner most loops \n",
605                       (void *) ref);
606            }
607            return false;
608         }
609      else
610        {
611          if (!expr_invariant_in_loop_p (loop_outermost (loop), step))
612          {
613            if (dump_file && (dump_flags & TDF_DETAILS))
614              {
615                fprintf (dump_file, "Memory expression %p\n",(void *) ref );
616                print_generic_expr (dump_file, ref, TDF_TREE);
617                fprintf (dump_file,":");
618                dump_mem_details (dump_file, base, step, delta, write_p);
619                fprintf (dump_file,
620                         "Not prefetching, ignoring %p due to "
621                         "loop variant step\n",
622                         (void *) ref);
623              }
624              return false;
625            }
626        }
627    }
628
629  /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
630     are integer constants.  */
631  agrp = find_or_create_group (refs, base, step);
632  record_ref (agrp, stmt, ref, delta, write_p);
633
634  return true;
635}
636
637/* Record the suitable memory references in LOOP.  NO_OTHER_REFS is set to
638   true if there are no other memory references inside the loop.  */
639
640static struct mem_ref_group *
641gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count)
642{
643  basic_block *body = get_loop_body_in_dom_order (loop);
644  basic_block bb;
645  unsigned i;
646  gimple_stmt_iterator bsi;
647  gimple stmt;
648  tree lhs, rhs;
649  struct mem_ref_group *refs = NULL;
650
651  *no_other_refs = true;
652  *ref_count = 0;
653
654  /* Scan the loop body in order, so that the former references precede the
655     later ones.  */
656  for (i = 0; i < loop->num_nodes; i++)
657    {
658      bb = body[i];
659      if (bb->loop_father != loop)
660	continue;
661
662      for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
663	{
664	  stmt = gsi_stmt (bsi);
665
666	  if (gimple_code (stmt) != GIMPLE_ASSIGN)
667	    {
668	      if (gimple_vuse (stmt)
669		  || (is_gimple_call (stmt)
670		      && !(gimple_call_flags (stmt) & ECF_CONST)))
671		*no_other_refs = false;
672	      continue;
673	    }
674
675	  lhs = gimple_assign_lhs (stmt);
676	  rhs = gimple_assign_rhs1 (stmt);
677
678	  if (REFERENCE_CLASS_P (rhs))
679	    {
680	    *no_other_refs &= gather_memory_references_ref (loop, &refs,
681							    rhs, false, stmt);
682	    *ref_count += 1;
683	    }
684	  if (REFERENCE_CLASS_P (lhs))
685	    {
686	    *no_other_refs &= gather_memory_references_ref (loop, &refs,
687							    lhs, true, stmt);
688	    *ref_count += 1;
689	    }
690	}
691    }
692  free (body);
693
694  return refs;
695}
696
697/* Prune the prefetch candidate REF using the self-reuse.  */
698
699static void
700prune_ref_by_self_reuse (struct mem_ref *ref)
701{
702  HOST_WIDE_INT step;
703  bool backward;
704
705  /* If the step size is non constant, we cannot calculate prefetch_mod.  */
706  if (!cst_and_fits_in_hwi (ref->group->step))
707    return;
708
709  step = int_cst_value (ref->group->step);
710
711  backward = step < 0;
712
713  if (step == 0)
714    {
715      /* Prefetch references to invariant address just once.  */
716      ref->prefetch_before = 1;
717      return;
718    }
719
720  if (backward)
721    step = -step;
722
723  if (step > PREFETCH_BLOCK)
724    return;
725
726  if ((backward && HAVE_BACKWARD_PREFETCH)
727      || (!backward && HAVE_FORWARD_PREFETCH))
728    {
729      ref->prefetch_before = 1;
730      return;
731    }
732
733  ref->prefetch_mod = PREFETCH_BLOCK / step;
734}
735
736/* Divides X by BY, rounding down.  */
737
738static HOST_WIDE_INT
739ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
740{
741  gcc_assert (by > 0);
742
743  if (x >= 0)
744    return x / by;
745  else
746    return (x + by - 1) / by;
747}
748
749/* Given a CACHE_LINE_SIZE and two inductive memory references
750   with a common STEP greater than CACHE_LINE_SIZE and an address
751   difference DELTA, compute the probability that they will fall
752   in different cache lines.  Return true if the computed miss rate
753   is not greater than the ACCEPTABLE_MISS_RATE.  DISTINCT_ITERS is the
754   number of distinct iterations after which the pattern repeats itself.
755   ALIGN_UNIT is the unit of alignment in bytes.  */
756
757static bool
758is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size,
759		   HOST_WIDE_INT step, HOST_WIDE_INT delta,
760		   unsigned HOST_WIDE_INT distinct_iters,
761		   int align_unit)
762{
763  unsigned align, iter;
764  int total_positions, miss_positions, max_allowed_miss_positions;
765  int address1, address2, cache_line1, cache_line2;
766
767  /* It always misses if delta is greater than or equal to the cache
768     line size.  */
769  if (delta >= (HOST_WIDE_INT) cache_line_size)
770    return false;
771
772  miss_positions = 0;
773  total_positions = (cache_line_size / align_unit) * distinct_iters;
774  max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000;
775
776  /* Iterate through all possible alignments of the first
777     memory reference within its cache line.  */
778  for (align = 0; align < cache_line_size; align += align_unit)
779
780    /* Iterate through all distinct iterations.  */
781    for (iter = 0; iter < distinct_iters; iter++)
782      {
783	address1 = align + step * iter;
784	address2 = address1 + delta;
785	cache_line1 = address1 / cache_line_size;
786	cache_line2 = address2 / cache_line_size;
787	if (cache_line1 != cache_line2)
788	  {
789	    miss_positions += 1;
790            if (miss_positions > max_allowed_miss_positions)
791	      return false;
792          }
793      }
794  return true;
795}
796
797/* Prune the prefetch candidate REF using the reuse with BY.
798   If BY_IS_BEFORE is true, BY is before REF in the loop.  */
799
800static void
801prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
802			  bool by_is_before)
803{
804  HOST_WIDE_INT step;
805  bool backward;
806  HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
807  HOST_WIDE_INT delta = delta_b - delta_r;
808  HOST_WIDE_INT hit_from;
809  unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
810  HOST_WIDE_INT reduced_step;
811  unsigned HOST_WIDE_INT reduced_prefetch_block;
812  tree ref_type;
813  int align_unit;
814
815  /* If the step is non constant we cannot calculate prefetch_before.  */
816  if (!cst_and_fits_in_hwi (ref->group->step)) {
817    return;
818  }
819
820  step = int_cst_value (ref->group->step);
821
822  backward = step < 0;
823
824
825  if (delta == 0)
826    {
827      /* If the references has the same address, only prefetch the
828	 former.  */
829      if (by_is_before)
830	ref->prefetch_before = 0;
831
832      return;
833    }
834
835  if (!step)
836    {
837      /* If the reference addresses are invariant and fall into the
838	 same cache line, prefetch just the first one.  */
839      if (!by_is_before)
840	return;
841
842      if (ddown (ref->delta, PREFETCH_BLOCK)
843	  != ddown (by->delta, PREFETCH_BLOCK))
844	return;
845
846      ref->prefetch_before = 0;
847      return;
848    }
849
850  /* Only prune the reference that is behind in the array.  */
851  if (backward)
852    {
853      if (delta > 0)
854	return;
855
856      /* Transform the data so that we may assume that the accesses
857	 are forward.  */
858      delta = - delta;
859      step = -step;
860      delta_r = PREFETCH_BLOCK - 1 - delta_r;
861      delta_b = PREFETCH_BLOCK - 1 - delta_b;
862    }
863  else
864    {
865      if (delta < 0)
866	return;
867    }
868
869  /* Check whether the two references are likely to hit the same cache
870     line, and how distant the iterations in that it occurs are from
871     each other.  */
872
873  if (step <= PREFETCH_BLOCK)
874    {
875      /* The accesses are sure to meet.  Let us check when.  */
876      hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
877      prefetch_before = (hit_from - delta_r + step - 1) / step;
878
879      /* Do not reduce prefetch_before if we meet beyond cache size.  */
880      if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES / step))
881        prefetch_before = PREFETCH_ALL;
882      if (prefetch_before < ref->prefetch_before)
883	ref->prefetch_before = prefetch_before;
884
885      return;
886    }
887
888  /* A more complicated case with step > prefetch_block.  First reduce
889     the ratio between the step and the cache line size to its simplest
890     terms.  The resulting denominator will then represent the number of
891     distinct iterations after which each address will go back to its
892     initial location within the cache line.  This computation assumes
893     that PREFETCH_BLOCK is a power of two.  */
894  prefetch_block = PREFETCH_BLOCK;
895  reduced_prefetch_block = prefetch_block;
896  reduced_step = step;
897  while ((reduced_step & 1) == 0
898	 && reduced_prefetch_block > 1)
899    {
900      reduced_step >>= 1;
901      reduced_prefetch_block >>= 1;
902    }
903
904  prefetch_before = delta / step;
905  delta %= step;
906  ref_type = TREE_TYPE (ref->mem);
907  align_unit = TYPE_ALIGN (ref_type) / 8;
908  if (is_miss_rate_acceptable (prefetch_block, step, delta,
909			       reduced_prefetch_block, align_unit))
910    {
911      /* Do not reduce prefetch_before if we meet beyond cache size.  */
912      if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
913        prefetch_before = PREFETCH_ALL;
914      if (prefetch_before < ref->prefetch_before)
915	ref->prefetch_before = prefetch_before;
916
917      return;
918    }
919
920  /* Try also the following iteration.  */
921  prefetch_before++;
922  delta = step - delta;
923  if (is_miss_rate_acceptable (prefetch_block, step, delta,
924			       reduced_prefetch_block, align_unit))
925    {
926      if (prefetch_before < ref->prefetch_before)
927	ref->prefetch_before = prefetch_before;
928
929      return;
930    }
931
932  /* The ref probably does not reuse by.  */
933  return;
934}
935
936/* Prune the prefetch candidate REF using the reuses with other references
937   in REFS.  */
938
939static void
940prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
941{
942  struct mem_ref *prune_by;
943  bool before = true;
944
945  prune_ref_by_self_reuse (ref);
946
947  for (prune_by = refs; prune_by; prune_by = prune_by->next)
948    {
949      if (prune_by == ref)
950	{
951	  before = false;
952	  continue;
953	}
954
955      if (!WRITE_CAN_USE_READ_PREFETCH
956	  && ref->write_p
957	  && !prune_by->write_p)
958	continue;
959      if (!READ_CAN_USE_WRITE_PREFETCH
960	  && !ref->write_p
961	  && prune_by->write_p)
962	continue;
963
964      prune_ref_by_group_reuse (ref, prune_by, before);
965    }
966}
967
968/* Prune the prefetch candidates in GROUP using the reuse analysis.  */
969
970static void
971prune_group_by_reuse (struct mem_ref_group *group)
972{
973  struct mem_ref *ref_pruned;
974
975  for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
976    {
977      prune_ref_by_reuse (ref_pruned, group->refs);
978
979      if (dump_file && (dump_flags & TDF_DETAILS))
980	{
981	  fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
982
983	  if (ref_pruned->prefetch_before == PREFETCH_ALL
984	      && ref_pruned->prefetch_mod == 1)
985	    fprintf (dump_file, " no restrictions");
986	  else if (ref_pruned->prefetch_before == 0)
987	    fprintf (dump_file, " do not prefetch");
988	  else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
989	    fprintf (dump_file, " prefetch once");
990	  else
991	    {
992	      if (ref_pruned->prefetch_before != PREFETCH_ALL)
993		{
994		  fprintf (dump_file, " prefetch before ");
995		  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
996			   ref_pruned->prefetch_before);
997		}
998	      if (ref_pruned->prefetch_mod != 1)
999		{
1000		  fprintf (dump_file, " prefetch mod ");
1001		  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
1002			   ref_pruned->prefetch_mod);
1003		}
1004	    }
1005	  fprintf (dump_file, "\n");
1006	}
1007    }
1008}
1009
1010/* Prune the list of prefetch candidates GROUPS using the reuse analysis.  */
1011
1012static void
1013prune_by_reuse (struct mem_ref_group *groups)
1014{
1015  for (; groups; groups = groups->next)
1016    prune_group_by_reuse (groups);
1017}
1018
1019/* Returns true if we should issue prefetch for REF.  */
1020
1021static bool
1022should_issue_prefetch_p (struct mem_ref *ref)
1023{
1024  /* For now do not issue prefetches for only first few of the
1025     iterations.  */
1026  if (ref->prefetch_before != PREFETCH_ALL)
1027    {
1028      if (dump_file && (dump_flags & TDF_DETAILS))
1029        fprintf (dump_file, "Ignoring %p due to prefetch_before\n",
1030		 (void *) ref);
1031      return false;
1032    }
1033
1034  /* Do not prefetch nontemporal stores.  */
1035  if (ref->storent_p)
1036    {
1037      if (dump_file && (dump_flags & TDF_DETAILS))
1038        fprintf (dump_file, "Ignoring nontemporal store %p\n", (void *) ref);
1039      return false;
1040    }
1041
1042  return true;
1043}
1044
1045/* Decide which of the prefetch candidates in GROUPS to prefetch.
1046   AHEAD is the number of iterations to prefetch ahead (which corresponds
1047   to the number of simultaneous instances of one prefetch running at a
1048   time).  UNROLL_FACTOR is the factor by that the loop is going to be
1049   unrolled.  Returns true if there is anything to prefetch.  */
1050
1051static bool
1052schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
1053		     unsigned ahead)
1054{
1055  unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
1056  unsigned slots_per_prefetch;
1057  struct mem_ref *ref;
1058  bool any = false;
1059
1060  /* At most SIMULTANEOUS_PREFETCHES should be running at the same time.  */
1061  remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
1062
1063  /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
1064     AHEAD / UNROLL_FACTOR iterations of the unrolled loop.  In each iteration,
1065     it will need a prefetch slot.  */
1066  slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
1067  if (dump_file && (dump_flags & TDF_DETAILS))
1068    fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
1069	     slots_per_prefetch);
1070
1071  /* For now we just take memory references one by one and issue
1072     prefetches for as many as possible.  The groups are sorted
1073     starting with the largest step, since the references with
1074     large step are more likely to cause many cache misses.  */
1075
1076  for (; groups; groups = groups->next)
1077    for (ref = groups->refs; ref; ref = ref->next)
1078      {
1079	if (!should_issue_prefetch_p (ref))
1080	  continue;
1081
1082        /* The loop is far from being sufficiently unrolled for this
1083           prefetch.  Do not generate prefetch to avoid many redudant
1084           prefetches.  */
1085        if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
1086          continue;
1087
1088	/* If we need to prefetch the reference each PREFETCH_MOD iterations,
1089	   and we unroll the loop UNROLL_FACTOR times, we need to insert
1090	   ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
1091	   iteration.  */
1092	n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1093			/ ref->prefetch_mod);
1094	prefetch_slots = n_prefetches * slots_per_prefetch;
1095
1096	/* If more than half of the prefetches would be lost anyway, do not
1097	   issue the prefetch.  */
1098	if (2 * remaining_prefetch_slots < prefetch_slots)
1099	  continue;
1100
1101	ref->issue_prefetch_p = true;
1102
1103	if (remaining_prefetch_slots <= prefetch_slots)
1104	  return true;
1105	remaining_prefetch_slots -= prefetch_slots;
1106	any = true;
1107      }
1108
1109  return any;
1110}
1111
1112/* Return TRUE if no prefetch is going to be generated in the given
1113   GROUPS.  */
1114
1115static bool
1116nothing_to_prefetch_p (struct mem_ref_group *groups)
1117{
1118  struct mem_ref *ref;
1119
1120  for (; groups; groups = groups->next)
1121    for (ref = groups->refs; ref; ref = ref->next)
1122      if (should_issue_prefetch_p (ref))
1123	return false;
1124
1125  return true;
1126}
1127
1128/* Estimate the number of prefetches in the given GROUPS.
1129   UNROLL_FACTOR is the factor by which LOOP was unrolled.  */
1130
1131static int
1132estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor)
1133{
1134  struct mem_ref *ref;
1135  unsigned n_prefetches;
1136  int prefetch_count = 0;
1137
1138  for (; groups; groups = groups->next)
1139    for (ref = groups->refs; ref; ref = ref->next)
1140      if (should_issue_prefetch_p (ref))
1141	{
1142	  n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1143			  / ref->prefetch_mod);
1144	  prefetch_count += n_prefetches;
1145	}
1146
1147  return prefetch_count;
1148}
1149
1150/* Issue prefetches for the reference REF into loop as decided before.
1151   HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR
1152   is the factor by which LOOP was unrolled.  */
1153
1154static void
1155issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
1156{
1157  HOST_WIDE_INT delta;
1158  tree addr, addr_base, write_p, local, forward;
1159  gcall *prefetch;
1160  gimple_stmt_iterator bsi;
1161  unsigned n_prefetches, ap;
1162  bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
1163
1164  if (dump_file && (dump_flags & TDF_DETAILS))
1165    fprintf (dump_file, "Issued%s prefetch for %p.\n",
1166	     nontemporal ? " nontemporal" : "",
1167	     (void *) ref);
1168
1169  bsi = gsi_for_stmt (ref->stmt);
1170
1171  n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1172		  / ref->prefetch_mod);
1173  addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
1174  addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
1175					true, NULL, true, GSI_SAME_STMT);
1176  write_p = ref->write_p ? integer_one_node : integer_zero_node;
1177  local = nontemporal ? integer_zero_node : integer_three_node;
1178
1179  for (ap = 0; ap < n_prefetches; ap++)
1180    {
1181      if (cst_and_fits_in_hwi (ref->group->step))
1182        {
1183          /* Determine the address to prefetch.  */
1184          delta = (ahead + ap * ref->prefetch_mod) *
1185		   int_cst_value (ref->group->step);
1186          addr = fold_build_pointer_plus_hwi (addr_base, delta);
1187          addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL,
1188                                           true, GSI_SAME_STMT);
1189        }
1190      else
1191        {
1192          /* The step size is non-constant but loop-invariant.  We use the
1193             heuristic to simply prefetch ahead iterations ahead.  */
1194          forward = fold_build2 (MULT_EXPR, sizetype,
1195                                 fold_convert (sizetype, ref->group->step),
1196                                 fold_convert (sizetype, size_int (ahead)));
1197          addr = fold_build_pointer_plus (addr_base, forward);
1198          addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1199					   NULL, true, GSI_SAME_STMT);
1200      }
1201      /* Create the prefetch instruction.  */
1202      prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
1203				    3, addr, write_p, local);
1204      gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
1205    }
1206}
1207
1208/* Issue prefetches for the references in GROUPS into loop as decided before.
1209   HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR is the
1210   factor by that LOOP was unrolled.  */
1211
1212static void
1213issue_prefetches (struct mem_ref_group *groups,
1214		  unsigned unroll_factor, unsigned ahead)
1215{
1216  struct mem_ref *ref;
1217
1218  for (; groups; groups = groups->next)
1219    for (ref = groups->refs; ref; ref = ref->next)
1220      if (ref->issue_prefetch_p)
1221	issue_prefetch_ref (ref, unroll_factor, ahead);
1222}
1223
1224/* Returns true if REF is a memory write for that a nontemporal store insn
1225   can be used.  */
1226
1227static bool
1228nontemporal_store_p (struct mem_ref *ref)
1229{
1230  machine_mode mode;
1231  enum insn_code code;
1232
1233  /* REF must be a write that is not reused.  We require it to be independent
1234     on all other memory references in the loop, as the nontemporal stores may
1235     be reordered with respect to other memory references.  */
1236  if (!ref->write_p
1237      || !ref->independent_p
1238      || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1239    return false;
1240
1241  /* Check that we have the storent instruction for the mode.  */
1242  mode = TYPE_MODE (TREE_TYPE (ref->mem));
1243  if (mode == BLKmode)
1244    return false;
1245
1246  code = optab_handler (storent_optab, mode);
1247  return code != CODE_FOR_nothing;
1248}
1249
1250/* If REF is a nontemporal store, we mark the corresponding modify statement
1251   and return true.  Otherwise, we return false.  */
1252
1253static bool
1254mark_nontemporal_store (struct mem_ref *ref)
1255{
1256  if (!nontemporal_store_p (ref))
1257    return false;
1258
1259  if (dump_file && (dump_flags & TDF_DETAILS))
1260    fprintf (dump_file, "Marked reference %p as a nontemporal store.\n",
1261	     (void *) ref);
1262
1263  gimple_assign_set_nontemporal_move (ref->stmt, true);
1264  ref->storent_p = true;
1265
1266  return true;
1267}
1268
1269/* Issue a memory fence instruction after LOOP.  */
1270
1271static void
1272emit_mfence_after_loop (struct loop *loop)
1273{
1274  vec<edge> exits = get_loop_exit_edges (loop);
1275  edge exit;
1276  gcall *call;
1277  gimple_stmt_iterator bsi;
1278  unsigned i;
1279
1280  FOR_EACH_VEC_ELT (exits, i, exit)
1281    {
1282      call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
1283
1284      if (!single_pred_p (exit->dest)
1285	  /* If possible, we prefer not to insert the fence on other paths
1286	     in cfg.  */
1287	  && !(exit->flags & EDGE_ABNORMAL))
1288	split_loop_exit_edge (exit);
1289      bsi = gsi_after_labels (exit->dest);
1290
1291      gsi_insert_before (&bsi, call, GSI_NEW_STMT);
1292    }
1293
1294  exits.release ();
1295  update_ssa (TODO_update_ssa_only_virtuals);
1296}
1297
1298/* Returns true if we can use storent in loop, false otherwise.  */
1299
1300static bool
1301may_use_storent_in_loop_p (struct loop *loop)
1302{
1303  bool ret = true;
1304
1305  if (loop->inner != NULL)
1306    return false;
1307
1308  /* If we must issue a mfence insn after using storent, check that there
1309     is a suitable place for it at each of the loop exits.  */
1310  if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1311    {
1312      vec<edge> exits = get_loop_exit_edges (loop);
1313      unsigned i;
1314      edge exit;
1315
1316      FOR_EACH_VEC_ELT (exits, i, exit)
1317	if ((exit->flags & EDGE_ABNORMAL)
1318	    && exit->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
1319	  ret = false;
1320
1321      exits.release ();
1322    }
1323
1324  return ret;
1325}
1326
1327/* Marks nontemporal stores in LOOP.  GROUPS contains the description of memory
1328   references in the loop.  */
1329
1330static void
1331mark_nontemporal_stores (struct loop *loop, struct mem_ref_group *groups)
1332{
1333  struct mem_ref *ref;
1334  bool any = false;
1335
1336  if (!may_use_storent_in_loop_p (loop))
1337    return;
1338
1339  for (; groups; groups = groups->next)
1340    for (ref = groups->refs; ref; ref = ref->next)
1341      any |= mark_nontemporal_store (ref);
1342
1343  if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1344    emit_mfence_after_loop (loop);
1345}
1346
1347/* Determines whether we can profitably unroll LOOP FACTOR times, and if
1348   this is the case, fill in DESC by the description of number of
1349   iterations.  */
1350
1351static bool
1352should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
1353		      unsigned factor)
1354{
1355  if (!can_unroll_loop_p (loop, factor, desc))
1356    return false;
1357
1358  /* We only consider loops without control flow for unrolling.  This is not
1359     a hard restriction -- tree_unroll_loop works with arbitrary loops
1360     as well; but the unrolling/prefetching is usually more profitable for
1361     loops consisting of a single basic block, and we want to limit the
1362     code growth.  */
1363  if (loop->num_nodes > 2)
1364    return false;
1365
1366  return true;
1367}
1368
1369/* Determine the coefficient by that unroll LOOP, from the information
1370   contained in the list of memory references REFS.  Description of
1371   umber of iterations of LOOP is stored to DESC.  NINSNS is the number of
1372   insns of the LOOP.  EST_NITER is the estimated number of iterations of
1373   the loop, or -1 if no estimate is available.  */
1374
1375static unsigned
1376determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
1377			 unsigned ninsns, struct tree_niter_desc *desc,
1378			 HOST_WIDE_INT est_niter)
1379{
1380  unsigned upper_bound;
1381  unsigned nfactor, factor, mod_constraint;
1382  struct mem_ref_group *agp;
1383  struct mem_ref *ref;
1384
1385  /* First check whether the loop is not too large to unroll.  We ignore
1386     PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1387     from unrolling them enough to make exactly one cache line covered by each
1388     iteration.  Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1389     us from unrolling the loops too many times in cases where we only expect
1390     gains from better scheduling and decreasing loop overhead, which is not
1391     the case here.  */
1392  upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
1393
1394  /* If we unrolled the loop more times than it iterates, the unrolled version
1395     of the loop would be never entered.  */
1396  if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
1397    upper_bound = est_niter;
1398
1399  if (upper_bound <= 1)
1400    return 1;
1401
1402  /* Choose the factor so that we may prefetch each cache just once,
1403     but bound the unrolling by UPPER_BOUND.  */
1404  factor = 1;
1405  for (agp = refs; agp; agp = agp->next)
1406    for (ref = agp->refs; ref; ref = ref->next)
1407      if (should_issue_prefetch_p (ref))
1408	{
1409	  mod_constraint = ref->prefetch_mod;
1410	  nfactor = least_common_multiple (mod_constraint, factor);
1411	  if (nfactor <= upper_bound)
1412	    factor = nfactor;
1413	}
1414
1415  if (!should_unroll_loop_p (loop, desc, factor))
1416    return 1;
1417
1418  return factor;
1419}
1420
1421/* Returns the total volume of the memory references REFS, taking into account
1422   reuses in the innermost loop and cache line size.  TODO -- we should also
1423   take into account reuses across the iterations of the loops in the loop
1424   nest.  */
1425
1426static unsigned
1427volume_of_references (struct mem_ref_group *refs)
1428{
1429  unsigned volume = 0;
1430  struct mem_ref_group *gr;
1431  struct mem_ref *ref;
1432
1433  for (gr = refs; gr; gr = gr->next)
1434    for (ref = gr->refs; ref; ref = ref->next)
1435      {
1436	/* Almost always reuses another value?  */
1437	if (ref->prefetch_before != PREFETCH_ALL)
1438	  continue;
1439
1440	/* If several iterations access the same cache line, use the size of
1441	   the line divided by this number.  Otherwise, a cache line is
1442	   accessed in each iteration.  TODO -- in the latter case, we should
1443	   take the size of the reference into account, rounding it up on cache
1444	   line size multiple.  */
1445	volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
1446      }
1447  return volume;
1448}
1449
1450/* Returns the volume of memory references accessed across VEC iterations of
1451   loops, whose sizes are described in the LOOP_SIZES array.  N is the number
1452   of the loops in the nest (length of VEC and LOOP_SIZES vectors).  */
1453
1454static unsigned
1455volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
1456{
1457  unsigned i;
1458
1459  for (i = 0; i < n; i++)
1460    if (vec[i] != 0)
1461      break;
1462
1463  if (i == n)
1464    return 0;
1465
1466  gcc_assert (vec[i] > 0);
1467
1468  /* We ignore the parts of the distance vector in subloops, since usually
1469     the numbers of iterations are much smaller.  */
1470  return loop_sizes[i] * vec[i];
1471}
1472
1473/* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1474   at the position corresponding to the loop of the step.  N is the depth
1475   of the considered loop nest, and, LOOP is its innermost loop.  */
1476
1477static void
1478add_subscript_strides (tree access_fn, unsigned stride,
1479		       HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
1480{
1481  struct loop *aloop;
1482  tree step;
1483  HOST_WIDE_INT astep;
1484  unsigned min_depth = loop_depth (loop) - n;
1485
1486  while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1487    {
1488      aloop = get_chrec_loop (access_fn);
1489      step = CHREC_RIGHT (access_fn);
1490      access_fn = CHREC_LEFT (access_fn);
1491
1492      if ((unsigned) loop_depth (aloop) <= min_depth)
1493	continue;
1494
1495      if (tree_fits_shwi_p (step))
1496	astep = tree_to_shwi (step);
1497      else
1498	astep = L1_CACHE_LINE_SIZE;
1499
1500      strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
1501
1502    }
1503}
1504
1505/* Returns the volume of memory references accessed between two consecutive
1506   self-reuses of the reference DR.  We consider the subscripts of DR in N
1507   loops, and LOOP_SIZES contains the volumes of accesses in each of the
1508   loops.  LOOP is the innermost loop of the current loop nest.  */
1509
1510static unsigned
1511self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1512		     struct loop *loop)
1513{
1514  tree stride, access_fn;
1515  HOST_WIDE_INT *strides, astride;
1516  vec<tree> access_fns;
1517  tree ref = DR_REF (dr);
1518  unsigned i, ret = ~0u;
1519
1520  /* In the following example:
1521
1522     for (i = 0; i < N; i++)
1523       for (j = 0; j < N; j++)
1524         use (a[j][i]);
1525     the same cache line is accessed each N steps (except if the change from
1526     i to i + 1 crosses the boundary of the cache line).  Thus, for self-reuse,
1527     we cannot rely purely on the results of the data dependence analysis.
1528
1529     Instead, we compute the stride of the reference in each loop, and consider
1530     the innermost loop in that the stride is less than cache size.  */
1531
1532  strides = XCNEWVEC (HOST_WIDE_INT, n);
1533  access_fns = DR_ACCESS_FNS (dr);
1534
1535  FOR_EACH_VEC_ELT (access_fns, i, access_fn)
1536    {
1537      /* Keep track of the reference corresponding to the subscript, so that we
1538	 know its stride.  */
1539      while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1540	ref = TREE_OPERAND (ref, 0);
1541
1542      if (TREE_CODE (ref) == ARRAY_REF)
1543	{
1544	  stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1545	  if (tree_fits_uhwi_p (stride))
1546	    astride = tree_to_uhwi (stride);
1547	  else
1548	    astride = L1_CACHE_LINE_SIZE;
1549
1550	  ref = TREE_OPERAND (ref, 0);
1551	}
1552      else
1553	astride = 1;
1554
1555      add_subscript_strides (access_fn, astride, strides, n, loop);
1556    }
1557
1558  for (i = n; i-- > 0; )
1559    {
1560      unsigned HOST_WIDE_INT s;
1561
1562      s = strides[i] < 0 ?  -strides[i] : strides[i];
1563
1564      if (s < (unsigned) L1_CACHE_LINE_SIZE
1565	  && (loop_sizes[i]
1566	      > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1567	{
1568	  ret = loop_sizes[i];
1569	  break;
1570	}
1571    }
1572
1573  free (strides);
1574  return ret;
1575}
1576
1577/* Determines the distance till the first reuse of each reference in REFS
1578   in the loop nest of LOOP.  NO_OTHER_REFS is true if there are no other
1579   memory references in the loop.  Return false if the analysis fails.  */
1580
1581static bool
1582determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs,
1583			   bool no_other_refs)
1584{
1585  struct loop *nest, *aloop;
1586  vec<data_reference_p> datarefs = vNULL;
1587  vec<ddr_p> dependences = vNULL;
1588  struct mem_ref_group *gr;
1589  struct mem_ref *ref, *refb;
1590  vec<loop_p> vloops = vNULL;
1591  unsigned *loop_data_size;
1592  unsigned i, j, n;
1593  unsigned volume, dist, adist;
1594  HOST_WIDE_INT vol;
1595  data_reference_p dr;
1596  ddr_p dep;
1597
1598  if (loop->inner)
1599    return true;
1600
1601  /* Find the outermost loop of the loop nest of loop (we require that
1602     there are no sibling loops inside the nest).  */
1603  nest = loop;
1604  while (1)
1605    {
1606      aloop = loop_outer (nest);
1607
1608      if (aloop == current_loops->tree_root
1609	  || aloop->inner->next)
1610	break;
1611
1612      nest = aloop;
1613    }
1614
1615  /* For each loop, determine the amount of data accessed in each iteration.
1616     We use this to estimate whether the reference is evicted from the
1617     cache before its reuse.  */
1618  find_loop_nest (nest, &vloops);
1619  n = vloops.length ();
1620  loop_data_size = XNEWVEC (unsigned, n);
1621  volume = volume_of_references (refs);
1622  i = n;
1623  while (i-- != 0)
1624    {
1625      loop_data_size[i] = volume;
1626      /* Bound the volume by the L2 cache size, since above this bound,
1627	 all dependence distances are equivalent.  */
1628      if (volume > L2_CACHE_SIZE_BYTES)
1629	continue;
1630
1631      aloop = vloops[i];
1632      vol = estimated_stmt_executions_int (aloop);
1633      if (vol == -1)
1634	vol = expected_loop_iterations (aloop);
1635      volume *= vol;
1636    }
1637
1638  /* Prepare the references in the form suitable for data dependence
1639     analysis.  We ignore unanalyzable data references (the results
1640     are used just as a heuristics to estimate temporality of the
1641     references, hence we do not need to worry about correctness).  */
1642  for (gr = refs; gr; gr = gr->next)
1643    for (ref = gr->refs; ref; ref = ref->next)
1644      {
1645	dr = create_data_ref (nest, loop_containing_stmt (ref->stmt),
1646			      ref->mem, ref->stmt, !ref->write_p);
1647
1648	if (dr)
1649	  {
1650	    ref->reuse_distance = volume;
1651	    dr->aux = ref;
1652	    datarefs.safe_push (dr);
1653	  }
1654	else
1655	  no_other_refs = false;
1656      }
1657
1658  FOR_EACH_VEC_ELT (datarefs, i, dr)
1659    {
1660      dist = self_reuse_distance (dr, loop_data_size, n, loop);
1661      ref = (struct mem_ref *) dr->aux;
1662      if (ref->reuse_distance > dist)
1663	ref->reuse_distance = dist;
1664
1665      if (no_other_refs)
1666	ref->independent_p = true;
1667    }
1668
1669  if (!compute_all_dependences (datarefs, &dependences, vloops, true))
1670    return false;
1671
1672  FOR_EACH_VEC_ELT (dependences, i, dep)
1673    {
1674      if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1675	continue;
1676
1677      ref = (struct mem_ref *) DDR_A (dep)->aux;
1678      refb = (struct mem_ref *) DDR_B (dep)->aux;
1679
1680      if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1681	  || DDR_NUM_DIST_VECTS (dep) == 0)
1682	{
1683	  /* If the dependence cannot be analyzed, assume that there might be
1684	     a reuse.  */
1685	  dist = 0;
1686
1687	  ref->independent_p = false;
1688	  refb->independent_p = false;
1689	}
1690      else
1691	{
1692	  /* The distance vectors are normalized to be always lexicographically
1693	     positive, hence we cannot tell just from them whether DDR_A comes
1694	     before DDR_B or vice versa.  However, it is not important,
1695	     anyway -- if DDR_A is close to DDR_B, then it is either reused in
1696	     DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1697	     in cache (and marking it as nontemporal would not affect
1698	     anything).  */
1699
1700	  dist = volume;
1701	  for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
1702	    {
1703	      adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1704					     loop_data_size, n);
1705
1706	      /* If this is a dependence in the innermost loop (i.e., the
1707		 distances in all superloops are zero) and it is not
1708		 the trivial self-dependence with distance zero, record that
1709		 the references are not completely independent.  */
1710	      if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
1711		  && (ref != refb
1712		      || DDR_DIST_VECT (dep, j)[n-1] != 0))
1713		{
1714		  ref->independent_p = false;
1715		  refb->independent_p = false;
1716		}
1717
1718	      /* Ignore accesses closer than
1719		 L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1720	      	 so that we use nontemporal prefetches e.g. if single memory
1721		 location is accessed several times in a single iteration of
1722		 the loop.  */
1723	      if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1724		continue;
1725
1726	      if (adist < dist)
1727		dist = adist;
1728	    }
1729	}
1730
1731      if (ref->reuse_distance > dist)
1732	ref->reuse_distance = dist;
1733      if (refb->reuse_distance > dist)
1734	refb->reuse_distance = dist;
1735    }
1736
1737  free_dependence_relations (dependences);
1738  free_data_refs (datarefs);
1739  free (loop_data_size);
1740
1741  if (dump_file && (dump_flags & TDF_DETAILS))
1742    {
1743      fprintf (dump_file, "Reuse distances:\n");
1744      for (gr = refs; gr; gr = gr->next)
1745	for (ref = gr->refs; ref; ref = ref->next)
1746	  fprintf (dump_file, " ref %p distance %u\n",
1747		   (void *) ref, ref->reuse_distance);
1748    }
1749
1750  return true;
1751}
1752
1753/* Determine whether or not the trip count to ahead ratio is too small based
1754   on prefitablility consideration.
1755   AHEAD: the iteration ahead distance,
1756   EST_NITER: the estimated trip count.  */
1757
1758static bool
1759trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter)
1760{
1761  /* Assume trip count to ahead ratio is big enough if the trip count could not
1762     be estimated at compile time.  */
1763  if (est_niter < 0)
1764    return false;
1765
1766  if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
1767    {
1768      if (dump_file && (dump_flags & TDF_DETAILS))
1769	fprintf (dump_file,
1770		 "Not prefetching -- loop estimated to roll only %d times\n",
1771		 (int) est_niter);
1772      return true;
1773    }
1774
1775  return false;
1776}
1777
1778/* Determine whether or not the number of memory references in the loop is
1779   reasonable based on the profitablity and compilation time considerations.
1780   NINSNS: estimated number of instructions in the loop,
1781   MEM_REF_COUNT: total number of memory references in the loop.  */
1782
1783static bool
1784mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count)
1785{
1786  int insn_to_mem_ratio;
1787
1788  if (mem_ref_count == 0)
1789    return false;
1790
1791  /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis
1792     (compute_all_dependences) have high costs based on quadratic complexity.
1793     To avoid huge compilation time, we give up prefetching if mem_ref_count
1794     is too large.  */
1795  if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP)
1796    return false;
1797
1798  /* Prefetching improves performance by overlapping cache missing
1799     memory accesses with CPU operations.  If the loop does not have
1800     enough CPU operations to overlap with memory operations, prefetching
1801     won't give a significant benefit.  One approximate way of checking
1802     this is to require the ratio of instructions to memory references to
1803     be above a certain limit.  This approximation works well in practice.
1804     TODO: Implement a more precise computation by estimating the time
1805     for each CPU or memory op in the loop. Time estimates for memory ops
1806     should account for cache misses.  */
1807  insn_to_mem_ratio = ninsns / mem_ref_count;
1808
1809  if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO)
1810    {
1811      if (dump_file && (dump_flags & TDF_DETAILS))
1812        fprintf (dump_file,
1813		 "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1814		 insn_to_mem_ratio);
1815      return false;
1816    }
1817
1818  return true;
1819}
1820
1821/* Determine whether or not the instruction to prefetch ratio in the loop is
1822   too small based on the profitablity consideration.
1823   NINSNS: estimated number of instructions in the loop,
1824   PREFETCH_COUNT: an estimate of the number of prefetches,
1825   UNROLL_FACTOR:  the factor to unroll the loop if prefetching.  */
1826
1827static bool
1828insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count,
1829                                     unsigned unroll_factor)
1830{
1831  int insn_to_prefetch_ratio;
1832
1833  /* Prefetching most likely causes performance degradation when the instruction
1834     to prefetch ratio is too small.  Too many prefetch instructions in a loop
1835     may reduce the I-cache performance.
1836     (unroll_factor * ninsns) is used to estimate the number of instructions in
1837     the unrolled loop.  This implementation is a bit simplistic -- the number
1838     of issued prefetch instructions is also affected by unrolling.  So,
1839     prefetch_mod and the unroll factor should be taken into account when
1840     determining prefetch_count.  Also, the number of insns of the unrolled
1841     loop will usually be significantly smaller than the number of insns of the
1842     original loop * unroll_factor (at least the induction variable increases
1843     and the exit branches will get eliminated), so it might be better to use
1844     tree_estimate_loop_size + estimated_unrolled_size.  */
1845  insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
1846  if (insn_to_prefetch_ratio < MIN_INSN_TO_PREFETCH_RATIO)
1847    {
1848      if (dump_file && (dump_flags & TDF_DETAILS))
1849        fprintf (dump_file,
1850		 "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
1851		 insn_to_prefetch_ratio);
1852      return true;
1853    }
1854
1855  return false;
1856}
1857
1858
1859/* Issue prefetch instructions for array references in LOOP.  Returns
1860   true if the LOOP was unrolled.  */
1861
1862static bool
1863loop_prefetch_arrays (struct loop *loop)
1864{
1865  struct mem_ref_group *refs;
1866  unsigned ahead, ninsns, time, unroll_factor;
1867  HOST_WIDE_INT est_niter;
1868  struct tree_niter_desc desc;
1869  bool unrolled = false, no_other_refs;
1870  unsigned prefetch_count;
1871  unsigned mem_ref_count;
1872
1873  if (optimize_loop_nest_for_size_p (loop))
1874    {
1875      if (dump_file && (dump_flags & TDF_DETAILS))
1876	fprintf (dump_file, "  ignored (cold area)\n");
1877      return false;
1878    }
1879
1880  /* FIXME: the time should be weighted by the probabilities of the blocks in
1881     the loop body.  */
1882  time = tree_num_loop_insns (loop, &eni_time_weights);
1883  if (time == 0)
1884    return false;
1885
1886  ahead = (PREFETCH_LATENCY + time - 1) / time;
1887  est_niter = estimated_stmt_executions_int (loop);
1888  if (est_niter == -1)
1889    est_niter = max_stmt_executions_int (loop);
1890
1891  /* Prefetching is not likely to be profitable if the trip count to ahead
1892     ratio is too small.  */
1893  if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter))
1894    return false;
1895
1896  ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1897
1898  /* Step 1: gather the memory references.  */
1899  refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1900
1901  /* Give up prefetching if the number of memory references in the
1902     loop is not reasonable based on profitablity and compilation time
1903     considerations.  */
1904  if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count))
1905    goto fail;
1906
1907  /* Step 2: estimate the reuse effects.  */
1908  prune_by_reuse (refs);
1909
1910  if (nothing_to_prefetch_p (refs))
1911    goto fail;
1912
1913  if (!determine_loop_nest_reuse (loop, refs, no_other_refs))
1914    goto fail;
1915
1916  /* Step 3: determine unroll factor.  */
1917  unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1918					   est_niter);
1919
1920  /* Estimate prefetch count for the unrolled loop.  */
1921  prefetch_count = estimate_prefetch_count (refs, unroll_factor);
1922  if (prefetch_count == 0)
1923    goto fail;
1924
1925  if (dump_file && (dump_flags & TDF_DETAILS))
1926    fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
1927	     HOST_WIDE_INT_PRINT_DEC "\n"
1928	     "insn count %d, mem ref count %d, prefetch count %d\n",
1929	     ahead, unroll_factor, est_niter,
1930	     ninsns, mem_ref_count, prefetch_count);
1931
1932  /* Prefetching is not likely to be profitable if the instruction to prefetch
1933     ratio is too small.  */
1934  if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
1935					  unroll_factor))
1936    goto fail;
1937
1938  mark_nontemporal_stores (loop, refs);
1939
1940  /* Step 4: what to prefetch?  */
1941  if (!schedule_prefetches (refs, unroll_factor, ahead))
1942    goto fail;
1943
1944  /* Step 5: unroll the loop.  TODO -- peeling of first and last few
1945     iterations so that we do not issue superfluous prefetches.  */
1946  if (unroll_factor != 1)
1947    {
1948      tree_unroll_loop (loop, unroll_factor,
1949			single_dom_exit (loop), &desc);
1950      unrolled = true;
1951    }
1952
1953  /* Step 6: issue the prefetches.  */
1954  issue_prefetches (refs, unroll_factor, ahead);
1955
1956fail:
1957  release_mem_refs (refs);
1958  return unrolled;
1959}
1960
1961/* Issue prefetch instructions for array references in loops.  */
1962
1963unsigned int
1964tree_ssa_prefetch_arrays (void)
1965{
1966  struct loop *loop;
1967  bool unrolled = false;
1968  int todo_flags = 0;
1969
1970  if (!HAVE_prefetch
1971      /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1972	 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1973	 of processor costs and i486 does not have prefetch, but
1974	 -march=pentium4 causes HAVE_prefetch to be true.  Ugh.  */
1975      || PREFETCH_BLOCK == 0)
1976    return 0;
1977
1978  if (dump_file && (dump_flags & TDF_DETAILS))
1979    {
1980      fprintf (dump_file, "Prefetching parameters:\n");
1981      fprintf (dump_file, "    simultaneous prefetches: %d\n",
1982	       SIMULTANEOUS_PREFETCHES);
1983      fprintf (dump_file, "    prefetch latency: %d\n", PREFETCH_LATENCY);
1984      fprintf (dump_file, "    prefetch block size: %d\n", PREFETCH_BLOCK);
1985      fprintf (dump_file, "    L1 cache size: %d lines, %d kB\n",
1986	       L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE);
1987      fprintf (dump_file, "    L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
1988      fprintf (dump_file, "    L2 cache size: %d kB\n", L2_CACHE_SIZE);
1989      fprintf (dump_file, "    min insn-to-prefetch ratio: %d \n",
1990	       MIN_INSN_TO_PREFETCH_RATIO);
1991      fprintf (dump_file, "    min insn-to-mem ratio: %d \n",
1992	       PREFETCH_MIN_INSN_TO_MEM_RATIO);
1993      fprintf (dump_file, "\n");
1994    }
1995
1996  initialize_original_copy_tables ();
1997
1998  if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
1999    {
2000      tree type = build_function_type_list (void_type_node,
2001					    const_ptr_type_node, NULL_TREE);
2002      tree decl = add_builtin_function ("__builtin_prefetch", type,
2003					BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
2004					NULL, NULL_TREE);
2005      DECL_IS_NOVOPS (decl) = true;
2006      set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
2007    }
2008
2009  FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
2010    {
2011      if (dump_file && (dump_flags & TDF_DETAILS))
2012	fprintf (dump_file, "Processing loop %d:\n", loop->num);
2013
2014      unrolled |= loop_prefetch_arrays (loop);
2015
2016      if (dump_file && (dump_flags & TDF_DETAILS))
2017	fprintf (dump_file, "\n\n");
2018    }
2019
2020  if (unrolled)
2021    {
2022      scev_reset ();
2023      todo_flags |= TODO_cleanup_cfg;
2024    }
2025
2026  free_original_copy_tables ();
2027  return todo_flags;
2028}
2029
2030/* Prefetching.  */
2031
2032namespace {
2033
2034const pass_data pass_data_loop_prefetch =
2035{
2036  GIMPLE_PASS, /* type */
2037  "aprefetch", /* name */
2038  OPTGROUP_LOOP, /* optinfo_flags */
2039  TV_TREE_PREFETCH, /* tv_id */
2040  ( PROP_cfg | PROP_ssa ), /* properties_required */
2041  0, /* properties_provided */
2042  0, /* properties_destroyed */
2043  0, /* todo_flags_start */
2044  0, /* todo_flags_finish */
2045};
2046
2047class pass_loop_prefetch : public gimple_opt_pass
2048{
2049public:
2050  pass_loop_prefetch (gcc::context *ctxt)
2051    : gimple_opt_pass (pass_data_loop_prefetch, ctxt)
2052  {}
2053
2054  /* opt_pass methods: */
2055  virtual bool gate (function *) { return flag_prefetch_loop_arrays > 0; }
2056  virtual unsigned int execute (function *);
2057
2058}; // class pass_loop_prefetch
2059
2060unsigned int
2061pass_loop_prefetch::execute (function *fun)
2062{
2063  if (number_of_loops (fun) <= 1)
2064    return 0;
2065
2066  if ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) != 0)
2067    {
2068      static bool warned = false;
2069
2070      if (!warned)
2071	{
2072	  warning (OPT_Wdisabled_optimization,
2073		   "%<l1-cache-size%> parameter is not a power of two %d",
2074		   PREFETCH_BLOCK);
2075	  warned = true;
2076	}
2077      return 0;
2078    }
2079
2080  return tree_ssa_prefetch_arrays ();
2081}
2082
2083} // anon namespace
2084
2085gimple_opt_pass *
2086make_pass_loop_prefetch (gcc::context *ctxt)
2087{
2088  return new pass_loop_prefetch (ctxt);
2089}
2090
2091
2092