1/* Array prefetching.
2   Copyright (C) 2005 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify it
7under the terms of the GNU General Public License as published by the
8Free Software Foundation; either version 2, or (at your option) any
9later version.
10
11GCC is distributed in the hope that it will be useful, but WITHOUT
12ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14for more details.
15
16You should have received a copy of the GNU General Public License
17along with GCC; see the file COPYING.  If not, write to the Free
18Software Foundation, 59 Temple Place - Suite 330, Boston, MA
1902111-1307, USA.  */
20
21#include "config.h"
22#include "system.h"
23#include "coretypes.h"
24#include "tm.h"
25#include "tree.h"
26#include "rtl.h"
27#include "tm_p.h"
28#include "hard-reg-set.h"
29#include "basic-block.h"
30#include "output.h"
31#include "diagnostic.h"
32#include "tree-flow.h"
33#include "tree-dump.h"
34#include "timevar.h"
35#include "cfgloop.h"
36#include "varray.h"
37#include "expr.h"
38#include "tree-pass.h"
39#include "ggc.h"
40#include "insn-config.h"
41#include "recog.h"
42#include "hashtab.h"
43#include "tree-chrec.h"
44#include "tree-scalar-evolution.h"
45#include "toplev.h"
46#include "params.h"
47#include "langhooks.h"
48
49/* This pass inserts prefetch instructions to optimize cache usage during
50   accesses to arrays in loops.  It processes loops sequentially and:
51
52   1) Gathers all memory references in the single loop.
53   2) For each of the references it decides when it is profitable to prefetch
54      it.  To do it, we evaluate the reuse among the accesses, and determines
55      two values: PREFETCH_BEFORE (meaning that it only makes sense to do
56      prefetching in the first PREFETCH_BEFORE iterations of the loop) and
57      PREFETCH_MOD (meaning that it only makes sense to prefetch in the
58      iterations of the loop that are zero modulo PREFETCH_MOD).  For example
59      (assuming cache line size is 64 bytes, char has size 1 byte and there
60      is no hardware sequential prefetch):
61
62      char *a;
63      for (i = 0; i < max; i++)
64	{
65	  a[255] = ...;		(0)
66	  a[i] = ...;		(1)
67	  a[i + 64] = ...;	(2)
68	  a[16*i] = ...;	(3)
69	  a[187*i] = ...;	(4)
70	  a[187*i + 50] = ...;	(5)
71	}
72
73       (0) obviously has PREFETCH_BEFORE 1
74       (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
75           location 64 iterations before it, and PREFETCH_MOD 64 (since
76	   it hits the same cache line otherwise).
77       (2) has PREFETCH_MOD 64
78       (3) has PREFETCH_MOD 4
79       (4) has PREFETCH_MOD 1.  We do not set PREFETCH_BEFORE here, since
80           the cache line accessed by (4) is the same with probability only
81	   7/32.
82       (5) has PREFETCH_MOD 1 as well.
83
84   3) We determine how much ahead we need to prefetch.  The number of
85      iterations needed is time to fetch / time spent in one iteration of
86      the loop.  The problem is that we do not know either of these values,
87      so we just make a heuristic guess based on a magic (possibly)
88      target-specific constant and size of the loop.
89
90   4) Determine which of the references we prefetch.  We take into account
91      that there is a maximum number of simultaneous prefetches (provided
92      by machine description).  We prefetch as many prefetches as possible
93      while still within this bound (starting with those with lowest
94      prefetch_mod, since they are responsible for most of the cache
95      misses).
96
97   5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
98      and PREFETCH_BEFORE requirements (within some bounds), and to avoid
99      prefetching nonaccessed memory.
100      TODO -- actually implement peeling.
101
102   6) We actually emit the prefetch instructions.  ??? Perhaps emit the
103      prefetch instructions with guards in cases where 5) was not sufficient
104      to satisfy the constraints?
105
106   Some other TODO:
107      -- write and use more general reuse analysis (that could be also used
108	 in other cache aimed loop optimizations)
109      -- make it behave sanely together with the prefetches given by user
110	 (now we just ignore them; at the very least we should avoid
111	 optimizing loops in that user put his own prefetches)
112      -- we assume cache line size alignment of arrays; this could be
113	 improved.  */
114
115/* Magic constants follow.  These should be replaced by machine specific
116   numbers.  */
117
118/* A number that should roughly correspond to the number of instructions
119   executed before the prefetch is completed.  */
120
121#ifndef PREFETCH_LATENCY
122#define PREFETCH_LATENCY 200
123#endif
124
125/* Number of prefetches that can run at the same time.  */
126
127#ifndef SIMULTANEOUS_PREFETCHES
128#define SIMULTANEOUS_PREFETCHES 3
129#endif
130
131/* True if write can be prefetched by a read prefetch.  */
132
133#ifndef WRITE_CAN_USE_READ_PREFETCH
134#define WRITE_CAN_USE_READ_PREFETCH 1
135#endif
136
137/* True if read can be prefetched by a write prefetch. */
138
139#ifndef READ_CAN_USE_WRITE_PREFETCH
140#define READ_CAN_USE_WRITE_PREFETCH 0
141#endif
142
143/* Cache line size.  Assumed to be a power of two.  */
144
145#ifndef PREFETCH_BLOCK
146#define PREFETCH_BLOCK 32
147#endif
148
149/* Do we have a forward hardware sequential prefetching?  */
150
151#ifndef HAVE_FORWARD_PREFETCH
152#define HAVE_FORWARD_PREFETCH 0
153#endif
154
155/* Do we have a backward hardware sequential prefetching?  */
156
157#ifndef HAVE_BACKWARD_PREFETCH
158#define HAVE_BACKWARD_PREFETCH 0
159#endif
160
161/* In some cases we are only able to determine that there is a certain
162   probability that the two accesses hit the same cache line.  In this
163   case, we issue the prefetches for both of them if this probability
164   is less then (1000 - ACCEPTABLE_MISS_RATE) promile.  */
165
166#ifndef ACCEPTABLE_MISS_RATE
167#define ACCEPTABLE_MISS_RATE 50
168#endif
169
170#ifndef HAVE_prefetch
171#define HAVE_prefetch 0
172#endif
173
174/* The group of references between that reuse may occur.  */
175
176struct mem_ref_group
177{
178  tree base;			/* Base of the reference.  */
179  HOST_WIDE_INT step;		/* Step of the reference.  */
180  struct mem_ref *refs;		/* References in the group.  */
181  struct mem_ref_group *next;	/* Next group of references.  */
182};
183
184/* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched.  */
185
186#define PREFETCH_ALL		(~(unsigned HOST_WIDE_INT) 0)
187
188/* The memory reference.  */
189
190struct mem_ref
191{
192  tree stmt;			/* Statement in that the reference appears.  */
193  tree mem;			/* The reference.  */
194  HOST_WIDE_INT delta;		/* Constant offset of the reference.  */
195  bool write_p;			/* Is it a write?  */
196  struct mem_ref_group *group;	/* The group of references it belongs to.  */
197  unsigned HOST_WIDE_INT prefetch_mod;
198				/* Prefetch only each PREFETCH_MOD-th
199				   iteration.  */
200  unsigned HOST_WIDE_INT prefetch_before;
201				/* Prefetch only first PREFETCH_BEFORE
202				   iterations.  */
203  bool issue_prefetch_p;	/* Should we really issue the prefetch?  */
204  struct mem_ref *next;		/* The next reference in the group.  */
205};
206
207/* Dumps information about reference REF to FILE.  */
208
209static void
210dump_mem_ref (FILE *file, struct mem_ref *ref)
211{
212  fprintf (file, "Reference %p:\n", (void *) ref);
213
214  fprintf (file, "  group %p (base ", (void *) ref->group);
215  print_generic_expr (file, ref->group->base, TDF_SLIM);
216  fprintf (file, ", step ");
217  fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->group->step);
218  fprintf (file, ")\n");
219
220  fprintf (dump_file, "  delta ");
221  fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta);
222  fprintf (file, "\n");
223
224  fprintf (file, "  %s\n", ref->write_p ? "write" : "read");
225
226  fprintf (file, "\n");
227}
228
229/* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
230   exist.  */
231
232static struct mem_ref_group *
233find_or_create_group (struct mem_ref_group **groups, tree base,
234		      HOST_WIDE_INT step)
235{
236  struct mem_ref_group *group;
237
238  for (; *groups; groups = &(*groups)->next)
239    {
240      if ((*groups)->step == step
241	  && operand_equal_p ((*groups)->base, base, 0))
242	return *groups;
243
244      /* Keep the list of groups sorted by decreasing step.  */
245      if ((*groups)->step < step)
246	break;
247    }
248
249  group = xcalloc (1, sizeof (struct mem_ref_group));
250  group->base = base;
251  group->step = step;
252  group->refs = NULL;
253  group->next = *groups;
254  *groups = group;
255
256  return group;
257}
258
259/* Records a memory reference MEM in GROUP with offset DELTA and write status
260   WRITE_P.  The reference occurs in statement STMT.  */
261
262static void
263record_ref (struct mem_ref_group *group, tree stmt, tree mem,
264	    HOST_WIDE_INT delta, bool write_p)
265{
266  struct mem_ref **aref;
267
268  /* Do not record the same address twice.  */
269  for (aref = &group->refs; *aref; aref = &(*aref)->next)
270    {
271      /* It does not have to be possible for write reference to reuse the read
272	 prefetch, or vice versa.  */
273      if (!WRITE_CAN_USE_READ_PREFETCH
274	  && write_p
275	  && !(*aref)->write_p)
276	continue;
277      if (!READ_CAN_USE_WRITE_PREFETCH
278	  && !write_p
279	  && (*aref)->write_p)
280	continue;
281
282      if ((*aref)->delta == delta)
283	return;
284    }
285
286  (*aref) = xcalloc (1, sizeof (struct mem_ref));
287  (*aref)->stmt = stmt;
288  (*aref)->mem = mem;
289  (*aref)->delta = delta;
290  (*aref)->write_p = write_p;
291  (*aref)->prefetch_before = PREFETCH_ALL;
292  (*aref)->prefetch_mod = 1;
293  (*aref)->issue_prefetch_p = false;
294  (*aref)->group = group;
295  (*aref)->next = NULL;
296
297  if (dump_file && (dump_flags & TDF_DETAILS))
298    dump_mem_ref (dump_file, *aref);
299}
300
301/* Release memory references in GROUPS.  */
302
303static void
304release_mem_refs (struct mem_ref_group *groups)
305{
306  struct mem_ref_group *next_g;
307  struct mem_ref *ref, *next_r;
308
309  for (; groups; groups = next_g)
310    {
311      next_g = groups->next;
312      for (ref = groups->refs; ref; ref = next_r)
313	{
314	  next_r = ref->next;
315	  free (ref);
316	}
317      free (groups);
318    }
319}
320
321/* A structure used to pass arguments to idx_analyze_ref.  */
322
323struct ar_data
324{
325  struct loop *loop;			/* Loop of the reference.  */
326  tree stmt;				/* Statement of the reference.  */
327  HOST_WIDE_INT *step;			/* Step of the memory reference.  */
328  HOST_WIDE_INT *delta;			/* Offset of the memory reference.  */
329};
330
331/* Analyzes a single INDEX of a memory reference to obtain information
332   described at analyze_ref.  Callback for for_each_index.  */
333
334static bool
335idx_analyze_ref (tree base, tree *index, void *data)
336{
337  struct ar_data *ar_data = data;
338  tree ibase, step, stepsize;
339  HOST_WIDE_INT istep, idelta = 0, imult = 1;
340  affine_iv iv;
341
342  if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF
343      || TREE_CODE (base) == ALIGN_INDIRECT_REF)
344    return false;
345
346  if (!simple_iv (ar_data->loop, ar_data->stmt, *index, &iv, false))
347    return false;
348  ibase = iv.base;
349  step = iv.step;
350
351  if (zero_p (step))
352    istep = 0;
353  else
354    {
355      if (!cst_and_fits_in_hwi (step))
356	return false;
357      istep = int_cst_value (step);
358    }
359
360  if (TREE_CODE (ibase) == PLUS_EXPR
361      && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
362    {
363      idelta = int_cst_value (TREE_OPERAND (ibase, 1));
364      ibase = TREE_OPERAND (ibase, 0);
365    }
366  if (cst_and_fits_in_hwi (ibase))
367    {
368      idelta += int_cst_value (ibase);
369      ibase = build_int_cst (TREE_TYPE (ibase), 0);
370    }
371
372  if (TREE_CODE (base) == ARRAY_REF)
373    {
374      stepsize = array_ref_element_size (base);
375      if (!cst_and_fits_in_hwi (stepsize))
376	return false;
377      imult = int_cst_value (stepsize);
378
379      istep *= imult;
380      idelta *= imult;
381    }
382
383  *ar_data->step += istep;
384  *ar_data->delta += idelta;
385  *index = ibase;
386
387  return true;
388}
389
390/* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
391   STEP are integer constants and iter is number of iterations of LOOP.  The
392   reference occurs in statement STMT.  Strips nonaddressable component
393   references from REF_P.  */
394
395static bool
396analyze_ref (struct loop *loop, tree *ref_p, tree *base,
397	     HOST_WIDE_INT *step, HOST_WIDE_INT *delta,
398	     tree stmt)
399{
400  struct ar_data ar_data;
401  tree off;
402  HOST_WIDE_INT bit_offset;
403  tree ref = *ref_p;
404
405  *step = 0;
406  *delta = 0;
407
408  /* First strip off the component references.  Ignore bitfields.  */
409  if (TREE_CODE (ref) == COMPONENT_REF
410      && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1)))
411    ref = TREE_OPERAND (ref, 0);
412
413  *ref_p = ref;
414
415  for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
416    {
417      off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
418      bit_offset = TREE_INT_CST_LOW (off);
419      gcc_assert (bit_offset % BITS_PER_UNIT == 0);
420
421      *delta += bit_offset / BITS_PER_UNIT;
422    }
423
424  *base = unshare_expr (ref);
425  ar_data.loop = loop;
426  ar_data.stmt = stmt;
427  ar_data.step = step;
428  ar_data.delta = delta;
429  return for_each_index (base, idx_analyze_ref, &ar_data);
430}
431
432/* Record a memory reference REF to the list REFS.  The reference occurs in
433   LOOP in statement STMT and it is write if WRITE_P.  */
434
435static void
436gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
437			      tree ref, bool write_p, tree stmt)
438{
439  tree base;
440  HOST_WIDE_INT step, delta;
441  struct mem_ref_group *agrp;
442
443  if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
444    return;
445
446  /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
447     are integer constants.  */
448  agrp = find_or_create_group (refs, base, step);
449  record_ref (agrp, stmt, ref, delta, write_p);
450}
451
452/* Record the suitable memory references in LOOP.  */
453
454static struct mem_ref_group *
455gather_memory_references (struct loop *loop)
456{
457  basic_block *body = get_loop_body_in_dom_order (loop);
458  basic_block bb;
459  unsigned i;
460  block_stmt_iterator bsi;
461  tree stmt, lhs, rhs;
462  struct mem_ref_group *refs = NULL;
463
464  /* Scan the loop body in order, so that the former references precede the
465     later ones.  */
466  for (i = 0; i < loop->num_nodes; i++)
467    {
468      bb = body[i];
469      if (bb->loop_father != loop)
470	continue;
471
472      for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
473	{
474	  stmt = bsi_stmt (bsi);
475	  if (TREE_CODE (stmt) != MODIFY_EXPR)
476	    continue;
477
478	  lhs = TREE_OPERAND (stmt, 0);
479	  rhs = TREE_OPERAND (stmt, 1);
480
481	  if (REFERENCE_CLASS_P (rhs))
482	    gather_memory_references_ref (loop, &refs, rhs, false, stmt);
483	  if (REFERENCE_CLASS_P (lhs))
484	    gather_memory_references_ref (loop, &refs, lhs, true, stmt);
485	}
486    }
487  free (body);
488
489  return refs;
490}
491
492/* Prune the prefetch candidate REF using the self-reuse.  */
493
494static void
495prune_ref_by_self_reuse (struct mem_ref *ref)
496{
497  HOST_WIDE_INT step = ref->group->step;
498  bool backward = step < 0;
499
500  if (step == 0)
501    {
502      /* Prefetch references to invariant address just once.  */
503      ref->prefetch_before = 1;
504      return;
505    }
506
507  if (backward)
508    step = -step;
509
510  if (step > PREFETCH_BLOCK)
511    return;
512
513  if ((backward && HAVE_BACKWARD_PREFETCH)
514      || (!backward && HAVE_FORWARD_PREFETCH))
515    {
516      ref->prefetch_before = 1;
517      return;
518    }
519
520  ref->prefetch_mod = PREFETCH_BLOCK / step;
521}
522
523/* Divides X by BY, rounding down.  */
524
525static HOST_WIDE_INT
526ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
527{
528  gcc_assert (by > 0);
529
530  if (x >= 0)
531    return x / by;
532  else
533    return (x + by - 1) / by;
534}
535
536/* Prune the prefetch candidate REF using the reuse with BY.
537   If BY_IS_BEFORE is true, BY is before REF in the loop.  */
538
539static void
540prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
541			  bool by_is_before)
542{
543  HOST_WIDE_INT step = ref->group->step;
544  bool backward = step < 0;
545  HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
546  HOST_WIDE_INT delta = delta_b - delta_r;
547  HOST_WIDE_INT hit_from;
548  unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
549
550  if (delta == 0)
551    {
552      /* If the references has the same address, only prefetch the
553	 former.  */
554      if (by_is_before)
555	ref->prefetch_before = 0;
556
557      return;
558    }
559
560  if (!step)
561    {
562      /* If the reference addresses are invariant and fall into the
563	 same cache line, prefetch just the first one.  */
564      if (!by_is_before)
565	return;
566
567      if (ddown (ref->delta, PREFETCH_BLOCK)
568	  != ddown (by->delta, PREFETCH_BLOCK))
569	return;
570
571      ref->prefetch_before = 0;
572      return;
573    }
574
575  /* Only prune the reference that is behind in the array.  */
576  if (backward)
577    {
578      if (delta > 0)
579	return;
580
581      /* Transform the data so that we may assume that the accesses
582	 are forward.  */
583      delta = - delta;
584      step = -step;
585      delta_r = PREFETCH_BLOCK - 1 - delta_r;
586      delta_b = PREFETCH_BLOCK - 1 - delta_b;
587    }
588  else
589    {
590      if (delta < 0)
591	return;
592    }
593
594  /* Check whether the two references are likely to hit the same cache
595     line, and how distant the iterations in that it occurs are from
596     each other.  */
597
598  if (step <= PREFETCH_BLOCK)
599    {
600      /* The accesses are sure to meet.  Let us check when.  */
601      hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
602      prefetch_before = (hit_from - delta_r + step - 1) / step;
603
604      if (prefetch_before < ref->prefetch_before)
605	ref->prefetch_before = prefetch_before;
606
607      return;
608    }
609
610  /* A more complicated case.  First let us ensure that size of cache line
611     and step are coprime (here we assume that PREFETCH_BLOCK is a power
612     of two.  */
613  prefetch_block = PREFETCH_BLOCK;
614  while ((step & 1) == 0
615	 && prefetch_block > 1)
616    {
617      step >>= 1;
618      prefetch_block >>= 1;
619      delta >>= 1;
620    }
621
622  /* Now step > prefetch_block, and step and prefetch_block are coprime.
623     Determine the probability that the accesses hit the same cache line.  */
624
625  prefetch_before = delta / step;
626  delta %= step;
627  if ((unsigned HOST_WIDE_INT) delta
628      <= (prefetch_block * ACCEPTABLE_MISS_RATE / 1000))
629    {
630      if (prefetch_before < ref->prefetch_before)
631	ref->prefetch_before = prefetch_before;
632
633      return;
634    }
635
636  /* Try also the following iteration.  */
637  prefetch_before++;
638  delta = step - delta;
639  if ((unsigned HOST_WIDE_INT) delta
640      <= (prefetch_block * ACCEPTABLE_MISS_RATE / 1000))
641    {
642      if (prefetch_before < ref->prefetch_before)
643	ref->prefetch_before = prefetch_before;
644
645      return;
646    }
647
648  /* The ref probably does not reuse by.  */
649  return;
650}
651
652/* Prune the prefetch candidate REF using the reuses with other references
653   in REFS.  */
654
655static void
656prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
657{
658  struct mem_ref *prune_by;
659  bool before = true;
660
661  prune_ref_by_self_reuse (ref);
662
663  for (prune_by = refs; prune_by; prune_by = prune_by->next)
664    {
665      if (prune_by == ref)
666	{
667	  before = false;
668	  continue;
669	}
670
671      if (!WRITE_CAN_USE_READ_PREFETCH
672	  && ref->write_p
673	  && !prune_by->write_p)
674	continue;
675      if (!READ_CAN_USE_WRITE_PREFETCH
676	  && !ref->write_p
677	  && prune_by->write_p)
678	continue;
679
680      prune_ref_by_group_reuse (ref, prune_by, before);
681    }
682}
683
684/* Prune the prefetch candidates in GROUP using the reuse analysis.  */
685
686static void
687prune_group_by_reuse (struct mem_ref_group *group)
688{
689  struct mem_ref *ref_pruned;
690
691  for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
692    {
693      prune_ref_by_reuse (ref_pruned, group->refs);
694
695      if (dump_file && (dump_flags & TDF_DETAILS))
696	{
697	  fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
698
699	  if (ref_pruned->prefetch_before == PREFETCH_ALL
700	      && ref_pruned->prefetch_mod == 1)
701	    fprintf (dump_file, " no restrictions");
702	  else if (ref_pruned->prefetch_before == 0)
703	    fprintf (dump_file, " do not prefetch");
704	  else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
705	    fprintf (dump_file, " prefetch once");
706	  else
707	    {
708	      if (ref_pruned->prefetch_before != PREFETCH_ALL)
709		{
710		  fprintf (dump_file, " prefetch before ");
711		  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
712			   ref_pruned->prefetch_before);
713		}
714	      if (ref_pruned->prefetch_mod != 1)
715		{
716		  fprintf (dump_file, " prefetch mod ");
717		  fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
718			   ref_pruned->prefetch_mod);
719		}
720	    }
721	  fprintf (dump_file, "\n");
722	}
723    }
724}
725
726/* Prune the list of prefetch candidates GROUPS using the reuse analysis.  */
727
728static void
729prune_by_reuse (struct mem_ref_group *groups)
730{
731  for (; groups; groups = groups->next)
732    prune_group_by_reuse (groups);
733}
734
735/* Returns true if we should issue prefetch for REF.  */
736
737static bool
738should_issue_prefetch_p (struct mem_ref *ref)
739{
740  /* For now do not issue prefetches for only first few of the
741     iterations.  */
742  if (ref->prefetch_before != PREFETCH_ALL)
743    return false;
744
745  return true;
746}
747
748/* Decide which of the prefetch candidates in GROUPS to prefetch.
749   AHEAD is the number of iterations to prefetch ahead (which corresponds
750   to the number of simultaneous instances of one prefetch running at a
751   time).  UNROLL_FACTOR is the factor by that the loop is going to be
752   unrolled.  Returns true if there is anything to prefetch.  */
753
754static bool
755schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
756		     unsigned ahead)
757{
758  unsigned max_prefetches, n_prefetches;
759  struct mem_ref *ref;
760  bool any = false;
761
762  max_prefetches = (SIMULTANEOUS_PREFETCHES * unroll_factor) / ahead;
763  if (max_prefetches > (unsigned) SIMULTANEOUS_PREFETCHES)
764    max_prefetches = SIMULTANEOUS_PREFETCHES;
765
766  if (dump_file && (dump_flags & TDF_DETAILS))
767    fprintf (dump_file, "Max prefetches to issue: %d.\n", max_prefetches);
768
769  if (!max_prefetches)
770    return false;
771
772  /* For now we just take memory references one by one and issue
773     prefetches for as many as possible.  The groups are sorted
774     starting with the largest step, since the references with
775     large step are more likely to cause many cache misses.  */
776
777  for (; groups; groups = groups->next)
778    for (ref = groups->refs; ref; ref = ref->next)
779      {
780	if (!should_issue_prefetch_p (ref))
781	  continue;
782
783	ref->issue_prefetch_p = true;
784
785	/* If prefetch_mod is less then unroll_factor, we need to insert
786	   several prefetches for the reference.  */
787	n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
788			/ ref->prefetch_mod);
789	if (max_prefetches <= n_prefetches)
790	  return true;
791
792	max_prefetches -= n_prefetches;
793	any = true;
794      }
795
796  return any;
797}
798
799/* Determine whether there is any reference suitable for prefetching
800   in GROUPS.  */
801
802static bool
803anything_to_prefetch_p (struct mem_ref_group *groups)
804{
805  struct mem_ref *ref;
806
807  for (; groups; groups = groups->next)
808    for (ref = groups->refs; ref; ref = ref->next)
809      if (should_issue_prefetch_p (ref))
810	return true;
811
812  return false;
813}
814
815/* Issue prefetches for the reference REF into loop as decided before.
816   HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR
817   is the factor by which LOOP was unrolled.  */
818
819static void
820issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
821{
822  HOST_WIDE_INT delta;
823  tree addr, addr_base, prefetch, params, write_p;
824  block_stmt_iterator bsi;
825  unsigned n_prefetches, ap;
826
827  if (dump_file && (dump_flags & TDF_DETAILS))
828    fprintf (dump_file, "Issued prefetch for %p.\n", (void *) ref);
829
830  bsi = bsi_for_stmt (ref->stmt);
831
832  n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
833		  / ref->prefetch_mod);
834  addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
835  addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL);
836
837  for (ap = 0; ap < n_prefetches; ap++)
838    {
839      /* Determine the address to prefetch.  */
840      delta = (ahead + ap * ref->prefetch_mod) * ref->group->step;
841      addr = fold_build2 (PLUS_EXPR, ptr_type_node,
842			  addr_base, build_int_cst (ptr_type_node, delta));
843      addr = force_gimple_operand_bsi (&bsi, unshare_expr (addr), true, NULL);
844
845      /* Create the prefetch instruction.  */
846      write_p = ref->write_p ? integer_one_node : integer_zero_node;
847      params = tree_cons (NULL_TREE, addr,
848			  tree_cons (NULL_TREE, write_p, NULL_TREE));
849
850      prefetch = build_function_call_expr (built_in_decls[BUILT_IN_PREFETCH],
851					   params);
852      bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT);
853    }
854}
855
856/* Issue prefetches for the references in GROUPS into loop as decided before.
857   HEAD is the number of iterations to prefetch ahead.  UNROLL_FACTOR is the
858   factor by that LOOP was unrolled.  */
859
860static void
861issue_prefetches (struct mem_ref_group *groups,
862		  unsigned unroll_factor, unsigned ahead)
863{
864  struct mem_ref *ref;
865
866  for (; groups; groups = groups->next)
867    for (ref = groups->refs; ref; ref = ref->next)
868      if (ref->issue_prefetch_p)
869	issue_prefetch_ref (ref, unroll_factor, ahead);
870}
871
872/* Determines whether we can profitably unroll LOOP FACTOR times, and if
873   this is the case, fill in DESC by the description of number of
874   iterations.  */
875
876static bool
877should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
878		      unsigned factor)
879{
880  if (!can_unroll_loop_p (loop, factor, desc))
881    return false;
882
883  /* We only consider loops without control flow for unrolling.  This is not
884     a hard restriction -- tree_unroll_loop works with arbitrary loops
885     as well; but the unrolling/prefetching is usually more profitable for
886     loops consisting of a single basic block, and we want to limit the
887     code growth.  */
888  if (loop->num_nodes > 2)
889    return false;
890
891  return true;
892}
893
894/* Determine the coefficient by that unroll LOOP, from the information
895   contained in the list of memory references REFS.  Description of
896   umber of iterations of LOOP is stored to DESC.  AHEAD is the number
897   of iterations ahead that we need to prefetch.  NINSNS is number of
898   insns of the LOOP.  */
899
900static unsigned
901determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
902			 unsigned ahead, unsigned ninsns,
903			 struct tree_niter_desc *desc)
904{
905  unsigned upper_bound, size_factor, constraint_factor;
906  unsigned factor, max_mod_constraint, ahead_factor;
907  struct mem_ref_group *agp;
908  struct mem_ref *ref;
909
910  upper_bound = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
911
912  /* First check whether the loop is not too large to unroll.  */
913  size_factor = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
914  if (size_factor <= 1)
915    return 1;
916
917  if (size_factor < upper_bound)
918    upper_bound = size_factor;
919
920  max_mod_constraint = 1;
921  for (agp = refs; agp; agp = agp->next)
922    for (ref = agp->refs; ref; ref = ref->next)
923      if (should_issue_prefetch_p (ref)
924	  && ref->prefetch_mod > max_mod_constraint)
925	max_mod_constraint = ref->prefetch_mod;
926
927  /* Set constraint_factor as large as needed to be able to satisfy the
928     largest modulo constraint.  */
929  constraint_factor = max_mod_constraint;
930
931  /* If ahead is too large in comparison with the number of available
932     prefetches, unroll the loop as much as needed to be able to prefetch
933     at least partially some of the references in the loop.  */
934  ahead_factor = ((ahead + SIMULTANEOUS_PREFETCHES - 1)
935		  / SIMULTANEOUS_PREFETCHES);
936
937  /* Unroll as much as useful, but bound the code size growth.  */
938  if (constraint_factor < ahead_factor)
939    factor = ahead_factor;
940  else
941    factor = constraint_factor;
942  if (factor > upper_bound)
943    factor = upper_bound;
944
945  if (!should_unroll_loop_p (loop, desc, factor))
946    return 1;
947
948  return factor;
949}
950
951/* Issue prefetch instructions for array references in LOOP.  Returns
952   true if the LOOP was unrolled.  LOOPS is the array containing all
953   loops.  */
954
955static bool
956loop_prefetch_arrays (struct loops *loops, struct loop *loop)
957{
958  struct mem_ref_group *refs;
959  unsigned ahead, ninsns, unroll_factor;
960  struct tree_niter_desc desc;
961  bool unrolled = false;
962
963  /* Step 1: gather the memory references.  */
964  refs = gather_memory_references (loop);
965
966  /* Step 2: estimate the reuse effects.  */
967  prune_by_reuse (refs);
968
969  if (!anything_to_prefetch_p (refs))
970    goto fail;
971
972  /* Step 3: determine the ahead and unroll factor.  */
973
974  /* FIXME: We should use not size of the loop, but the average number of
975     instructions executed per iteration of the loop.  */
976  ninsns = tree_num_loop_insns (loop);
977  ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns;
978  unroll_factor = determine_unroll_factor (loop, refs, ahead, ninsns,
979					   &desc);
980  if (dump_file && (dump_flags & TDF_DETAILS))
981    fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor);
982
983  /* If the loop rolls less than the required unroll factor, prefetching
984     is useless.  */
985  if (unroll_factor > 1
986      && cst_and_fits_in_hwi (desc.niter)
987      && (unsigned HOST_WIDE_INT) int_cst_value (desc.niter) < unroll_factor)
988    goto fail;
989
990  /* Step 4: what to prefetch?  */
991  if (!schedule_prefetches (refs, unroll_factor, ahead))
992    goto fail;
993
994  /* Step 5: unroll the loop.  TODO -- peeling of first and last few
995     iterations so that we do not issue superfluous prefetches.  */
996  if (unroll_factor != 1)
997    {
998      tree_unroll_loop (loops, loop, unroll_factor,
999			single_dom_exit (loop), &desc);
1000      unrolled = true;
1001    }
1002
1003  /* Step 6: issue the prefetches.  */
1004  issue_prefetches (refs, unroll_factor, ahead);
1005
1006fail:
1007  release_mem_refs (refs);
1008  return unrolled;
1009}
1010
1011/* Issue prefetch instructions for array references in LOOPS.  */
1012
1013unsigned int
1014tree_ssa_prefetch_arrays (struct loops *loops)
1015{
1016  unsigned i;
1017  struct loop *loop;
1018  bool unrolled = false;
1019  int todo_flags = 0;
1020
1021  if (!HAVE_prefetch
1022      /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1023	 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1024	 of processor costs and i486 does not have prefetch, but
1025	 -march=pentium4 causes HAVE_prefetch to be true.  Ugh.  */
1026      || PREFETCH_BLOCK == 0)
1027    return 0;
1028
1029  initialize_original_copy_tables ();
1030
1031  if (!built_in_decls[BUILT_IN_PREFETCH])
1032    {
1033      tree type = build_function_type (void_type_node,
1034				       tree_cons (NULL_TREE,
1035						  const_ptr_type_node,
1036						  NULL_TREE));
1037      tree decl = lang_hooks.builtin_function ("__builtin_prefetch", type,
1038			BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1039			NULL, NULL_TREE);
1040      DECL_IS_NOVOPS (decl) = true;
1041      built_in_decls[BUILT_IN_PREFETCH] = decl;
1042    }
1043
1044  /* We assume that size of cache line is a power of two, so verify this
1045     here.  */
1046  gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
1047
1048  for (i = loops->num - 1; i > 0; i--)
1049    {
1050      loop = loops->parray[i];
1051
1052      if (dump_file && (dump_flags & TDF_DETAILS))
1053	fprintf (dump_file, "Processing loop %d:\n", loop->num);
1054
1055      if (loop)
1056	unrolled |= loop_prefetch_arrays (loops, loop);
1057
1058      if (dump_file && (dump_flags & TDF_DETAILS))
1059	fprintf (dump_file, "\n\n");
1060    }
1061
1062  if (unrolled)
1063    {
1064      scev_reset ();
1065      todo_flags |= TODO_cleanup_cfg;
1066    }
1067
1068  free_original_copy_tables ();
1069  return todo_flags;
1070}
1071