loop-unroll.c revision 146895
1/* Loop unrolling and peeling.
2   Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify it under
7the terms of the GNU General Public License as published by the Free
8Software Foundation; either version 2, or (at your option) any later
9version.
10
11GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12WARRANTY; without even the implied warranty of MERCHANTABILITY or
13FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14for more details.
15
16You should have received a copy of the GNU General Public License
17along with GCC; see the file COPYING.  If not, write to the Free
18Software Foundation, 59 Temple Place - Suite 330, Boston, MA
1902111-1307, USA.  */
20
21#include "config.h"
22#include "system.h"
23#include "coretypes.h"
24#include "tm.h"
25#include "rtl.h"
26#include "hard-reg-set.h"
27#include "basic-block.h"
28#include "cfgloop.h"
29#include "cfglayout.h"
30#include "params.h"
31#include "output.h"
32#include "expr.h"
33/* We need to use the macro exact_log2. */
34#include "toplev.h"
35
36/* This pass performs loop unrolling and peeling.  We only perform these
37   optimizations on innermost loops (with single exception) because
38   the impact on performance is greatest here, and we want to avoid
39   unnecessary code size growth.  The gain is caused by greater sequentiality
40   of code, better code to optimize for further passes and in some cases
41   by fewer testings of exit conditions.  The main problem is code growth,
42   that impacts performance negatively due to effect of caches.
43
44   What we do:
45
46   -- complete peeling of once-rolling loops; this is the above mentioned
47      exception, as this causes loop to be cancelled completely and
48      does not cause code growth
49   -- complete peeling of loops that roll (small) constant times.
50   -- simple peeling of first iterations of loops that do not roll much
51      (according to profile feedback)
52   -- unrolling of loops that roll constant times; this is almost always
53      win, as we get rid of exit condition tests.
54   -- unrolling of loops that roll number of times that we can compute
55      in runtime; we also get rid of exit condition tests here, but there
56      is the extra expense for calculating the number of iterations
57   -- simple unrolling of remaining loops; this is performed only if we
58      are asked to, as the gain is questionable in this case and often
59      it may even slow down the code
60   For more detailed descriptions of each of those, see comments at
61   appropriate function below.
62
63   There is a lot of parameters (defined and described in params.def) that
64   control how much we unroll/peel.
65
66   ??? A great problem is that we don't have a good way how to determine
67   how many times we should unroll the loop; the experiments I have made
68   showed that this choice may affect performance in order of several %.
69   */
70
71static void decide_unrolling_and_peeling (struct loops *, int);
72static void peel_loops_completely (struct loops *, int);
73static void decide_peel_simple (struct loop *, int);
74static void decide_peel_once_rolling (struct loop *, int);
75static void decide_peel_completely (struct loop *, int);
76static void decide_unroll_stupid (struct loop *, int);
77static void decide_unroll_constant_iterations (struct loop *, int);
78static void decide_unroll_runtime_iterations (struct loop *, int);
79static void peel_loop_simple (struct loops *, struct loop *);
80static void peel_loop_completely (struct loops *, struct loop *);
81static void unroll_loop_stupid (struct loops *, struct loop *);
82static void unroll_loop_constant_iterations (struct loops *, struct loop *);
83static void unroll_loop_runtime_iterations (struct loops *, struct loop *);
84static void expand_bct (edge, int);
85static bool discard_increment (struct loop *);
86
87/* Unroll and/or peel (depending on FLAGS) LOOPS.  */
88void
89unroll_and_peel_loops (struct loops *loops, int flags)
90{
91  struct loop *loop, *next;
92  int check;
93
94  /* First perform complete loop peeling (it is almost surely a win,
95     and affects parameters for further decision a lot).  */
96  peel_loops_completely (loops, flags);
97
98  /* Now decide rest of unrolling and peeling.  */
99  decide_unrolling_and_peeling (loops, flags);
100
101  loop = loops->tree_root;
102  while (loop->inner)
103    loop = loop->inner;
104
105  /* Scan the loops, inner ones first.  */
106  while (loop != loops->tree_root)
107    {
108      if (loop->next)
109	{
110	  next = loop->next;
111	  while (next->inner)
112	    next = next->inner;
113	}
114      else
115	next = loop->outer;
116
117      check = 1;
118      /* And perform the appropriate transformations.  */
119      switch (loop->lpt_decision.decision)
120	{
121	case LPT_PEEL_COMPLETELY:
122	  /* Already done.  */
123	  abort ();
124	case LPT_PEEL_SIMPLE:
125	  peel_loop_simple (loops, loop);
126	  break;
127	case LPT_UNROLL_CONSTANT:
128	  unroll_loop_constant_iterations (loops, loop);
129	  break;
130	case LPT_UNROLL_RUNTIME:
131	  unroll_loop_runtime_iterations (loops, loop);
132	  break;
133	case LPT_UNROLL_STUPID:
134	  unroll_loop_stupid (loops, loop);
135	  break;
136	case LPT_NONE:
137	  check = 0;
138	  break;
139	default:
140	  abort ();
141	}
142      if (check)
143	{
144#ifdef ENABLE_CHECKING
145	  verify_dominators (CDI_DOMINATORS);
146	  verify_loop_structure (loops);
147#endif
148	}
149      loop = next;
150    }
151}
152
153/* Check whether to peel LOOPS (depending on FLAGS) completely and do so.  */
154static void
155peel_loops_completely (struct loops *loops, int flags)
156{
157  struct loop *loop, *next;
158
159  loop = loops->tree_root;
160  while (loop->inner)
161    loop = loop->inner;
162
163  while (loop != loops->tree_root)
164    {
165      if (loop->next)
166	{
167	  next = loop->next;
168	  while (next->inner)
169	    next = next->inner;
170	}
171      else
172	next = loop->outer;
173
174      loop->lpt_decision.decision = LPT_NONE;
175      loop->has_desc = 0;
176
177      if (rtl_dump_file)
178	fprintf (rtl_dump_file, ";; Considering loop %d for complete peeling\n",
179		 loop->num);
180
181      loop->ninsns = num_loop_insns (loop);
182
183      decide_peel_once_rolling (loop, flags);
184      if (loop->lpt_decision.decision == LPT_NONE)
185	decide_peel_completely (loop, flags);
186
187      if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
188	{
189	  peel_loop_completely (loops, loop);
190#ifdef ENABLE_CHECKING
191	  verify_dominators (CDI_DOMINATORS);
192	  verify_loop_structure (loops);
193#endif
194	}
195      loop = next;
196    }
197}
198
199/* Decide whether unroll or peel LOOPS (depending on FLAGS) and how much.  */
200static void
201decide_unrolling_and_peeling (struct loops *loops, int flags)
202{
203  struct loop *loop = loops->tree_root, *next;
204
205  while (loop->inner)
206    loop = loop->inner;
207
208  /* Scan the loops, inner ones first.  */
209  while (loop != loops->tree_root)
210    {
211      if (loop->next)
212	{
213	  next = loop->next;
214	  while (next->inner)
215	    next = next->inner;
216	}
217      else
218	next = loop->outer;
219
220      loop->lpt_decision.decision = LPT_NONE;
221
222      if (rtl_dump_file)
223	fprintf (rtl_dump_file, ";; Considering loop %d\n", loop->num);
224
225      /* Do not peel cold areas.  */
226      if (!maybe_hot_bb_p (loop->header))
227	{
228	  if (rtl_dump_file)
229	    fprintf (rtl_dump_file, ";; Not considering loop, cold area\n");
230	  loop = next;
231	  continue;
232	}
233
234      /* Can the loop be manipulated?  */
235      if (!can_duplicate_loop_p (loop))
236	{
237	  if (rtl_dump_file)
238	    fprintf (rtl_dump_file,
239		     ";; Not considering loop, cannot duplicate\n");
240	  loop = next;
241	  continue;
242	}
243
244      /* Skip non-innermost loops.  */
245      if (loop->inner)
246	{
247	  if (rtl_dump_file)
248	    fprintf (rtl_dump_file, ";; Not considering loop, is not innermost\n");
249	  loop = next;
250	  continue;
251	}
252
253      loop->ninsns = num_loop_insns (loop);
254      loop->av_ninsns = average_num_loop_insns (loop);
255
256      /* Try transformations one by one in decreasing order of
257	 priority.  */
258
259      decide_unroll_constant_iterations (loop, flags);
260      if (loop->lpt_decision.decision == LPT_NONE)
261	decide_unroll_runtime_iterations (loop, flags);
262      if (loop->lpt_decision.decision == LPT_NONE)
263	decide_unroll_stupid (loop, flags);
264      if (loop->lpt_decision.decision == LPT_NONE)
265	decide_peel_simple (loop, flags);
266
267      loop = next;
268    }
269}
270
271/* Decide whether the LOOP is once rolling and suitable for complete
272   peeling.  */
273static void
274decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
275{
276  if (rtl_dump_file)
277    fprintf (rtl_dump_file, ";; Considering peeling once rolling loop\n");
278
279  /* Is the loop small enough?  */
280  if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
281    {
282      if (rtl_dump_file)
283	fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
284      return;
285    }
286
287  /* Check for simple loops.  */
288  loop->simple = simple_loop_p (loop, &loop->desc);
289  loop->has_desc = 1;
290
291  /* Check number of iterations.  */
292  if (!loop->simple || !loop->desc.const_iter || loop->desc.niter != 0)
293    {
294      if (rtl_dump_file)
295	fprintf (rtl_dump_file, ";; Unable to prove that the loop rolls exactly once\n");
296      return;
297    }
298
299  /* Success.  */
300  if (rtl_dump_file)
301    fprintf (rtl_dump_file, ";; Decided to peel exactly once rolling loop\n");
302  loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
303}
304
305/* Decide whether the LOOP is suitable for complete peeling.  */
306static void
307decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
308{
309  unsigned npeel;
310
311  if (rtl_dump_file)
312    fprintf (rtl_dump_file, ";; Considering peeling completely\n");
313
314  /* Skip non-innermost loops.  */
315  if (loop->inner)
316    {
317      if (rtl_dump_file)
318	fprintf (rtl_dump_file, ";; Not considering loop, is not innermost\n");
319      return;
320    }
321
322  /* Do not peel cold areas.  */
323  if (!maybe_hot_bb_p (loop->header))
324    {
325      if (rtl_dump_file)
326	fprintf (rtl_dump_file, ";; Not considering loop, cold area\n");
327      return;
328    }
329
330  /* Can the loop be manipulated?  */
331  if (!can_duplicate_loop_p (loop))
332    {
333      if (rtl_dump_file)
334	fprintf (rtl_dump_file,
335		 ";; Not considering loop, cannot duplicate\n");
336      return;
337    }
338
339  /* npeel = number of iterations to peel.  */
340  npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
341  if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
342    npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
343
344  /* Is the loop small enough?  */
345  if (!npeel)
346    {
347      if (rtl_dump_file)
348	fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
349      return;
350    }
351
352  /* Check for simple loops.  */
353  if (!loop->has_desc)
354    {
355      loop->simple = simple_loop_p (loop, &loop->desc);
356      loop->has_desc = 1;
357    }
358
359  /* Check number of iterations.  */
360  if (!loop->simple || !loop->desc.const_iter)
361    {
362      if (rtl_dump_file)
363	fprintf (rtl_dump_file, ";; Unable to prove that the loop iterates constant times\n");
364      return;
365    }
366
367  if (loop->desc.niter > npeel - 1)
368    {
369      if (rtl_dump_file)
370	{
371	  fprintf (rtl_dump_file, ";; Not peeling loop completely, rolls too much (");
372	  fprintf (rtl_dump_file, HOST_WIDEST_INT_PRINT_DEC,(HOST_WIDEST_INT) loop->desc.niter);
373	  fprintf (rtl_dump_file, " iterations > %d [maximum peelings])\n", npeel);
374	}
375      return;
376    }
377
378  /* Success.  */
379  if (rtl_dump_file)
380    fprintf (rtl_dump_file, ";; Decided to peel loop completely\n");
381  loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
382}
383
384/* Peel all iterations of LOOP, remove exit edges and cancel the loop
385   completely.  The transformation done:
386
387   for (i = 0; i < 4; i++)
388     body;
389
390   ==>
391
392   i = 0;
393   body; i++;
394   body; i++;
395   body; i++;
396   body; i++;
397   */
398static void
399peel_loop_completely (struct loops *loops, struct loop *loop)
400{
401  sbitmap wont_exit;
402  unsigned HOST_WIDE_INT npeel;
403  unsigned n_remove_edges, i;
404  edge *remove_edges;
405  struct loop_desc *desc = &loop->desc;
406  bool discard_inc = false;
407  bool is_bct;
408
409  if ((is_bct = is_bct_cond (BB_END (loop->desc.out_edge->src))))
410    discard_inc = discard_increment (loop);
411
412  npeel = desc->niter;
413
414  if (npeel)
415    {
416      wont_exit = sbitmap_alloc (npeel + 1);
417      sbitmap_ones (wont_exit);
418      RESET_BIT (wont_exit, 0);
419      if (desc->may_be_zero)
420	RESET_BIT (wont_exit, 1);
421
422      remove_edges = xcalloc (npeel, sizeof (edge));
423      n_remove_edges = 0;
424
425      if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
426		loops, npeel,
427		wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
428		DLTHE_FLAG_UPDATE_FREQ))
429	abort ();
430
431      free (wont_exit);
432
433      /* Expand the branch and count.  */
434      if (is_bct)
435	for (i = 0; i < n_remove_edges; i++)
436	  expand_bct (remove_edges[i], discard_inc);
437
438      /* Remove the exit edges.  */
439      for (i = 0; i < n_remove_edges; i++)
440	remove_path (loops, remove_edges[i]);
441      free (remove_edges);
442    }
443
444  /* Expand the branch and count.  */
445  if (is_bct)
446    expand_bct (desc->in_edge, discard_inc);
447
448  /* Now remove the unreachable part of the last iteration and cancel
449     the loop.  */
450  remove_path (loops, desc->in_edge);
451
452  if (rtl_dump_file)
453    fprintf (rtl_dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
454}
455
456/* Decide whether to unroll LOOP iterating constant number of times and how much.  */
457static void
458decide_unroll_constant_iterations (struct loop *loop, int flags)
459{
460  unsigned nunroll, nunroll_by_av, best_copies, best_unroll = -1, n_copies, i;
461
462  if (!(flags & UAP_UNROLL))
463    {
464      /* We were not asked to, just return back silently.  */
465      return;
466    }
467
468  if (rtl_dump_file)
469    fprintf (rtl_dump_file, ";; Considering unrolling loop with constant number of iterations\n");
470
471  /* nunroll = total number of copies of the original loop body in
472     unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
473  nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
474  nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
475  if (nunroll > nunroll_by_av)
476    nunroll = nunroll_by_av;
477  if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
478    nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
479
480  /* Skip big loops.  */
481  if (nunroll <= 1)
482    {
483      if (rtl_dump_file)
484	fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
485      return;
486    }
487
488  /* Check for simple loops.  */
489  if (!loop->has_desc)
490    {
491      loop->simple = simple_loop_p (loop, &loop->desc);
492      loop->has_desc = 1;
493    }
494
495  /* Check number of iterations.  */
496  if (!loop->simple || !loop->desc.const_iter)
497    {
498      if (rtl_dump_file)
499	fprintf (rtl_dump_file, ";; Unable to prove that the loop iterates constant times\n");
500      return;
501    }
502
503  /* Check whether the loop rolls enough to consider.  */
504  if (loop->desc.niter < 2 * nunroll)
505    {
506      if (rtl_dump_file)
507	fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n");
508      return;
509    }
510
511  /* Success; now compute number of iterations to unroll.  We alter
512     nunroll so that as few as possible copies of loop body are
513     necessary, while still not decreasing the number of unrollings
514     too much (at most by 1).  */
515  best_copies = 2 * nunroll + 10;
516
517  i = 2 * nunroll + 2;
518  if ((unsigned) i - 1 >= loop->desc.niter)
519    i = loop->desc.niter - 2;
520
521  for (; i >= nunroll - 1; i--)
522    {
523      unsigned exit_mod = loop->desc.niter % (i + 1);
524
525      if (loop->desc.postincr)
526	n_copies = exit_mod + i + 1;
527      else if (exit_mod != (unsigned) i || loop->desc.may_be_zero)
528	n_copies = exit_mod + i + 2;
529      else
530	n_copies = i + 1;
531
532      if (n_copies < best_copies)
533	{
534	  best_copies = n_copies;
535	  best_unroll = i;
536	}
537    }
538
539  if (rtl_dump_file)
540    fprintf (rtl_dump_file, ";; max_unroll %d (%d copies, initial %d).\n",
541	     best_unroll + 1, best_copies, nunroll);
542
543  loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
544  loop->lpt_decision.times = best_unroll;
545}
546
547/* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1
548   times.  The transformation does this:
549
550   for (i = 0; i < 102; i++)
551     body;
552
553   ==>
554
555   i = 0;
556   body; i++;
557   body; i++;
558   while (i < 102)
559     {
560       body; i++;
561       body; i++;
562       body; i++;
563       body; i++;
564     }
565  */
566static void
567unroll_loop_constant_iterations (struct loops *loops, struct loop *loop)
568{
569  unsigned HOST_WIDE_INT niter;
570  unsigned exit_mod;
571  sbitmap wont_exit;
572  unsigned n_remove_edges, i;
573  edge *remove_edges;
574  unsigned max_unroll = loop->lpt_decision.times;
575  struct loop_desc *desc = &loop->desc;
576  bool discard_inc = false;
577  bool is_bct;
578
579  niter = desc->niter;
580
581  if (niter <= (unsigned) max_unroll + 1)
582    abort ();  /* Should not get here (such loop should be peeled instead).  */
583
584  exit_mod = niter % (max_unroll + 1);
585
586  wont_exit = sbitmap_alloc (max_unroll + 1);
587  sbitmap_ones (wont_exit);
588
589  remove_edges = xcalloc (max_unroll + exit_mod + 1, sizeof (edge));
590  n_remove_edges = 0;
591
592  /* For a loop ending with a branch and count for which the increment
593     of the count register will be discarded, adjust the initialization of
594     the count register.  */
595  if ((is_bct = is_bct_cond (BB_END (desc->out_edge->src)))
596      && (discard_inc = discard_increment (loop)))
597    {
598      rtx ini_var;
599
600      rtx init_code;
601      int n_peel, new_bct_value;
602
603      /* Get expression for number of iterations.  */
604      start_sequence ();
605
606      n_peel = (niter+1) % (max_unroll+1);
607      new_bct_value = (niter+1 - n_peel) / (max_unroll+1) ;
608      ini_var = GEN_INT (new_bct_value);
609
610      emit_move_insn (desc->var, ini_var);
611      init_code = get_insns ();
612      end_sequence ();
613
614      loop_split_edge_with (loop_preheader_edge (loop), init_code);
615    }
616
617  if (desc->postincr)
618    {
619      /* Counter is incremented after the exit test; leave exit test
620	 in the first copy, so that the loops that start with test
621	 of exit condition have continuous body after unrolling.  */
622
623      if (rtl_dump_file)
624	fprintf (rtl_dump_file, ";; Condition on beginning of loop.\n");
625
626      /* Peel exit_mod iterations.  */
627      RESET_BIT (wont_exit, 0);
628      if (desc->may_be_zero)
629	RESET_BIT (wont_exit, 1);
630
631      if (exit_mod
632	  && !duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
633		loops, exit_mod,
634		wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
635		DLTHE_FLAG_UPDATE_FREQ))
636	abort ();
637
638      SET_BIT (wont_exit, 1);
639    }
640  else
641    {
642      /* Leave exit test in last copy, for the same reason as above if
643	 the loop tests the condition at the end of loop body.  */
644
645      if (rtl_dump_file)
646	fprintf (rtl_dump_file, ";; Condition on end of loop.\n");
647
648      /* We know that niter >= max_unroll + 2; so we do not need to care of
649	 case when we would exit before reaching the loop.  So just peel
650	 exit_mod + 1 iterations.
651	 */
652      if (exit_mod != (unsigned) max_unroll || desc->may_be_zero)
653	{
654	  RESET_BIT (wont_exit, 0);
655	  if (desc->may_be_zero)
656	    RESET_BIT (wont_exit, 1);
657
658	  if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
659		loops, exit_mod + 1,
660		wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
661		DLTHE_FLAG_UPDATE_FREQ))
662	    abort ();
663
664	  SET_BIT (wont_exit, 0);
665	  SET_BIT (wont_exit, 1);
666	}
667
668      RESET_BIT (wont_exit, max_unroll);
669    }
670
671  /* Now unroll the loop.  */
672  if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
673		loops, max_unroll,
674		wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
675		DLTHE_FLAG_UPDATE_FREQ))
676    abort ();
677
678  free (wont_exit);
679
680  /* Expand the branch and count.  */
681  if (is_bct)
682    for (i = 0; i < n_remove_edges; i++)
683      expand_bct (remove_edges[i], discard_inc);
684
685  /* Remove the edges.  */
686  for (i = 0; i < n_remove_edges; i++)
687    remove_path (loops, remove_edges[i]);
688  free (remove_edges);
689
690  if (rtl_dump_file)
691    fprintf (rtl_dump_file, ";; Unrolled loop %d times, constant # of iterations %i insns\n",max_unroll, num_loop_insns (loop));
692}
693
694/* Decide whether to unroll LOOP iterating runtime computable number of times
695   and how much.  */
696static void
697decide_unroll_runtime_iterations (struct loop *loop, int flags)
698{
699  unsigned nunroll, nunroll_by_av, i;
700
701  if (!(flags & UAP_UNROLL))
702    {
703      /* We were not asked to, just return back silently.  */
704      return;
705    }
706
707  if (rtl_dump_file)
708    fprintf (rtl_dump_file, ";; Considering unrolling loop with runtime computable number of iterations\n");
709
710  /* nunroll = total number of copies of the original loop body in
711     unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
712  nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
713  nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
714  if (nunroll > nunroll_by_av)
715    nunroll = nunroll_by_av;
716  if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
717    nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
718
719  /* Skip big loops.  */
720  if (nunroll <= 1)
721    {
722      if (rtl_dump_file)
723	fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
724      return;
725    }
726
727  /* Check for simple loops.  */
728  if (!loop->has_desc)
729    {
730      loop->simple = simple_loop_p (loop, &loop->desc);
731      loop->has_desc = 1;
732    }
733
734  /* Check simpleness.  */
735  if (!loop->simple)
736    {
737      if (rtl_dump_file)
738	fprintf (rtl_dump_file, ";; Unable to prove that the number of iterations can be counted in runtime\n");
739      return;
740    }
741
742  if (loop->desc.const_iter)
743    {
744      if (rtl_dump_file)
745	fprintf (rtl_dump_file, ";; Loop iterates constant times\n");
746      return;
747    }
748
749  /* If we have profile feedback, check whether the loop rolls.  */
750  if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
751    {
752      if (rtl_dump_file)
753	fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n");
754      return;
755    }
756
757  /* Success; now force nunroll to be power of 2, as we are unable to
758     cope with overflows in computation of number of iterations.  */
759  for (i = 1; 2 * i <= nunroll; i *= 2);
760
761  loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
762  loop->lpt_decision.times = i - 1;
763}
764
765/* Unroll LOOP for that we are able to count number of iterations in runtime
766   LOOP->LPT_DECISION.TIMES + 1 times.  The transformation does this (with some
767   extra care for case n < 0):
768
769   for (i = 0; i < n; i++)
770     body;
771
772   ==>
773
774   i = 0;
775   mod = n % 4;
776
777   switch (mod)
778     {
779       case 3:
780         body; i++;
781       case 2:
782         body; i++;
783       case 1:
784         body; i++;
785       case 0: ;
786     }
787
788   while (i < n)
789     {
790       body; i++;
791       body; i++;
792       body; i++;
793       body; i++;
794     }
795   */
796static void
797unroll_loop_runtime_iterations (struct loops *loops, struct loop *loop)
798{
799  rtx niter, init_code, branch_code, jump, label;
800  unsigned i, j, p;
801  basic_block preheader, *body, *dom_bbs, swtch, ezc_swtch;
802  unsigned n_dom_bbs;
803  sbitmap wont_exit;
804  int may_exit_copy;
805  unsigned n_peel, n_remove_edges;
806  edge *remove_edges, e;
807  bool extra_zero_check, last_may_exit;
808  unsigned max_unroll = loop->lpt_decision.times;
809  struct loop_desc *desc = &loop->desc;
810  bool discard_inc = false;
811  bool is_bct;
812
813  /* Remember blocks whose dominators will have to be updated.  */
814  dom_bbs = xcalloc (n_basic_blocks, sizeof (basic_block));
815  n_dom_bbs = 0;
816
817  body = get_loop_body (loop);
818  for (i = 0; i < loop->num_nodes; i++)
819    {
820      unsigned nldom;
821      basic_block *ldom;
822
823      nldom = get_dominated_by (CDI_DOMINATORS, body[i], &ldom);
824      for (j = 0; j < nldom; j++)
825	if (!flow_bb_inside_loop_p (loop, ldom[j]))
826	  dom_bbs[n_dom_bbs++] = ldom[j];
827
828      free (ldom);
829    }
830  free (body);
831
832  if (desc->postincr)
833    {
834      /* Leave exit in first copy (for explanation why see comment in
835	 unroll_loop_constant_iterations).  */
836      may_exit_copy = 0;
837      n_peel = max_unroll - 1;
838      extra_zero_check = true;
839      last_may_exit = false;
840    }
841  else
842    {
843      /* Leave exit in last copy (for explanation why see comment in
844	 unroll_loop_constant_iterations).  */
845      may_exit_copy = max_unroll;
846      n_peel = max_unroll;
847      extra_zero_check = false;
848      last_may_exit = true;
849    }
850
851  /* Get expression for number of iterations.  */
852  start_sequence ();
853  niter = count_loop_iterations (desc, NULL, NULL);
854  if (!niter)
855    abort ();
856  niter = force_operand (niter, NULL);
857
858  /* Count modulo by ANDing it with max_unroll; we use the fact that
859     the number of unrollings is a power of two, and thus this is correct
860     even if there is overflow in the computation.  */
861  niter = expand_simple_binop (GET_MODE (desc->var), AND,
862			       niter,
863			       GEN_INT (max_unroll),
864			       NULL_RTX, 0, OPTAB_LIB_WIDEN);
865
866  /* For a loop ending with a branch and count for which the increment
867     of the count register will be discarded, adjust the initialization of
868     the count register.  */
869  if ((is_bct = is_bct_cond (BB_END (desc->out_edge->src)))
870      && (discard_inc = discard_increment (loop)))
871    {
872      rtx count, count2, count_unroll_mod;
873      int count_unroll;
874
875      /* start_sequence (); */
876
877      count = count_loop_iterations (desc, NULL, NULL);
878
879      count_unroll = loop->lpt_decision.times+1;
880
881
882
883      count_unroll_mod =  GEN_INT (exact_log2 (count_unroll));
884      count = expand_simple_binop (GET_MODE (desc->var), LSHIFTRT,
885				  count, count_unroll_mod,
886				  0, 0, OPTAB_LIB_WIDEN);
887
888      count2 = expand_simple_binop (GET_MODE (desc->var), PLUS,
889				     count, GEN_INT (2),
890				     0, 0, OPTAB_LIB_WIDEN);
891
892      emit_move_insn (desc->var, count2);
893    }
894
895  init_code = get_insns ();
896  end_sequence ();
897
898  /* Precondition the loop.  */
899  loop_split_edge_with (loop_preheader_edge (loop), init_code);
900
901  remove_edges = xcalloc (max_unroll + n_peel + 1, sizeof (edge));
902  n_remove_edges = 0;
903
904  wont_exit = sbitmap_alloc (max_unroll + 2);
905
906  /* Peel the first copy of loop body (almost always we must leave exit test
907     here; the only exception is when we have extra zero check and the number
908     of iterations is reliable (i.e. comes out of NE condition).  Also record
909     the place of (possible) extra zero check.  */
910  sbitmap_zero (wont_exit);
911  if (extra_zero_check && desc->cond == NE)
912    SET_BIT (wont_exit, 1);
913  ezc_swtch = loop_preheader_edge (loop)->src;
914  if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
915		loops, 1,
916		wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
917		DLTHE_FLAG_UPDATE_FREQ))
918    abort ();
919
920  /* Record the place where switch will be built for preconditioning.  */
921  swtch = loop_split_edge_with (loop_preheader_edge (loop),
922				NULL_RTX);
923
924  for (i = 0; i < n_peel; i++)
925    {
926      /* Peel the copy.  */
927      sbitmap_zero (wont_exit);
928      if (i != n_peel - 1 || !last_may_exit)
929	SET_BIT (wont_exit, 1);
930      if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
931		loops, 1,
932		wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
933		DLTHE_FLAG_UPDATE_FREQ))
934	abort ();
935
936      /* Create item for switch.  */
937      j = n_peel - i - (extra_zero_check ? 0 : 1);
938      p = REG_BR_PROB_BASE / (i + 2);
939
940      /* If modulo is zero do not jumo to the header of the unrolled loops.
941         Jump instead to the last branch and count that precedes it.  */
942      if (is_bct && discard_inc && (j == 0))
943	{
944	  basic_block lastbb = loop_preheader_edge(loop)->src;
945	  rtx split_after;
946
947          /* Skip dummy basic blocks generated during the unrolling.  */
948	  while (!is_bct_cond (BB_END (lastbb)))
949	    lastbb = lastbb->pred->src;
950
951	  split_after = PREV_INSN (BB_END (lastbb));
952
953	  preheader = split_loop_bb (lastbb , split_after)->dest;
954	}
955      else
956	preheader = loop_split_edge_with (loop_preheader_edge (loop),
957					  NULL_RTX);
958      label = block_label (preheader);
959      start_sequence ();
960      do_compare_rtx_and_jump (copy_rtx (niter), GEN_INT (j), EQ, 0,
961			       GET_MODE (desc->var), NULL_RTX, NULL_RTX,
962			       label);
963      jump = get_last_insn ();
964      JUMP_LABEL (jump) = label;
965      REG_NOTES (jump)
966	      = gen_rtx_EXPR_LIST (REG_BR_PROB,
967				   GEN_INT (p), REG_NOTES (jump));
968
969      LABEL_NUSES (label)++;
970      branch_code = get_insns ();
971      end_sequence ();
972
973      swtch = loop_split_edge_with (swtch->pred, branch_code);
974      set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
975      swtch->succ->probability = REG_BR_PROB_BASE - p;
976      e = make_edge (swtch, preheader,
977		     swtch->succ->flags & EDGE_IRREDUCIBLE_LOOP);
978      e->probability = p;
979    }
980
981  if (extra_zero_check)
982    {
983      /* Add branch for zero iterations.  */
984      p = REG_BR_PROB_BASE / (max_unroll + 1);
985      swtch = ezc_swtch;
986      preheader = loop_split_edge_with (loop_preheader_edge (loop), NULL_RTX);
987      label = block_label (preheader);
988      start_sequence ();
989      do_compare_rtx_and_jump (copy_rtx (niter), const0_rtx, EQ, 0,
990			       GET_MODE (desc->var), NULL_RTX, NULL_RTX,
991			       label);
992      jump = get_last_insn ();
993      JUMP_LABEL (jump) = label;
994      REG_NOTES (jump)
995	      = gen_rtx_EXPR_LIST (REG_BR_PROB,
996				   GEN_INT (p), REG_NOTES (jump));
997
998      LABEL_NUSES (label)++;
999      branch_code = get_insns ();
1000      end_sequence ();
1001
1002      swtch = loop_split_edge_with (swtch->succ, branch_code);
1003      set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1004      swtch->succ->probability = REG_BR_PROB_BASE - p;
1005      e = make_edge (swtch, preheader,
1006		     swtch->succ->flags & EDGE_IRREDUCIBLE_LOOP);
1007      e->probability = p;
1008    }
1009
1010  /* Recount dominators for outer blocks.  */
1011  iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, n_dom_bbs);
1012
1013  /* And unroll loop.  */
1014
1015  sbitmap_ones (wont_exit);
1016  RESET_BIT (wont_exit, may_exit_copy);
1017
1018  if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1019		loops, max_unroll,
1020		wont_exit, desc->out_edge, remove_edges, &n_remove_edges,
1021		DLTHE_FLAG_UPDATE_FREQ))
1022    abort ();
1023
1024  free (wont_exit);
1025
1026  /* Expand the branch and count.  */
1027  if (is_bct)
1028    for (i = 0; i < n_remove_edges; i++)
1029      expand_bct (remove_edges[i], discard_inc);
1030
1031  /* Remove the edges.  */
1032  for (i = 0; i < n_remove_edges; i++)
1033    remove_path (loops, remove_edges[i]);
1034  free (remove_edges);
1035
1036  if (rtl_dump_file)
1037    fprintf (rtl_dump_file,
1038	     ";; Unrolled loop %d times, counting # of iterations in runtime, %i insns\n",
1039	     max_unroll, num_loop_insns (loop));
1040}
1041
1042/* Decide whether to simply peel LOOP and how much.  */
1043static void
1044decide_peel_simple (struct loop *loop, int flags)
1045{
1046  unsigned npeel;
1047
1048  if (!(flags & UAP_PEEL))
1049    {
1050      /* We were not asked to, just return back silently.  */
1051      return;
1052    }
1053
1054  if (rtl_dump_file)
1055    fprintf (rtl_dump_file, ";; Considering simply peeling loop\n");
1056
1057  /* npeel = number of iterations to peel.  */
1058  npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1059  if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1060    npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1061
1062  /* Skip big loops.  */
1063  if (!npeel)
1064    {
1065      if (rtl_dump_file)
1066	fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
1067      return;
1068    }
1069
1070  /* Check for simple loops.  */
1071  if (!loop->has_desc)
1072    {
1073      loop->simple = simple_loop_p (loop, &loop->desc);
1074      loop->has_desc = 1;
1075    }
1076
1077  /* Check number of iterations.  */
1078  if (loop->simple && loop->desc.const_iter)
1079    {
1080      if (rtl_dump_file)
1081	fprintf (rtl_dump_file, ";; Loop iterates constant times\n");
1082      return;
1083    }
1084
1085  /* Do not simply peel loops with branches inside -- it increases number
1086     of mispredicts.  */
1087  if (loop->desc.n_branches > 1)
1088    {
1089      if (rtl_dump_file)
1090	fprintf (rtl_dump_file, ";; Not peeling, contains branches\n");
1091      return;
1092    }
1093
1094  if (loop->header->count)
1095    {
1096      unsigned niter = expected_loop_iterations (loop);
1097      if (niter + 1 > npeel)
1098	{
1099	  if (rtl_dump_file)
1100	    {
1101	      fprintf (rtl_dump_file, ";; Not peeling loop, rolls too much (");
1102	      fprintf (rtl_dump_file, HOST_WIDEST_INT_PRINT_DEC, (HOST_WIDEST_INT) (niter + 1));
1103	      fprintf (rtl_dump_file, " iterations > %d [maximum peelings])\n", npeel);
1104	    }
1105	  return;
1106	}
1107      npeel = niter + 1;
1108    }
1109  else
1110    {
1111      /* For now we have no good heuristics to decide whether loop peeling
1112         will be effective, so disable it.  */
1113      if (rtl_dump_file)
1114	fprintf (rtl_dump_file,
1115		 ";; Not peeling loop, no evidence it will be profitable\n");
1116      return;
1117    }
1118
1119  /* Success.  */
1120  loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1121  loop->lpt_decision.times = npeel;
1122}
1123
1124/* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1125   while (cond)
1126     body;
1127
1128   ==>
1129
1130   if (!cond) goto end;
1131   body;
1132   if (!cond) goto end;
1133   body;
1134   while (cond)
1135     body;
1136   end: ;
1137   */
1138static void
1139peel_loop_simple (struct loops *loops, struct loop *loop)
1140{
1141  sbitmap wont_exit;
1142  unsigned npeel = loop->lpt_decision.times;
1143
1144  wont_exit = sbitmap_alloc (npeel + 1);
1145  sbitmap_zero (wont_exit);
1146
1147  if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1148		loops, npeel, wont_exit, NULL, NULL, NULL,
1149		DLTHE_FLAG_UPDATE_FREQ))
1150    abort ();
1151
1152  free (wont_exit);
1153
1154  if (rtl_dump_file)
1155    fprintf (rtl_dump_file, ";; Peeling loop %d times\n", npeel);
1156}
1157
1158/* Decide whether to unroll LOOP stupidly and how much.  */
1159static void
1160decide_unroll_stupid (struct loop *loop, int flags)
1161{
1162  unsigned nunroll, nunroll_by_av, i;
1163
1164  if (!(flags & UAP_UNROLL_ALL))
1165    {
1166      /* We were not asked to, just return back silently.  */
1167      return;
1168    }
1169
1170  if (rtl_dump_file)
1171    fprintf (rtl_dump_file, ";; Considering unrolling loop stupidly\n");
1172
1173  /* nunroll = total number of copies of the original loop body in
1174     unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1175  nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1176  nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1177  if (nunroll > nunroll_by_av)
1178    nunroll = nunroll_by_av;
1179  if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1180    nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1181
1182  /* Skip big loops.  */
1183  if (nunroll <= 1)
1184    {
1185      if (rtl_dump_file)
1186	fprintf (rtl_dump_file, ";; Not considering loop, is too big\n");
1187      return;
1188    }
1189
1190  /* Check for simple loops.  */
1191  if (!loop->has_desc)
1192    {
1193      loop->simple = simple_loop_p (loop, &loop->desc);
1194      loop->has_desc = 1;
1195    }
1196
1197  /* Check simpleness.  */
1198  if (loop->simple)
1199    {
1200      if (rtl_dump_file)
1201	fprintf (rtl_dump_file, ";; The loop is simple\n");
1202      return;
1203    }
1204
1205  /* Do not unroll loops with branches inside -- it increases number
1206     of mispredicts.  */
1207  if (loop->desc.n_branches > 1)
1208    {
1209      if (rtl_dump_file)
1210	fprintf (rtl_dump_file, ";; Not unrolling, contains branches\n");
1211      return;
1212    }
1213
1214  /* If we have profile feedback, check whether the loop rolls.  */
1215  if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll)
1216    {
1217      if (rtl_dump_file)
1218	fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n");
1219      return;
1220    }
1221
1222  /* Success.  Now force nunroll to be power of 2, as it seems that this
1223     improves results (partially because of better alignments, partially
1224     because of some dark magic).  */
1225  for (i = 1; 2 * i <= nunroll; i *= 2);
1226
1227  loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1228  loop->lpt_decision.times = i - 1;
1229}
1230
1231/* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation:
1232   while (cond)
1233     body;
1234
1235   ==>
1236
1237   while (cond)
1238     {
1239       body;
1240       if (!cond) break;
1241       body;
1242       if (!cond) break;
1243       body;
1244       if (!cond) break;
1245       body;
1246     }
1247   */
1248static void
1249unroll_loop_stupid (struct loops *loops, struct loop *loop)
1250{
1251  sbitmap wont_exit;
1252  unsigned nunroll = loop->lpt_decision.times;
1253
1254  wont_exit = sbitmap_alloc (nunroll + 1);
1255  sbitmap_zero (wont_exit);
1256
1257  if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1258		loops, nunroll, wont_exit, NULL, NULL, NULL,
1259		DLTHE_FLAG_UPDATE_FREQ))
1260    abort ();
1261
1262  free (wont_exit);
1263
1264  if (rtl_dump_file)
1265    fprintf (rtl_dump_file, ";; Unrolled loop %d times, %i insns\n",
1266	     nunroll, num_loop_insns (loop));
1267}
1268
1269/* Expand a bct instruction in a branch and an increment.
1270   If flag_inc is set, the induction variable does not need to be
1271   incremented.  */
1272
1273static void
1274expand_bct (edge e, int flag_inc)
1275{
1276  rtx bct_insn = BB_END (e->src);
1277  rtx cmp;
1278  rtx inc;
1279  rtx seq;
1280
1281  rtx tgt;
1282  rtx condition;
1283  rtx labelref;
1284  rtx reg;
1285  rtx pattern = PATTERN (bct_insn);
1286
1287  if (!is_bct_cond (bct_insn))
1288    return;
1289
1290  inc = get_var_set_from_bct (bct_insn);
1291  cmp = XVECEXP (pattern, 0, 0);
1292  reg = SET_DEST (inc);
1293
1294  start_sequence ();
1295  if (!flag_inc)
1296    {
1297      tgt = force_operand (XEXP (inc, 1), XEXP (inc, 0));
1298      if (tgt != XEXP (inc, 0))
1299	emit_move_insn (XEXP (inc, 0), tgt);
1300    }
1301
1302  condition = XEXP (SET_SRC (cmp), 0);
1303  labelref = XEXP (SET_SRC (cmp), 1);
1304
1305  do_compare_rtx_and_jump (copy_rtx (reg), XEXP (condition, 1),
1306			   GET_CODE (condition), 0,
1307			   GET_MODE (reg), NULL_RTX, NULL_RTX,
1308			   XEXP (labelref, 0));
1309  seq = get_insns ();
1310  end_sequence ();
1311  emit_insn_after (seq, bct_insn);
1312
1313  delete_insn (bct_insn);
1314
1315  return;
1316}
1317
1318/* Check that the increment of the count register can be discarded.  */
1319bool
1320discard_increment (struct loop *loop)
1321{
1322  struct loop_desc *desc = &loop->desc;
1323  rtx inc, set_src, reg;
1324  rtx bct_insn;
1325  unsigned int i;
1326  basic_block *body;
1327
1328  bct_insn = BB_END (desc->out_edge->src);
1329  if (!is_bct_cond (bct_insn))
1330    abort();
1331
1332  inc = get_var_set_from_bct (bct_insn);
1333
1334  /* Check that inc is of the form reg = reg - 1.  */
1335  reg = SET_DEST (inc);
1336  set_src = SET_SRC (inc);
1337
1338  if (GET_CODE (set_src) != PLUS)
1339    return false;
1340
1341  if (!rtx_equal_p (XEXP (set_src, 0), reg))
1342    return false;
1343
1344  if (!CONSTANT_P (XEXP (set_src, 1)))
1345     return false;
1346
1347  if (INTVAL (XEXP (set_src, 1)) != -1)
1348     return false;
1349
1350  /* We need to check that the register has no other uses beside the branch and
1351     count.  */
1352  body = get_loop_body (loop);
1353  for(i=0; i < loop->num_nodes; i++)
1354    {
1355      if (reg_mentioned_p (desc->var, BB_HEAD (body[i])))
1356	  return false;
1357
1358      if (body[i] != desc->out_edge->src)
1359	if (reg_mentioned_p (desc->var, BB_END (body[i])))
1360	  return false;
1361
1362      if (reg_used_between_p (desc->var, BB_HEAD (body[i]), BB_END (body[i])))
1363	  return false;
1364    }
1365
1366  /* Check that the branch and count ends the latch.  */
1367  if (desc->out_edge->src != loop->latch)
1368    {
1369      rtx insn;
1370
1371      /* Latch is a dummy block generated by loop-init.  */
1372      if (BRANCH_EDGE(desc->out_edge->src)->dest != loop->latch)
1373	  return false;
1374
1375      for (insn = BB_HEAD (loop->latch); insn != NEXT_INSN (BB_END (loop->latch));
1376	   insn = NEXT_INSN (insn))
1377        if (INSN_P (insn)) return false;
1378    }
1379
1380  return true;
1381}
1382
1383