1/* Perform doloop optimizations
2   Copyright (C) 2004, 2005, 2006 Free Software Foundation, Inc.
3   Based on code by Michael P. Hayes (m.hayes@elec.canterbury.ac.nz)
4
5This file is part of GCC.
6
7GCC is free software; you can redistribute it and/or modify it under
8the terms of the GNU General Public License as published by the Free
9Software Foundation; either version 2, or (at your option) any later
10version.
11
12GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13WARRANTY; without even the implied warranty of MERCHANTABILITY or
14FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15for more details.
16
17You should have received a copy of the GNU General Public License
18along with GCC; see the file COPYING.  If not, write to the Free
19Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
2002110-1301, USA.  */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "tm.h"
26#include "rtl.h"
27#include "flags.h"
28#include "expr.h"
29#include "hard-reg-set.h"
30#include "basic-block.h"
31#include "toplev.h"
32#include "tm_p.h"
33#include "cfgloop.h"
34#include "output.h"
35#include "params.h"
36#include "target.h"
37
38/* This module is used to modify loops with a determinable number of
39   iterations to use special low-overhead looping instructions.
40
41   It first validates whether the loop is well behaved and has a
42   determinable number of iterations (either at compile or run-time).
43   It then modifies the loop to use a low-overhead looping pattern as
44   follows:
45
46   1. A pseudo register is allocated as the loop iteration counter.
47
48   2. The number of loop iterations is calculated and is stored
49      in the loop counter.
50
51   3. At the end of the loop, the jump insn is replaced by the
52      doloop_end pattern.  The compare must remain because it might be
53      used elsewhere.  If the loop-variable or condition register are
54      used elsewhere, they will be eliminated by flow.
55
56   4. An optional doloop_begin pattern is inserted at the top of the
57      loop.
58
59   TODO The optimization should only performed when either the biv used for exit
60   condition is unused at all except for the exit test, or if we do not have to
61   change its value, since otherwise we have to add a new induction variable,
62   which usually will not pay up (unless the cost of the doloop pattern is
63   somehow extremely lower than the cost of compare & jump, or unless the bct
64   register cannot be used for anything else but doloop -- ??? detect these
65   cases).  */
66
67#ifdef HAVE_doloop_end
68
69/* Return the loop termination condition for PATTERN or zero
70   if it is not a decrement and branch jump insn.  */
71
72rtx
73doloop_condition_get (rtx pattern)
74{
75  rtx cmp;
76  rtx inc;
77  rtx reg;
78  rtx inc_src;
79  rtx condition;
80
81  /* The canonical doloop pattern we expect is:
82
83     (parallel [(set (pc) (if_then_else (condition)
84                                        (label_ref (label))
85                                        (pc)))
86                (set (reg) (plus (reg) (const_int -1)))
87                (additional clobbers and uses)])
88
89     Some targets (IA-64) wrap the set of the loop counter in
90     an if_then_else too.
91
92     In summary, the branch must be the first entry of the
93     parallel (also required by jump.c), and the second
94     entry of the parallel must be a set of the loop counter
95     register.  */
96
97  if (GET_CODE (pattern) != PARALLEL)
98    return 0;
99
100  cmp = XVECEXP (pattern, 0, 0);
101  inc = XVECEXP (pattern, 0, 1);
102
103  /* Check for (set (reg) (something)).  */
104  if (GET_CODE (inc) != SET)
105    return 0;
106  reg = SET_DEST (inc);
107  if (! REG_P (reg))
108    return 0;
109
110  /* Check if something = (plus (reg) (const_int -1)).
111     On IA-64, this decrement is wrapped in an if_then_else.  */
112  inc_src = SET_SRC (inc);
113  if (GET_CODE (inc_src) == IF_THEN_ELSE)
114    inc_src = XEXP (inc_src, 1);
115  if (GET_CODE (inc_src) != PLUS
116      || XEXP (inc_src, 0) != reg
117      || XEXP (inc_src, 1) != constm1_rtx)
118    return 0;
119
120  /* Check for (set (pc) (if_then_else (condition)
121                                       (label_ref (label))
122                                       (pc))).  */
123  if (GET_CODE (cmp) != SET
124      || SET_DEST (cmp) != pc_rtx
125      || GET_CODE (SET_SRC (cmp)) != IF_THEN_ELSE
126      || GET_CODE (XEXP (SET_SRC (cmp), 1)) != LABEL_REF
127      || XEXP (SET_SRC (cmp), 2) != pc_rtx)
128    return 0;
129
130  /* Extract loop termination condition.  */
131  condition = XEXP (SET_SRC (cmp), 0);
132
133  /* We expect a GE or NE comparison with 0 or 1.  */
134  if ((GET_CODE (condition) != GE
135       && GET_CODE (condition) != NE)
136      || (XEXP (condition, 1) != const0_rtx
137          && XEXP (condition, 1) != const1_rtx))
138    return 0;
139
140  if ((XEXP (condition, 0) == reg)
141      || (GET_CODE (XEXP (condition, 0)) == PLUS
142		   && XEXP (XEXP (condition, 0), 0) == reg))
143    return condition;
144
145  /* ??? If a machine uses a funny comparison, we could return a
146     canonicalized form here.  */
147
148  return 0;
149}
150
151/* Return nonzero if the loop specified by LOOP is suitable for
152   the use of special low-overhead looping instructions.  DESC
153   describes the number of iterations of the loop.  */
154
155static bool
156doloop_valid_p (struct loop *loop, struct niter_desc *desc)
157{
158  basic_block *body = get_loop_body (loop), bb;
159  rtx insn;
160  unsigned i;
161  bool result = true;
162
163  /* Check for loops that may not terminate under special conditions.  */
164  if (!desc->simple_p
165      || desc->assumptions
166      || desc->infinite)
167    {
168      /* There are some cases that would require a special attention.
169	 For example if the comparison is LEU and the comparison value
170	 is UINT_MAX then the loop will not terminate.  Similarly, if the
171	 comparison code is GEU and the comparison value is 0, the
172	 loop will not terminate.
173
174	 If the absolute increment is not 1, the loop can be infinite
175	 even with LTU/GTU, e.g. for (i = 3; i > 0; i -= 2)
176
177	 ??? We could compute these conditions at run-time and have a
178	 additional jump around the loop to ensure an infinite loop.
179	 However, it is very unlikely that this is the intended
180	 behavior of the loop and checking for these rare boundary
181	 conditions would pessimize all other code.
182
183	 If the loop is executed only a few times an extra check to
184	 restart the loop could use up most of the benefits of using a
185	 count register loop.  Note however, that normally, this
186	 restart branch would never execute, so it could be predicted
187	 well by the CPU.  We should generate the pessimistic code by
188	 default, and have an option, e.g. -funsafe-loops that would
189	 enable count-register loops in this case.  */
190      if (dump_file)
191	fprintf (dump_file, "Doloop: Possible infinite iteration case.\n");
192      result = false;
193      goto cleanup;
194    }
195
196  for (i = 0; i < loop->num_nodes; i++)
197    {
198      bb = body[i];
199
200      for (insn = BB_HEAD (bb);
201	   insn != NEXT_INSN (BB_END (bb));
202	   insn = NEXT_INSN (insn))
203	{
204	  /* Different targets have different necessities for low-overhead
205	     looping.  Call the back end for each instruction within the loop
206	     to let it decide whether the insn prohibits a low-overhead loop.
207	     It will then return the cause for it to emit to the dump file.  */
208	  const char * invalid = targetm.invalid_within_doloop (insn);
209	  if (invalid)
210	    {
211	      if (dump_file)
212		fprintf (dump_file, "Doloop: %s\n", invalid);
213	      result = false;
214	      goto cleanup;
215	    }
216	}
217    }
218  result = true;
219
220cleanup:
221  free (body);
222
223  return result;
224}
225
226/* Adds test of COND jumping to DEST on edge *E and set *E to the new fallthru
227   edge.  If the condition is always false, do not do anything.  If it is always
228   true, redirect E to DEST and return false.  In all other cases, true is
229   returned.  */
230
231static bool
232add_test (rtx cond, edge *e, basic_block dest)
233{
234  rtx seq, jump, label;
235  enum machine_mode mode;
236  rtx op0 = XEXP (cond, 0), op1 = XEXP (cond, 1);
237  enum rtx_code code = GET_CODE (cond);
238  basic_block bb;
239
240  mode = GET_MODE (XEXP (cond, 0));
241  if (mode == VOIDmode)
242    mode = GET_MODE (XEXP (cond, 1));
243
244  start_sequence ();
245  op0 = force_operand (op0, NULL_RTX);
246  op1 = force_operand (op1, NULL_RTX);
247  label = block_label (dest);
248  do_compare_rtx_and_jump (op0, op1, code, 0, mode, NULL_RTX, NULL_RTX, label);
249
250  jump = get_last_insn ();
251  if (!JUMP_P (jump))
252    {
253      /* The condition is always false and the jump was optimized out.  */
254      end_sequence ();
255      return true;
256    }
257
258  seq = get_insns ();
259  end_sequence ();
260  bb = loop_split_edge_with (*e, seq);
261  *e = single_succ_edge (bb);
262
263  if (any_uncondjump_p (jump))
264    {
265      /* The condition is always true.  */
266      delete_insn (jump);
267      redirect_edge_and_branch_force (*e, dest);
268      return false;
269    }
270
271  JUMP_LABEL (jump) = label;
272
273  /* The jump is supposed to handle an unlikely special case.  */
274  REG_NOTES (jump)
275	  = gen_rtx_EXPR_LIST (REG_BR_PROB,
276			       const0_rtx, REG_NOTES (jump));
277  LABEL_NUSES (label)++;
278
279  make_edge (bb, dest, (*e)->flags & ~EDGE_FALLTHRU);
280  return true;
281}
282
283/* Modify the loop to use the low-overhead looping insn where LOOP
284   describes the loop, DESC describes the number of iterations of the
285   loop, and DOLOOP_INSN is the low-overhead looping insn to emit at the
286   end of the loop.  CONDITION is the condition separated from the
287   DOLOOP_SEQ.  COUNT is the number of iterations of the LOOP.  */
288
289static void
290doloop_modify (struct loop *loop, struct niter_desc *desc,
291	       rtx doloop_seq, rtx condition, rtx count)
292{
293  rtx counter_reg;
294  rtx tmp, noloop = NULL_RTX;
295  rtx sequence;
296  rtx jump_insn;
297  rtx jump_label;
298  int nonneg = 0;
299  bool increment_count;
300  basic_block loop_end = desc->out_edge->src;
301  enum machine_mode mode;
302
303  jump_insn = BB_END (loop_end);
304
305  if (dump_file)
306    {
307      fprintf (dump_file, "Doloop: Inserting doloop pattern (");
308      if (desc->const_iter)
309	fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
310      else
311	fputs ("runtime", dump_file);
312      fputs (" iterations).\n", dump_file);
313    }
314
315  /* Discard original jump to continue loop.  The original compare
316     result may still be live, so it cannot be discarded explicitly.  */
317  delete_insn (jump_insn);
318
319  counter_reg = XEXP (condition, 0);
320  if (GET_CODE (counter_reg) == PLUS)
321    counter_reg = XEXP (counter_reg, 0);
322  mode = GET_MODE (counter_reg);
323
324  increment_count = false;
325  switch (GET_CODE (condition))
326    {
327    case NE:
328      /* Currently only NE tests against zero and one are supported.  */
329      noloop = XEXP (condition, 1);
330      if (noloop != const0_rtx)
331	{
332	  gcc_assert (noloop == const1_rtx);
333	  increment_count = true;
334	}
335      break;
336
337    case GE:
338      /* Currently only GE tests against zero are supported.  */
339      gcc_assert (XEXP (condition, 1) == const0_rtx);
340
341      noloop = constm1_rtx;
342
343      /* The iteration count does not need incrementing for a GE test.  */
344      increment_count = false;
345
346      /* Determine if the iteration counter will be non-negative.
347	 Note that the maximum value loaded is iterations_max - 1.  */
348      if (desc->niter_max
349	  <= ((unsigned HOST_WIDEST_INT) 1
350	      << (GET_MODE_BITSIZE (mode) - 1)))
351	nonneg = 1;
352      break;
353
354      /* Abort if an invalid doloop pattern has been generated.  */
355    default:
356      gcc_unreachable ();
357    }
358
359  if (increment_count)
360    count = simplify_gen_binary (PLUS, mode, count, const1_rtx);
361
362  /* Insert initialization of the count register into the loop header.  */
363  start_sequence ();
364  tmp = force_operand (count, counter_reg);
365  convert_move (counter_reg, tmp, 1);
366  sequence = get_insns ();
367  end_sequence ();
368  emit_insn_after (sequence, BB_END (loop_preheader_edge (loop)->src));
369
370  if (desc->noloop_assumptions)
371    {
372      rtx ass = copy_rtx (desc->noloop_assumptions);
373      basic_block preheader = loop_preheader_edge (loop)->src;
374      basic_block set_zero
375	      = loop_split_edge_with (loop_preheader_edge (loop), NULL_RTX);
376      basic_block new_preheader
377	      = loop_split_edge_with (loop_preheader_edge (loop), NULL_RTX);
378      edge te;
379
380      /* Expand the condition testing the assumptions and if it does not pass,
381	 reset the count register to 0.  */
382      redirect_edge_and_branch_force (single_succ_edge (preheader), new_preheader);
383      set_immediate_dominator (CDI_DOMINATORS, new_preheader, preheader);
384
385      set_zero->count = 0;
386      set_zero->frequency = 0;
387
388      te = single_succ_edge (preheader);
389      for (; ass; ass = XEXP (ass, 1))
390	if (!add_test (XEXP (ass, 0), &te, set_zero))
391	  break;
392
393      if (ass)
394	{
395	  /* We reached a condition that is always true.  This is very hard to
396	     reproduce (such a loop does not roll, and thus it would most
397	     likely get optimized out by some of the preceding optimizations).
398	     In fact, I do not have any testcase for it.  However, it would
399	     also be very hard to show that it is impossible, so we must
400	     handle this case.  */
401	  set_zero->count = preheader->count;
402	  set_zero->frequency = preheader->frequency;
403	}
404
405      if (EDGE_COUNT (set_zero->preds) == 0)
406	{
407	  /* All the conditions were simplified to false, remove the
408	     unreachable set_zero block.  */
409	  remove_bb_from_loops (set_zero);
410	  delete_basic_block (set_zero);
411	}
412      else
413	{
414	  /* Reset the counter to zero in the set_zero block.  */
415	  start_sequence ();
416	  convert_move (counter_reg, noloop, 0);
417	  sequence = get_insns ();
418	  end_sequence ();
419	  emit_insn_after (sequence, BB_END (set_zero));
420
421	  set_immediate_dominator (CDI_DOMINATORS, set_zero,
422				   recount_dominator (CDI_DOMINATORS,
423						      set_zero));
424	}
425
426      set_immediate_dominator (CDI_DOMINATORS, new_preheader,
427			       recount_dominator (CDI_DOMINATORS,
428						  new_preheader));
429    }
430
431  /* Some targets (eg, C4x) need to initialize special looping
432     registers.  */
433#ifdef HAVE_doloop_begin
434  {
435    rtx init;
436    unsigned level = get_loop_level (loop) + 1;
437    init = gen_doloop_begin (counter_reg,
438			     desc->const_iter ? desc->niter_expr : const0_rtx,
439			     GEN_INT (desc->niter_max),
440			     GEN_INT (level));
441    if (init)
442      {
443	start_sequence ();
444	emit_insn (init);
445	sequence = get_insns ();
446	end_sequence ();
447	emit_insn_after (sequence, BB_END (loop_preheader_edge (loop)->src));
448      }
449  }
450#endif
451
452  /* Insert the new low-overhead looping insn.  */
453  emit_jump_insn_after (doloop_seq, BB_END (loop_end));
454  jump_insn = BB_END (loop_end);
455  jump_label = block_label (desc->in_edge->dest);
456  JUMP_LABEL (jump_insn) = jump_label;
457  LABEL_NUSES (jump_label)++;
458
459  /* Ensure the right fallthru edge is marked, for case we have reversed
460     the condition.  */
461  desc->in_edge->flags &= ~EDGE_FALLTHRU;
462  desc->out_edge->flags |= EDGE_FALLTHRU;
463
464  /* Add a REG_NONNEG note if the actual or estimated maximum number
465     of iterations is non-negative.  */
466  if (nonneg)
467    {
468      REG_NOTES (jump_insn)
469	= gen_rtx_EXPR_LIST (REG_NONNEG, NULL_RTX, REG_NOTES (jump_insn));
470    }
471}
472
473/* Process loop described by LOOP validating that the loop is suitable for
474   conversion to use a low overhead looping instruction, replacing the jump
475   insn where suitable.  Returns true if the loop was successfully
476   modified.  */
477
478static bool
479doloop_optimize (struct loop *loop)
480{
481  enum machine_mode mode;
482  rtx doloop_seq, doloop_pat, doloop_reg;
483  rtx iterations, count;
484  rtx iterations_max;
485  rtx start_label;
486  rtx condition;
487  unsigned level, est_niter;
488  int max_cost;
489  struct niter_desc *desc;
490  unsigned word_mode_size;
491  unsigned HOST_WIDE_INT word_mode_max;
492
493  if (dump_file)
494    fprintf (dump_file, "Doloop: Processing loop %d.\n", loop->num);
495
496  iv_analysis_loop_init (loop);
497
498  /* Find the simple exit of a LOOP.  */
499  desc = get_simple_loop_desc (loop);
500
501  /* Check that loop is a candidate for a low-overhead looping insn.  */
502  if (!doloop_valid_p (loop, desc))
503    {
504      if (dump_file)
505	fprintf (dump_file,
506		 "Doloop: The loop is not suitable.\n");
507      return false;
508    }
509  mode = desc->mode;
510
511  est_niter = 3;
512  if (desc->const_iter)
513    est_niter = desc->niter;
514  /* If the estimate on number of iterations is reliable (comes from profile
515     feedback), use it.  Do not use it normally, since the expected number
516     of iterations of an unrolled loop is 2.  */
517  if (loop->header->count)
518    est_niter = expected_loop_iterations (loop);
519
520  if (est_niter < 3)
521    {
522      if (dump_file)
523	fprintf (dump_file,
524		 "Doloop: Too few iterations (%u) to be profitable.\n",
525		 est_niter);
526      return false;
527    }
528
529  max_cost
530    = COSTS_N_INSNS (PARAM_VALUE (PARAM_MAX_ITERATIONS_COMPUTATION_COST));
531  if (rtx_cost (desc->niter_expr, SET) > max_cost)
532    {
533      if (dump_file)
534	fprintf (dump_file,
535		 "Doloop: number of iterations too costly to compute.\n");
536      return false;
537    }
538
539  count = copy_rtx (desc->niter_expr);
540  iterations = desc->const_iter ? desc->niter_expr : const0_rtx;
541  iterations_max = GEN_INT (desc->niter_max);
542  level = get_loop_level (loop) + 1;
543
544  /* Generate looping insn.  If the pattern FAILs then give up trying
545     to modify the loop since there is some aspect the back-end does
546     not like.  */
547  start_label = block_label (desc->in_edge->dest);
548  doloop_reg = gen_reg_rtx (mode);
549  doloop_seq = gen_doloop_end (doloop_reg, iterations, iterations_max,
550			       GEN_INT (level), start_label);
551
552  word_mode_size = GET_MODE_BITSIZE (word_mode);
553  word_mode_max
554	  = ((unsigned HOST_WIDE_INT) 1 << (word_mode_size - 1) << 1) - 1;
555  if (! doloop_seq
556      && mode != word_mode
557      /* Before trying mode different from the one in that # of iterations is
558	 computed, we must be sure that the number of iterations fits into
559	 the new mode.  */
560      && (word_mode_size >= GET_MODE_BITSIZE (mode)
561	  || desc->niter_max <= word_mode_max))
562    {
563      if (word_mode_size > GET_MODE_BITSIZE (mode))
564	{
565	  count = simplify_gen_unary (ZERO_EXTEND, word_mode,
566				      count, mode);
567	  iterations = simplify_gen_unary (ZERO_EXTEND, word_mode,
568					   iterations, mode);
569	  iterations_max = simplify_gen_unary (ZERO_EXTEND, word_mode,
570					       iterations_max, mode);
571	}
572      else
573	{
574	  count = lowpart_subreg (word_mode, count, mode);
575	  iterations = lowpart_subreg (word_mode, iterations, mode);
576	  iterations_max = lowpart_subreg (word_mode, iterations_max, mode);
577	}
578      PUT_MODE (doloop_reg, word_mode);
579      doloop_seq = gen_doloop_end (doloop_reg, iterations, iterations_max,
580				   GEN_INT (level), start_label);
581    }
582  if (! doloop_seq)
583    {
584      if (dump_file)
585	fprintf (dump_file,
586		 "Doloop: Target unwilling to use doloop pattern!\n");
587      return false;
588    }
589
590  /* If multiple instructions were created, the last must be the
591     jump instruction.  Also, a raw define_insn may yield a plain
592     pattern.  */
593  doloop_pat = doloop_seq;
594  if (INSN_P (doloop_pat))
595    {
596      while (NEXT_INSN (doloop_pat) != NULL_RTX)
597	doloop_pat = NEXT_INSN (doloop_pat);
598      if (JUMP_P (doloop_pat))
599	doloop_pat = PATTERN (doloop_pat);
600      else
601	doloop_pat = NULL_RTX;
602    }
603
604  if (! doloop_pat
605      || ! (condition = doloop_condition_get (doloop_pat)))
606    {
607      if (dump_file)
608	fprintf (dump_file, "Doloop: Unrecognizable doloop pattern!\n");
609      return false;
610    }
611
612  doloop_modify (loop, desc, doloop_seq, condition, count);
613  return true;
614}
615
616/* This is the main entry point.  Process all LOOPS using doloop_optimize.  */
617
618void
619doloop_optimize_loops (struct loops *loops)
620{
621  unsigned i;
622  struct loop *loop;
623
624  for (i = 1; i < loops->num; i++)
625    {
626      loop = loops->parray[i];
627      if (!loop)
628	continue;
629
630      doloop_optimize (loop);
631    }
632
633  iv_analysis_done ();
634
635#ifdef ENABLE_CHECKING
636  verify_dominators (CDI_DOMINATORS);
637  verify_loop_structure (loops);
638#endif
639}
640#endif /* HAVE_doloop_end */
641
642