1/* Global, SSA-based optimizations using mathematical identities.
2   Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010
3   Free Software Foundation, Inc.
4
5This file is part of GCC.
6
7GCC is free software; you can redistribute it and/or modify it
8under the terms of the GNU General Public License as published by the
9Free Software Foundation; either version 3, or (at your option) any
10later version.
11
12GCC is distributed in the hope that it will be useful, but WITHOUT
13ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15for more details.
16
17You should have received a copy of the GNU General Public License
18along with GCC; see the file COPYING3.  If not see
19<http://www.gnu.org/licenses/>.  */
20
21/* Currently, the only mini-pass in this file tries to CSE reciprocal
22   operations.  These are common in sequences such as this one:
23
24	modulus = sqrt(x*x + y*y + z*z);
25	x = x / modulus;
26	y = y / modulus;
27	z = z / modulus;
28
29   that can be optimized to
30
31	modulus = sqrt(x*x + y*y + z*z);
32        rmodulus = 1.0 / modulus;
33	x = x * rmodulus;
34	y = y * rmodulus;
35	z = z * rmodulus;
36
37   We do this for loop invariant divisors, and with this pass whenever
38   we notice that a division has the same divisor multiple times.
39
40   Of course, like in PRE, we don't insert a division if a dominator
41   already has one.  However, this cannot be done as an extension of
42   PRE for several reasons.
43
44   First of all, with some experiments it was found out that the
45   transformation is not always useful if there are only two divisions
46   hy the same divisor.  This is probably because modern processors
47   can pipeline the divisions; on older, in-order processors it should
48   still be effective to optimize two divisions by the same number.
49   We make this a param, and it shall be called N in the remainder of
50   this comment.
51
52   Second, if trapping math is active, we have less freedom on where
53   to insert divisions: we can only do so in basic blocks that already
54   contain one.  (If divisions don't trap, instead, we can insert
55   divisions elsewhere, which will be in blocks that are common dominators
56   of those that have the division).
57
58   We really don't want to compute the reciprocal unless a division will
59   be found.  To do this, we won't insert the division in a basic block
60   that has less than N divisions *post-dominating* it.
61
62   The algorithm constructs a subset of the dominator tree, holding the
63   blocks containing the divisions and the common dominators to them,
64   and walk it twice.  The first walk is in post-order, and it annotates
65   each block with the number of divisions that post-dominate it: this
66   gives information on where divisions can be inserted profitably.
67   The second walk is in pre-order, and it inserts divisions as explained
68   above, and replaces divisions by multiplications.
69
70   In the best case, the cost of the pass is O(n_statements).  In the
71   worst-case, the cost is due to creating the dominator tree subset,
72   with a cost of O(n_basic_blocks ^ 2); however this can only happen
73   for n_statements / n_basic_blocks statements.  So, the amortized cost
74   of creating the dominator tree subset is O(n_basic_blocks) and the
75   worst-case cost of the pass is O(n_statements * n_basic_blocks).
76
77   More practically, the cost will be small because there are few
78   divisions, and they tend to be in the same basic block, so insert_bb
79   is called very few times.
80
81   If we did this using domwalk.c, an efficient implementation would have
82   to work on all the variables in a single pass, because we could not
83   work on just a subset of the dominator tree, as we do now, and the
84   cost would also be something like O(n_statements * n_basic_blocks).
85   The data structures would be more complex in order to work on all the
86   variables in a single pass.  */
87
88#include "config.h"
89#include "system.h"
90#include "coretypes.h"
91#include "tm.h"
92#include "flags.h"
93#include "tree.h"
94#include "tree-flow.h"
95#include "real.h"
96#include "timevar.h"
97#include "tree-pass.h"
98#include "alloc-pool.h"
99#include "basic-block.h"
100#include "target.h"
101#include "diagnostic.h"
102#include "rtl.h"
103#include "expr.h"
104#include "optabs.h"
105
106/* This structure represents one basic block that either computes a
107   division, or is a common dominator for basic block that compute a
108   division.  */
109struct occurrence {
110  /* The basic block represented by this structure.  */
111  basic_block bb;
112
113  /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
114     inserted in BB.  */
115  tree recip_def;
116
117  /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
118     was inserted in BB.  */
119  gimple recip_def_stmt;
120
121  /* Pointer to a list of "struct occurrence"s for blocks dominated
122     by BB.  */
123  struct occurrence *children;
124
125  /* Pointer to the next "struct occurrence"s in the list of blocks
126     sharing a common dominator.  */
127  struct occurrence *next;
128
129  /* The number of divisions that are in BB before compute_merit.  The
130     number of divisions that are in BB or post-dominate it after
131     compute_merit.  */
132  int num_divisions;
133
134  /* True if the basic block has a division, false if it is a common
135     dominator for basic blocks that do.  If it is false and trapping
136     math is active, BB is not a candidate for inserting a reciprocal.  */
137  bool bb_has_division;
138};
139
140
141/* The instance of "struct occurrence" representing the highest
142   interesting block in the dominator tree.  */
143static struct occurrence *occ_head;
144
145/* Allocation pool for getting instances of "struct occurrence".  */
146static alloc_pool occ_pool;
147
148
149
150/* Allocate and return a new struct occurrence for basic block BB, and
151   whose children list is headed by CHILDREN.  */
152static struct occurrence *
153occ_new (basic_block bb, struct occurrence *children)
154{
155  struct occurrence *occ;
156
157  bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
158  memset (occ, 0, sizeof (struct occurrence));
159
160  occ->bb = bb;
161  occ->children = children;
162  return occ;
163}
164
165
166/* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
167   list of "struct occurrence"s, one per basic block, having IDOM as
168   their common dominator.
169
170   We try to insert NEW_OCC as deep as possible in the tree, and we also
171   insert any other block that is a common dominator for BB and one
172   block already in the tree.  */
173
174static void
175insert_bb (struct occurrence *new_occ, basic_block idom,
176	   struct occurrence **p_head)
177{
178  struct occurrence *occ, **p_occ;
179
180  for (p_occ = p_head; (occ = *p_occ) != NULL; )
181    {
182      basic_block bb = new_occ->bb, occ_bb = occ->bb;
183      basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
184      if (dom == bb)
185	{
186	  /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
187	     from its list.  */
188	  *p_occ = occ->next;
189	  occ->next = new_occ->children;
190	  new_occ->children = occ;
191
192	  /* Try the next block (it may as well be dominated by BB).  */
193	}
194
195      else if (dom == occ_bb)
196	{
197	  /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
198	  insert_bb (new_occ, dom, &occ->children);
199	  return;
200	}
201
202      else if (dom != idom)
203	{
204	  gcc_assert (!dom->aux);
205
206	  /* There is a dominator between IDOM and BB, add it and make
207	     two children out of NEW_OCC and OCC.  First, remove OCC from
208	     its list.	*/
209	  *p_occ = occ->next;
210	  new_occ->next = occ;
211	  occ->next = NULL;
212
213	  /* None of the previous blocks has DOM as a dominator: if we tail
214	     recursed, we would reexamine them uselessly. Just switch BB with
215	     DOM, and go on looking for blocks dominated by DOM.  */
216          new_occ = occ_new (dom, new_occ);
217	}
218
219      else
220	{
221	  /* Nothing special, go on with the next element.  */
222	  p_occ = &occ->next;
223	}
224    }
225
226  /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
227  new_occ->next = *p_head;
228  *p_head = new_occ;
229}
230
231/* Register that we found a division in BB.  */
232
233static inline void
234register_division_in (basic_block bb)
235{
236  struct occurrence *occ;
237
238  occ = (struct occurrence *) bb->aux;
239  if (!occ)
240    {
241      occ = occ_new (bb, NULL);
242      insert_bb (occ, ENTRY_BLOCK_PTR, &occ_head);
243    }
244
245  occ->bb_has_division = true;
246  occ->num_divisions++;
247}
248
249
250/* Compute the number of divisions that postdominate each block in OCC and
251   its children.  */
252
253static void
254compute_merit (struct occurrence *occ)
255{
256  struct occurrence *occ_child;
257  basic_block dom = occ->bb;
258
259  for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
260    {
261      basic_block bb;
262      if (occ_child->children)
263        compute_merit (occ_child);
264
265      if (flag_exceptions)
266	bb = single_noncomplex_succ (dom);
267      else
268	bb = dom;
269
270      if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
271        occ->num_divisions += occ_child->num_divisions;
272    }
273}
274
275
276/* Return whether USE_STMT is a floating-point division by DEF.  */
277static inline bool
278is_division_by (gimple use_stmt, tree def)
279{
280  return is_gimple_assign (use_stmt)
281	 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
282	 && gimple_assign_rhs2 (use_stmt) == def
283	 /* Do not recognize x / x as valid division, as we are getting
284	    confused later by replacing all immediate uses x in such
285	    a stmt.  */
286	 && gimple_assign_rhs1 (use_stmt) != def;
287}
288
289/* Walk the subset of the dominator tree rooted at OCC, setting the
290   RECIP_DEF field to a definition of 1.0 / DEF that can be used in
291   the given basic block.  The field may be left NULL, of course,
292   if it is not possible or profitable to do the optimization.
293
294   DEF_BSI is an iterator pointing at the statement defining DEF.
295   If RECIP_DEF is set, a dominator already has a computation that can
296   be used.  */
297
298static void
299insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
300		    tree def, tree recip_def, int threshold)
301{
302  tree type;
303  gimple new_stmt;
304  gimple_stmt_iterator gsi;
305  struct occurrence *occ_child;
306
307  if (!recip_def
308      && (occ->bb_has_division || !flag_trapping_math)
309      && occ->num_divisions >= threshold)
310    {
311      /* Make a variable with the replacement and substitute it.  */
312      type = TREE_TYPE (def);
313      recip_def = make_rename_temp (type, "reciptmp");
314      new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
315					       build_one_cst (type), def);
316
317      if (occ->bb_has_division)
318        {
319          /* Case 1: insert before an existing division.  */
320          gsi = gsi_after_labels (occ->bb);
321          while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
322	    gsi_next (&gsi);
323
324          gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
325        }
326      else if (def_gsi && occ->bb == def_gsi->bb)
327        {
328          /* Case 2: insert right after the definition.  Note that this will
329	     never happen if the definition statement can throw, because in
330	     that case the sole successor of the statement's basic block will
331	     dominate all the uses as well.  */
332          gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
333        }
334      else
335        {
336          /* Case 3: insert in a basic block not containing defs/uses.  */
337          gsi = gsi_after_labels (occ->bb);
338          gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
339        }
340
341      occ->recip_def_stmt = new_stmt;
342    }
343
344  occ->recip_def = recip_def;
345  for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
346    insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
347}
348
349
350/* Replace the division at USE_P with a multiplication by the reciprocal, if
351   possible.  */
352
353static inline void
354replace_reciprocal (use_operand_p use_p)
355{
356  gimple use_stmt = USE_STMT (use_p);
357  basic_block bb = gimple_bb (use_stmt);
358  struct occurrence *occ = (struct occurrence *) bb->aux;
359
360  if (optimize_bb_for_speed_p (bb)
361      && occ->recip_def && use_stmt != occ->recip_def_stmt)
362    {
363      gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
364      SET_USE (use_p, occ->recip_def);
365      fold_stmt_inplace (use_stmt);
366      update_stmt (use_stmt);
367    }
368}
369
370
371/* Free OCC and return one more "struct occurrence" to be freed.  */
372
373static struct occurrence *
374free_bb (struct occurrence *occ)
375{
376  struct occurrence *child, *next;
377
378  /* First get the two pointers hanging off OCC.  */
379  next = occ->next;
380  child = occ->children;
381  occ->bb->aux = NULL;
382  pool_free (occ_pool, occ);
383
384  /* Now ensure that we don't recurse unless it is necessary.  */
385  if (!child)
386    return next;
387  else
388    {
389      while (next)
390	next = free_bb (next);
391
392      return child;
393    }
394}
395
396
397/* Look for floating-point divisions among DEF's uses, and try to
398   replace them by multiplications with the reciprocal.  Add
399   as many statements computing the reciprocal as needed.
400
401   DEF must be a GIMPLE register of a floating-point type.  */
402
403static void
404execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
405{
406  use_operand_p use_p;
407  imm_use_iterator use_iter;
408  struct occurrence *occ;
409  int count = 0, threshold;
410
411  gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
412
413  FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
414    {
415      gimple use_stmt = USE_STMT (use_p);
416      if (is_division_by (use_stmt, def))
417	{
418	  register_division_in (gimple_bb (use_stmt));
419	  count++;
420	}
421    }
422
423  /* Do the expensive part only if we can hope to optimize something.  */
424  threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
425  if (count >= threshold)
426    {
427      gimple use_stmt;
428      for (occ = occ_head; occ; occ = occ->next)
429	{
430	  compute_merit (occ);
431	  insert_reciprocals (def_gsi, occ, def, NULL, threshold);
432	}
433
434      FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
435	{
436	  if (is_division_by (use_stmt, def))
437	    {
438	      FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
439		replace_reciprocal (use_p);
440	    }
441	}
442    }
443
444  for (occ = occ_head; occ; )
445    occ = free_bb (occ);
446
447  occ_head = NULL;
448}
449
450static bool
451gate_cse_reciprocals (void)
452{
453  return optimize && flag_reciprocal_math;
454}
455
456/* Go through all the floating-point SSA_NAMEs, and call
457   execute_cse_reciprocals_1 on each of them.  */
458static unsigned int
459execute_cse_reciprocals (void)
460{
461  basic_block bb;
462  tree arg;
463
464  occ_pool = create_alloc_pool ("dominators for recip",
465				sizeof (struct occurrence),
466				n_basic_blocks / 3 + 1);
467
468  calculate_dominance_info (CDI_DOMINATORS);
469  calculate_dominance_info (CDI_POST_DOMINATORS);
470
471#ifdef ENABLE_CHECKING
472  FOR_EACH_BB (bb)
473    gcc_assert (!bb->aux);
474#endif
475
476  for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = TREE_CHAIN (arg))
477    if (gimple_default_def (cfun, arg)
478	&& FLOAT_TYPE_P (TREE_TYPE (arg))
479	&& is_gimple_reg (arg))
480      execute_cse_reciprocals_1 (NULL, gimple_default_def (cfun, arg));
481
482  FOR_EACH_BB (bb)
483    {
484      gimple_stmt_iterator gsi;
485      gimple phi;
486      tree def;
487
488      for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
489	{
490	  phi = gsi_stmt (gsi);
491	  def = PHI_RESULT (phi);
492	  if (FLOAT_TYPE_P (TREE_TYPE (def))
493	      && is_gimple_reg (def))
494	    execute_cse_reciprocals_1 (NULL, def);
495	}
496
497      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
498        {
499	  gimple stmt = gsi_stmt (gsi);
500
501	  if (gimple_has_lhs (stmt)
502	      && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
503	      && FLOAT_TYPE_P (TREE_TYPE (def))
504	      && TREE_CODE (def) == SSA_NAME)
505	    execute_cse_reciprocals_1 (&gsi, def);
506	}
507
508      if (optimize_bb_for_size_p (bb))
509        continue;
510
511      /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
512      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
513        {
514	  gimple stmt = gsi_stmt (gsi);
515	  tree fndecl;
516
517	  if (is_gimple_assign (stmt)
518	      && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
519	    {
520	      tree arg1 = gimple_assign_rhs2 (stmt);
521	      gimple stmt1;
522
523	      if (TREE_CODE (arg1) != SSA_NAME)
524		continue;
525
526	      stmt1 = SSA_NAME_DEF_STMT (arg1);
527
528	      if (is_gimple_call (stmt1)
529		  && gimple_call_lhs (stmt1)
530		  && (fndecl = gimple_call_fndecl (stmt1))
531		  && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
532		      || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
533		{
534		  enum built_in_function code;
535		  bool md_code, fail;
536		  imm_use_iterator ui;
537		  use_operand_p use_p;
538
539		  code = DECL_FUNCTION_CODE (fndecl);
540		  md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
541
542		  fndecl = targetm.builtin_reciprocal (code, md_code, false);
543		  if (!fndecl)
544		    continue;
545
546		  /* Check that all uses of the SSA name are divisions,
547		     otherwise replacing the defining statement will do
548		     the wrong thing.  */
549		  fail = false;
550		  FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
551		    {
552		      gimple stmt2 = USE_STMT (use_p);
553		      if (is_gimple_debug (stmt2))
554			continue;
555		      if (!is_gimple_assign (stmt2)
556			  || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
557			  || gimple_assign_rhs1 (stmt2) == arg1
558			  || gimple_assign_rhs2 (stmt2) != arg1)
559			{
560			  fail = true;
561			  break;
562			}
563		    }
564		  if (fail)
565		    continue;
566
567		  gimple_replace_lhs (stmt1, arg1);
568		  gimple_call_set_fndecl (stmt1, fndecl);
569		  update_stmt (stmt1);
570
571		  FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
572		    {
573		      gimple_assign_set_rhs_code (stmt, MULT_EXPR);
574		      fold_stmt_inplace (stmt);
575		      update_stmt (stmt);
576		    }
577		}
578	    }
579	}
580    }
581
582  free_dominance_info (CDI_DOMINATORS);
583  free_dominance_info (CDI_POST_DOMINATORS);
584  free_alloc_pool (occ_pool);
585  return 0;
586}
587
588struct gimple_opt_pass pass_cse_reciprocals =
589{
590 {
591  GIMPLE_PASS,
592  "recip",				/* name */
593  gate_cse_reciprocals,			/* gate */
594  execute_cse_reciprocals,		/* execute */
595  NULL,					/* sub */
596  NULL,					/* next */
597  0,					/* static_pass_number */
598  TV_NONE,				/* tv_id */
599  PROP_ssa,				/* properties_required */
600  0,					/* properties_provided */
601  0,					/* properties_destroyed */
602  0,					/* todo_flags_start */
603  TODO_dump_func | TODO_update_ssa | TODO_verify_ssa
604    | TODO_verify_stmts                /* todo_flags_finish */
605 }
606};
607
608/* Records an occurrence at statement USE_STMT in the vector of trees
609   STMTS if it is dominated by *TOP_BB or dominates it or this basic block
610   is not yet initialized.  Returns true if the occurrence was pushed on
611   the vector.  Adjusts *TOP_BB to be the basic block dominating all
612   statements in the vector.  */
613
614static bool
615maybe_record_sincos (VEC(gimple, heap) **stmts,
616		     basic_block *top_bb, gimple use_stmt)
617{
618  basic_block use_bb = gimple_bb (use_stmt);
619  if (*top_bb
620      && (*top_bb == use_bb
621	  || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
622    VEC_safe_push (gimple, heap, *stmts, use_stmt);
623  else if (!*top_bb
624	   || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
625    {
626      VEC_safe_push (gimple, heap, *stmts, use_stmt);
627      *top_bb = use_bb;
628    }
629  else
630    return false;
631
632  return true;
633}
634
635/* Look for sin, cos and cexpi calls with the same argument NAME and
636   create a single call to cexpi CSEing the result in this case.
637   We first walk over all immediate uses of the argument collecting
638   statements that we can CSE in a vector and in a second pass replace
639   the statement rhs with a REALPART or IMAGPART expression on the
640   result of the cexpi call we insert before the use statement that
641   dominates all other candidates.  */
642
643static bool
644execute_cse_sincos_1 (tree name)
645{
646  gimple_stmt_iterator gsi;
647  imm_use_iterator use_iter;
648  tree fndecl, res, type;
649  gimple def_stmt, use_stmt, stmt;
650  int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
651  VEC(gimple, heap) *stmts = NULL;
652  basic_block top_bb = NULL;
653  int i;
654  bool cfg_changed = false;
655
656  type = TREE_TYPE (name);
657  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
658    {
659      if (gimple_code (use_stmt) != GIMPLE_CALL
660	  || !gimple_call_lhs (use_stmt)
661	  || !(fndecl = gimple_call_fndecl (use_stmt))
662	  || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
663	continue;
664
665      switch (DECL_FUNCTION_CODE (fndecl))
666	{
667	CASE_FLT_FN (BUILT_IN_COS):
668	  seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
669	  break;
670
671	CASE_FLT_FN (BUILT_IN_SIN):
672	  seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
673	  break;
674
675	CASE_FLT_FN (BUILT_IN_CEXPI):
676	  seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
677	  break;
678
679	default:;
680	}
681    }
682
683  if (seen_cos + seen_sin + seen_cexpi <= 1)
684    {
685      VEC_free(gimple, heap, stmts);
686      return false;
687    }
688
689  /* Simply insert cexpi at the beginning of top_bb but not earlier than
690     the name def statement.  */
691  fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
692  if (!fndecl)
693    return false;
694  res = create_tmp_var (TREE_TYPE (TREE_TYPE (fndecl)), "sincostmp");
695  if (TREE_CODE (TREE_TYPE (TREE_TYPE (fndecl))) == COMPLEX_TYPE
696      || TREE_CODE (TREE_TYPE (TREE_TYPE (fndecl))) == VECTOR_TYPE)
697    DECL_GIMPLE_REG_P (res) = 1;
698  stmt = gimple_build_call (fndecl, 1, name);
699  res = make_ssa_name (res, stmt);
700  gimple_call_set_lhs (stmt, res);
701
702  def_stmt = SSA_NAME_DEF_STMT (name);
703  if (!SSA_NAME_IS_DEFAULT_DEF (name)
704      && gimple_code (def_stmt) != GIMPLE_PHI
705      && gimple_bb (def_stmt) == top_bb)
706    {
707      gsi = gsi_for_stmt (def_stmt);
708      gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
709    }
710  else
711    {
712      gsi = gsi_after_labels (top_bb);
713      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
714    }
715  update_stmt (stmt);
716
717  /* And adjust the recorded old call sites.  */
718  for (i = 0; VEC_iterate(gimple, stmts, i, use_stmt); ++i)
719    {
720      tree rhs = NULL;
721      fndecl = gimple_call_fndecl (use_stmt);
722
723      switch (DECL_FUNCTION_CODE (fndecl))
724	{
725	CASE_FLT_FN (BUILT_IN_COS):
726	  rhs = fold_build1 (REALPART_EXPR, type, res);
727	  break;
728
729	CASE_FLT_FN (BUILT_IN_SIN):
730	  rhs = fold_build1 (IMAGPART_EXPR, type, res);
731	  break;
732
733	CASE_FLT_FN (BUILT_IN_CEXPI):
734	  rhs = res;
735	  break;
736
737	default:;
738	  gcc_unreachable ();
739	}
740
741	/* Replace call with a copy.  */
742	stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
743
744	gsi = gsi_for_stmt (use_stmt);
745	gsi_replace (&gsi, stmt, true);
746	if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
747	  cfg_changed = true;
748    }
749
750  VEC_free(gimple, heap, stmts);
751
752  return cfg_changed;
753}
754
755/* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
756   on the SSA_NAME argument of each of them.  */
757
758static unsigned int
759execute_cse_sincos (void)
760{
761  basic_block bb;
762  bool cfg_changed = false;
763
764  calculate_dominance_info (CDI_DOMINATORS);
765
766  FOR_EACH_BB (bb)
767    {
768      gimple_stmt_iterator gsi;
769
770      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
771        {
772	  gimple stmt = gsi_stmt (gsi);
773	  tree fndecl;
774
775	  if (is_gimple_call (stmt)
776	      && gimple_call_lhs (stmt)
777	      && (fndecl = gimple_call_fndecl (stmt))
778	      && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
779	    {
780	      tree arg;
781
782	      switch (DECL_FUNCTION_CODE (fndecl))
783		{
784		CASE_FLT_FN (BUILT_IN_COS):
785		CASE_FLT_FN (BUILT_IN_SIN):
786		CASE_FLT_FN (BUILT_IN_CEXPI):
787		  arg = gimple_call_arg (stmt, 0);
788		  if (TREE_CODE (arg) == SSA_NAME)
789		    cfg_changed |= execute_cse_sincos_1 (arg);
790		  break;
791
792		default:;
793		}
794	    }
795	}
796    }
797
798  free_dominance_info (CDI_DOMINATORS);
799  return cfg_changed ? TODO_cleanup_cfg : 0;
800}
801
802static bool
803gate_cse_sincos (void)
804{
805  /* Make sure we have either sincos or cexp.  */
806  return (TARGET_HAS_SINCOS
807	  || TARGET_C99_FUNCTIONS)
808	 && optimize;
809}
810
811struct gimple_opt_pass pass_cse_sincos =
812{
813 {
814  GIMPLE_PASS,
815  "sincos",				/* name */
816  gate_cse_sincos,			/* gate */
817  execute_cse_sincos,			/* execute */
818  NULL,					/* sub */
819  NULL,					/* next */
820  0,					/* static_pass_number */
821  TV_NONE,				/* tv_id */
822  PROP_ssa,				/* properties_required */
823  0,					/* properties_provided */
824  0,					/* properties_destroyed */
825  0,					/* todo_flags_start */
826  TODO_dump_func | TODO_update_ssa | TODO_verify_ssa
827    | TODO_verify_stmts                 /* todo_flags_finish */
828 }
829};
830
831/* A symbolic number is used to detect byte permutation and selection
832   patterns.  Therefore the field N contains an artificial number
833   consisting of byte size markers:
834
835   0    - byte has the value 0
836   1..size - byte contains the content of the byte
837   number indexed with that value minus one  */
838
839struct symbolic_number {
840  unsigned HOST_WIDEST_INT n;
841  int size;
842};
843
844/* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
845   number N.  Return false if the requested operation is not permitted
846   on a symbolic number.  */
847
848static inline bool
849do_shift_rotate (enum tree_code code,
850		 struct symbolic_number *n,
851		 int count)
852{
853  if (count % 8 != 0)
854    return false;
855
856  /* Zero out the extra bits of N in order to avoid them being shifted
857     into the significant bits.  */
858  if (n->size < (int)sizeof (HOST_WIDEST_INT))
859    n->n &= ((unsigned HOST_WIDEST_INT)1 << (n->size * BITS_PER_UNIT)) - 1;
860
861  switch (code)
862    {
863    case LSHIFT_EXPR:
864      n->n <<= count;
865      break;
866    case RSHIFT_EXPR:
867      n->n >>= count;
868      break;
869    case LROTATE_EXPR:
870      n->n = (n->n << count) | (n->n >> ((n->size * BITS_PER_UNIT) - count));
871      break;
872    case RROTATE_EXPR:
873      n->n = (n->n >> count) | (n->n << ((n->size * BITS_PER_UNIT) - count));
874      break;
875    default:
876      return false;
877    }
878  return true;
879}
880
881/* Perform sanity checking for the symbolic number N and the gimple
882   statement STMT.  */
883
884static inline bool
885verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
886{
887  tree lhs_type;
888
889  lhs_type = gimple_expr_type (stmt);
890
891  if (TREE_CODE (lhs_type) != INTEGER_TYPE)
892    return false;
893
894  if (TYPE_PRECISION (lhs_type) != n->size * BITS_PER_UNIT)
895    return false;
896
897  return true;
898}
899
900/* find_bswap_1 invokes itself recursively with N and tries to perform
901   the operation given by the rhs of STMT on the result.  If the
902   operation could successfully be executed the function returns the
903   tree expression of the source operand and NULL otherwise.  */
904
905static tree
906find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
907{
908  enum tree_code code;
909  tree rhs1, rhs2 = NULL;
910  gimple rhs1_stmt, rhs2_stmt;
911  tree source_expr1;
912  enum gimple_rhs_class rhs_class;
913
914  if (!limit || !is_gimple_assign (stmt))
915    return NULL_TREE;
916
917  rhs1 = gimple_assign_rhs1 (stmt);
918
919  if (TREE_CODE (rhs1) != SSA_NAME)
920    return NULL_TREE;
921
922  code = gimple_assign_rhs_code (stmt);
923  rhs_class = gimple_assign_rhs_class (stmt);
924  rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
925
926  if (rhs_class == GIMPLE_BINARY_RHS)
927    rhs2 = gimple_assign_rhs2 (stmt);
928
929  /* Handle unary rhs and binary rhs with integer constants as second
930     operand.  */
931
932  if (rhs_class == GIMPLE_UNARY_RHS
933      || (rhs_class == GIMPLE_BINARY_RHS
934	  && TREE_CODE (rhs2) == INTEGER_CST))
935    {
936      if (code != BIT_AND_EXPR
937	  && code != LSHIFT_EXPR
938	  && code != RSHIFT_EXPR
939	  && code != LROTATE_EXPR
940	  && code != RROTATE_EXPR
941	  && code != NOP_EXPR
942	  && code != CONVERT_EXPR)
943	return NULL_TREE;
944
945      source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
946
947      /* If find_bswap_1 returned NULL STMT is a leaf node and we have
948	 to initialize the symbolic number.  */
949      if (!source_expr1)
950	{
951	  /* Set up the symbolic number N by setting each byte to a
952	     value between 1 and the byte size of rhs1.  The highest
953	     order byte is set to n->size and the lowest order
954	     byte to 1.  */
955	  n->size = TYPE_PRECISION (TREE_TYPE (rhs1));
956	  if (n->size % BITS_PER_UNIT != 0)
957	    return NULL_TREE;
958	  n->size /= BITS_PER_UNIT;
959	  n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
960		  (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
961
962	  if (n->size < (int)sizeof (HOST_WIDEST_INT))
963	    n->n &= ((unsigned HOST_WIDEST_INT)1 <<
964		     (n->size * BITS_PER_UNIT)) - 1;
965
966	  source_expr1 = rhs1;
967	}
968
969      switch (code)
970	{
971	case BIT_AND_EXPR:
972	  {
973	    int i;
974	    unsigned HOST_WIDEST_INT val = widest_int_cst_value (rhs2);
975	    unsigned HOST_WIDEST_INT tmp = val;
976
977	    /* Only constants masking full bytes are allowed.  */
978	    for (i = 0; i < n->size; i++, tmp >>= BITS_PER_UNIT)
979	      if ((tmp & 0xff) != 0 && (tmp & 0xff) != 0xff)
980		return NULL_TREE;
981
982	    n->n &= val;
983	  }
984	  break;
985	case LSHIFT_EXPR:
986	case RSHIFT_EXPR:
987	case LROTATE_EXPR:
988	case RROTATE_EXPR:
989	  if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
990	    return NULL_TREE;
991	  break;
992	CASE_CONVERT:
993	  {
994	    int type_size;
995
996	    type_size = TYPE_PRECISION (gimple_expr_type (stmt));
997	    if (type_size % BITS_PER_UNIT != 0)
998	      return NULL_TREE;
999
1000	    if (type_size / BITS_PER_UNIT < (int)(sizeof (HOST_WIDEST_INT)))
1001	      {
1002		/* If STMT casts to a smaller type mask out the bits not
1003		   belonging to the target type.  */
1004		n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
1005	      }
1006	    n->size = type_size / BITS_PER_UNIT;
1007	  }
1008	  break;
1009	default:
1010	  return NULL_TREE;
1011	};
1012      return verify_symbolic_number_p (n, stmt) ? source_expr1 : NULL;
1013    }
1014
1015  /* Handle binary rhs.  */
1016
1017  if (rhs_class == GIMPLE_BINARY_RHS)
1018    {
1019      struct symbolic_number n1, n2;
1020      tree source_expr2;
1021
1022      if (code != BIT_IOR_EXPR)
1023	return NULL_TREE;
1024
1025      if (TREE_CODE (rhs2) != SSA_NAME)
1026	return NULL_TREE;
1027
1028      rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1029
1030      switch (code)
1031	{
1032	case BIT_IOR_EXPR:
1033	  source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
1034
1035	  if (!source_expr1)
1036	    return NULL_TREE;
1037
1038	  source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
1039
1040	  if (source_expr1 != source_expr2
1041	      || n1.size != n2.size)
1042	    return NULL_TREE;
1043
1044	  n->size = n1.size;
1045	  n->n = n1.n | n2.n;
1046
1047	  if (!verify_symbolic_number_p (n, stmt))
1048	    return NULL_TREE;
1049
1050	  break;
1051	default:
1052	  return NULL_TREE;
1053	}
1054      return source_expr1;
1055    }
1056  return NULL_TREE;
1057}
1058
1059/* Check if STMT completes a bswap implementation consisting of ORs,
1060   SHIFTs and ANDs.  Return the source tree expression on which the
1061   byte swap is performed and NULL if no bswap was found.  */
1062
1063static tree
1064find_bswap (gimple stmt)
1065{
1066/* The number which the find_bswap result should match in order to
1067   have a full byte swap.  The number is shifted to the left according
1068   to the size of the symbolic number before using it.  */
1069  unsigned HOST_WIDEST_INT cmp =
1070    sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1071    (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
1072
1073  struct symbolic_number n;
1074  tree source_expr;
1075
1076  /* The last parameter determines the depth search limit.  It usually
1077     correlates directly to the number of bytes to be touched.  We
1078     increase that number by one here in order to also cover signed ->
1079     unsigned conversions of the src operand as can be seen in
1080     libgcc.  */
1081  source_expr =  find_bswap_1 (stmt, &n,
1082			       TREE_INT_CST_LOW (
1083				 TYPE_SIZE_UNIT (gimple_expr_type (stmt))) + 1);
1084
1085  if (!source_expr)
1086    return NULL_TREE;
1087
1088  /* Zero out the extra bits of N and CMP.  */
1089  if (n.size < (int)sizeof (HOST_WIDEST_INT))
1090    {
1091      unsigned HOST_WIDEST_INT mask =
1092	((unsigned HOST_WIDEST_INT)1 << (n.size * BITS_PER_UNIT)) - 1;
1093
1094      n.n &= mask;
1095      cmp >>= (sizeof (HOST_WIDEST_INT) - n.size) * BITS_PER_UNIT;
1096    }
1097
1098  /* A complete byte swap should make the symbolic number to start
1099     with the largest digit in the highest order byte.  */
1100  if (cmp != n.n)
1101    return NULL_TREE;
1102
1103  return source_expr;
1104}
1105
1106/* Find manual byte swap implementations and turn them into a bswap
1107   builtin invokation.  */
1108
1109static unsigned int
1110execute_optimize_bswap (void)
1111{
1112  basic_block bb;
1113  bool bswap32_p, bswap64_p;
1114  bool changed = false;
1115  tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
1116
1117  if (BITS_PER_UNIT != 8)
1118    return 0;
1119
1120  if (sizeof (HOST_WIDEST_INT) < 8)
1121    return 0;
1122
1123  bswap32_p = (built_in_decls[BUILT_IN_BSWAP32]
1124	       && optab_handler (bswap_optab, SImode)->insn_code !=
1125	       CODE_FOR_nothing);
1126  bswap64_p = (built_in_decls[BUILT_IN_BSWAP64]
1127	       && (optab_handler (bswap_optab, DImode)->insn_code !=
1128		   CODE_FOR_nothing
1129		   || (bswap32_p && word_mode == SImode)));
1130
1131  if (!bswap32_p && !bswap64_p)
1132    return 0;
1133
1134  /* Determine the argument type of the builtins.  The code later on
1135     assumes that the return and argument type are the same.  */
1136  if (bswap32_p)
1137    {
1138      tree fndecl = built_in_decls[BUILT_IN_BSWAP32];
1139      bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1140    }
1141
1142  if (bswap64_p)
1143    {
1144      tree fndecl = built_in_decls[BUILT_IN_BSWAP64];
1145      bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1146    }
1147
1148  FOR_EACH_BB (bb)
1149    {
1150      gimple_stmt_iterator gsi;
1151
1152      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1153        {
1154	  gimple stmt = gsi_stmt (gsi);
1155	  tree bswap_src, bswap_type;
1156	  tree bswap_tmp;
1157	  tree fndecl = NULL_TREE;
1158	  int type_size;
1159	  gimple call;
1160
1161	  if (!is_gimple_assign (stmt)
1162	      || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
1163	    continue;
1164
1165	  type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1166
1167	  switch (type_size)
1168	    {
1169	    case 32:
1170	      if (bswap32_p)
1171		{
1172		  fndecl = built_in_decls[BUILT_IN_BSWAP32];
1173		  bswap_type = bswap32_type;
1174		}
1175	      break;
1176	    case 64:
1177	      if (bswap64_p)
1178		{
1179		  fndecl = built_in_decls[BUILT_IN_BSWAP64];
1180		  bswap_type = bswap64_type;
1181		}
1182	      break;
1183	    default:
1184	      continue;
1185	    }
1186
1187	  if (!fndecl)
1188	    continue;
1189
1190	  bswap_src = find_bswap (stmt);
1191
1192	  if (!bswap_src)
1193	    continue;
1194
1195	  changed = true;
1196
1197	  bswap_tmp = bswap_src;
1198
1199	  /* Convert the src expression if necessary.  */
1200	  if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1201	    {
1202	      gimple convert_stmt;
1203
1204	      bswap_tmp = create_tmp_var (bswap_type, "bswapsrc");
1205	      add_referenced_var (bswap_tmp);
1206	      bswap_tmp = make_ssa_name (bswap_tmp, NULL);
1207
1208	      convert_stmt = gimple_build_assign_with_ops (
1209			       CONVERT_EXPR, bswap_tmp, bswap_src, NULL);
1210	      gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
1211	    }
1212
1213	  call = gimple_build_call (fndecl, 1, bswap_tmp);
1214
1215	  bswap_tmp = gimple_assign_lhs (stmt);
1216
1217	  /* Convert the result if necessary.  */
1218	  if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1219	    {
1220	      gimple convert_stmt;
1221
1222	      bswap_tmp = create_tmp_var (bswap_type, "bswapdst");
1223	      add_referenced_var (bswap_tmp);
1224	      bswap_tmp = make_ssa_name (bswap_tmp, NULL);
1225	      convert_stmt = gimple_build_assign_with_ops (
1226		               CONVERT_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
1227	      gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
1228	    }
1229
1230	  gimple_call_set_lhs (call, bswap_tmp);
1231
1232	  if (dump_file)
1233	    {
1234	      fprintf (dump_file, "%d bit bswap implementation found at: ",
1235		       (int)type_size);
1236	      print_gimple_stmt (dump_file, stmt, 0, 0);
1237	    }
1238
1239	  gsi_insert_after (&gsi, call, GSI_SAME_STMT);
1240	  gsi_remove (&gsi, true);
1241	}
1242    }
1243
1244  return (changed ? TODO_dump_func | TODO_update_ssa | TODO_verify_ssa
1245	  | TODO_verify_stmts : 0);
1246}
1247
1248static bool
1249gate_optimize_bswap (void)
1250{
1251  return flag_expensive_optimizations && optimize;
1252}
1253
1254struct gimple_opt_pass pass_optimize_bswap =
1255{
1256 {
1257  GIMPLE_PASS,
1258  "bswap",				/* name */
1259  gate_optimize_bswap,                  /* gate */
1260  execute_optimize_bswap,		/* execute */
1261  NULL,					/* sub */
1262  NULL,					/* next */
1263  0,					/* static_pass_number */
1264  TV_NONE,				/* tv_id */
1265  PROP_ssa,				/* properties_required */
1266  0,					/* properties_provided */
1267  0,					/* properties_destroyed */
1268  0,					/* todo_flags_start */
1269  0                                     /* todo_flags_finish */
1270 }
1271};
1272