tree-ssa-math-opts.c revision 1.11
1/* Global, SSA-based optimizations using mathematical identities.
2   Copyright (C) 2005-2017 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify it
7under the terms of the GNU General Public License as published by the
8Free Software Foundation; either version 3, or (at your option) any
9later version.
10
11GCC is distributed in the hope that it will be useful, but WITHOUT
12ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14for more details.
15
16You should have received a copy of the GNU General Public License
17along with GCC; see the file COPYING3.  If not see
18<http://www.gnu.org/licenses/>.  */
19
20/* Currently, the only mini-pass in this file tries to CSE reciprocal
21   operations.  These are common in sequences such as this one:
22
23	modulus = sqrt(x*x + y*y + z*z);
24	x = x / modulus;
25	y = y / modulus;
26	z = z / modulus;
27
28   that can be optimized to
29
30	modulus = sqrt(x*x + y*y + z*z);
31        rmodulus = 1.0 / modulus;
32	x = x * rmodulus;
33	y = y * rmodulus;
34	z = z * rmodulus;
35
36   We do this for loop invariant divisors, and with this pass whenever
37   we notice that a division has the same divisor multiple times.
38
39   Of course, like in PRE, we don't insert a division if a dominator
40   already has one.  However, this cannot be done as an extension of
41   PRE for several reasons.
42
43   First of all, with some experiments it was found out that the
44   transformation is not always useful if there are only two divisions
45   by the same divisor.  This is probably because modern processors
46   can pipeline the divisions; on older, in-order processors it should
47   still be effective to optimize two divisions by the same number.
48   We make this a param, and it shall be called N in the remainder of
49   this comment.
50
51   Second, if trapping math is active, we have less freedom on where
52   to insert divisions: we can only do so in basic blocks that already
53   contain one.  (If divisions don't trap, instead, we can insert
54   divisions elsewhere, which will be in blocks that are common dominators
55   of those that have the division).
56
57   We really don't want to compute the reciprocal unless a division will
58   be found.  To do this, we won't insert the division in a basic block
59   that has less than N divisions *post-dominating* it.
60
61   The algorithm constructs a subset of the dominator tree, holding the
62   blocks containing the divisions and the common dominators to them,
63   and walk it twice.  The first walk is in post-order, and it annotates
64   each block with the number of divisions that post-dominate it: this
65   gives information on where divisions can be inserted profitably.
66   The second walk is in pre-order, and it inserts divisions as explained
67   above, and replaces divisions by multiplications.
68
69   In the best case, the cost of the pass is O(n_statements).  In the
70   worst-case, the cost is due to creating the dominator tree subset,
71   with a cost of O(n_basic_blocks ^ 2); however this can only happen
72   for n_statements / n_basic_blocks statements.  So, the amortized cost
73   of creating the dominator tree subset is O(n_basic_blocks) and the
74   worst-case cost of the pass is O(n_statements * n_basic_blocks).
75
76   More practically, the cost will be small because there are few
77   divisions, and they tend to be in the same basic block, so insert_bb
78   is called very few times.
79
80   If we did this using domwalk.c, an efficient implementation would have
81   to work on all the variables in a single pass, because we could not
82   work on just a subset of the dominator tree, as we do now, and the
83   cost would also be something like O(n_statements * n_basic_blocks).
84   The data structures would be more complex in order to work on all the
85   variables in a single pass.  */
86
87#include "config.h"
88#include "system.h"
89#include "coretypes.h"
90#include "backend.h"
91#include "target.h"
92#include "rtl.h"
93#include "tree.h"
94#include "gimple.h"
95#include "predict.h"
96#include "alloc-pool.h"
97#include "tree-pass.h"
98#include "ssa.h"
99#include "optabs-tree.h"
100#include "gimple-pretty-print.h"
101#include "alias.h"
102#include "fold-const.h"
103#include "gimple-fold.h"
104#include "gimple-iterator.h"
105#include "gimplify.h"
106#include "gimplify-me.h"
107#include "stor-layout.h"
108#include "tree-cfg.h"
109#include "tree-dfa.h"
110#include "tree-ssa.h"
111#include "builtins.h"
112#include "params.h"
113#include "internal-fn.h"
114#include "case-cfn-macros.h"
115#include "optabs-libfuncs.h"
116#include "tree-eh.h"
117#include "targhooks.h"
118
119/* This structure represents one basic block that either computes a
120   division, or is a common dominator for basic block that compute a
121   division.  */
122struct occurrence {
123  /* The basic block represented by this structure.  */
124  basic_block bb;
125
126  /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
127     inserted in BB.  */
128  tree recip_def;
129
130  /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
131     was inserted in BB.  */
132  gimple *recip_def_stmt;
133
134  /* Pointer to a list of "struct occurrence"s for blocks dominated
135     by BB.  */
136  struct occurrence *children;
137
138  /* Pointer to the next "struct occurrence"s in the list of blocks
139     sharing a common dominator.  */
140  struct occurrence *next;
141
142  /* The number of divisions that are in BB before compute_merit.  The
143     number of divisions that are in BB or post-dominate it after
144     compute_merit.  */
145  int num_divisions;
146
147  /* True if the basic block has a division, false if it is a common
148     dominator for basic blocks that do.  If it is false and trapping
149     math is active, BB is not a candidate for inserting a reciprocal.  */
150  bool bb_has_division;
151};
152
153static struct
154{
155  /* Number of 1.0/X ops inserted.  */
156  int rdivs_inserted;
157
158  /* Number of 1.0/FUNC ops inserted.  */
159  int rfuncs_inserted;
160} reciprocal_stats;
161
162static struct
163{
164  /* Number of cexpi calls inserted.  */
165  int inserted;
166} sincos_stats;
167
168static struct
169{
170  /* Number of hand-written 16-bit nop / bswaps found.  */
171  int found_16bit;
172
173  /* Number of hand-written 32-bit nop / bswaps found.  */
174  int found_32bit;
175
176  /* Number of hand-written 64-bit nop / bswaps found.  */
177  int found_64bit;
178} nop_stats, bswap_stats;
179
180static struct
181{
182  /* Number of widening multiplication ops inserted.  */
183  int widen_mults_inserted;
184
185  /* Number of integer multiply-and-accumulate ops inserted.  */
186  int maccs_inserted;
187
188  /* Number of fp fused multiply-add ops inserted.  */
189  int fmas_inserted;
190
191  /* Number of divmod calls inserted.  */
192  int divmod_calls_inserted;
193} widen_mul_stats;
194
195/* The instance of "struct occurrence" representing the highest
196   interesting block in the dominator tree.  */
197static struct occurrence *occ_head;
198
199/* Allocation pool for getting instances of "struct occurrence".  */
200static object_allocator<occurrence> *occ_pool;
201
202
203
204/* Allocate and return a new struct occurrence for basic block BB, and
205   whose children list is headed by CHILDREN.  */
206static struct occurrence *
207occ_new (basic_block bb, struct occurrence *children)
208{
209  struct occurrence *occ;
210
211  bb->aux = occ = occ_pool->allocate ();
212  memset (occ, 0, sizeof (struct occurrence));
213
214  occ->bb = bb;
215  occ->children = children;
216  return occ;
217}
218
219
220/* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
221   list of "struct occurrence"s, one per basic block, having IDOM as
222   their common dominator.
223
224   We try to insert NEW_OCC as deep as possible in the tree, and we also
225   insert any other block that is a common dominator for BB and one
226   block already in the tree.  */
227
228static void
229insert_bb (struct occurrence *new_occ, basic_block idom,
230	   struct occurrence **p_head)
231{
232  struct occurrence *occ, **p_occ;
233
234  for (p_occ = p_head; (occ = *p_occ) != NULL; )
235    {
236      basic_block bb = new_occ->bb, occ_bb = occ->bb;
237      basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
238      if (dom == bb)
239	{
240	  /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
241	     from its list.  */
242	  *p_occ = occ->next;
243	  occ->next = new_occ->children;
244	  new_occ->children = occ;
245
246	  /* Try the next block (it may as well be dominated by BB).  */
247	}
248
249      else if (dom == occ_bb)
250	{
251	  /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
252	  insert_bb (new_occ, dom, &occ->children);
253	  return;
254	}
255
256      else if (dom != idom)
257	{
258	  gcc_assert (!dom->aux);
259
260	  /* There is a dominator between IDOM and BB, add it and make
261	     two children out of NEW_OCC and OCC.  First, remove OCC from
262	     its list.	*/
263	  *p_occ = occ->next;
264	  new_occ->next = occ;
265	  occ->next = NULL;
266
267	  /* None of the previous blocks has DOM as a dominator: if we tail
268	     recursed, we would reexamine them uselessly. Just switch BB with
269	     DOM, and go on looking for blocks dominated by DOM.  */
270          new_occ = occ_new (dom, new_occ);
271	}
272
273      else
274	{
275	  /* Nothing special, go on with the next element.  */
276	  p_occ = &occ->next;
277	}
278    }
279
280  /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
281  new_occ->next = *p_head;
282  *p_head = new_occ;
283}
284
285/* Register that we found a division in BB.  */
286
287static inline void
288register_division_in (basic_block bb)
289{
290  struct occurrence *occ;
291
292  occ = (struct occurrence *) bb->aux;
293  if (!occ)
294    {
295      occ = occ_new (bb, NULL);
296      insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
297    }
298
299  occ->bb_has_division = true;
300  occ->num_divisions++;
301}
302
303
304/* Compute the number of divisions that postdominate each block in OCC and
305   its children.  */
306
307static void
308compute_merit (struct occurrence *occ)
309{
310  struct occurrence *occ_child;
311  basic_block dom = occ->bb;
312
313  for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
314    {
315      basic_block bb;
316      if (occ_child->children)
317        compute_merit (occ_child);
318
319      if (flag_exceptions)
320	bb = single_noncomplex_succ (dom);
321      else
322	bb = dom;
323
324      if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
325        occ->num_divisions += occ_child->num_divisions;
326    }
327}
328
329
330/* Return whether USE_STMT is a floating-point division by DEF.  */
331static inline bool
332is_division_by (gimple *use_stmt, tree def)
333{
334  return is_gimple_assign (use_stmt)
335	 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
336	 && gimple_assign_rhs2 (use_stmt) == def
337	 /* Do not recognize x / x as valid division, as we are getting
338	    confused later by replacing all immediate uses x in such
339	    a stmt.  */
340	 && gimple_assign_rhs1 (use_stmt) != def
341	 && !stmt_can_throw_internal (use_stmt);
342}
343
344/* Walk the subset of the dominator tree rooted at OCC, setting the
345   RECIP_DEF field to a definition of 1.0 / DEF that can be used in
346   the given basic block.  The field may be left NULL, of course,
347   if it is not possible or profitable to do the optimization.
348
349   DEF_BSI is an iterator pointing at the statement defining DEF.
350   If RECIP_DEF is set, a dominator already has a computation that can
351   be used.  */
352
353static void
354insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
355		    tree def, tree recip_def, int threshold)
356{
357  tree type;
358  gassign *new_stmt;
359  gimple_stmt_iterator gsi;
360  struct occurrence *occ_child;
361
362  if (!recip_def
363      && (occ->bb_has_division || !flag_trapping_math)
364      && occ->num_divisions >= threshold)
365    {
366      /* Make a variable with the replacement and substitute it.  */
367      type = TREE_TYPE (def);
368      recip_def = create_tmp_reg (type, "reciptmp");
369      new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
370				      build_one_cst (type), def);
371
372      if (occ->bb_has_division)
373        {
374          /* Case 1: insert before an existing division.  */
375          gsi = gsi_after_labels (occ->bb);
376          while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
377	    gsi_next (&gsi);
378
379          gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
380        }
381      else if (def_gsi && occ->bb == def_gsi->bb)
382        {
383          /* Case 2: insert right after the definition.  Note that this will
384	     never happen if the definition statement can throw, because in
385	     that case the sole successor of the statement's basic block will
386	     dominate all the uses as well.  */
387          gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
388        }
389      else
390        {
391          /* Case 3: insert in a basic block not containing defs/uses.  */
392          gsi = gsi_after_labels (occ->bb);
393          gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
394        }
395
396      reciprocal_stats.rdivs_inserted++;
397
398      occ->recip_def_stmt = new_stmt;
399    }
400
401  occ->recip_def = recip_def;
402  for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
403    insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
404}
405
406
407/* Replace the division at USE_P with a multiplication by the reciprocal, if
408   possible.  */
409
410static inline void
411replace_reciprocal (use_operand_p use_p)
412{
413  gimple *use_stmt = USE_STMT (use_p);
414  basic_block bb = gimple_bb (use_stmt);
415  struct occurrence *occ = (struct occurrence *) bb->aux;
416
417  if (optimize_bb_for_speed_p (bb)
418      && occ->recip_def && use_stmt != occ->recip_def_stmt)
419    {
420      gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
421      gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
422      SET_USE (use_p, occ->recip_def);
423      fold_stmt_inplace (&gsi);
424      update_stmt (use_stmt);
425    }
426}
427
428
429/* Free OCC and return one more "struct occurrence" to be freed.  */
430
431static struct occurrence *
432free_bb (struct occurrence *occ)
433{
434  struct occurrence *child, *next;
435
436  /* First get the two pointers hanging off OCC.  */
437  next = occ->next;
438  child = occ->children;
439  occ->bb->aux = NULL;
440  occ_pool->remove (occ);
441
442  /* Now ensure that we don't recurse unless it is necessary.  */
443  if (!child)
444    return next;
445  else
446    {
447      while (next)
448	next = free_bb (next);
449
450      return child;
451    }
452}
453
454
455/* Look for floating-point divisions among DEF's uses, and try to
456   replace them by multiplications with the reciprocal.  Add
457   as many statements computing the reciprocal as needed.
458
459   DEF must be a GIMPLE register of a floating-point type.  */
460
461static void
462execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
463{
464  use_operand_p use_p;
465  imm_use_iterator use_iter;
466  struct occurrence *occ;
467  int count = 0, threshold;
468
469  gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
470
471  FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
472    {
473      gimple *use_stmt = USE_STMT (use_p);
474      if (is_division_by (use_stmt, def))
475	{
476	  register_division_in (gimple_bb (use_stmt));
477	  count++;
478	}
479    }
480
481  /* Do the expensive part only if we can hope to optimize something.  */
482  threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
483  if (count >= threshold)
484    {
485      gimple *use_stmt;
486      for (occ = occ_head; occ; occ = occ->next)
487	{
488	  compute_merit (occ);
489	  insert_reciprocals (def_gsi, occ, def, NULL, threshold);
490	}
491
492      FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
493	{
494	  if (is_division_by (use_stmt, def))
495	    {
496	      FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
497		replace_reciprocal (use_p);
498	    }
499	}
500    }
501
502  for (occ = occ_head; occ; )
503    occ = free_bb (occ);
504
505  occ_head = NULL;
506}
507
508/* Return an internal function that implements the reciprocal of CALL,
509   or IFN_LAST if there is no such function that the target supports.  */
510
511internal_fn
512internal_fn_reciprocal (gcall *call)
513{
514  internal_fn ifn;
515
516  switch (gimple_call_combined_fn (call))
517    {
518    CASE_CFN_SQRT:
519      ifn = IFN_RSQRT;
520      break;
521
522    default:
523      return IFN_LAST;
524    }
525
526  tree_pair types = direct_internal_fn_types (ifn, call);
527  if (!direct_internal_fn_supported_p (ifn, types, OPTIMIZE_FOR_SPEED))
528    return IFN_LAST;
529
530  return ifn;
531}
532
533/* Go through all the floating-point SSA_NAMEs, and call
534   execute_cse_reciprocals_1 on each of them.  */
535namespace {
536
537const pass_data pass_data_cse_reciprocals =
538{
539  GIMPLE_PASS, /* type */
540  "recip", /* name */
541  OPTGROUP_NONE, /* optinfo_flags */
542  TV_NONE, /* tv_id */
543  PROP_ssa, /* properties_required */
544  0, /* properties_provided */
545  0, /* properties_destroyed */
546  0, /* todo_flags_start */
547  TODO_update_ssa, /* todo_flags_finish */
548};
549
550class pass_cse_reciprocals : public gimple_opt_pass
551{
552public:
553  pass_cse_reciprocals (gcc::context *ctxt)
554    : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
555  {}
556
557  /* opt_pass methods: */
558  virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
559  virtual unsigned int execute (function *);
560
561}; // class pass_cse_reciprocals
562
563unsigned int
564pass_cse_reciprocals::execute (function *fun)
565{
566  basic_block bb;
567  tree arg;
568
569  occ_pool = new object_allocator<occurrence> ("dominators for recip");
570
571  memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
572  calculate_dominance_info (CDI_DOMINATORS);
573  calculate_dominance_info (CDI_POST_DOMINATORS);
574
575  if (flag_checking)
576    FOR_EACH_BB_FN (bb, fun)
577      gcc_assert (!bb->aux);
578
579  for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
580    if (FLOAT_TYPE_P (TREE_TYPE (arg))
581	&& is_gimple_reg (arg))
582      {
583	tree name = ssa_default_def (fun, arg);
584	if (name)
585	  execute_cse_reciprocals_1 (NULL, name);
586      }
587
588  FOR_EACH_BB_FN (bb, fun)
589    {
590      tree def;
591
592      for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
593	   gsi_next (&gsi))
594	{
595	  gphi *phi = gsi.phi ();
596	  def = PHI_RESULT (phi);
597	  if (! virtual_operand_p (def)
598	      && FLOAT_TYPE_P (TREE_TYPE (def)))
599	    execute_cse_reciprocals_1 (NULL, def);
600	}
601
602      for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
603	   gsi_next (&gsi))
604        {
605	  gimple *stmt = gsi_stmt (gsi);
606
607	  if (gimple_has_lhs (stmt)
608	      && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
609	      && FLOAT_TYPE_P (TREE_TYPE (def))
610	      && TREE_CODE (def) == SSA_NAME)
611	    execute_cse_reciprocals_1 (&gsi, def);
612	}
613
614      if (optimize_bb_for_size_p (bb))
615        continue;
616
617      /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
618      for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
619	   gsi_next (&gsi))
620        {
621	  gimple *stmt = gsi_stmt (gsi);
622
623	  if (is_gimple_assign (stmt)
624	      && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
625	    {
626	      tree arg1 = gimple_assign_rhs2 (stmt);
627	      gimple *stmt1;
628
629	      if (TREE_CODE (arg1) != SSA_NAME)
630		continue;
631
632	      stmt1 = SSA_NAME_DEF_STMT (arg1);
633
634	      if (is_gimple_call (stmt1)
635		  && gimple_call_lhs (stmt1))
636		{
637		  bool fail;
638		  imm_use_iterator ui;
639		  use_operand_p use_p;
640		  tree fndecl = NULL_TREE;
641
642		  gcall *call = as_a <gcall *> (stmt1);
643		  internal_fn ifn = internal_fn_reciprocal (call);
644		  if (ifn == IFN_LAST)
645		    {
646		      fndecl = gimple_call_fndecl (call);
647		      if (!fndecl
648			  || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_MD)
649			continue;
650		      fndecl = targetm.builtin_reciprocal (fndecl);
651		      if (!fndecl)
652			continue;
653		    }
654
655		  /* Check that all uses of the SSA name are divisions,
656		     otherwise replacing the defining statement will do
657		     the wrong thing.  */
658		  fail = false;
659		  FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
660		    {
661		      gimple *stmt2 = USE_STMT (use_p);
662		      if (is_gimple_debug (stmt2))
663			continue;
664		      if (!is_gimple_assign (stmt2)
665			  || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
666			  || gimple_assign_rhs1 (stmt2) == arg1
667			  || gimple_assign_rhs2 (stmt2) != arg1)
668			{
669			  fail = true;
670			  break;
671			}
672		    }
673		  if (fail)
674		    continue;
675
676		  gimple_replace_ssa_lhs (call, arg1);
677		  if (gimple_call_internal_p (call) != (ifn != IFN_LAST))
678		    {
679		      auto_vec<tree, 4> args;
680		      for (unsigned int i = 0;
681			   i < gimple_call_num_args (call); i++)
682			args.safe_push (gimple_call_arg (call, i));
683		      gcall *stmt2;
684		      if (ifn == IFN_LAST)
685			stmt2 = gimple_build_call_vec (fndecl, args);
686		      else
687			stmt2 = gimple_build_call_internal_vec (ifn, args);
688		      gimple_call_set_lhs (stmt2, arg1);
689		      if (gimple_vdef (call))
690			{
691			  gimple_set_vdef (stmt2, gimple_vdef (call));
692			  SSA_NAME_DEF_STMT (gimple_vdef (stmt2)) = stmt2;
693			}
694		      gimple_set_vuse (stmt2, gimple_vuse (call));
695		      gimple_stmt_iterator gsi2 = gsi_for_stmt (call);
696		      gsi_replace (&gsi2, stmt2, true);
697		    }
698		  else
699		    {
700		      if (ifn == IFN_LAST)
701			gimple_call_set_fndecl (call, fndecl);
702		      else
703			gimple_call_set_internal_fn (call, ifn);
704		      update_stmt (call);
705		    }
706		  reciprocal_stats.rfuncs_inserted++;
707
708		  FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
709		    {
710		      gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
711		      gimple_assign_set_rhs_code (stmt, MULT_EXPR);
712		      fold_stmt_inplace (&gsi);
713		      update_stmt (stmt);
714		    }
715		}
716	    }
717	}
718    }
719
720  statistics_counter_event (fun, "reciprocal divs inserted",
721			    reciprocal_stats.rdivs_inserted);
722  statistics_counter_event (fun, "reciprocal functions inserted",
723			    reciprocal_stats.rfuncs_inserted);
724
725  free_dominance_info (CDI_DOMINATORS);
726  free_dominance_info (CDI_POST_DOMINATORS);
727  delete occ_pool;
728  return 0;
729}
730
731} // anon namespace
732
733gimple_opt_pass *
734make_pass_cse_reciprocals (gcc::context *ctxt)
735{
736  return new pass_cse_reciprocals (ctxt);
737}
738
739/* Records an occurrence at statement USE_STMT in the vector of trees
740   STMTS if it is dominated by *TOP_BB or dominates it or this basic block
741   is not yet initialized.  Returns true if the occurrence was pushed on
742   the vector.  Adjusts *TOP_BB to be the basic block dominating all
743   statements in the vector.  */
744
745static bool
746maybe_record_sincos (vec<gimple *> *stmts,
747		     basic_block *top_bb, gimple *use_stmt)
748{
749  basic_block use_bb = gimple_bb (use_stmt);
750  if (*top_bb
751      && (*top_bb == use_bb
752	  || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
753    stmts->safe_push (use_stmt);
754  else if (!*top_bb
755	   || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
756    {
757      stmts->safe_push (use_stmt);
758      *top_bb = use_bb;
759    }
760  else
761    return false;
762
763  return true;
764}
765
766/* Look for sin, cos and cexpi calls with the same argument NAME and
767   create a single call to cexpi CSEing the result in this case.
768   We first walk over all immediate uses of the argument collecting
769   statements that we can CSE in a vector and in a second pass replace
770   the statement rhs with a REALPART or IMAGPART expression on the
771   result of the cexpi call we insert before the use statement that
772   dominates all other candidates.  */
773
774static bool
775execute_cse_sincos_1 (tree name)
776{
777  gimple_stmt_iterator gsi;
778  imm_use_iterator use_iter;
779  tree fndecl, res, type;
780  gimple *def_stmt, *use_stmt, *stmt;
781  int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
782  auto_vec<gimple *> stmts;
783  basic_block top_bb = NULL;
784  int i;
785  bool cfg_changed = false;
786
787  type = TREE_TYPE (name);
788  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
789    {
790      if (gimple_code (use_stmt) != GIMPLE_CALL
791	  || !gimple_call_lhs (use_stmt))
792	continue;
793
794      switch (gimple_call_combined_fn (use_stmt))
795	{
796	CASE_CFN_COS:
797	  seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
798	  break;
799
800	CASE_CFN_SIN:
801	  seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
802	  break;
803
804	CASE_CFN_CEXPI:
805	  seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
806	  break;
807
808	default:;
809	}
810    }
811
812  if (seen_cos + seen_sin + seen_cexpi <= 1)
813    return false;
814
815  /* Simply insert cexpi at the beginning of top_bb but not earlier than
816     the name def statement.  */
817  fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
818  if (!fndecl)
819    return false;
820  stmt = gimple_build_call (fndecl, 1, name);
821  res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
822  gimple_call_set_lhs (stmt, res);
823
824  def_stmt = SSA_NAME_DEF_STMT (name);
825  if (!SSA_NAME_IS_DEFAULT_DEF (name)
826      && gimple_code (def_stmt) != GIMPLE_PHI
827      && gimple_bb (def_stmt) == top_bb)
828    {
829      gsi = gsi_for_stmt (def_stmt);
830      gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
831    }
832  else
833    {
834      gsi = gsi_after_labels (top_bb);
835      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
836    }
837  sincos_stats.inserted++;
838
839  /* And adjust the recorded old call sites.  */
840  for (i = 0; stmts.iterate (i, &use_stmt); ++i)
841    {
842      tree rhs = NULL;
843
844      switch (gimple_call_combined_fn (use_stmt))
845	{
846	CASE_CFN_COS:
847	  rhs = fold_build1 (REALPART_EXPR, type, res);
848	  break;
849
850	CASE_CFN_SIN:
851	  rhs = fold_build1 (IMAGPART_EXPR, type, res);
852	  break;
853
854	CASE_CFN_CEXPI:
855	  rhs = res;
856	  break;
857
858	default:;
859	  gcc_unreachable ();
860	}
861
862	/* Replace call with a copy.  */
863	stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
864
865	gsi = gsi_for_stmt (use_stmt);
866	gsi_replace (&gsi, stmt, true);
867	if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
868	  cfg_changed = true;
869    }
870
871  return cfg_changed;
872}
873
874/* To evaluate powi(x,n), the floating point value x raised to the
875   constant integer exponent n, we use a hybrid algorithm that
876   combines the "window method" with look-up tables.  For an
877   introduction to exponentiation algorithms and "addition chains",
878   see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
879   "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
880   3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
881   Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
882
883/* Provide a default value for POWI_MAX_MULTS, the maximum number of
884   multiplications to inline before calling the system library's pow
885   function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
886   so this default never requires calling pow, powf or powl.  */
887
888#ifndef POWI_MAX_MULTS
889#define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
890#endif
891
892/* The size of the "optimal power tree" lookup table.  All
893   exponents less than this value are simply looked up in the
894   powi_table below.  This threshold is also used to size the
895   cache of pseudo registers that hold intermediate results.  */
896#define POWI_TABLE_SIZE 256
897
898/* The size, in bits of the window, used in the "window method"
899   exponentiation algorithm.  This is equivalent to a radix of
900   (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
901#define POWI_WINDOW_SIZE 3
902
903/* The following table is an efficient representation of an
904   "optimal power tree".  For each value, i, the corresponding
905   value, j, in the table states than an optimal evaluation
906   sequence for calculating pow(x,i) can be found by evaluating
907   pow(x,j)*pow(x,i-j).  An optimal power tree for the first
908   100 integers is given in Knuth's "Seminumerical algorithms".  */
909
910static const unsigned char powi_table[POWI_TABLE_SIZE] =
911  {
912      0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
913      4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
914      8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
915     12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
916     16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
917     20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
918     24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
919     28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
920     32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
921     36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
922     40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
923     44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
924     48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
925     52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
926     56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
927     60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
928     64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
929     68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
930     72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
931     76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
932     80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
933     84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
934     88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
935     92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
936     96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
937    100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
938    104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
939    108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
940    112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
941    116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
942    120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
943    124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
944  };
945
946
947/* Return the number of multiplications required to calculate
948   powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
949   subroutine of powi_cost.  CACHE is an array indicating
950   which exponents have already been calculated.  */
951
952static int
953powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
954{
955  /* If we've already calculated this exponent, then this evaluation
956     doesn't require any additional multiplications.  */
957  if (cache[n])
958    return 0;
959
960  cache[n] = true;
961  return powi_lookup_cost (n - powi_table[n], cache)
962	 + powi_lookup_cost (powi_table[n], cache) + 1;
963}
964
965/* Return the number of multiplications required to calculate
966   powi(x,n) for an arbitrary x, given the exponent N.  This
967   function needs to be kept in sync with powi_as_mults below.  */
968
969static int
970powi_cost (HOST_WIDE_INT n)
971{
972  bool cache[POWI_TABLE_SIZE];
973  unsigned HOST_WIDE_INT digit;
974  unsigned HOST_WIDE_INT val;
975  int result;
976
977  if (n == 0)
978    return 0;
979
980  /* Ignore the reciprocal when calculating the cost.  */
981  val = (n < 0) ? -n : n;
982
983  /* Initialize the exponent cache.  */
984  memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
985  cache[1] = true;
986
987  result = 0;
988
989  while (val >= POWI_TABLE_SIZE)
990    {
991      if (val & 1)
992	{
993	  digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
994	  result += powi_lookup_cost (digit, cache)
995		    + POWI_WINDOW_SIZE + 1;
996	  val >>= POWI_WINDOW_SIZE;
997	}
998      else
999	{
1000	  val >>= 1;
1001	  result++;
1002	}
1003    }
1004
1005  return result + powi_lookup_cost (val, cache);
1006}
1007
1008/* Recursive subroutine of powi_as_mults.  This function takes the
1009   array, CACHE, of already calculated exponents and an exponent N and
1010   returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
1011
1012static tree
1013powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
1014		 HOST_WIDE_INT n, tree *cache)
1015{
1016  tree op0, op1, ssa_target;
1017  unsigned HOST_WIDE_INT digit;
1018  gassign *mult_stmt;
1019
1020  if (n < POWI_TABLE_SIZE && cache[n])
1021    return cache[n];
1022
1023  ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1024
1025  if (n < POWI_TABLE_SIZE)
1026    {
1027      cache[n] = ssa_target;
1028      op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1029      op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1030    }
1031  else if (n & 1)
1032    {
1033      digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1034      op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1035      op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1036    }
1037  else
1038    {
1039      op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1040      op1 = op0;
1041    }
1042
1043  mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1044  gimple_set_location (mult_stmt, loc);
1045  gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1046
1047  return ssa_target;
1048}
1049
1050/* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1051   This function needs to be kept in sync with powi_cost above.  */
1052
1053static tree
1054powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1055	       tree arg0, HOST_WIDE_INT n)
1056{
1057  tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1058  gassign *div_stmt;
1059  tree target;
1060
1061  if (n == 0)
1062    return build_real (type, dconst1);
1063
1064  memset (cache, 0,  sizeof (cache));
1065  cache[1] = arg0;
1066
1067  result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1068  if (n >= 0)
1069    return result;
1070
1071  /* If the original exponent was negative, reciprocate the result.  */
1072  target = make_temp_ssa_name (type, NULL, "powmult");
1073  div_stmt = gimple_build_assign (target, RDIV_EXPR,
1074				  build_real (type, dconst1), result);
1075  gimple_set_location (div_stmt, loc);
1076  gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1077
1078  return target;
1079}
1080
1081/* ARG0 and N are the two arguments to a powi builtin in GSI with
1082   location info LOC.  If the arguments are appropriate, create an
1083   equivalent sequence of statements prior to GSI using an optimal
1084   number of multiplications, and return an expession holding the
1085   result.  */
1086
1087static tree
1088gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1089			    tree arg0, HOST_WIDE_INT n)
1090{
1091  /* Avoid largest negative number.  */
1092  if (n != -n
1093      && ((n >= -1 && n <= 2)
1094	  || (optimize_function_for_speed_p (cfun)
1095	      && powi_cost (n) <= POWI_MAX_MULTS)))
1096    return powi_as_mults (gsi, loc, arg0, n);
1097
1098  return NULL_TREE;
1099}
1100
1101/* Build a gimple call statement that calls FN with argument ARG.
1102   Set the lhs of the call statement to a fresh SSA name.  Insert the
1103   statement prior to GSI's current position, and return the fresh
1104   SSA name.  */
1105
1106static tree
1107build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1108		       tree fn, tree arg)
1109{
1110  gcall *call_stmt;
1111  tree ssa_target;
1112
1113  call_stmt = gimple_build_call (fn, 1, arg);
1114  ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1115  gimple_set_lhs (call_stmt, ssa_target);
1116  gimple_set_location (call_stmt, loc);
1117  gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1118
1119  return ssa_target;
1120}
1121
1122/* Build a gimple binary operation with the given CODE and arguments
1123   ARG0, ARG1, assigning the result to a new SSA name for variable
1124   TARGET.  Insert the statement prior to GSI's current position, and
1125   return the fresh SSA name.*/
1126
1127static tree
1128build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1129			const char *name, enum tree_code code,
1130			tree arg0, tree arg1)
1131{
1132  tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1133  gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1134  gimple_set_location (stmt, loc);
1135  gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1136  return result;
1137}
1138
1139/* Build a gimple reference operation with the given CODE and argument
1140   ARG, assigning the result to a new SSA name of TYPE with NAME.
1141   Insert the statement prior to GSI's current position, and return
1142   the fresh SSA name.  */
1143
1144static inline tree
1145build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1146		      const char *name, enum tree_code code, tree arg0)
1147{
1148  tree result = make_temp_ssa_name (type, NULL, name);
1149  gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1150  gimple_set_location (stmt, loc);
1151  gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1152  return result;
1153}
1154
1155/* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1156   prior to GSI's current position, and return the fresh SSA name.  */
1157
1158static tree
1159build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1160		       tree type, tree val)
1161{
1162  tree result = make_ssa_name (type);
1163  gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1164  gimple_set_location (stmt, loc);
1165  gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1166  return result;
1167}
1168
1169struct pow_synth_sqrt_info
1170{
1171  bool *factors;
1172  unsigned int deepest;
1173  unsigned int num_mults;
1174};
1175
1176/* Return true iff the real value C can be represented as a
1177   sum of powers of 0.5 up to N.  That is:
1178   C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1179   Record in INFO the various parameters of the synthesis algorithm such
1180   as the factors a[i], the maximum 0.5 power and the number of
1181   multiplications that will be required.  */
1182
1183bool
1184representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1185				 struct pow_synth_sqrt_info *info)
1186{
1187  REAL_VALUE_TYPE factor = dconsthalf;
1188  REAL_VALUE_TYPE remainder = c;
1189
1190  info->deepest = 0;
1191  info->num_mults = 0;
1192  memset (info->factors, 0, n * sizeof (bool));
1193
1194  for (unsigned i = 0; i < n; i++)
1195    {
1196      REAL_VALUE_TYPE res;
1197
1198      /* If something inexact happened bail out now.  */
1199      if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor))
1200	return false;
1201
1202      /* We have hit zero.  The number is representable as a sum
1203         of powers of 0.5.  */
1204      if (real_equal (&res, &dconst0))
1205	{
1206	  info->factors[i] = true;
1207	  info->deepest = i + 1;
1208	  return true;
1209	}
1210      else if (!REAL_VALUE_NEGATIVE (res))
1211	{
1212	  remainder = res;
1213	  info->factors[i] = true;
1214	  info->num_mults++;
1215	}
1216      else
1217	info->factors[i] = false;
1218
1219      real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf);
1220    }
1221  return false;
1222}
1223
1224/* Return the tree corresponding to FN being applied
1225   to ARG N times at GSI and LOC.
1226   Look up previous results from CACHE if need be.
1227   cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times.  */
1228
1229static tree
1230get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1231	      tree fn, location_t loc, tree *cache)
1232{
1233  tree res = cache[n];
1234  if (!res)
1235    {
1236      tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1237      res = build_and_insert_call (gsi, loc, fn, prev);
1238      cache[n] = res;
1239    }
1240
1241  return res;
1242}
1243
1244/* Print to STREAM the repeated application of function FNAME to ARG
1245   N times.  So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1246   "foo (foo (x))".  */
1247
1248static void
1249print_nested_fn (FILE* stream, const char *fname, const char* arg,
1250		 unsigned int n)
1251{
1252  if (n == 0)
1253    fprintf (stream, "%s", arg);
1254  else
1255    {
1256      fprintf (stream, "%s (", fname);
1257      print_nested_fn (stream, fname, arg, n - 1);
1258      fprintf (stream, ")");
1259    }
1260}
1261
1262/* Print to STREAM the fractional sequence of sqrt chains
1263   applied to ARG, described by INFO.  Used for the dump file.  */
1264
1265static void
1266dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1267			        struct pow_synth_sqrt_info *info)
1268{
1269  for (unsigned int i = 0; i < info->deepest; i++)
1270    {
1271      bool is_set = info->factors[i];
1272      if (is_set)
1273	{
1274	  print_nested_fn (stream, "sqrt", arg, i + 1);
1275	  if (i != info->deepest - 1)
1276	    fprintf (stream, " * ");
1277	}
1278    }
1279}
1280
1281/* Print to STREAM a representation of raising ARG to an integer
1282   power N.  Used for the dump file.  */
1283
1284static void
1285dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1286{
1287  if (n > 1)
1288    fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1289  else if (n == 1)
1290    fprintf (stream, "%s", arg);
1291}
1292
1293/* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1294   square roots.  Place at GSI and LOC.  Limit the maximum depth
1295   of the sqrt chains to MAX_DEPTH.  Return the tree holding the
1296   result of the expanded sequence or NULL_TREE if the expansion failed.
1297
1298   This routine assumes that ARG1 is a real number with a fractional part
1299   (the integer exponent case will have been handled earlier in
1300   gimple_expand_builtin_pow).
1301
1302   For ARG1 > 0.0:
1303   * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1304     FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1305                    FRAC_PART == ARG1 - WHOLE_PART:
1306     Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1307     POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1308     if it can be expressed as such, that is if FRAC_PART satisfies:
1309     FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1310     where integer a[i] is either 0 or 1.
1311
1312     Example:
1313     POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1314       --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1315
1316   For ARG1 < 0.0 there are two approaches:
1317   * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1318         is calculated as above.
1319
1320     Example:
1321     POW (x, -5.625) == 1.0 / POW (x, 5.625)
1322       -->  1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1323
1324   * (B) : WHOLE_PART := - ceil (abs (ARG1))
1325           FRAC_PART  := ARG1 - WHOLE_PART
1326     and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1327     Example:
1328     POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1329       --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1330
1331   For ARG1 < 0.0 we choose between (A) and (B) depending on
1332   how many multiplications we'd have to do.
1333   So, for the example in (B): POW (x, -5.875), if we were to
1334   follow algorithm (A) we would produce:
1335   1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1336   which contains more multiplications than approach (B).
1337
1338   Hopefully, this approach will eliminate potentially expensive POW library
1339   calls when unsafe floating point math is enabled and allow the compiler to
1340   further optimise the multiplies, square roots and divides produced by this
1341   function.  */
1342
1343static tree
1344expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1345		     tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1346{
1347  tree type = TREE_TYPE (arg0);
1348  machine_mode mode = TYPE_MODE (type);
1349  tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1350  bool one_over = true;
1351
1352  if (!sqrtfn)
1353    return NULL_TREE;
1354
1355  if (TREE_CODE (arg1) != REAL_CST)
1356    return NULL_TREE;
1357
1358  REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1359
1360  gcc_assert (max_depth > 0);
1361  tree *cache = XALLOCAVEC (tree, max_depth + 1);
1362
1363  struct pow_synth_sqrt_info synth_info;
1364  synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1365  synth_info.deepest = 0;
1366  synth_info.num_mults = 0;
1367
1368  bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1369  REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1370
1371  /* The whole and fractional parts of exp.  */
1372  REAL_VALUE_TYPE whole_part;
1373  REAL_VALUE_TYPE frac_part;
1374
1375  real_floor (&whole_part, mode, &exp);
1376  real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part);
1377
1378
1379  REAL_VALUE_TYPE ceil_whole = dconst0;
1380  REAL_VALUE_TYPE ceil_fract = dconst0;
1381
1382  if (neg_exp)
1383    {
1384      real_ceil (&ceil_whole, mode, &exp);
1385      real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp);
1386    }
1387
1388  if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1389    return NULL_TREE;
1390
1391  /* Check whether it's more profitable to not use 1.0 / ...  */
1392  if (neg_exp)
1393    {
1394      struct pow_synth_sqrt_info alt_synth_info;
1395      alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1396      alt_synth_info.deepest = 0;
1397      alt_synth_info.num_mults = 0;
1398
1399      if (representable_as_half_series_p (ceil_fract, max_depth,
1400					   &alt_synth_info)
1401	  && alt_synth_info.deepest <= synth_info.deepest
1402	  && alt_synth_info.num_mults < synth_info.num_mults)
1403	{
1404	  whole_part = ceil_whole;
1405	  frac_part = ceil_fract;
1406	  synth_info.deepest = alt_synth_info.deepest;
1407	  synth_info.num_mults = alt_synth_info.num_mults;
1408	  memcpy (synth_info.factors, alt_synth_info.factors,
1409		  (max_depth + 1) * sizeof (bool));
1410	  one_over = false;
1411	}
1412    }
1413
1414  HOST_WIDE_INT n = real_to_integer (&whole_part);
1415  REAL_VALUE_TYPE cint;
1416  real_from_integer (&cint, VOIDmode, n, SIGNED);
1417
1418  if (!real_identical (&whole_part, &cint))
1419    return NULL_TREE;
1420
1421  if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1422    return NULL_TREE;
1423
1424  memset (cache, 0, (max_depth + 1) * sizeof (tree));
1425
1426  tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1427
1428  /* Calculate the integer part of the exponent.  */
1429  if (n > 1)
1430    {
1431      integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1432      if (!integer_res)
1433	return NULL_TREE;
1434    }
1435
1436  if (dump_file)
1437    {
1438      char string[64];
1439
1440      real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1441      fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1442
1443      if (neg_exp)
1444	{
1445	  if (one_over)
1446	    {
1447	      fprintf (dump_file, "1.0 / (");
1448	      dump_integer_part (dump_file, "x", n);
1449	      if (n > 0)
1450	        fprintf (dump_file, " * ");
1451	      dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1452	      fprintf (dump_file, ")");
1453	    }
1454	  else
1455	    {
1456	      dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1457	      fprintf (dump_file, " / (");
1458	      dump_integer_part (dump_file, "x", n);
1459	      fprintf (dump_file, ")");
1460	    }
1461	}
1462      else
1463	{
1464	  dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1465	  if (n > 0)
1466	    fprintf (dump_file, " * ");
1467	  dump_integer_part (dump_file, "x", n);
1468	}
1469
1470      fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1471    }
1472
1473
1474  tree fract_res = NULL_TREE;
1475  cache[0] = arg0;
1476
1477  /* Calculate the fractional part of the exponent.  */
1478  for (unsigned i = 0; i < synth_info.deepest; i++)
1479    {
1480      if (synth_info.factors[i])
1481	{
1482	  tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1483
1484	  if (!fract_res)
1485	      fract_res = sqrt_chain;
1486
1487	  else
1488	    fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1489					   fract_res, sqrt_chain);
1490	}
1491    }
1492
1493  tree res = NULL_TREE;
1494
1495  if (neg_exp)
1496    {
1497      if (one_over)
1498	{
1499	  if (n > 0)
1500	    res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1501					   fract_res, integer_res);
1502	  else
1503	    res = fract_res;
1504
1505	  res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1506					  build_real (type, dconst1), res);
1507	}
1508      else
1509	{
1510	  res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1511					 fract_res, integer_res);
1512	}
1513    }
1514  else
1515    res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1516				   fract_res, integer_res);
1517  return res;
1518}
1519
1520/* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1521   with location info LOC.  If possible, create an equivalent and
1522   less expensive sequence of statements prior to GSI, and return an
1523   expession holding the result.  */
1524
1525static tree
1526gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1527			   tree arg0, tree arg1)
1528{
1529  REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1530  REAL_VALUE_TYPE c2, dconst3;
1531  HOST_WIDE_INT n;
1532  tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1533  machine_mode mode;
1534  bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1535  bool hw_sqrt_exists, c_is_int, c2_is_int;
1536
1537  dconst1_4 = dconst1;
1538  SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1539
1540  /* If the exponent isn't a constant, there's nothing of interest
1541     to be done.  */
1542  if (TREE_CODE (arg1) != REAL_CST)
1543    return NULL_TREE;
1544
1545  /* Don't perform the operation if flag_signaling_nans is on
1546     and the operand is a signaling NaN.  */
1547  if (HONOR_SNANS (TYPE_MODE (TREE_TYPE (arg1)))
1548      && ((TREE_CODE (arg0) == REAL_CST
1549	   && REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg0)))
1550	  || REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg1))))
1551    return NULL_TREE;
1552
1553  /* If the exponent is equivalent to an integer, expand to an optimal
1554     multiplication sequence when profitable.  */
1555  c = TREE_REAL_CST (arg1);
1556  n = real_to_integer (&c);
1557  real_from_integer (&cint, VOIDmode, n, SIGNED);
1558  c_is_int = real_identical (&c, &cint);
1559
1560  if (c_is_int
1561      && ((n >= -1 && n <= 2)
1562	  || (flag_unsafe_math_optimizations
1563	      && speed_p
1564	      && powi_cost (n) <= POWI_MAX_MULTS)))
1565    return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1566
1567  /* Attempt various optimizations using sqrt and cbrt.  */
1568  type = TREE_TYPE (arg0);
1569  mode = TYPE_MODE (type);
1570  sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1571
1572  /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1573     unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1574     sqrt(-0) = -0.  */
1575  if (sqrtfn
1576      && real_equal (&c, &dconsthalf)
1577      && !HONOR_SIGNED_ZEROS (mode))
1578    return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1579
1580  hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1581
1582  /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1583     optimizations since 1./3. is not exactly representable.  If x
1584     is negative and finite, the correct value of pow(x,1./3.) is
1585     a NaN with the "invalid" exception raised, because the value
1586     of 1./3. actually has an even denominator.  The correct value
1587     of cbrt(x) is a negative real value.  */
1588  cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1589  dconst1_3 = real_value_truncate (mode, dconst_third ());
1590
1591  if (flag_unsafe_math_optimizations
1592      && cbrtfn
1593      && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1594      && real_equal (&c, &dconst1_3))
1595    return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1596
1597  /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1598     if we don't have a hardware sqrt insn.  */
1599  dconst1_6 = dconst1_3;
1600  SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1601
1602  if (flag_unsafe_math_optimizations
1603      && sqrtfn
1604      && cbrtfn
1605      && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1606      && speed_p
1607      && hw_sqrt_exists
1608      && real_equal (&c, &dconst1_6))
1609    {
1610      /* sqrt(x)  */
1611      sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1612
1613      /* cbrt(sqrt(x))  */
1614      return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1615    }
1616
1617
1618  /* Attempt to expand the POW as a product of square root chains.
1619     Expand the 0.25 case even when otpimising for size.  */
1620  if (flag_unsafe_math_optimizations
1621      && sqrtfn
1622      && hw_sqrt_exists
1623      && (speed_p || real_equal (&c, &dconst1_4))
1624      && !HONOR_SIGNED_ZEROS (mode))
1625    {
1626      unsigned int max_depth = speed_p
1627				? PARAM_VALUE (PARAM_MAX_POW_SQRT_DEPTH)
1628				: 2;
1629
1630      tree expand_with_sqrts
1631	= expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1632
1633      if (expand_with_sqrts)
1634	return expand_with_sqrts;
1635    }
1636
1637  real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1638  n = real_to_integer (&c2);
1639  real_from_integer (&cint, VOIDmode, n, SIGNED);
1640  c2_is_int = real_identical (&c2, &cint);
1641
1642  /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1643
1644     powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1645     1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1646
1647     Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1648     different from pow(x, 1./3.) due to rounding and behavior with
1649     negative x, we need to constrain this transformation to unsafe
1650     math and positive x or finite math.  */
1651  real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1652  real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1653  real_round (&c2, mode, &c2);
1654  n = real_to_integer (&c2);
1655  real_from_integer (&cint, VOIDmode, n, SIGNED);
1656  real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1657  real_convert (&c2, mode, &c2);
1658
1659  if (flag_unsafe_math_optimizations
1660      && cbrtfn
1661      && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1662      && real_identical (&c2, &c)
1663      && !c2_is_int
1664      && optimize_function_for_speed_p (cfun)
1665      && powi_cost (n / 3) <= POWI_MAX_MULTS)
1666    {
1667      tree powi_x_ndiv3 = NULL_TREE;
1668
1669      /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1670         possible or profitable, give up.  Skip the degenerate case when
1671         abs(n) < 3, where the result is always 1.  */
1672      if (absu_hwi (n) >= 3)
1673	{
1674	  powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1675						     abs_hwi (n / 3));
1676	  if (!powi_x_ndiv3)
1677	    return NULL_TREE;
1678	}
1679
1680      /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1681         as that creates an unnecessary variable.  Instead, just produce
1682         either cbrt(x) or cbrt(x) * cbrt(x).  */
1683      cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1684
1685      if (absu_hwi (n) % 3 == 1)
1686	powi_cbrt_x = cbrt_x;
1687      else
1688	powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1689					      cbrt_x, cbrt_x);
1690
1691      /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1692      if (absu_hwi (n) < 3)
1693	result = powi_cbrt_x;
1694      else
1695	result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1696					 powi_x_ndiv3, powi_cbrt_x);
1697
1698      /* If n is negative, reciprocate the result.  */
1699      if (n < 0)
1700	result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1701					 build_real (type, dconst1), result);
1702
1703      return result;
1704    }
1705
1706  /* No optimizations succeeded.  */
1707  return NULL_TREE;
1708}
1709
1710/* ARG is the argument to a cabs builtin call in GSI with location info
1711   LOC.  Create a sequence of statements prior to GSI that calculates
1712   sqrt(R*R + I*I), where R and I are the real and imaginary components
1713   of ARG, respectively.  Return an expression holding the result.  */
1714
1715static tree
1716gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1717{
1718  tree real_part, imag_part, addend1, addend2, sum, result;
1719  tree type = TREE_TYPE (TREE_TYPE (arg));
1720  tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1721  machine_mode mode = TYPE_MODE (type);
1722
1723  if (!flag_unsafe_math_optimizations
1724      || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1725      || !sqrtfn
1726      || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1727    return NULL_TREE;
1728
1729  real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1730				    REALPART_EXPR, arg);
1731  addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1732				    real_part, real_part);
1733  imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1734				    IMAGPART_EXPR, arg);
1735  addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1736				    imag_part, imag_part);
1737  sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1738  result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1739
1740  return result;
1741}
1742
1743/* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1744   on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1745   an optimal number of multiplies, when n is a constant.  */
1746
1747namespace {
1748
1749const pass_data pass_data_cse_sincos =
1750{
1751  GIMPLE_PASS, /* type */
1752  "sincos", /* name */
1753  OPTGROUP_NONE, /* optinfo_flags */
1754  TV_NONE, /* tv_id */
1755  PROP_ssa, /* properties_required */
1756  PROP_gimple_opt_math, /* properties_provided */
1757  0, /* properties_destroyed */
1758  0, /* todo_flags_start */
1759  TODO_update_ssa, /* todo_flags_finish */
1760};
1761
1762class pass_cse_sincos : public gimple_opt_pass
1763{
1764public:
1765  pass_cse_sincos (gcc::context *ctxt)
1766    : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1767  {}
1768
1769  /* opt_pass methods: */
1770  virtual bool gate (function *)
1771    {
1772      /* We no longer require either sincos or cexp, since powi expansion
1773	 piggybacks on this pass.  */
1774      return optimize;
1775    }
1776
1777  virtual unsigned int execute (function *);
1778
1779}; // class pass_cse_sincos
1780
1781unsigned int
1782pass_cse_sincos::execute (function *fun)
1783{
1784  basic_block bb;
1785  bool cfg_changed = false;
1786
1787  calculate_dominance_info (CDI_DOMINATORS);
1788  memset (&sincos_stats, 0, sizeof (sincos_stats));
1789
1790  FOR_EACH_BB_FN (bb, fun)
1791    {
1792      gimple_stmt_iterator gsi;
1793      bool cleanup_eh = false;
1794
1795      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1796        {
1797	  gimple *stmt = gsi_stmt (gsi);
1798
1799	  /* Only the last stmt in a bb could throw, no need to call
1800	     gimple_purge_dead_eh_edges if we change something in the middle
1801	     of a basic block.  */
1802	  cleanup_eh = false;
1803
1804	  if (is_gimple_call (stmt)
1805	      && gimple_call_lhs (stmt))
1806	    {
1807	      tree arg, arg0, arg1, result;
1808	      HOST_WIDE_INT n;
1809	      location_t loc;
1810
1811	      switch (gimple_call_combined_fn (stmt))
1812		{
1813		CASE_CFN_COS:
1814		CASE_CFN_SIN:
1815		CASE_CFN_CEXPI:
1816		  /* Make sure we have either sincos or cexp.  */
1817		  if (!targetm.libc_has_function (function_c99_math_complex)
1818		      && !targetm.libc_has_function (function_sincos))
1819		    break;
1820
1821		  arg = gimple_call_arg (stmt, 0);
1822		  if (TREE_CODE (arg) == SSA_NAME)
1823		    cfg_changed |= execute_cse_sincos_1 (arg);
1824		  break;
1825
1826		CASE_CFN_POW:
1827		  arg0 = gimple_call_arg (stmt, 0);
1828		  arg1 = gimple_call_arg (stmt, 1);
1829
1830		  loc = gimple_location (stmt);
1831		  result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1832
1833		  if (result)
1834		    {
1835		      tree lhs = gimple_get_lhs (stmt);
1836		      gassign *new_stmt = gimple_build_assign (lhs, result);
1837		      gimple_set_location (new_stmt, loc);
1838		      unlink_stmt_vdef (stmt);
1839		      gsi_replace (&gsi, new_stmt, true);
1840		      cleanup_eh = true;
1841		      if (gimple_vdef (stmt))
1842			release_ssa_name (gimple_vdef (stmt));
1843		    }
1844		  break;
1845
1846		CASE_CFN_POWI:
1847		  arg0 = gimple_call_arg (stmt, 0);
1848		  arg1 = gimple_call_arg (stmt, 1);
1849		  loc = gimple_location (stmt);
1850
1851		  if (real_minus_onep (arg0))
1852		    {
1853                      tree t0, t1, cond, one, minus_one;
1854		      gassign *stmt;
1855
1856		      t0 = TREE_TYPE (arg0);
1857		      t1 = TREE_TYPE (arg1);
1858		      one = build_real (t0, dconst1);
1859		      minus_one = build_real (t0, dconstm1);
1860
1861		      cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1862		      stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1863						  arg1, build_int_cst (t1, 1));
1864		      gimple_set_location (stmt, loc);
1865		      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1866
1867		      result = make_temp_ssa_name (t0, NULL, "powi");
1868		      stmt = gimple_build_assign (result, COND_EXPR, cond,
1869						  minus_one, one);
1870		      gimple_set_location (stmt, loc);
1871		      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1872		    }
1873		  else
1874		    {
1875		      if (!tree_fits_shwi_p (arg1))
1876			break;
1877
1878		      n = tree_to_shwi (arg1);
1879		      result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1880		    }
1881
1882		  if (result)
1883		    {
1884		      tree lhs = gimple_get_lhs (stmt);
1885		      gassign *new_stmt = gimple_build_assign (lhs, result);
1886		      gimple_set_location (new_stmt, loc);
1887		      unlink_stmt_vdef (stmt);
1888		      gsi_replace (&gsi, new_stmt, true);
1889		      cleanup_eh = true;
1890		      if (gimple_vdef (stmt))
1891			release_ssa_name (gimple_vdef (stmt));
1892		    }
1893		  break;
1894
1895		CASE_CFN_CABS:
1896		  arg0 = gimple_call_arg (stmt, 0);
1897		  loc = gimple_location (stmt);
1898		  result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1899
1900		  if (result)
1901		    {
1902		      tree lhs = gimple_get_lhs (stmt);
1903		      gassign *new_stmt = gimple_build_assign (lhs, result);
1904		      gimple_set_location (new_stmt, loc);
1905		      unlink_stmt_vdef (stmt);
1906		      gsi_replace (&gsi, new_stmt, true);
1907		      cleanup_eh = true;
1908		      if (gimple_vdef (stmt))
1909			release_ssa_name (gimple_vdef (stmt));
1910		    }
1911		  break;
1912
1913		default:;
1914		}
1915	    }
1916	}
1917      if (cleanup_eh)
1918	cfg_changed |= gimple_purge_dead_eh_edges (bb);
1919    }
1920
1921  statistics_counter_event (fun, "sincos statements inserted",
1922			    sincos_stats.inserted);
1923
1924  return cfg_changed ? TODO_cleanup_cfg : 0;
1925}
1926
1927} // anon namespace
1928
1929gimple_opt_pass *
1930make_pass_cse_sincos (gcc::context *ctxt)
1931{
1932  return new pass_cse_sincos (ctxt);
1933}
1934
1935/* A symbolic number structure is used to detect byte permutation and selection
1936   patterns of a source.  To achieve that, its field N contains an artificial
1937   number consisting of BITS_PER_MARKER sized markers tracking where does each
1938   byte come from in the source:
1939
1940   0	   - target byte has the value 0
1941   FF	   - target byte has an unknown value (eg. due to sign extension)
1942   1..size - marker value is the byte index in the source (0 for lsb).
1943
1944   To detect permutations on memory sources (arrays and structures), a symbolic
1945   number is also associated:
1946   - a base address BASE_ADDR and an OFFSET giving the address of the source;
1947   - a range which gives the difference between the highest and lowest accessed
1948     memory location to make such a symbolic number;
1949   - the address SRC of the source element of lowest address as a convenience
1950     to easily get BASE_ADDR + offset + lowest bytepos.
1951
1952   Note 1: the range is different from size as size reflects the size of the
1953   type of the current expression.  For instance, for an array char a[],
1954   (short) a[0] | (short) a[3] would have a size of 2 but a range of 4 while
1955   (short) a[0] | ((short) a[0] << 1) would still have a size of 2 but this
1956   time a range of 1.
1957
1958   Note 2: for non-memory sources, range holds the same value as size.
1959
1960   Note 3: SRC points to the SSA_NAME in case of non-memory source.  */
1961
1962struct symbolic_number {
1963  uint64_t n;
1964  tree type;
1965  tree base_addr;
1966  tree offset;
1967  HOST_WIDE_INT bytepos;
1968  tree src;
1969  tree alias_set;
1970  tree vuse;
1971  unsigned HOST_WIDE_INT range;
1972};
1973
1974#define BITS_PER_MARKER 8
1975#define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1976#define MARKER_BYTE_UNKNOWN MARKER_MASK
1977#define HEAD_MARKER(n, size) \
1978  ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1979
1980/* The number which the find_bswap_or_nop_1 result should match in
1981   order to have a nop.  The number is masked according to the size of
1982   the symbolic number before using it.  */
1983#define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1984  (uint64_t)0x08070605 << 32 | 0x04030201)
1985
1986/* The number which the find_bswap_or_nop_1 result should match in
1987   order to have a byte swap.  The number is masked according to the
1988   size of the symbolic number before using it.  */
1989#define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1990  (uint64_t)0x01020304 << 32 | 0x05060708)
1991
1992/* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1993   number N.  Return false if the requested operation is not permitted
1994   on a symbolic number.  */
1995
1996static inline bool
1997do_shift_rotate (enum tree_code code,
1998		 struct symbolic_number *n,
1999		 int count)
2000{
2001  int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2002  unsigned head_marker;
2003
2004  if (count % BITS_PER_UNIT != 0)
2005    return false;
2006  count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
2007
2008  /* Zero out the extra bits of N in order to avoid them being shifted
2009     into the significant bits.  */
2010  if (size < 64 / BITS_PER_MARKER)
2011    n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2012
2013  switch (code)
2014    {
2015    case LSHIFT_EXPR:
2016      n->n <<= count;
2017      break;
2018    case RSHIFT_EXPR:
2019      head_marker = HEAD_MARKER (n->n, size);
2020      n->n >>= count;
2021      /* Arithmetic shift of signed type: result is dependent on the value.  */
2022      if (!TYPE_UNSIGNED (n->type) && head_marker)
2023	for (i = 0; i < count / BITS_PER_MARKER; i++)
2024	  n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2025		  << ((size - 1 - i) * BITS_PER_MARKER);
2026      break;
2027    case LROTATE_EXPR:
2028      n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
2029      break;
2030    case RROTATE_EXPR:
2031      n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
2032      break;
2033    default:
2034      return false;
2035    }
2036  /* Zero unused bits for size.  */
2037  if (size < 64 / BITS_PER_MARKER)
2038    n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2039  return true;
2040}
2041
2042/* Perform sanity checking for the symbolic number N and the gimple
2043   statement STMT.  */
2044
2045static inline bool
2046verify_symbolic_number_p (struct symbolic_number *n, gimple *stmt)
2047{
2048  tree lhs_type;
2049
2050  lhs_type = gimple_expr_type (stmt);
2051
2052  if (TREE_CODE (lhs_type) != INTEGER_TYPE)
2053    return false;
2054
2055  if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
2056    return false;
2057
2058  return true;
2059}
2060
2061/* Initialize the symbolic number N for the bswap pass from the base element
2062   SRC manipulated by the bitwise OR expression.  */
2063
2064static bool
2065init_symbolic_number (struct symbolic_number *n, tree src)
2066{
2067  int size;
2068
2069  if (! INTEGRAL_TYPE_P (TREE_TYPE (src)))
2070    return false;
2071
2072  n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
2073  n->src = src;
2074
2075  /* Set up the symbolic number N by setting each byte to a value between 1 and
2076     the byte size of rhs1.  The highest order byte is set to n->size and the
2077     lowest order byte to 1.  */
2078  n->type = TREE_TYPE (src);
2079  size = TYPE_PRECISION (n->type);
2080  if (size % BITS_PER_UNIT != 0)
2081    return false;
2082  size /= BITS_PER_UNIT;
2083  if (size > 64 / BITS_PER_MARKER)
2084    return false;
2085  n->range = size;
2086  n->n = CMPNOP;
2087
2088  if (size < 64 / BITS_PER_MARKER)
2089    n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
2090
2091  return true;
2092}
2093
2094/* Check if STMT might be a byte swap or a nop from a memory source and returns
2095   the answer. If so, REF is that memory source and the base of the memory area
2096   accessed and the offset of the access from that base are recorded in N.  */
2097
2098bool
2099find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n)
2100{
2101  /* Leaf node is an array or component ref. Memorize its base and
2102     offset from base to compare to other such leaf node.  */
2103  HOST_WIDE_INT bitsize, bitpos;
2104  machine_mode mode;
2105  int unsignedp, reversep, volatilep;
2106  tree offset, base_addr;
2107
2108  /* Not prepared to handle PDP endian.  */
2109  if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
2110    return false;
2111
2112  if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
2113    return false;
2114
2115  base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
2116				   &unsignedp, &reversep, &volatilep);
2117
2118  if (TREE_CODE (base_addr) == MEM_REF)
2119    {
2120      offset_int bit_offset = 0;
2121      tree off = TREE_OPERAND (base_addr, 1);
2122
2123      if (!integer_zerop (off))
2124	{
2125	  offset_int boff, coff = mem_ref_offset (base_addr);
2126	  boff = coff << LOG2_BITS_PER_UNIT;
2127	  bit_offset += boff;
2128	}
2129
2130      base_addr = TREE_OPERAND (base_addr, 0);
2131
2132      /* Avoid returning a negative bitpos as this may wreak havoc later.  */
2133      if (wi::neg_p (bit_offset))
2134	{
2135	  offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
2136	  offset_int tem = bit_offset.and_not (mask);
2137	  /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
2138	     Subtract it to BIT_OFFSET and add it (scaled) to OFFSET.  */
2139	  bit_offset -= tem;
2140	  tem >>= LOG2_BITS_PER_UNIT;
2141	  if (offset)
2142	    offset = size_binop (PLUS_EXPR, offset,
2143				    wide_int_to_tree (sizetype, tem));
2144	  else
2145	    offset = wide_int_to_tree (sizetype, tem);
2146	}
2147
2148      bitpos += bit_offset.to_shwi ();
2149    }
2150
2151  if (bitpos % BITS_PER_UNIT)
2152    return false;
2153  if (bitsize % BITS_PER_UNIT)
2154    return false;
2155  if (reversep)
2156    return false;
2157
2158  if (!init_symbolic_number (n, ref))
2159    return false;
2160  n->base_addr = base_addr;
2161  n->offset = offset;
2162  n->bytepos = bitpos / BITS_PER_UNIT;
2163  n->alias_set = reference_alias_ptr_type (ref);
2164  n->vuse = gimple_vuse (stmt);
2165  return true;
2166}
2167
2168/* Compute the symbolic number N representing the result of a bitwise OR on 2
2169   symbolic number N1 and N2 whose source statements are respectively
2170   SOURCE_STMT1 and SOURCE_STMT2.  */
2171
2172static gimple *
2173perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1,
2174			gimple *source_stmt2, struct symbolic_number *n2,
2175			struct symbolic_number *n)
2176{
2177  int i, size;
2178  uint64_t mask;
2179  gimple *source_stmt;
2180  struct symbolic_number *n_start;
2181
2182  tree rhs1 = gimple_assign_rhs1 (source_stmt1);
2183  if (TREE_CODE (rhs1) == BIT_FIELD_REF
2184      && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME)
2185    rhs1 = TREE_OPERAND (rhs1, 0);
2186  tree rhs2 = gimple_assign_rhs1 (source_stmt2);
2187  if (TREE_CODE (rhs2) == BIT_FIELD_REF
2188      && TREE_CODE (TREE_OPERAND (rhs2, 0)) == SSA_NAME)
2189    rhs2 = TREE_OPERAND (rhs2, 0);
2190
2191  /* Sources are different, cancel bswap if they are not memory location with
2192     the same base (array, structure, ...).  */
2193  if (rhs1 != rhs2)
2194    {
2195      uint64_t inc;
2196      HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
2197      struct symbolic_number *toinc_n_ptr, *n_end;
2198      basic_block bb1, bb2;
2199
2200      if (!n1->base_addr || !n2->base_addr
2201	  || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
2202	return NULL;
2203
2204      if (!n1->offset != !n2->offset
2205	  || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
2206	return NULL;
2207
2208      if (n1->bytepos < n2->bytepos)
2209	{
2210	  n_start = n1;
2211	  start_sub = n2->bytepos - n1->bytepos;
2212	}
2213      else
2214	{
2215	  n_start = n2;
2216	  start_sub = n1->bytepos - n2->bytepos;
2217	}
2218
2219      bb1 = gimple_bb (source_stmt1);
2220      bb2 = gimple_bb (source_stmt2);
2221      if (dominated_by_p (CDI_DOMINATORS, bb1, bb2))
2222	source_stmt = source_stmt1;
2223      else
2224	source_stmt = source_stmt2;
2225
2226      /* Find the highest address at which a load is performed and
2227	 compute related info.  */
2228      end1 = n1->bytepos + (n1->range - 1);
2229      end2 = n2->bytepos + (n2->range - 1);
2230      if (end1 < end2)
2231	{
2232	  end = end2;
2233	  end_sub = end2 - end1;
2234	}
2235      else
2236	{
2237	  end = end1;
2238	  end_sub = end1 - end2;
2239	}
2240      n_end = (end2 > end1) ? n2 : n1;
2241
2242      /* Find symbolic number whose lsb is the most significant.  */
2243      if (BYTES_BIG_ENDIAN)
2244	toinc_n_ptr = (n_end == n1) ? n2 : n1;
2245      else
2246	toinc_n_ptr = (n_start == n1) ? n2 : n1;
2247
2248      n->range = end - n_start->bytepos + 1;
2249
2250      /* Check that the range of memory covered can be represented by
2251	 a symbolic number.  */
2252      if (n->range > 64 / BITS_PER_MARKER)
2253	return NULL;
2254
2255      /* Reinterpret byte marks in symbolic number holding the value of
2256	 bigger weight according to target endianness.  */
2257      inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
2258      size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
2259      for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
2260	{
2261	  unsigned marker
2262	    = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
2263	  if (marker && marker != MARKER_BYTE_UNKNOWN)
2264	    toinc_n_ptr->n += inc;
2265	}
2266    }
2267  else
2268    {
2269      n->range = n1->range;
2270      n_start = n1;
2271      source_stmt = source_stmt1;
2272    }
2273
2274  if (!n1->alias_set
2275      || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
2276    n->alias_set = n1->alias_set;
2277  else
2278    n->alias_set = ptr_type_node;
2279  n->vuse = n_start->vuse;
2280  n->base_addr = n_start->base_addr;
2281  n->offset = n_start->offset;
2282  n->src = n_start->src;
2283  n->bytepos = n_start->bytepos;
2284  n->type = n_start->type;
2285  size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2286
2287  for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
2288    {
2289      uint64_t masked1, masked2;
2290
2291      masked1 = n1->n & mask;
2292      masked2 = n2->n & mask;
2293      if (masked1 && masked2 && masked1 != masked2)
2294	return NULL;
2295    }
2296  n->n = n1->n | n2->n;
2297
2298  return source_stmt;
2299}
2300
2301/* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
2302   the operation given by the rhs of STMT on the result.  If the operation
2303   could successfully be executed the function returns a gimple stmt whose
2304   rhs's first tree is the expression of the source operand and NULL
2305   otherwise.  */
2306
2307static gimple *
2308find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit)
2309{
2310  enum tree_code code;
2311  tree rhs1, rhs2 = NULL;
2312  gimple *rhs1_stmt, *rhs2_stmt, *source_stmt1;
2313  enum gimple_rhs_class rhs_class;
2314
2315  if (!limit || !is_gimple_assign (stmt))
2316    return NULL;
2317
2318  rhs1 = gimple_assign_rhs1 (stmt);
2319
2320  if (find_bswap_or_nop_load (stmt, rhs1, n))
2321    return stmt;
2322
2323  /* Handle BIT_FIELD_REF.  */
2324  if (TREE_CODE (rhs1) == BIT_FIELD_REF
2325      && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME)
2326    {
2327      unsigned HOST_WIDE_INT bitsize = tree_to_uhwi (TREE_OPERAND (rhs1, 1));
2328      unsigned HOST_WIDE_INT bitpos = tree_to_uhwi (TREE_OPERAND (rhs1, 2));
2329      if (bitpos % BITS_PER_UNIT == 0
2330	  && bitsize % BITS_PER_UNIT == 0
2331	  && init_symbolic_number (n, TREE_OPERAND (rhs1, 0)))
2332	{
2333	  /* Handle big-endian bit numbering in BIT_FIELD_REF.  */
2334	  if (BYTES_BIG_ENDIAN)
2335	    bitpos = TYPE_PRECISION (n->type) - bitpos - bitsize;
2336
2337	  /* Shift.  */
2338	  if (!do_shift_rotate (RSHIFT_EXPR, n, bitpos))
2339	    return NULL;
2340
2341	  /* Mask.  */
2342	  uint64_t mask = 0;
2343	  uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2344	  for (unsigned i = 0; i < bitsize / BITS_PER_UNIT;
2345	       i++, tmp <<= BITS_PER_UNIT)
2346	    mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2347	  n->n &= mask;
2348
2349	  /* Convert.  */
2350	  n->type = TREE_TYPE (rhs1);
2351	  if (!n->base_addr)
2352	    n->range = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2353
2354	  return verify_symbolic_number_p (n, stmt) ? stmt : NULL;
2355	}
2356
2357      return NULL;
2358    }
2359
2360  if (TREE_CODE (rhs1) != SSA_NAME)
2361    return NULL;
2362
2363  code = gimple_assign_rhs_code (stmt);
2364  rhs_class = gimple_assign_rhs_class (stmt);
2365  rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2366
2367  if (rhs_class == GIMPLE_BINARY_RHS)
2368    rhs2 = gimple_assign_rhs2 (stmt);
2369
2370  /* Handle unary rhs and binary rhs with integer constants as second
2371     operand.  */
2372
2373  if (rhs_class == GIMPLE_UNARY_RHS
2374      || (rhs_class == GIMPLE_BINARY_RHS
2375	  && TREE_CODE (rhs2) == INTEGER_CST))
2376    {
2377      if (code != BIT_AND_EXPR
2378	  && code != LSHIFT_EXPR
2379	  && code != RSHIFT_EXPR
2380	  && code != LROTATE_EXPR
2381	  && code != RROTATE_EXPR
2382	  && !CONVERT_EXPR_CODE_P (code))
2383	return NULL;
2384
2385      source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2386
2387      /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2388	 we have to initialize the symbolic number.  */
2389      if (!source_stmt1)
2390	{
2391	  if (gimple_assign_load_p (stmt)
2392	      || !init_symbolic_number (n, rhs1))
2393	    return NULL;
2394	  source_stmt1 = stmt;
2395	}
2396
2397      switch (code)
2398	{
2399	case BIT_AND_EXPR:
2400	  {
2401	    int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2402	    uint64_t val = int_cst_value (rhs2), mask = 0;
2403	    uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2404
2405	    /* Only constants masking full bytes are allowed.  */
2406	    for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2407	      if ((val & tmp) != 0 && (val & tmp) != tmp)
2408		return NULL;
2409	      else if (val & tmp)
2410		mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2411
2412	    n->n &= mask;
2413	  }
2414	  break;
2415	case LSHIFT_EXPR:
2416	case RSHIFT_EXPR:
2417	case LROTATE_EXPR:
2418	case RROTATE_EXPR:
2419	  if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
2420	    return NULL;
2421	  break;
2422	CASE_CONVERT:
2423	  {
2424	    int i, type_size, old_type_size;
2425	    tree type;
2426
2427	    type = gimple_expr_type (stmt);
2428	    type_size = TYPE_PRECISION (type);
2429	    if (type_size % BITS_PER_UNIT != 0)
2430	      return NULL;
2431	    type_size /= BITS_PER_UNIT;
2432	    if (type_size > 64 / BITS_PER_MARKER)
2433	      return NULL;
2434
2435	    /* Sign extension: result is dependent on the value.  */
2436	    old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2437	    if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2438		&& HEAD_MARKER (n->n, old_type_size))
2439	      for (i = 0; i < type_size - old_type_size; i++)
2440		n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2441			<< ((type_size - 1 - i) * BITS_PER_MARKER);
2442
2443	    if (type_size < 64 / BITS_PER_MARKER)
2444	      {
2445		/* If STMT casts to a smaller type mask out the bits not
2446		   belonging to the target type.  */
2447		n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2448	      }
2449	    n->type = type;
2450	    if (!n->base_addr)
2451	      n->range = type_size;
2452	  }
2453	  break;
2454	default:
2455	  return NULL;
2456	};
2457      return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2458    }
2459
2460  /* Handle binary rhs.  */
2461
2462  if (rhs_class == GIMPLE_BINARY_RHS)
2463    {
2464      struct symbolic_number n1, n2;
2465      gimple *source_stmt, *source_stmt2;
2466
2467      if (code != BIT_IOR_EXPR)
2468	return NULL;
2469
2470      if (TREE_CODE (rhs2) != SSA_NAME)
2471	return NULL;
2472
2473      rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2474
2475      switch (code)
2476	{
2477	case BIT_IOR_EXPR:
2478	  source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2479
2480	  if (!source_stmt1)
2481	    return NULL;
2482
2483	  source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2484
2485	  if (!source_stmt2)
2486	    return NULL;
2487
2488	  if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2489	    return NULL;
2490
2491	  if (!n1.vuse != !n2.vuse
2492	      || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2493	    return NULL;
2494
2495	  source_stmt
2496	    = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2497
2498	  if (!source_stmt)
2499	    return NULL;
2500
2501	  if (!verify_symbolic_number_p (n, stmt))
2502	    return NULL;
2503
2504	  break;
2505	default:
2506	  return NULL;
2507	}
2508      return source_stmt;
2509    }
2510  return NULL;
2511}
2512
2513/* Check if STMT completes a bswap implementation or a read in a given
2514   endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2515   accordingly.  It also sets N to represent the kind of operations
2516   performed: size of the resulting expression and whether it works on
2517   a memory source, and if so alias-set and vuse.  At last, the
2518   function returns a stmt whose rhs's first tree is the source
2519   expression.  */
2520
2521static gimple *
2522find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap)
2523{
2524  unsigned rsize;
2525  uint64_t tmpn, mask;
2526/* The number which the find_bswap_or_nop_1 result should match in order
2527   to have a full byte swap.  The number is shifted to the right
2528   according to the size of the symbolic number before using it.  */
2529  uint64_t cmpxchg = CMPXCHG;
2530  uint64_t cmpnop = CMPNOP;
2531
2532  gimple *ins_stmt;
2533  int limit;
2534
2535  /* The last parameter determines the depth search limit.  It usually
2536     correlates directly to the number n of bytes to be touched.  We
2537     increase that number by log2(n) + 1 here in order to also
2538     cover signed -> unsigned conversions of the src operand as can be seen
2539     in libgcc, and for initial shift/and operation of the src operand.  */
2540  limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2541  limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2542  ins_stmt = find_bswap_or_nop_1 (stmt, n, limit);
2543
2544  if (!ins_stmt)
2545    return NULL;
2546
2547  /* Find real size of result (highest non-zero byte).  */
2548  if (n->base_addr)
2549    for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2550  else
2551    rsize = n->range;
2552
2553  /* Zero out the bits corresponding to untouched bytes in original gimple
2554     expression.  */
2555  if (n->range < (int) sizeof (int64_t))
2556    {
2557      mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2558      cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2559      cmpnop &= mask;
2560    }
2561
2562  /* Zero out the bits corresponding to unused bytes in the result of the
2563     gimple expression.  */
2564  if (rsize < n->range)
2565    {
2566      if (BYTES_BIG_ENDIAN)
2567	{
2568	  mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
2569	  cmpxchg &= mask;
2570	  cmpnop >>= (n->range - rsize) * BITS_PER_MARKER;
2571	}
2572      else
2573	{
2574	  mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1;
2575	  cmpxchg >>= (n->range - rsize) * BITS_PER_MARKER;
2576	  cmpnop &= mask;
2577	}
2578      n->range = rsize;
2579    }
2580
2581  /* A complete byte swap should make the symbolic number to start with
2582     the largest digit in the highest order byte. Unchanged symbolic
2583     number indicates a read with same endianness as target architecture.  */
2584  if (n->n == cmpnop)
2585    *bswap = false;
2586  else if (n->n == cmpxchg)
2587    *bswap = true;
2588  else
2589    return NULL;
2590
2591  /* Useless bit manipulation performed by code.  */
2592  if (!n->base_addr && n->n == cmpnop)
2593    return NULL;
2594
2595  n->range *= BITS_PER_UNIT;
2596  return ins_stmt;
2597}
2598
2599namespace {
2600
2601const pass_data pass_data_optimize_bswap =
2602{
2603  GIMPLE_PASS, /* type */
2604  "bswap", /* name */
2605  OPTGROUP_NONE, /* optinfo_flags */
2606  TV_NONE, /* tv_id */
2607  PROP_ssa, /* properties_required */
2608  0, /* properties_provided */
2609  0, /* properties_destroyed */
2610  0, /* todo_flags_start */
2611  0, /* todo_flags_finish */
2612};
2613
2614class pass_optimize_bswap : public gimple_opt_pass
2615{
2616public:
2617  pass_optimize_bswap (gcc::context *ctxt)
2618    : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2619  {}
2620
2621  /* opt_pass methods: */
2622  virtual bool gate (function *)
2623    {
2624      return flag_expensive_optimizations && optimize;
2625    }
2626
2627  virtual unsigned int execute (function *);
2628
2629}; // class pass_optimize_bswap
2630
2631/* Perform the bswap optimization: replace the expression computed in the rhs
2632   of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2633   Which of these alternatives replace the rhs is given by N->base_addr (non
2634   null if a load is needed) and BSWAP.  The type, VUSE and set-alias of the
2635   load to perform are also given in N while the builtin bswap invoke is given
2636   in FNDEL.  Finally, if a load is involved, SRC_STMT refers to one of the
2637   load statements involved to construct the rhs in CUR_STMT and N->range gives
2638   the size of the rhs expression for maintaining some statistics.
2639
2640   Note that if the replacement involve a load, CUR_STMT is moved just after
2641   SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2642   changing of basic block.  */
2643
2644static bool
2645bswap_replace (gimple *cur_stmt, gimple *ins_stmt, tree fndecl,
2646	       tree bswap_type, tree load_type, struct symbolic_number *n,
2647	       bool bswap)
2648{
2649  gimple_stmt_iterator gsi;
2650  tree src, tmp, tgt;
2651  gimple *bswap_stmt;
2652
2653  gsi = gsi_for_stmt (cur_stmt);
2654  src = n->src;
2655  tgt = gimple_assign_lhs (cur_stmt);
2656
2657  /* Need to load the value from memory first.  */
2658  if (n->base_addr)
2659    {
2660      gimple_stmt_iterator gsi_ins = gsi_for_stmt (ins_stmt);
2661      tree addr_expr, addr_tmp, val_expr, val_tmp;
2662      tree load_offset_ptr, aligned_load_type;
2663      gimple *addr_stmt, *load_stmt;
2664      unsigned align;
2665      HOST_WIDE_INT load_offset = 0;
2666      basic_block ins_bb, cur_bb;
2667
2668      ins_bb = gimple_bb (ins_stmt);
2669      cur_bb = gimple_bb (cur_stmt);
2670      if (!dominated_by_p (CDI_DOMINATORS, cur_bb, ins_bb))
2671	return false;
2672
2673      align = get_object_alignment (src);
2674
2675      /* Move cur_stmt just before  one of the load of the original
2676	 to ensure it has the same VUSE.  See PR61517 for what could
2677	 go wrong.  */
2678      if (gimple_bb (cur_stmt) != gimple_bb (ins_stmt))
2679	reset_flow_sensitive_info (gimple_assign_lhs (cur_stmt));
2680      gsi_move_before (&gsi, &gsi_ins);
2681      gsi = gsi_for_stmt (cur_stmt);
2682
2683      /* Compute address to load from and cast according to the size
2684	 of the load.  */
2685      addr_expr = build_fold_addr_expr (unshare_expr (src));
2686      if (is_gimple_mem_ref_addr (addr_expr))
2687	addr_tmp = addr_expr;
2688      else
2689	{
2690	  addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2691					 "load_src");
2692	  addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2693	  gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2694	}
2695
2696      /* Perform the load.  */
2697      aligned_load_type = load_type;
2698      if (align < TYPE_ALIGN (load_type))
2699	aligned_load_type = build_aligned_type (load_type, align);
2700      load_offset_ptr = build_int_cst (n->alias_set, load_offset);
2701      val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2702			      load_offset_ptr);
2703
2704      if (!bswap)
2705	{
2706	  if (n->range == 16)
2707	    nop_stats.found_16bit++;
2708	  else if (n->range == 32)
2709	    nop_stats.found_32bit++;
2710	  else
2711	    {
2712	      gcc_assert (n->range == 64);
2713	      nop_stats.found_64bit++;
2714	    }
2715
2716	  /* Convert the result of load if necessary.  */
2717	  if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2718	    {
2719	      val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2720					    "load_dst");
2721	      load_stmt = gimple_build_assign (val_tmp, val_expr);
2722	      gimple_set_vuse (load_stmt, n->vuse);
2723	      gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2724	      gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2725	    }
2726	  else
2727	    {
2728	      gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2729	      gimple_set_vuse (cur_stmt, n->vuse);
2730	    }
2731	  update_stmt (cur_stmt);
2732
2733	  if (dump_file)
2734	    {
2735	      fprintf (dump_file,
2736		       "%d bit load in target endianness found at: ",
2737		       (int) n->range);
2738	      print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2739	    }
2740	  return true;
2741	}
2742      else
2743	{
2744	  val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2745	  load_stmt = gimple_build_assign (val_tmp, val_expr);
2746	  gimple_set_vuse (load_stmt, n->vuse);
2747	  gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2748	}
2749      src = val_tmp;
2750    }
2751  else if (TREE_CODE (src) == BIT_FIELD_REF)
2752    src = TREE_OPERAND (src, 0);
2753
2754  if (n->range == 16)
2755    bswap_stats.found_16bit++;
2756  else if (n->range == 32)
2757    bswap_stats.found_32bit++;
2758  else
2759    {
2760      gcc_assert (n->range == 64);
2761      bswap_stats.found_64bit++;
2762    }
2763
2764  tmp = src;
2765
2766  /* Convert the src expression if necessary.  */
2767  if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2768    {
2769      gimple *convert_stmt;
2770
2771      tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2772      convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2773      gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2774    }
2775
2776  /* Canonical form for 16 bit bswap is a rotate expression.  Only 16bit values
2777     are considered as rotation of 2N bit values by N bits is generally not
2778     equivalent to a bswap.  Consider for instance 0x01020304 r>> 16 which
2779     gives 0x03040102 while a bswap for that value is 0x04030201.  */
2780  if (bswap && n->range == 16)
2781    {
2782      tree count = build_int_cst (NULL, BITS_PER_UNIT);
2783      src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2784      bswap_stmt = gimple_build_assign (NULL, src);
2785    }
2786  else
2787    bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2788
2789  tmp = tgt;
2790
2791  /* Convert the result if necessary.  */
2792  if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2793    {
2794      gimple *convert_stmt;
2795
2796      tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2797      convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2798      gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2799    }
2800
2801  gimple_set_lhs (bswap_stmt, tmp);
2802
2803  if (dump_file)
2804    {
2805      fprintf (dump_file, "%d bit bswap implementation found at: ",
2806	       (int) n->range);
2807      print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2808    }
2809
2810  gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2811  gsi_remove (&gsi, true);
2812  return true;
2813}
2814
2815/* Find manual byte swap implementations as well as load in a given
2816   endianness. Byte swaps are turned into a bswap builtin invokation
2817   while endian loads are converted to bswap builtin invokation or
2818   simple load according to the target endianness.  */
2819
2820unsigned int
2821pass_optimize_bswap::execute (function *fun)
2822{
2823  basic_block bb;
2824  bool bswap32_p, bswap64_p;
2825  bool changed = false;
2826  tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2827
2828  if (BITS_PER_UNIT != 8)
2829    return 0;
2830
2831  bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2832	       && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2833  bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2834	       && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2835		   || (bswap32_p && word_mode == SImode)));
2836
2837  /* Determine the argument type of the builtins.  The code later on
2838     assumes that the return and argument type are the same.  */
2839  if (bswap32_p)
2840    {
2841      tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2842      bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2843    }
2844
2845  if (bswap64_p)
2846    {
2847      tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2848      bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2849    }
2850
2851  memset (&nop_stats, 0, sizeof (nop_stats));
2852  memset (&bswap_stats, 0, sizeof (bswap_stats));
2853  calculate_dominance_info (CDI_DOMINATORS);
2854
2855  FOR_EACH_BB_FN (bb, fun)
2856    {
2857      gimple_stmt_iterator gsi;
2858
2859      /* We do a reverse scan for bswap patterns to make sure we get the
2860	 widest match. As bswap pattern matching doesn't handle previously
2861	 inserted smaller bswap replacements as sub-patterns, the wider
2862	 variant wouldn't be detected.  */
2863      for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2864        {
2865	  gimple *ins_stmt, *cur_stmt = gsi_stmt (gsi);
2866	  tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2867	  enum tree_code code;
2868	  struct symbolic_number n;
2869	  bool bswap;
2870
2871	  /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2872	     might be moved to a different basic block by bswap_replace and gsi
2873	     must not points to it if that's the case.  Moving the gsi_prev
2874	     there make sure that gsi points to the statement previous to
2875	     cur_stmt while still making sure that all statements are
2876	     considered in this basic block.  */
2877	  gsi_prev (&gsi);
2878
2879	  if (!is_gimple_assign (cur_stmt))
2880	    continue;
2881
2882	  code = gimple_assign_rhs_code (cur_stmt);
2883	  switch (code)
2884	    {
2885	    case LROTATE_EXPR:
2886	    case RROTATE_EXPR:
2887	      if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2888		  || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2889		     % BITS_PER_UNIT)
2890		continue;
2891	      /* Fall through.  */
2892	    case BIT_IOR_EXPR:
2893	      break;
2894	    default:
2895	      continue;
2896	    }
2897
2898	  ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2899
2900	  if (!ins_stmt)
2901	    continue;
2902
2903	  switch (n.range)
2904	    {
2905	    case 16:
2906	      /* Already in canonical form, nothing to do.  */
2907	      if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2908		continue;
2909	      load_type = bswap_type = uint16_type_node;
2910	      break;
2911	    case 32:
2912	      load_type = uint32_type_node;
2913	      if (bswap32_p)
2914		{
2915		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2916		  bswap_type = bswap32_type;
2917		}
2918	      break;
2919	    case 64:
2920	      load_type = uint64_type_node;
2921	      if (bswap64_p)
2922		{
2923		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2924		  bswap_type = bswap64_type;
2925		}
2926	      break;
2927	    default:
2928	      continue;
2929	    }
2930
2931	  if (bswap && !fndecl && n.range != 16)
2932	    continue;
2933
2934	  if (bswap_replace (cur_stmt, ins_stmt, fndecl, bswap_type, load_type,
2935			     &n, bswap))
2936	    changed = true;
2937	}
2938    }
2939
2940  statistics_counter_event (fun, "16-bit nop implementations found",
2941			    nop_stats.found_16bit);
2942  statistics_counter_event (fun, "32-bit nop implementations found",
2943			    nop_stats.found_32bit);
2944  statistics_counter_event (fun, "64-bit nop implementations found",
2945			    nop_stats.found_64bit);
2946  statistics_counter_event (fun, "16-bit bswap implementations found",
2947			    bswap_stats.found_16bit);
2948  statistics_counter_event (fun, "32-bit bswap implementations found",
2949			    bswap_stats.found_32bit);
2950  statistics_counter_event (fun, "64-bit bswap implementations found",
2951			    bswap_stats.found_64bit);
2952
2953  return (changed ? TODO_update_ssa : 0);
2954}
2955
2956} // anon namespace
2957
2958gimple_opt_pass *
2959make_pass_optimize_bswap (gcc::context *ctxt)
2960{
2961  return new pass_optimize_bswap (ctxt);
2962}
2963
2964/* Return true if stmt is a type conversion operation that can be stripped
2965   when used in a widening multiply operation.  */
2966static bool
2967widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
2968{
2969  enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2970
2971  if (TREE_CODE (result_type) == INTEGER_TYPE)
2972    {
2973      tree op_type;
2974      tree inner_op_type;
2975
2976      if (!CONVERT_EXPR_CODE_P (rhs_code))
2977	return false;
2978
2979      op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2980
2981      /* If the type of OP has the same precision as the result, then
2982	 we can strip this conversion.  The multiply operation will be
2983	 selected to create the correct extension as a by-product.  */
2984      if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2985	return true;
2986
2987      /* We can also strip a conversion if it preserves the signed-ness of
2988	 the operation and doesn't narrow the range.  */
2989      inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2990
2991      /* If the inner-most type is unsigned, then we can strip any
2992	 intermediate widening operation.  If it's signed, then the
2993	 intermediate widening operation must also be signed.  */
2994      if ((TYPE_UNSIGNED (inner_op_type)
2995	   || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2996	  && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2997	return true;
2998
2999      return false;
3000    }
3001
3002  return rhs_code == FIXED_CONVERT_EXPR;
3003}
3004
3005/* Return true if RHS is a suitable operand for a widening multiplication,
3006   assuming a target type of TYPE.
3007   There are two cases:
3008
3009     - RHS makes some value at least twice as wide.  Store that value
3010       in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
3011
3012     - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
3013       but leave *TYPE_OUT untouched.  */
3014
3015static bool
3016is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
3017			tree *new_rhs_out)
3018{
3019  gimple *stmt;
3020  tree type1, rhs1;
3021
3022  if (TREE_CODE (rhs) == SSA_NAME)
3023    {
3024      stmt = SSA_NAME_DEF_STMT (rhs);
3025      if (is_gimple_assign (stmt))
3026	{
3027	  if (! widening_mult_conversion_strippable_p (type, stmt))
3028	    rhs1 = rhs;
3029	  else
3030	    {
3031	      rhs1 = gimple_assign_rhs1 (stmt);
3032
3033	      if (TREE_CODE (rhs1) == INTEGER_CST)
3034		{
3035		  *new_rhs_out = rhs1;
3036		  *type_out = NULL;
3037		  return true;
3038		}
3039	    }
3040	}
3041      else
3042	rhs1 = rhs;
3043
3044      type1 = TREE_TYPE (rhs1);
3045
3046      if (TREE_CODE (type1) != TREE_CODE (type)
3047	  || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
3048	return false;
3049
3050      *new_rhs_out = rhs1;
3051      *type_out = type1;
3052      return true;
3053    }
3054
3055  if (TREE_CODE (rhs) == INTEGER_CST)
3056    {
3057      *new_rhs_out = rhs;
3058      *type_out = NULL;
3059      return true;
3060    }
3061
3062  return false;
3063}
3064
3065/* Return true if STMT performs a widening multiplication, assuming the
3066   output type is TYPE.  If so, store the unwidened types of the operands
3067   in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
3068   *RHS2_OUT such that converting those operands to types *TYPE1_OUT
3069   and *TYPE2_OUT would give the operands of the multiplication.  */
3070
3071static bool
3072is_widening_mult_p (gimple *stmt,
3073		    tree *type1_out, tree *rhs1_out,
3074		    tree *type2_out, tree *rhs2_out)
3075{
3076  tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3077
3078  if (TREE_CODE (type) == INTEGER_TYPE)
3079    {
3080      if (TYPE_OVERFLOW_TRAPS (type))
3081	return false;
3082    }
3083  else if (TREE_CODE (type) != FIXED_POINT_TYPE)
3084    return false;
3085
3086  if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
3087			       rhs1_out))
3088    return false;
3089
3090  if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
3091			       rhs2_out))
3092    return false;
3093
3094  if (*type1_out == NULL)
3095    {
3096      if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
3097	return false;
3098      *type1_out = *type2_out;
3099    }
3100
3101  if (*type2_out == NULL)
3102    {
3103      if (!int_fits_type_p (*rhs2_out, *type1_out))
3104	return false;
3105      *type2_out = *type1_out;
3106    }
3107
3108  /* Ensure that the larger of the two operands comes first. */
3109  if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
3110    {
3111      std::swap (*type1_out, *type2_out);
3112      std::swap (*rhs1_out, *rhs2_out);
3113    }
3114
3115  return true;
3116}
3117
3118/* Process a single gimple statement STMT, which has a MULT_EXPR as
3119   its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
3120   value is true iff we converted the statement.  */
3121
3122static bool
3123convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
3124{
3125  tree lhs, rhs1, rhs2, type, type1, type2;
3126  enum insn_code handler;
3127  machine_mode to_mode, from_mode, actual_mode;
3128  optab op;
3129  int actual_precision;
3130  location_t loc = gimple_location (stmt);
3131  bool from_unsigned1, from_unsigned2;
3132
3133  lhs = gimple_assign_lhs (stmt);
3134  type = TREE_TYPE (lhs);
3135  if (TREE_CODE (type) != INTEGER_TYPE)
3136    return false;
3137
3138  if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
3139    return false;
3140
3141  to_mode = TYPE_MODE (type);
3142  from_mode = TYPE_MODE (type1);
3143  from_unsigned1 = TYPE_UNSIGNED (type1);
3144  from_unsigned2 = TYPE_UNSIGNED (type2);
3145
3146  if (from_unsigned1 && from_unsigned2)
3147    op = umul_widen_optab;
3148  else if (!from_unsigned1 && !from_unsigned2)
3149    op = smul_widen_optab;
3150  else
3151    op = usmul_widen_optab;
3152
3153  handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
3154						  0, &actual_mode);
3155
3156  if (handler == CODE_FOR_nothing)
3157    {
3158      if (op != smul_widen_optab)
3159	{
3160	  /* We can use a signed multiply with unsigned types as long as
3161	     there is a wider mode to use, or it is the smaller of the two
3162	     types that is unsigned.  Note that type1 >= type2, always.  */
3163	  if ((TYPE_UNSIGNED (type1)
3164	       && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3165	      || (TYPE_UNSIGNED (type2)
3166		  && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3167	    {
3168	      from_mode = GET_MODE_WIDER_MODE (from_mode);
3169	      if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
3170		return false;
3171	    }
3172
3173	  op = smul_widen_optab;
3174	  handler = find_widening_optab_handler_and_mode (op, to_mode,
3175							  from_mode, 0,
3176							  &actual_mode);
3177
3178	  if (handler == CODE_FOR_nothing)
3179	    return false;
3180
3181	  from_unsigned1 = from_unsigned2 = false;
3182	}
3183      else
3184	return false;
3185    }
3186
3187  /* Ensure that the inputs to the handler are in the correct precison
3188     for the opcode.  This will be the full mode size.  */
3189  actual_precision = GET_MODE_PRECISION (actual_mode);
3190  if (2 * actual_precision > TYPE_PRECISION (type))
3191    return false;
3192  if (actual_precision != TYPE_PRECISION (type1)
3193      || from_unsigned1 != TYPE_UNSIGNED (type1))
3194    rhs1 = build_and_insert_cast (gsi, loc,
3195				  build_nonstandard_integer_type
3196				    (actual_precision, from_unsigned1), rhs1);
3197  if (actual_precision != TYPE_PRECISION (type2)
3198      || from_unsigned2 != TYPE_UNSIGNED (type2))
3199    rhs2 = build_and_insert_cast (gsi, loc,
3200				  build_nonstandard_integer_type
3201				    (actual_precision, from_unsigned2), rhs2);
3202
3203  /* Handle constants.  */
3204  if (TREE_CODE (rhs1) == INTEGER_CST)
3205    rhs1 = fold_convert (type1, rhs1);
3206  if (TREE_CODE (rhs2) == INTEGER_CST)
3207    rhs2 = fold_convert (type2, rhs2);
3208
3209  gimple_assign_set_rhs1 (stmt, rhs1);
3210  gimple_assign_set_rhs2 (stmt, rhs2);
3211  gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
3212  update_stmt (stmt);
3213  widen_mul_stats.widen_mults_inserted++;
3214  return true;
3215}
3216
3217/* Process a single gimple statement STMT, which is found at the
3218   iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
3219   rhs (given by CODE), and try to convert it into a
3220   WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
3221   is true iff we converted the statement.  */
3222
3223static bool
3224convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
3225			    enum tree_code code)
3226{
3227  gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
3228  gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
3229  tree type, type1, type2, optype;
3230  tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
3231  enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
3232  optab this_optab;
3233  enum tree_code wmult_code;
3234  enum insn_code handler;
3235  machine_mode to_mode, from_mode, actual_mode;
3236  location_t loc = gimple_location (stmt);
3237  int actual_precision;
3238  bool from_unsigned1, from_unsigned2;
3239
3240  lhs = gimple_assign_lhs (stmt);
3241  type = TREE_TYPE (lhs);
3242  if (TREE_CODE (type) != INTEGER_TYPE
3243      && TREE_CODE (type) != FIXED_POINT_TYPE)
3244    return false;
3245
3246  if (code == MINUS_EXPR)
3247    wmult_code = WIDEN_MULT_MINUS_EXPR;
3248  else
3249    wmult_code = WIDEN_MULT_PLUS_EXPR;
3250
3251  rhs1 = gimple_assign_rhs1 (stmt);
3252  rhs2 = gimple_assign_rhs2 (stmt);
3253
3254  if (TREE_CODE (rhs1) == SSA_NAME)
3255    {
3256      rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3257      if (is_gimple_assign (rhs1_stmt))
3258	rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3259    }
3260
3261  if (TREE_CODE (rhs2) == SSA_NAME)
3262    {
3263      rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3264      if (is_gimple_assign (rhs2_stmt))
3265	rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3266    }
3267
3268  /* Allow for one conversion statement between the multiply
3269     and addition/subtraction statement.  If there are more than
3270     one conversions then we assume they would invalidate this
3271     transformation.  If that's not the case then they should have
3272     been folded before now.  */
3273  if (CONVERT_EXPR_CODE_P (rhs1_code))
3274    {
3275      conv1_stmt = rhs1_stmt;
3276      rhs1 = gimple_assign_rhs1 (rhs1_stmt);
3277      if (TREE_CODE (rhs1) == SSA_NAME)
3278	{
3279	  rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
3280	  if (is_gimple_assign (rhs1_stmt))
3281	    rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
3282	}
3283      else
3284	return false;
3285    }
3286  if (CONVERT_EXPR_CODE_P (rhs2_code))
3287    {
3288      conv2_stmt = rhs2_stmt;
3289      rhs2 = gimple_assign_rhs1 (rhs2_stmt);
3290      if (TREE_CODE (rhs2) == SSA_NAME)
3291	{
3292	  rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
3293	  if (is_gimple_assign (rhs2_stmt))
3294	    rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
3295	}
3296      else
3297	return false;
3298    }
3299
3300  /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
3301     is_widening_mult_p, but we still need the rhs returns.
3302
3303     It might also appear that it would be sufficient to use the existing
3304     operands of the widening multiply, but that would limit the choice of
3305     multiply-and-accumulate instructions.
3306
3307     If the widened-multiplication result has more than one uses, it is
3308     probably wiser not to do the conversion.  */
3309  if (code == PLUS_EXPR
3310      && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
3311    {
3312      if (!has_single_use (rhs1)
3313	  || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
3314				  &type2, &mult_rhs2))
3315	return false;
3316      add_rhs = rhs2;
3317      conv_stmt = conv1_stmt;
3318    }
3319  else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
3320    {
3321      if (!has_single_use (rhs2)
3322	  || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
3323				  &type2, &mult_rhs2))
3324	return false;
3325      add_rhs = rhs1;
3326      conv_stmt = conv2_stmt;
3327    }
3328  else
3329    return false;
3330
3331  to_mode = TYPE_MODE (type);
3332  from_mode = TYPE_MODE (type1);
3333  from_unsigned1 = TYPE_UNSIGNED (type1);
3334  from_unsigned2 = TYPE_UNSIGNED (type2);
3335  optype = type1;
3336
3337  /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
3338  if (from_unsigned1 != from_unsigned2)
3339    {
3340      if (!INTEGRAL_TYPE_P (type))
3341	return false;
3342      /* We can use a signed multiply with unsigned types as long as
3343	 there is a wider mode to use, or it is the smaller of the two
3344	 types that is unsigned.  Note that type1 >= type2, always.  */
3345      if ((from_unsigned1
3346	   && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
3347	  || (from_unsigned2
3348	      && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
3349	{
3350	  from_mode = GET_MODE_WIDER_MODE (from_mode);
3351	  if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
3352	    return false;
3353	}
3354
3355      from_unsigned1 = from_unsigned2 = false;
3356      optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
3357					       false);
3358    }
3359
3360  /* If there was a conversion between the multiply and addition
3361     then we need to make sure it fits a multiply-and-accumulate.
3362     The should be a single mode change which does not change the
3363     value.  */
3364  if (conv_stmt)
3365    {
3366      /* We use the original, unmodified data types for this.  */
3367      tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
3368      tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
3369      int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
3370      bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
3371
3372      if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
3373	{
3374	  /* Conversion is a truncate.  */
3375	  if (TYPE_PRECISION (to_type) < data_size)
3376	    return false;
3377	}
3378      else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
3379	{
3380	  /* Conversion is an extend.  Check it's the right sort.  */
3381	  if (TYPE_UNSIGNED (from_type) != is_unsigned
3382	      && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
3383	    return false;
3384	}
3385      /* else convert is a no-op for our purposes.  */
3386    }
3387
3388  /* Verify that the machine can perform a widening multiply
3389     accumulate in this mode/signedness combination, otherwise
3390     this transformation is likely to pessimize code.  */
3391  this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
3392  handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
3393						  from_mode, 0, &actual_mode);
3394
3395  if (handler == CODE_FOR_nothing)
3396    return false;
3397
3398  /* Ensure that the inputs to the handler are in the correct precison
3399     for the opcode.  This will be the full mode size.  */
3400  actual_precision = GET_MODE_PRECISION (actual_mode);
3401  if (actual_precision != TYPE_PRECISION (type1)
3402      || from_unsigned1 != TYPE_UNSIGNED (type1))
3403    mult_rhs1 = build_and_insert_cast (gsi, loc,
3404				       build_nonstandard_integer_type
3405				         (actual_precision, from_unsigned1),
3406				       mult_rhs1);
3407  if (actual_precision != TYPE_PRECISION (type2)
3408      || from_unsigned2 != TYPE_UNSIGNED (type2))
3409    mult_rhs2 = build_and_insert_cast (gsi, loc,
3410				       build_nonstandard_integer_type
3411					 (actual_precision, from_unsigned2),
3412				       mult_rhs2);
3413
3414  if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
3415    add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
3416
3417  /* Handle constants.  */
3418  if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3419    mult_rhs1 = fold_convert (type1, mult_rhs1);
3420  if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3421    mult_rhs2 = fold_convert (type2, mult_rhs2);
3422
3423  gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3424				  add_rhs);
3425  update_stmt (gsi_stmt (*gsi));
3426  widen_mul_stats.maccs_inserted++;
3427  return true;
3428}
3429
3430/* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3431   with uses in additions and subtractions to form fused multiply-add
3432   operations.  Returns true if successful and MUL_STMT should be removed.  */
3433
3434static bool
3435convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2)
3436{
3437  tree mul_result = gimple_get_lhs (mul_stmt);
3438  tree type = TREE_TYPE (mul_result);
3439  gimple *use_stmt, *neguse_stmt;
3440  gassign *fma_stmt;
3441  use_operand_p use_p;
3442  imm_use_iterator imm_iter;
3443
3444  if (FLOAT_TYPE_P (type)
3445      && flag_fp_contract_mode == FP_CONTRACT_OFF)
3446    return false;
3447
3448  /* We don't want to do bitfield reduction ops.  */
3449  if (INTEGRAL_TYPE_P (type)
3450      && (TYPE_PRECISION (type)
3451	  != GET_MODE_PRECISION (TYPE_MODE (type))
3452	  || TYPE_OVERFLOW_TRAPS (type)))
3453    return false;
3454
3455  /* If the target doesn't support it, don't generate it.  We assume that
3456     if fma isn't available then fms, fnma or fnms are not either.  */
3457  if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3458    return false;
3459
3460  /* If the multiplication has zero uses, it is kept around probably because
3461     of -fnon-call-exceptions.  Don't optimize it away in that case,
3462     it is DCE job.  */
3463  if (has_zero_uses (mul_result))
3464    return false;
3465
3466  /* Make sure that the multiplication statement becomes dead after
3467     the transformation, thus that all uses are transformed to FMAs.
3468     This means we assume that an FMA operation has the same cost
3469     as an addition.  */
3470  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3471    {
3472      enum tree_code use_code;
3473      tree result = mul_result;
3474      bool negate_p = false;
3475
3476      use_stmt = USE_STMT (use_p);
3477
3478      if (is_gimple_debug (use_stmt))
3479	continue;
3480
3481      /* For now restrict this operations to single basic blocks.  In theory
3482	 we would want to support sinking the multiplication in
3483	 m = a*b;
3484	 if ()
3485	   ma = m + c;
3486	 else
3487	   d = m;
3488	 to form a fma in the then block and sink the multiplication to the
3489	 else block.  */
3490      if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3491	return false;
3492
3493      if (!is_gimple_assign (use_stmt))
3494	return false;
3495
3496      use_code = gimple_assign_rhs_code (use_stmt);
3497
3498      /* A negate on the multiplication leads to FNMA.  */
3499      if (use_code == NEGATE_EXPR)
3500	{
3501	  ssa_op_iter iter;
3502	  use_operand_p usep;
3503
3504	  result = gimple_assign_lhs (use_stmt);
3505
3506	  /* Make sure the negate statement becomes dead with this
3507	     single transformation.  */
3508	  if (!single_imm_use (gimple_assign_lhs (use_stmt),
3509			       &use_p, &neguse_stmt))
3510	    return false;
3511
3512	  /* Make sure the multiplication isn't also used on that stmt.  */
3513	  FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3514	    if (USE_FROM_PTR (usep) == mul_result)
3515	      return false;
3516
3517	  /* Re-validate.  */
3518	  use_stmt = neguse_stmt;
3519	  if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3520	    return false;
3521	  if (!is_gimple_assign (use_stmt))
3522	    return false;
3523
3524	  use_code = gimple_assign_rhs_code (use_stmt);
3525	  negate_p = true;
3526	}
3527
3528      switch (use_code)
3529	{
3530	case MINUS_EXPR:
3531	  if (gimple_assign_rhs2 (use_stmt) == result)
3532	    negate_p = !negate_p;
3533	  break;
3534	case PLUS_EXPR:
3535	  break;
3536	default:
3537	  /* FMA can only be formed from PLUS and MINUS.  */
3538	  return false;
3539	}
3540
3541      /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3542	 by a MULT_EXPR that we'll visit later, we might be able to
3543	 get a more profitable match with fnma.
3544	 OTOH, if we don't, a negate / fma pair has likely lower latency
3545	 that a mult / subtract pair.  */
3546      if (use_code == MINUS_EXPR && !negate_p
3547	  && gimple_assign_rhs1 (use_stmt) == result
3548	  && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3549	  && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3550	{
3551	  tree rhs2 = gimple_assign_rhs2 (use_stmt);
3552
3553	  if (TREE_CODE (rhs2) == SSA_NAME)
3554	    {
3555	      gimple *stmt2 = SSA_NAME_DEF_STMT (rhs2);
3556	      if (has_single_use (rhs2)
3557		  && is_gimple_assign (stmt2)
3558		  && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3559	      return false;
3560	    }
3561	}
3562
3563      /* We can't handle a * b + a * b.  */
3564      if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3565	return false;
3566
3567      /* While it is possible to validate whether or not the exact form
3568	 that we've recognized is available in the backend, the assumption
3569	 is that the transformation is never a loss.  For instance, suppose
3570	 the target only has the plain FMA pattern available.  Consider
3571	 a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3572	 is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
3573	 still have 3 operations, but in the FMA form the two NEGs are
3574	 independent and could be run in parallel.  */
3575    }
3576
3577  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3578    {
3579      gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3580      enum tree_code use_code;
3581      tree addop, mulop1 = op1, result = mul_result;
3582      bool negate_p = false;
3583
3584      if (is_gimple_debug (use_stmt))
3585	continue;
3586
3587      use_code = gimple_assign_rhs_code (use_stmt);
3588      if (use_code == NEGATE_EXPR)
3589	{
3590	  result = gimple_assign_lhs (use_stmt);
3591	  single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3592	  gsi_remove (&gsi, true);
3593	  release_defs (use_stmt);
3594
3595	  use_stmt = neguse_stmt;
3596	  gsi = gsi_for_stmt (use_stmt);
3597	  use_code = gimple_assign_rhs_code (use_stmt);
3598	  negate_p = true;
3599	}
3600
3601      if (gimple_assign_rhs1 (use_stmt) == result)
3602	{
3603	  addop = gimple_assign_rhs2 (use_stmt);
3604	  /* a * b - c -> a * b + (-c)  */
3605	  if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3606	    addop = force_gimple_operand_gsi (&gsi,
3607					      build1 (NEGATE_EXPR,
3608						      type, addop),
3609					      true, NULL_TREE, true,
3610					      GSI_SAME_STMT);
3611	}
3612      else
3613	{
3614	  addop = gimple_assign_rhs1 (use_stmt);
3615	  /* a - b * c -> (-b) * c + a */
3616	  if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3617	    negate_p = !negate_p;
3618	}
3619
3620      if (negate_p)
3621	mulop1 = force_gimple_operand_gsi (&gsi,
3622					   build1 (NEGATE_EXPR,
3623						   type, mulop1),
3624					   true, NULL_TREE, true,
3625					   GSI_SAME_STMT);
3626
3627      fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3628				      FMA_EXPR, mulop1, op2, addop);
3629      gsi_replace (&gsi, fma_stmt, true);
3630      widen_mul_stats.fmas_inserted++;
3631    }
3632
3633  return true;
3634}
3635
3636
3637/* Helper function of match_uaddsub_overflow.  Return 1
3638   if USE_STMT is unsigned overflow check ovf != 0 for
3639   STMT, -1 if USE_STMT is unsigned overflow check ovf == 0
3640   and 0 otherwise.  */
3641
3642static int
3643uaddsub_overflow_check_p (gimple *stmt, gimple *use_stmt)
3644{
3645  enum tree_code ccode = ERROR_MARK;
3646  tree crhs1 = NULL_TREE, crhs2 = NULL_TREE;
3647  if (gimple_code (use_stmt) == GIMPLE_COND)
3648    {
3649      ccode = gimple_cond_code (use_stmt);
3650      crhs1 = gimple_cond_lhs (use_stmt);
3651      crhs2 = gimple_cond_rhs (use_stmt);
3652    }
3653  else if (is_gimple_assign (use_stmt))
3654    {
3655      if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3656	{
3657	  ccode = gimple_assign_rhs_code (use_stmt);
3658	  crhs1 = gimple_assign_rhs1 (use_stmt);
3659	  crhs2 = gimple_assign_rhs2 (use_stmt);
3660	}
3661      else if (gimple_assign_rhs_code (use_stmt) == COND_EXPR)
3662	{
3663	  tree cond = gimple_assign_rhs1 (use_stmt);
3664	  if (COMPARISON_CLASS_P (cond))
3665	    {
3666	      ccode = TREE_CODE (cond);
3667	      crhs1 = TREE_OPERAND (cond, 0);
3668	      crhs2 = TREE_OPERAND (cond, 1);
3669	    }
3670	  else
3671	    return 0;
3672	}
3673      else
3674	return 0;
3675    }
3676  else
3677    return 0;
3678
3679  if (TREE_CODE_CLASS (ccode) != tcc_comparison)
3680    return 0;
3681
3682  enum tree_code code = gimple_assign_rhs_code (stmt);
3683  tree lhs = gimple_assign_lhs (stmt);
3684  tree rhs1 = gimple_assign_rhs1 (stmt);
3685  tree rhs2 = gimple_assign_rhs2 (stmt);
3686
3687  switch (ccode)
3688    {
3689    case GT_EXPR:
3690    case LE_EXPR:
3691      /* r = a - b; r > a or r <= a
3692	 r = a + b; a > r or a <= r or b > r or b <= r.  */
3693      if ((code == MINUS_EXPR && crhs1 == lhs && crhs2 == rhs1)
3694	  || (code == PLUS_EXPR && (crhs1 == rhs1 || crhs1 == rhs2)
3695	      && crhs2 == lhs))
3696	return ccode == GT_EXPR ? 1 : -1;
3697      break;
3698    case LT_EXPR:
3699    case GE_EXPR:
3700      /* r = a - b; a < r or a >= r
3701	 r = a + b; r < a or r >= a or r < b or r >= b.  */
3702      if ((code == MINUS_EXPR && crhs1 == rhs1 && crhs2 == lhs)
3703	  || (code == PLUS_EXPR && crhs1 == lhs
3704	      && (crhs2 == rhs1 || crhs2 == rhs2)))
3705	return ccode == LT_EXPR ? 1 : -1;
3706      break;
3707    default:
3708      break;
3709    }
3710  return 0;
3711}
3712
3713/* Recognize for unsigned x
3714   x = y - z;
3715   if (x > y)
3716   where there are other uses of x and replace it with
3717   _7 = SUB_OVERFLOW (y, z);
3718   x = REALPART_EXPR <_7>;
3719   _8 = IMAGPART_EXPR <_7>;
3720   if (_8)
3721   and similarly for addition.  */
3722
3723static bool
3724match_uaddsub_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
3725			enum tree_code code)
3726{
3727  tree lhs = gimple_assign_lhs (stmt);
3728  tree type = TREE_TYPE (lhs);
3729  use_operand_p use_p;
3730  imm_use_iterator iter;
3731  bool use_seen = false;
3732  bool ovf_use_seen = false;
3733  gimple *use_stmt;
3734
3735  gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR);
3736  if (!INTEGRAL_TYPE_P (type)
3737      || !TYPE_UNSIGNED (type)
3738      || has_zero_uses (lhs)
3739      || has_single_use (lhs)
3740      || optab_handler (code == PLUS_EXPR ? uaddv4_optab : usubv4_optab,
3741			TYPE_MODE (type)) == CODE_FOR_nothing)
3742    return false;
3743
3744  FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
3745    {
3746      use_stmt = USE_STMT (use_p);
3747      if (is_gimple_debug (use_stmt))
3748	continue;
3749
3750      if (uaddsub_overflow_check_p (stmt, use_stmt))
3751	ovf_use_seen = true;
3752      else
3753	use_seen = true;
3754      if (ovf_use_seen && use_seen)
3755	break;
3756    }
3757
3758  if (!ovf_use_seen || !use_seen)
3759    return false;
3760
3761  tree ctype = build_complex_type (type);
3762  tree rhs1 = gimple_assign_rhs1 (stmt);
3763  tree rhs2 = gimple_assign_rhs2 (stmt);
3764  gcall *g = gimple_build_call_internal (code == PLUS_EXPR
3765					 ? IFN_ADD_OVERFLOW : IFN_SUB_OVERFLOW,
3766					 2, rhs1, rhs2);
3767  tree ctmp = make_ssa_name (ctype);
3768  gimple_call_set_lhs (g, ctmp);
3769  gsi_insert_before (gsi, g, GSI_SAME_STMT);
3770  gassign *g2 = gimple_build_assign (lhs, REALPART_EXPR,
3771				     build1 (REALPART_EXPR, type, ctmp));
3772  gsi_replace (gsi, g2, true);
3773  tree ovf = make_ssa_name (type);
3774  g2 = gimple_build_assign (ovf, IMAGPART_EXPR,
3775			    build1 (IMAGPART_EXPR, type, ctmp));
3776  gsi_insert_after (gsi, g2, GSI_NEW_STMT);
3777
3778  FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
3779    {
3780      if (is_gimple_debug (use_stmt))
3781	continue;
3782
3783      int ovf_use = uaddsub_overflow_check_p (stmt, use_stmt);
3784      if (ovf_use == 0)
3785	continue;
3786      if (gimple_code (use_stmt) == GIMPLE_COND)
3787	{
3788	  gcond *cond_stmt = as_a <gcond *> (use_stmt);
3789	  gimple_cond_set_lhs (cond_stmt, ovf);
3790	  gimple_cond_set_rhs (cond_stmt, build_int_cst (type, 0));
3791	  gimple_cond_set_code (cond_stmt, ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3792	}
3793      else
3794	{
3795	  gcc_checking_assert (is_gimple_assign (use_stmt));
3796	  if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3797	    {
3798	      gimple_assign_set_rhs1 (use_stmt, ovf);
3799	      gimple_assign_set_rhs2 (use_stmt, build_int_cst (type, 0));
3800	      gimple_assign_set_rhs_code (use_stmt,
3801					  ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3802	    }
3803	  else
3804	    {
3805	      gcc_checking_assert (gimple_assign_rhs_code (use_stmt)
3806				   == COND_EXPR);
3807	      tree cond = build2 (ovf_use == 1 ? NE_EXPR : EQ_EXPR,
3808				  boolean_type_node, ovf,
3809				  build_int_cst (type, 0));
3810	      gimple_assign_set_rhs1 (use_stmt, cond);
3811	    }
3812	}
3813      update_stmt (use_stmt);
3814    }
3815  return true;
3816}
3817
3818/* Return true if target has support for divmod.  */
3819
3820static bool
3821target_supports_divmod_p (optab divmod_optab, optab div_optab, machine_mode mode)
3822{
3823  /* If target supports hardware divmod insn, use it for divmod.  */
3824  if (optab_handler (divmod_optab, mode) != CODE_FOR_nothing)
3825    return true;
3826
3827  /* Check if libfunc for divmod is available.  */
3828  rtx libfunc = optab_libfunc (divmod_optab, mode);
3829  if (libfunc != NULL_RTX)
3830    {
3831      /* If optab_handler exists for div_optab, perhaps in a wider mode,
3832	 we don't want to use the libfunc even if it exists for given mode.  */
3833      for (machine_mode div_mode = mode;
3834	   div_mode != VOIDmode;
3835	   div_mode = GET_MODE_WIDER_MODE (div_mode))
3836	if (optab_handler (div_optab, div_mode) != CODE_FOR_nothing)
3837	  return false;
3838
3839      return targetm.expand_divmod_libfunc != NULL;
3840    }
3841
3842  return false;
3843}
3844
3845/* Check if stmt is candidate for divmod transform.  */
3846
3847static bool
3848divmod_candidate_p (gassign *stmt)
3849{
3850  tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3851  enum machine_mode mode = TYPE_MODE (type);
3852  optab divmod_optab, div_optab;
3853
3854  if (TYPE_UNSIGNED (type))
3855    {
3856      divmod_optab = udivmod_optab;
3857      div_optab = udiv_optab;
3858    }
3859  else
3860    {
3861      divmod_optab = sdivmod_optab;
3862      div_optab = sdiv_optab;
3863    }
3864
3865  tree op1 = gimple_assign_rhs1 (stmt);
3866  tree op2 = gimple_assign_rhs2 (stmt);
3867
3868  /* Disable the transform if either is a constant, since division-by-constant
3869     may have specialized expansion.  */
3870  if (CONSTANT_CLASS_P (op1) || CONSTANT_CLASS_P (op2))
3871    return false;
3872
3873  /* Exclude the case where TYPE_OVERFLOW_TRAPS (type) as that should
3874     expand using the [su]divv optabs.  */
3875  if (TYPE_OVERFLOW_TRAPS (type))
3876    return false;
3877
3878  if (!target_supports_divmod_p (divmod_optab, div_optab, mode))
3879    return false;
3880
3881  return true;
3882}
3883
3884/* This function looks for:
3885   t1 = a TRUNC_DIV_EXPR b;
3886   t2 = a TRUNC_MOD_EXPR b;
3887   and transforms it to the following sequence:
3888   complex_tmp = DIVMOD (a, b);
3889   t1 = REALPART_EXPR(a);
3890   t2 = IMAGPART_EXPR(b);
3891   For conditions enabling the transform see divmod_candidate_p().
3892
3893   The pass has three parts:
3894   1) Find top_stmt which is trunc_div or trunc_mod stmt and dominates all
3895      other trunc_div_expr and trunc_mod_expr stmts.
3896   2) Add top_stmt and all trunc_div and trunc_mod stmts dominated by top_stmt
3897      to stmts vector.
3898   3) Insert DIVMOD call just before top_stmt and update entries in
3899      stmts vector to use return value of DIMOVD (REALEXPR_PART for div,
3900      IMAGPART_EXPR for mod).  */
3901
3902static bool
3903convert_to_divmod (gassign *stmt)
3904{
3905  if (stmt_can_throw_internal (stmt)
3906      || !divmod_candidate_p (stmt))
3907    return false;
3908
3909  tree op1 = gimple_assign_rhs1 (stmt);
3910  tree op2 = gimple_assign_rhs2 (stmt);
3911
3912  imm_use_iterator use_iter;
3913  gimple *use_stmt;
3914  auto_vec<gimple *> stmts;
3915
3916  gimple *top_stmt = stmt;
3917  basic_block top_bb = gimple_bb (stmt);
3918
3919  /* Part 1: Try to set top_stmt to "topmost" stmt that dominates
3920     at-least stmt and possibly other trunc_div/trunc_mod stmts
3921     having same operands as stmt.  */
3922
3923  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, op1)
3924    {
3925      if (is_gimple_assign (use_stmt)
3926	  && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
3927	      || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
3928	  && operand_equal_p (op1, gimple_assign_rhs1 (use_stmt), 0)
3929	  && operand_equal_p (op2, gimple_assign_rhs2 (use_stmt), 0))
3930	{
3931	  if (stmt_can_throw_internal (use_stmt))
3932	    continue;
3933
3934	  basic_block bb = gimple_bb (use_stmt);
3935
3936	  if (bb == top_bb)
3937	    {
3938	      if (gimple_uid (use_stmt) < gimple_uid (top_stmt))
3939		top_stmt = use_stmt;
3940	    }
3941	  else if (dominated_by_p (CDI_DOMINATORS, top_bb, bb))
3942	    {
3943	      top_bb = bb;
3944	      top_stmt = use_stmt;
3945	    }
3946	}
3947    }
3948
3949  tree top_op1 = gimple_assign_rhs1 (top_stmt);
3950  tree top_op2 = gimple_assign_rhs2 (top_stmt);
3951
3952  stmts.safe_push (top_stmt);
3953  bool div_seen = (gimple_assign_rhs_code (top_stmt) == TRUNC_DIV_EXPR);
3954
3955  /* Part 2: Add all trunc_div/trunc_mod statements domianted by top_bb
3956     to stmts vector. The 2nd loop will always add stmt to stmts vector, since
3957     gimple_bb (top_stmt) dominates gimple_bb (stmt), so the
3958     2nd loop ends up adding at-least single trunc_mod_expr stmt.  */
3959
3960  FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, top_op1)
3961    {
3962      if (is_gimple_assign (use_stmt)
3963	  && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
3964	      || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
3965	  && operand_equal_p (top_op1, gimple_assign_rhs1 (use_stmt), 0)
3966	  && operand_equal_p (top_op2, gimple_assign_rhs2 (use_stmt), 0))
3967	{
3968	  if (use_stmt == top_stmt
3969	      || stmt_can_throw_internal (use_stmt)
3970	      || !dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt), top_bb))
3971	    continue;
3972
3973	  stmts.safe_push (use_stmt);
3974	  if (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR)
3975	    div_seen = true;
3976	}
3977    }
3978
3979  if (!div_seen)
3980    return false;
3981
3982  /* Part 3: Create libcall to internal fn DIVMOD:
3983     divmod_tmp = DIVMOD (op1, op2).  */
3984
3985  gcall *call_stmt = gimple_build_call_internal (IFN_DIVMOD, 2, op1, op2);
3986  tree res = make_temp_ssa_name (build_complex_type (TREE_TYPE (op1)),
3987				 call_stmt, "divmod_tmp");
3988  gimple_call_set_lhs (call_stmt, res);
3989
3990  /* Insert the call before top_stmt.  */
3991  gimple_stmt_iterator top_stmt_gsi = gsi_for_stmt (top_stmt);
3992  gsi_insert_before (&top_stmt_gsi, call_stmt, GSI_SAME_STMT);
3993
3994  widen_mul_stats.divmod_calls_inserted++;
3995
3996  /* Update all statements in stmts vector:
3997     lhs = op1 TRUNC_DIV_EXPR op2 -> lhs = REALPART_EXPR<divmod_tmp>
3998     lhs = op1 TRUNC_MOD_EXPR op2 -> lhs = IMAGPART_EXPR<divmod_tmp>.  */
3999
4000  for (unsigned i = 0; stmts.iterate (i, &use_stmt); ++i)
4001    {
4002      tree new_rhs;
4003
4004      switch (gimple_assign_rhs_code (use_stmt))
4005	{
4006	  case TRUNC_DIV_EXPR:
4007	    new_rhs = fold_build1 (REALPART_EXPR, TREE_TYPE (op1), res);
4008	    break;
4009
4010	  case TRUNC_MOD_EXPR:
4011	    new_rhs = fold_build1 (IMAGPART_EXPR, TREE_TYPE (op1), res);
4012	    break;
4013
4014	  default:
4015	    gcc_unreachable ();
4016	}
4017
4018      gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
4019      gimple_assign_set_rhs_from_tree (&gsi, new_rhs);
4020      update_stmt (use_stmt);
4021    }
4022
4023  return true;
4024}
4025
4026/* Find integer multiplications where the operands are extended from
4027   smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
4028   where appropriate.  */
4029
4030namespace {
4031
4032const pass_data pass_data_optimize_widening_mul =
4033{
4034  GIMPLE_PASS, /* type */
4035  "widening_mul", /* name */
4036  OPTGROUP_NONE, /* optinfo_flags */
4037  TV_NONE, /* tv_id */
4038  PROP_ssa, /* properties_required */
4039  0, /* properties_provided */
4040  0, /* properties_destroyed */
4041  0, /* todo_flags_start */
4042  TODO_update_ssa, /* todo_flags_finish */
4043};
4044
4045class pass_optimize_widening_mul : public gimple_opt_pass
4046{
4047public:
4048  pass_optimize_widening_mul (gcc::context *ctxt)
4049    : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
4050  {}
4051
4052  /* opt_pass methods: */
4053  virtual bool gate (function *)
4054    {
4055      return flag_expensive_optimizations && optimize;
4056    }
4057
4058  virtual unsigned int execute (function *);
4059
4060}; // class pass_optimize_widening_mul
4061
4062unsigned int
4063pass_optimize_widening_mul::execute (function *fun)
4064{
4065  basic_block bb;
4066  bool cfg_changed = false;
4067
4068  memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
4069  calculate_dominance_info (CDI_DOMINATORS);
4070  renumber_gimple_stmt_uids ();
4071
4072  FOR_EACH_BB_FN (bb, fun)
4073    {
4074      gimple_stmt_iterator gsi;
4075
4076      for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
4077        {
4078	  gimple *stmt = gsi_stmt (gsi);
4079	  enum tree_code code;
4080
4081	  if (is_gimple_assign (stmt))
4082	    {
4083	      code = gimple_assign_rhs_code (stmt);
4084	      switch (code)
4085		{
4086		case MULT_EXPR:
4087		  if (!convert_mult_to_widen (stmt, &gsi)
4088		      && convert_mult_to_fma (stmt,
4089					      gimple_assign_rhs1 (stmt),
4090					      gimple_assign_rhs2 (stmt)))
4091		    {
4092		      gsi_remove (&gsi, true);
4093		      release_defs (stmt);
4094		      continue;
4095		    }
4096		  break;
4097
4098		case PLUS_EXPR:
4099		case MINUS_EXPR:
4100		  if (!convert_plusminus_to_widen (&gsi, stmt, code))
4101		    match_uaddsub_overflow (&gsi, stmt, code);
4102		  break;
4103
4104		case TRUNC_MOD_EXPR:
4105		  convert_to_divmod (as_a<gassign *> (stmt));
4106		  break;
4107
4108		default:;
4109		}
4110	    }
4111	  else if (is_gimple_call (stmt)
4112		   && gimple_call_lhs (stmt))
4113	    {
4114	      tree fndecl = gimple_call_fndecl (stmt);
4115	      if (fndecl
4116		  && gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
4117		{
4118		  switch (DECL_FUNCTION_CODE (fndecl))
4119		    {
4120		      case BUILT_IN_POWF:
4121		      case BUILT_IN_POW:
4122		      case BUILT_IN_POWL:
4123			if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
4124			    && real_equal
4125			         (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
4126				  &dconst2)
4127			    && convert_mult_to_fma (stmt,
4128						    gimple_call_arg (stmt, 0),
4129						    gimple_call_arg (stmt, 0)))
4130			  {
4131			    unlink_stmt_vdef (stmt);
4132			    if (gsi_remove (&gsi, true)
4133				&& gimple_purge_dead_eh_edges (bb))
4134			      cfg_changed = true;
4135			    release_defs (stmt);
4136			    continue;
4137			  }
4138			  break;
4139
4140		      default:;
4141		    }
4142		}
4143	    }
4144	  gsi_next (&gsi);
4145	}
4146    }
4147
4148  statistics_counter_event (fun, "widening multiplications inserted",
4149			    widen_mul_stats.widen_mults_inserted);
4150  statistics_counter_event (fun, "widening maccs inserted",
4151			    widen_mul_stats.maccs_inserted);
4152  statistics_counter_event (fun, "fused multiply-adds inserted",
4153			    widen_mul_stats.fmas_inserted);
4154  statistics_counter_event (fun, "divmod calls inserted",
4155			    widen_mul_stats.divmod_calls_inserted);
4156
4157  return cfg_changed ? TODO_cleanup_cfg : 0;
4158}
4159
4160} // anon namespace
4161
4162gimple_opt_pass *
4163make_pass_optimize_widening_mul (gcc::context *ctxt)
4164{
4165  return new pass_optimize_widening_mul (ctxt);
4166}
4167