1169689Skan/* Global, SSA-based optimizations using mathematical identities.
2169689Skan   Copyright (C) 2005 Free Software Foundation, Inc.
3169689Skan
4169689SkanThis file is part of GCC.
5169689Skan
6169689SkanGCC is free software; you can redistribute it and/or modify it
7169689Skanunder the terms of the GNU General Public License as published by the
8169689SkanFree Software Foundation; either version 2, or (at your option) any
9169689Skanlater version.
10169689Skan
11169689SkanGCC is distributed in the hope that it will be useful, but WITHOUT
12169689SkanANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13169689SkanFITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14169689Skanfor more details.
15169689Skan
16169689SkanYou should have received a copy of the GNU General Public License
17169689Skanalong with GCC; see the file COPYING.  If not, write to the Free
18169689SkanSoftware Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
19169689Skan02110-1301, USA.  */
20169689Skan
21169689Skan/* Currently, the only mini-pass in this file tries to CSE reciprocal
22169689Skan   operations.  These are common in sequences such as this one:
23169689Skan
24169689Skan	modulus = sqrt(x*x + y*y + z*z);
25169689Skan	x = x / modulus;
26169689Skan	y = y / modulus;
27169689Skan	z = z / modulus;
28169689Skan
29169689Skan   that can be optimized to
30169689Skan
31169689Skan	modulus = sqrt(x*x + y*y + z*z);
32169689Skan        rmodulus = 1.0 / modulus;
33169689Skan	x = x * rmodulus;
34169689Skan	y = y * rmodulus;
35169689Skan	z = z * rmodulus;
36169689Skan
37169689Skan   We do this for loop invariant divisors, and with this pass whenever
38169689Skan   we notice that a division has the same divisor multiple times.
39169689Skan
40169689Skan   Of course, like in PRE, we don't insert a division if a dominator
41169689Skan   already has one.  However, this cannot be done as an extension of
42169689Skan   PRE for several reasons.
43169689Skan
44169689Skan   First of all, with some experiments it was found out that the
45169689Skan   transformation is not always useful if there are only two divisions
46169689Skan   hy the same divisor.  This is probably because modern processors
47169689Skan   can pipeline the divisions; on older, in-order processors it should
48169689Skan   still be effective to optimize two divisions by the same number.
49169689Skan   We make this a param, and it shall be called N in the remainder of
50169689Skan   this comment.
51169689Skan
52169689Skan   Second, if trapping math is active, we have less freedom on where
53169689Skan   to insert divisions: we can only do so in basic blocks that already
54169689Skan   contain one.  (If divisions don't trap, instead, we can insert
55169689Skan   divisions elsewhere, which will be in blocks that are common dominators
56169689Skan   of those that have the division).
57169689Skan
58169689Skan   We really don't want to compute the reciprocal unless a division will
59169689Skan   be found.  To do this, we won't insert the division in a basic block
60169689Skan   that has less than N divisions *post-dominating* it.
61169689Skan
62169689Skan   The algorithm constructs a subset of the dominator tree, holding the
63169689Skan   blocks containing the divisions and the common dominators to them,
64169689Skan   and walk it twice.  The first walk is in post-order, and it annotates
65169689Skan   each block with the number of divisions that post-dominate it: this
66169689Skan   gives information on where divisions can be inserted profitably.
67169689Skan   The second walk is in pre-order, and it inserts divisions as explained
68169689Skan   above, and replaces divisions by multiplications.
69169689Skan
70169689Skan   In the best case, the cost of the pass is O(n_statements).  In the
71169689Skan   worst-case, the cost is due to creating the dominator tree subset,
72169689Skan   with a cost of O(n_basic_blocks ^ 2); however this can only happen
73169689Skan   for n_statements / n_basic_blocks statements.  So, the amortized cost
74169689Skan   of creating the dominator tree subset is O(n_basic_blocks) and the
75169689Skan   worst-case cost of the pass is O(n_statements * n_basic_blocks).
76169689Skan
77169689Skan   More practically, the cost will be small because there are few
78169689Skan   divisions, and they tend to be in the same basic block, so insert_bb
79169689Skan   is called very few times.
80169689Skan
81169689Skan   If we did this using domwalk.c, an efficient implementation would have
82169689Skan   to work on all the variables in a single pass, because we could not
83169689Skan   work on just a subset of the dominator tree, as we do now, and the
84169689Skan   cost would also be something like O(n_statements * n_basic_blocks).
85169689Skan   The data structures would be more complex in order to work on all the
86169689Skan   variables in a single pass.  */
87169689Skan
88169689Skan#include "config.h"
89169689Skan#include "system.h"
90169689Skan#include "coretypes.h"
91169689Skan#include "tm.h"
92169689Skan#include "flags.h"
93169689Skan#include "tree.h"
94169689Skan#include "tree-flow.h"
95169689Skan#include "real.h"
96169689Skan#include "timevar.h"
97169689Skan#include "tree-pass.h"
98169689Skan#include "alloc-pool.h"
99169689Skan#include "basic-block.h"
100169689Skan#include "target.h"
101169689Skan
102169689Skan
103169689Skan/* This structure represents one basic block that either computes a
104169689Skan   division, or is a common dominator for basic block that compute a
105169689Skan   division.  */
106169689Skanstruct occurrence {
107169689Skan  /* The basic block represented by this structure.  */
108169689Skan  basic_block bb;
109169689Skan
110169689Skan  /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
111169689Skan     inserted in BB.  */
112169689Skan  tree recip_def;
113169689Skan
114169689Skan  /* If non-NULL, the MODIFY_EXPR for a reciprocal computation that
115169689Skan     was inserted in BB.  */
116169689Skan  tree recip_def_stmt;
117169689Skan
118169689Skan  /* Pointer to a list of "struct occurrence"s for blocks dominated
119169689Skan     by BB.  */
120169689Skan  struct occurrence *children;
121169689Skan
122169689Skan  /* Pointer to the next "struct occurrence"s in the list of blocks
123169689Skan     sharing a common dominator.  */
124169689Skan  struct occurrence *next;
125169689Skan
126169689Skan  /* The number of divisions that are in BB before compute_merit.  The
127169689Skan     number of divisions that are in BB or post-dominate it after
128169689Skan     compute_merit.  */
129169689Skan  int num_divisions;
130169689Skan
131169689Skan  /* True if the basic block has a division, false if it is a common
132169689Skan     dominator for basic blocks that do.  If it is false and trapping
133169689Skan     math is active, BB is not a candidate for inserting a reciprocal.  */
134169689Skan  bool bb_has_division;
135169689Skan};
136169689Skan
137169689Skan
138169689Skan/* The instance of "struct occurrence" representing the highest
139169689Skan   interesting block in the dominator tree.  */
140169689Skanstatic struct occurrence *occ_head;
141169689Skan
142169689Skan/* Allocation pool for getting instances of "struct occurrence".  */
143169689Skanstatic alloc_pool occ_pool;
144169689Skan
145169689Skan
146169689Skan
147169689Skan/* Allocate and return a new struct occurrence for basic block BB, and
148169689Skan   whose children list is headed by CHILDREN.  */
149169689Skanstatic struct occurrence *
150169689Skanocc_new (basic_block bb, struct occurrence *children)
151169689Skan{
152169689Skan  struct occurrence *occ;
153169689Skan
154169689Skan  occ = bb->aux = pool_alloc (occ_pool);
155169689Skan  memset (occ, 0, sizeof (struct occurrence));
156169689Skan
157169689Skan  occ->bb = bb;
158169689Skan  occ->children = children;
159169689Skan  return occ;
160169689Skan}
161169689Skan
162169689Skan
163169689Skan/* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
164169689Skan   list of "struct occurrence"s, one per basic block, having IDOM as
165169689Skan   their common dominator.
166169689Skan
167169689Skan   We try to insert NEW_OCC as deep as possible in the tree, and we also
168169689Skan   insert any other block that is a common dominator for BB and one
169169689Skan   block already in the tree.  */
170169689Skan
171169689Skanstatic void
172169689Skaninsert_bb (struct occurrence *new_occ, basic_block idom,
173169689Skan	   struct occurrence **p_head)
174169689Skan{
175169689Skan  struct occurrence *occ, **p_occ;
176169689Skan
177169689Skan  for (p_occ = p_head; (occ = *p_occ) != NULL; )
178169689Skan    {
179169689Skan      basic_block bb = new_occ->bb, occ_bb = occ->bb;
180169689Skan      basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
181169689Skan      if (dom == bb)
182169689Skan	{
183169689Skan	  /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
184169689Skan	     from its list.  */
185169689Skan	  *p_occ = occ->next;
186169689Skan	  occ->next = new_occ->children;
187169689Skan	  new_occ->children = occ;
188169689Skan
189169689Skan	  /* Try the next block (it may as well be dominated by BB).  */
190169689Skan	}
191169689Skan
192169689Skan      else if (dom == occ_bb)
193169689Skan	{
194169689Skan	  /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
195169689Skan	  insert_bb (new_occ, dom, &occ->children);
196169689Skan	  return;
197169689Skan	}
198169689Skan
199169689Skan      else if (dom != idom)
200169689Skan	{
201169689Skan	  gcc_assert (!dom->aux);
202169689Skan
203169689Skan	  /* There is a dominator between IDOM and BB, add it and make
204169689Skan	     two children out of NEW_OCC and OCC.  First, remove OCC from
205169689Skan	     its list.	*/
206169689Skan	  *p_occ = occ->next;
207169689Skan	  new_occ->next = occ;
208169689Skan	  occ->next = NULL;
209169689Skan
210169689Skan	  /* None of the previous blocks has DOM as a dominator: if we tail
211169689Skan	     recursed, we would reexamine them uselessly. Just switch BB with
212169689Skan	     DOM, and go on looking for blocks dominated by DOM.  */
213169689Skan          new_occ = occ_new (dom, new_occ);
214169689Skan	}
215169689Skan
216169689Skan      else
217169689Skan	{
218169689Skan	  /* Nothing special, go on with the next element.  */
219169689Skan	  p_occ = &occ->next;
220169689Skan	}
221169689Skan    }
222169689Skan
223169689Skan  /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
224169689Skan  new_occ->next = *p_head;
225169689Skan  *p_head = new_occ;
226169689Skan}
227169689Skan
228169689Skan/* Register that we found a division in BB.  */
229169689Skan
230169689Skanstatic inline void
231169689Skanregister_division_in (basic_block bb)
232169689Skan{
233169689Skan  struct occurrence *occ;
234169689Skan
235169689Skan  occ = (struct occurrence *) bb->aux;
236169689Skan  if (!occ)
237169689Skan    {
238169689Skan      occ = occ_new (bb, NULL);
239169689Skan      insert_bb (occ, ENTRY_BLOCK_PTR, &occ_head);
240169689Skan    }
241169689Skan
242169689Skan  occ->bb_has_division = true;
243169689Skan  occ->num_divisions++;
244169689Skan}
245169689Skan
246169689Skan
247169689Skan/* Compute the number of divisions that postdominate each block in OCC and
248169689Skan   its children.  */
249169689Skan
250169689Skanstatic void
251169689Skancompute_merit (struct occurrence *occ)
252169689Skan{
253169689Skan  struct occurrence *occ_child;
254169689Skan  basic_block dom = occ->bb;
255169689Skan
256169689Skan  for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
257169689Skan    {
258169689Skan      basic_block bb;
259169689Skan      if (occ_child->children)
260169689Skan        compute_merit (occ_child);
261169689Skan
262169689Skan      if (flag_exceptions)
263169689Skan	bb = single_noncomplex_succ (dom);
264169689Skan      else
265169689Skan	bb = dom;
266169689Skan
267169689Skan      if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
268169689Skan        occ->num_divisions += occ_child->num_divisions;
269169689Skan    }
270169689Skan}
271169689Skan
272169689Skan
273169689Skan/* Return whether USE_STMT is a floating-point division by DEF.  */
274169689Skanstatic inline bool
275169689Skanis_division_by (tree use_stmt, tree def)
276169689Skan{
277169689Skan  return TREE_CODE (use_stmt) == MODIFY_EXPR
278169689Skan	 && TREE_CODE (TREE_OPERAND (use_stmt, 1)) == RDIV_EXPR
279169689Skan	 && TREE_OPERAND (TREE_OPERAND (use_stmt, 1), 1) == def;
280169689Skan}
281169689Skan
282169689Skan/* Walk the subset of the dominator tree rooted at OCC, setting the
283169689Skan   RECIP_DEF field to a definition of 1.0 / DEF that can be used in
284169689Skan   the given basic block.  The field may be left NULL, of course,
285169689Skan   if it is not possible or profitable to do the optimization.
286169689Skan
287169689Skan   DEF_BSI is an iterator pointing at the statement defining DEF.
288169689Skan   If RECIP_DEF is set, a dominator already has a computation that can
289169689Skan   be used.  */
290169689Skan
291169689Skanstatic void
292169689Skaninsert_reciprocals (block_stmt_iterator *def_bsi, struct occurrence *occ,
293169689Skan		    tree def, tree recip_def, int threshold)
294169689Skan{
295169689Skan  tree type, new_stmt;
296169689Skan  block_stmt_iterator bsi;
297169689Skan  struct occurrence *occ_child;
298169689Skan
299169689Skan  if (!recip_def
300169689Skan      && (occ->bb_has_division || !flag_trapping_math)
301169689Skan      && occ->num_divisions >= threshold)
302169689Skan    {
303169689Skan      /* Make a variable with the replacement and substitute it.  */
304169689Skan      type = TREE_TYPE (def);
305169689Skan      recip_def = make_rename_temp (type, "reciptmp");
306169689Skan      new_stmt = build2 (MODIFY_EXPR, void_type_node, recip_def,
307169689Skan		         fold_build2 (RDIV_EXPR, type, build_one_cst (type),
308169689Skan				      def));
309169689Skan
310169689Skan
311169689Skan      if (occ->bb_has_division)
312169689Skan        {
313169689Skan          /* Case 1: insert before an existing division.  */
314169689Skan          bsi = bsi_after_labels (occ->bb);
315169689Skan          while (!bsi_end_p (bsi) && !is_division_by (bsi_stmt (bsi), def))
316169689Skan	    bsi_next (&bsi);
317169689Skan
318169689Skan          bsi_insert_before (&bsi, new_stmt, BSI_SAME_STMT);
319169689Skan        }
320169689Skan      else if (def_bsi && occ->bb == def_bsi->bb)
321169689Skan        {
322169689Skan          /* Case 2: insert right after the definition.  Note that this will
323169689Skan	     never happen if the definition statement can throw, because in
324169689Skan	     that case the sole successor of the statement's basic block will
325169689Skan	     dominate all the uses as well.  */
326169689Skan          bsi_insert_after (def_bsi, new_stmt, BSI_NEW_STMT);
327169689Skan        }
328169689Skan      else
329169689Skan        {
330169689Skan          /* Case 3: insert in a basic block not containing defs/uses.  */
331169689Skan          bsi = bsi_after_labels (occ->bb);
332169689Skan          bsi_insert_before (&bsi, new_stmt, BSI_SAME_STMT);
333169689Skan        }
334169689Skan
335169689Skan      occ->recip_def_stmt = new_stmt;
336169689Skan    }
337169689Skan
338169689Skan  occ->recip_def = recip_def;
339169689Skan  for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
340169689Skan    insert_reciprocals (def_bsi, occ_child, def, recip_def, threshold);
341169689Skan}
342169689Skan
343169689Skan
344169689Skan/* Replace the division at USE_P with a multiplication by the reciprocal, if
345169689Skan   possible.  */
346169689Skan
347169689Skanstatic inline void
348169689Skanreplace_reciprocal (use_operand_p use_p)
349169689Skan{
350169689Skan  tree use_stmt = USE_STMT (use_p);
351169689Skan  basic_block bb = bb_for_stmt (use_stmt);
352169689Skan  struct occurrence *occ = (struct occurrence *) bb->aux;
353169689Skan
354169689Skan  if (occ->recip_def && use_stmt != occ->recip_def_stmt)
355169689Skan    {
356169689Skan      TREE_SET_CODE (TREE_OPERAND (use_stmt, 1), MULT_EXPR);
357169689Skan      SET_USE (use_p, occ->recip_def);
358169689Skan      fold_stmt_inplace (use_stmt);
359169689Skan      update_stmt (use_stmt);
360169689Skan    }
361169689Skan}
362169689Skan
363169689Skan
364169689Skan/* Free OCC and return one more "struct occurrence" to be freed.  */
365169689Skan
366169689Skanstatic struct occurrence *
367169689Skanfree_bb (struct occurrence *occ)
368169689Skan{
369169689Skan  struct occurrence *child, *next;
370169689Skan
371169689Skan  /* First get the two pointers hanging off OCC.  */
372169689Skan  next = occ->next;
373169689Skan  child = occ->children;
374169689Skan  occ->bb->aux = NULL;
375169689Skan  pool_free (occ_pool, occ);
376169689Skan
377169689Skan  /* Now ensure that we don't recurse unless it is necessary.  */
378169689Skan  if (!child)
379169689Skan    return next;
380169689Skan  else
381169689Skan    {
382169689Skan      while (next)
383169689Skan	next = free_bb (next);
384169689Skan
385169689Skan      return child;
386169689Skan    }
387169689Skan}
388169689Skan
389169689Skan
390169689Skan/* Look for floating-point divisions among DEF's uses, and try to
391169689Skan   replace them by multiplications with the reciprocal.  Add
392169689Skan   as many statements computing the reciprocal as needed.
393169689Skan
394169689Skan   DEF must be a GIMPLE register of a floating-point type.  */
395169689Skan
396169689Skanstatic void
397169689Skanexecute_cse_reciprocals_1 (block_stmt_iterator *def_bsi, tree def)
398169689Skan{
399169689Skan  use_operand_p use_p;
400169689Skan  imm_use_iterator use_iter;
401169689Skan  struct occurrence *occ;
402169689Skan  int count = 0, threshold;
403169689Skan
404169689Skan  gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
405169689Skan
406169689Skan  FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
407169689Skan    {
408169689Skan      tree use_stmt = USE_STMT (use_p);
409169689Skan      if (is_division_by (use_stmt, def))
410169689Skan	{
411169689Skan	  register_division_in (bb_for_stmt (use_stmt));
412169689Skan	  count++;
413169689Skan	}
414169689Skan    }
415169689Skan
416169689Skan  /* Do the expensive part only if we can hope to optimize something.  */
417169689Skan  threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
418169689Skan  if (count >= threshold)
419169689Skan    {
420169689Skan      tree use_stmt;
421169689Skan      for (occ = occ_head; occ; occ = occ->next)
422169689Skan	{
423169689Skan	  compute_merit (occ);
424169689Skan	  insert_reciprocals (def_bsi, occ, def, NULL, threshold);
425169689Skan	}
426169689Skan
427169689Skan      FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
428169689Skan	{
429169689Skan	  if (is_division_by (use_stmt, def))
430169689Skan	    {
431169689Skan	      FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
432169689Skan		replace_reciprocal (use_p);
433169689Skan	    }
434169689Skan	}
435169689Skan    }
436169689Skan
437169689Skan  for (occ = occ_head; occ; )
438169689Skan    occ = free_bb (occ);
439169689Skan
440169689Skan  occ_head = NULL;
441169689Skan}
442169689Skan
443169689Skan
444169689Skanstatic bool
445169689Skangate_cse_reciprocals (void)
446169689Skan{
447169689Skan  return optimize && !optimize_size && flag_unsafe_math_optimizations;
448169689Skan}
449169689Skan
450169689Skan
451169689Skan/* Go through all the floating-point SSA_NAMEs, and call
452169689Skan   execute_cse_reciprocals_1 on each of them.  */
453169689Skanstatic unsigned int
454169689Skanexecute_cse_reciprocals (void)
455169689Skan{
456169689Skan  basic_block bb;
457169689Skan  tree arg;
458169689Skan
459169689Skan  occ_pool = create_alloc_pool ("dominators for recip",
460169689Skan				sizeof (struct occurrence),
461169689Skan				n_basic_blocks / 3 + 1);
462169689Skan
463169689Skan  calculate_dominance_info (CDI_DOMINATORS);
464169689Skan  calculate_dominance_info (CDI_POST_DOMINATORS);
465169689Skan
466169689Skan#ifdef ENABLE_CHECKING
467169689Skan  FOR_EACH_BB (bb)
468169689Skan    gcc_assert (!bb->aux);
469169689Skan#endif
470169689Skan
471169689Skan  for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = TREE_CHAIN (arg))
472169689Skan    if (default_def (arg)
473169689Skan	&& FLOAT_TYPE_P (TREE_TYPE (arg))
474169689Skan	&& is_gimple_reg (arg))
475169689Skan      execute_cse_reciprocals_1 (NULL, default_def (arg));
476169689Skan
477169689Skan  FOR_EACH_BB (bb)
478169689Skan    {
479169689Skan      block_stmt_iterator bsi;
480169689Skan      tree phi, def;
481169689Skan
482169689Skan      for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi))
483169689Skan	{
484169689Skan	  def = PHI_RESULT (phi);
485169689Skan	  if (FLOAT_TYPE_P (TREE_TYPE (def))
486169689Skan	      && is_gimple_reg (def))
487169689Skan	    execute_cse_reciprocals_1 (NULL, def);
488169689Skan	}
489169689Skan
490169689Skan      for (bsi = bsi_after_labels (bb); !bsi_end_p (bsi); bsi_next (&bsi))
491169689Skan        {
492169689Skan	  tree stmt = bsi_stmt (bsi);
493169689Skan	  if (TREE_CODE (stmt) == MODIFY_EXPR
494169689Skan	      && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
495169689Skan	      && FLOAT_TYPE_P (TREE_TYPE (def))
496169689Skan	      && TREE_CODE (def) == SSA_NAME)
497169689Skan	    execute_cse_reciprocals_1 (&bsi, def);
498169689Skan	}
499169689Skan    }
500169689Skan
501169689Skan  free_dominance_info (CDI_DOMINATORS);
502169689Skan  free_dominance_info (CDI_POST_DOMINATORS);
503169689Skan  free_alloc_pool (occ_pool);
504169689Skan  return 0;
505169689Skan}
506169689Skan
507169689Skanstruct tree_opt_pass pass_cse_reciprocals =
508169689Skan{
509169689Skan  "recip",				/* name */
510169689Skan  gate_cse_reciprocals,			/* gate */
511169689Skan  execute_cse_reciprocals,		/* execute */
512169689Skan  NULL,					/* sub */
513169689Skan  NULL,					/* next */
514169689Skan  0,					/* static_pass_number */
515169689Skan  0,					/* tv_id */
516169689Skan  PROP_ssa,				/* properties_required */
517169689Skan  0,					/* properties_provided */
518169689Skan  0,					/* properties_destroyed */
519169689Skan  0,					/* todo_flags_start */
520169689Skan  TODO_dump_func | TODO_update_ssa | TODO_verify_ssa
521169689Skan    | TODO_verify_stmts,                /* todo_flags_finish */
522169689Skan  0				        /* letter */
523169689Skan};
524