1169689Skan/* Global, SSA-based optimizations using mathematical identities. 2169689Skan Copyright (C) 2005 Free Software Foundation, Inc. 3169689Skan 4169689SkanThis file is part of GCC. 5169689Skan 6169689SkanGCC is free software; you can redistribute it and/or modify it 7169689Skanunder the terms of the GNU General Public License as published by the 8169689SkanFree Software Foundation; either version 2, or (at your option) any 9169689Skanlater version. 10169689Skan 11169689SkanGCC is distributed in the hope that it will be useful, but WITHOUT 12169689SkanANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13169689SkanFITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14169689Skanfor more details. 15169689Skan 16169689SkanYou should have received a copy of the GNU General Public License 17169689Skanalong with GCC; see the file COPYING. If not, write to the Free 18169689SkanSoftware Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 19169689Skan02110-1301, USA. */ 20169689Skan 21169689Skan/* Currently, the only mini-pass in this file tries to CSE reciprocal 22169689Skan operations. These are common in sequences such as this one: 23169689Skan 24169689Skan modulus = sqrt(x*x + y*y + z*z); 25169689Skan x = x / modulus; 26169689Skan y = y / modulus; 27169689Skan z = z / modulus; 28169689Skan 29169689Skan that can be optimized to 30169689Skan 31169689Skan modulus = sqrt(x*x + y*y + z*z); 32169689Skan rmodulus = 1.0 / modulus; 33169689Skan x = x * rmodulus; 34169689Skan y = y * rmodulus; 35169689Skan z = z * rmodulus; 36169689Skan 37169689Skan We do this for loop invariant divisors, and with this pass whenever 38169689Skan we notice that a division has the same divisor multiple times. 39169689Skan 40169689Skan Of course, like in PRE, we don't insert a division if a dominator 41169689Skan already has one. However, this cannot be done as an extension of 42169689Skan PRE for several reasons. 43169689Skan 44169689Skan First of all, with some experiments it was found out that the 45169689Skan transformation is not always useful if there are only two divisions 46169689Skan hy the same divisor. This is probably because modern processors 47169689Skan can pipeline the divisions; on older, in-order processors it should 48169689Skan still be effective to optimize two divisions by the same number. 49169689Skan We make this a param, and it shall be called N in the remainder of 50169689Skan this comment. 51169689Skan 52169689Skan Second, if trapping math is active, we have less freedom on where 53169689Skan to insert divisions: we can only do so in basic blocks that already 54169689Skan contain one. (If divisions don't trap, instead, we can insert 55169689Skan divisions elsewhere, which will be in blocks that are common dominators 56169689Skan of those that have the division). 57169689Skan 58169689Skan We really don't want to compute the reciprocal unless a division will 59169689Skan be found. To do this, we won't insert the division in a basic block 60169689Skan that has less than N divisions *post-dominating* it. 61169689Skan 62169689Skan The algorithm constructs a subset of the dominator tree, holding the 63169689Skan blocks containing the divisions and the common dominators to them, 64169689Skan and walk it twice. The first walk is in post-order, and it annotates 65169689Skan each block with the number of divisions that post-dominate it: this 66169689Skan gives information on where divisions can be inserted profitably. 67169689Skan The second walk is in pre-order, and it inserts divisions as explained 68169689Skan above, and replaces divisions by multiplications. 69169689Skan 70169689Skan In the best case, the cost of the pass is O(n_statements). In the 71169689Skan worst-case, the cost is due to creating the dominator tree subset, 72169689Skan with a cost of O(n_basic_blocks ^ 2); however this can only happen 73169689Skan for n_statements / n_basic_blocks statements. So, the amortized cost 74169689Skan of creating the dominator tree subset is O(n_basic_blocks) and the 75169689Skan worst-case cost of the pass is O(n_statements * n_basic_blocks). 76169689Skan 77169689Skan More practically, the cost will be small because there are few 78169689Skan divisions, and they tend to be in the same basic block, so insert_bb 79169689Skan is called very few times. 80169689Skan 81169689Skan If we did this using domwalk.c, an efficient implementation would have 82169689Skan to work on all the variables in a single pass, because we could not 83169689Skan work on just a subset of the dominator tree, as we do now, and the 84169689Skan cost would also be something like O(n_statements * n_basic_blocks). 85169689Skan The data structures would be more complex in order to work on all the 86169689Skan variables in a single pass. */ 87169689Skan 88169689Skan#include "config.h" 89169689Skan#include "system.h" 90169689Skan#include "coretypes.h" 91169689Skan#include "tm.h" 92169689Skan#include "flags.h" 93169689Skan#include "tree.h" 94169689Skan#include "tree-flow.h" 95169689Skan#include "real.h" 96169689Skan#include "timevar.h" 97169689Skan#include "tree-pass.h" 98169689Skan#include "alloc-pool.h" 99169689Skan#include "basic-block.h" 100169689Skan#include "target.h" 101169689Skan 102169689Skan 103169689Skan/* This structure represents one basic block that either computes a 104169689Skan division, or is a common dominator for basic block that compute a 105169689Skan division. */ 106169689Skanstruct occurrence { 107169689Skan /* The basic block represented by this structure. */ 108169689Skan basic_block bb; 109169689Skan 110169689Skan /* If non-NULL, the SSA_NAME holding the definition for a reciprocal 111169689Skan inserted in BB. */ 112169689Skan tree recip_def; 113169689Skan 114169689Skan /* If non-NULL, the MODIFY_EXPR for a reciprocal computation that 115169689Skan was inserted in BB. */ 116169689Skan tree recip_def_stmt; 117169689Skan 118169689Skan /* Pointer to a list of "struct occurrence"s for blocks dominated 119169689Skan by BB. */ 120169689Skan struct occurrence *children; 121169689Skan 122169689Skan /* Pointer to the next "struct occurrence"s in the list of blocks 123169689Skan sharing a common dominator. */ 124169689Skan struct occurrence *next; 125169689Skan 126169689Skan /* The number of divisions that are in BB before compute_merit. The 127169689Skan number of divisions that are in BB or post-dominate it after 128169689Skan compute_merit. */ 129169689Skan int num_divisions; 130169689Skan 131169689Skan /* True if the basic block has a division, false if it is a common 132169689Skan dominator for basic blocks that do. If it is false and trapping 133169689Skan math is active, BB is not a candidate for inserting a reciprocal. */ 134169689Skan bool bb_has_division; 135169689Skan}; 136169689Skan 137169689Skan 138169689Skan/* The instance of "struct occurrence" representing the highest 139169689Skan interesting block in the dominator tree. */ 140169689Skanstatic struct occurrence *occ_head; 141169689Skan 142169689Skan/* Allocation pool for getting instances of "struct occurrence". */ 143169689Skanstatic alloc_pool occ_pool; 144169689Skan 145169689Skan 146169689Skan 147169689Skan/* Allocate and return a new struct occurrence for basic block BB, and 148169689Skan whose children list is headed by CHILDREN. */ 149169689Skanstatic struct occurrence * 150169689Skanocc_new (basic_block bb, struct occurrence *children) 151169689Skan{ 152169689Skan struct occurrence *occ; 153169689Skan 154169689Skan occ = bb->aux = pool_alloc (occ_pool); 155169689Skan memset (occ, 0, sizeof (struct occurrence)); 156169689Skan 157169689Skan occ->bb = bb; 158169689Skan occ->children = children; 159169689Skan return occ; 160169689Skan} 161169689Skan 162169689Skan 163169689Skan/* Insert NEW_OCC into our subset of the dominator tree. P_HEAD points to a 164169689Skan list of "struct occurrence"s, one per basic block, having IDOM as 165169689Skan their common dominator. 166169689Skan 167169689Skan We try to insert NEW_OCC as deep as possible in the tree, and we also 168169689Skan insert any other block that is a common dominator for BB and one 169169689Skan block already in the tree. */ 170169689Skan 171169689Skanstatic void 172169689Skaninsert_bb (struct occurrence *new_occ, basic_block idom, 173169689Skan struct occurrence **p_head) 174169689Skan{ 175169689Skan struct occurrence *occ, **p_occ; 176169689Skan 177169689Skan for (p_occ = p_head; (occ = *p_occ) != NULL; ) 178169689Skan { 179169689Skan basic_block bb = new_occ->bb, occ_bb = occ->bb; 180169689Skan basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb); 181169689Skan if (dom == bb) 182169689Skan { 183169689Skan /* BB dominates OCC_BB. OCC becomes NEW_OCC's child: remove OCC 184169689Skan from its list. */ 185169689Skan *p_occ = occ->next; 186169689Skan occ->next = new_occ->children; 187169689Skan new_occ->children = occ; 188169689Skan 189169689Skan /* Try the next block (it may as well be dominated by BB). */ 190169689Skan } 191169689Skan 192169689Skan else if (dom == occ_bb) 193169689Skan { 194169689Skan /* OCC_BB dominates BB. Tail recurse to look deeper. */ 195169689Skan insert_bb (new_occ, dom, &occ->children); 196169689Skan return; 197169689Skan } 198169689Skan 199169689Skan else if (dom != idom) 200169689Skan { 201169689Skan gcc_assert (!dom->aux); 202169689Skan 203169689Skan /* There is a dominator between IDOM and BB, add it and make 204169689Skan two children out of NEW_OCC and OCC. First, remove OCC from 205169689Skan its list. */ 206169689Skan *p_occ = occ->next; 207169689Skan new_occ->next = occ; 208169689Skan occ->next = NULL; 209169689Skan 210169689Skan /* None of the previous blocks has DOM as a dominator: if we tail 211169689Skan recursed, we would reexamine them uselessly. Just switch BB with 212169689Skan DOM, and go on looking for blocks dominated by DOM. */ 213169689Skan new_occ = occ_new (dom, new_occ); 214169689Skan } 215169689Skan 216169689Skan else 217169689Skan { 218169689Skan /* Nothing special, go on with the next element. */ 219169689Skan p_occ = &occ->next; 220169689Skan } 221169689Skan } 222169689Skan 223169689Skan /* No place was found as a child of IDOM. Make BB a sibling of IDOM. */ 224169689Skan new_occ->next = *p_head; 225169689Skan *p_head = new_occ; 226169689Skan} 227169689Skan 228169689Skan/* Register that we found a division in BB. */ 229169689Skan 230169689Skanstatic inline void 231169689Skanregister_division_in (basic_block bb) 232169689Skan{ 233169689Skan struct occurrence *occ; 234169689Skan 235169689Skan occ = (struct occurrence *) bb->aux; 236169689Skan if (!occ) 237169689Skan { 238169689Skan occ = occ_new (bb, NULL); 239169689Skan insert_bb (occ, ENTRY_BLOCK_PTR, &occ_head); 240169689Skan } 241169689Skan 242169689Skan occ->bb_has_division = true; 243169689Skan occ->num_divisions++; 244169689Skan} 245169689Skan 246169689Skan 247169689Skan/* Compute the number of divisions that postdominate each block in OCC and 248169689Skan its children. */ 249169689Skan 250169689Skanstatic void 251169689Skancompute_merit (struct occurrence *occ) 252169689Skan{ 253169689Skan struct occurrence *occ_child; 254169689Skan basic_block dom = occ->bb; 255169689Skan 256169689Skan for (occ_child = occ->children; occ_child; occ_child = occ_child->next) 257169689Skan { 258169689Skan basic_block bb; 259169689Skan if (occ_child->children) 260169689Skan compute_merit (occ_child); 261169689Skan 262169689Skan if (flag_exceptions) 263169689Skan bb = single_noncomplex_succ (dom); 264169689Skan else 265169689Skan bb = dom; 266169689Skan 267169689Skan if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb)) 268169689Skan occ->num_divisions += occ_child->num_divisions; 269169689Skan } 270169689Skan} 271169689Skan 272169689Skan 273169689Skan/* Return whether USE_STMT is a floating-point division by DEF. */ 274169689Skanstatic inline bool 275169689Skanis_division_by (tree use_stmt, tree def) 276169689Skan{ 277169689Skan return TREE_CODE (use_stmt) == MODIFY_EXPR 278169689Skan && TREE_CODE (TREE_OPERAND (use_stmt, 1)) == RDIV_EXPR 279169689Skan && TREE_OPERAND (TREE_OPERAND (use_stmt, 1), 1) == def; 280169689Skan} 281169689Skan 282169689Skan/* Walk the subset of the dominator tree rooted at OCC, setting the 283169689Skan RECIP_DEF field to a definition of 1.0 / DEF that can be used in 284169689Skan the given basic block. The field may be left NULL, of course, 285169689Skan if it is not possible or profitable to do the optimization. 286169689Skan 287169689Skan DEF_BSI is an iterator pointing at the statement defining DEF. 288169689Skan If RECIP_DEF is set, a dominator already has a computation that can 289169689Skan be used. */ 290169689Skan 291169689Skanstatic void 292169689Skaninsert_reciprocals (block_stmt_iterator *def_bsi, struct occurrence *occ, 293169689Skan tree def, tree recip_def, int threshold) 294169689Skan{ 295169689Skan tree type, new_stmt; 296169689Skan block_stmt_iterator bsi; 297169689Skan struct occurrence *occ_child; 298169689Skan 299169689Skan if (!recip_def 300169689Skan && (occ->bb_has_division || !flag_trapping_math) 301169689Skan && occ->num_divisions >= threshold) 302169689Skan { 303169689Skan /* Make a variable with the replacement and substitute it. */ 304169689Skan type = TREE_TYPE (def); 305169689Skan recip_def = make_rename_temp (type, "reciptmp"); 306169689Skan new_stmt = build2 (MODIFY_EXPR, void_type_node, recip_def, 307169689Skan fold_build2 (RDIV_EXPR, type, build_one_cst (type), 308169689Skan def)); 309169689Skan 310169689Skan 311169689Skan if (occ->bb_has_division) 312169689Skan { 313169689Skan /* Case 1: insert before an existing division. */ 314169689Skan bsi = bsi_after_labels (occ->bb); 315169689Skan while (!bsi_end_p (bsi) && !is_division_by (bsi_stmt (bsi), def)) 316169689Skan bsi_next (&bsi); 317169689Skan 318169689Skan bsi_insert_before (&bsi, new_stmt, BSI_SAME_STMT); 319169689Skan } 320169689Skan else if (def_bsi && occ->bb == def_bsi->bb) 321169689Skan { 322169689Skan /* Case 2: insert right after the definition. Note that this will 323169689Skan never happen if the definition statement can throw, because in 324169689Skan that case the sole successor of the statement's basic block will 325169689Skan dominate all the uses as well. */ 326169689Skan bsi_insert_after (def_bsi, new_stmt, BSI_NEW_STMT); 327169689Skan } 328169689Skan else 329169689Skan { 330169689Skan /* Case 3: insert in a basic block not containing defs/uses. */ 331169689Skan bsi = bsi_after_labels (occ->bb); 332169689Skan bsi_insert_before (&bsi, new_stmt, BSI_SAME_STMT); 333169689Skan } 334169689Skan 335169689Skan occ->recip_def_stmt = new_stmt; 336169689Skan } 337169689Skan 338169689Skan occ->recip_def = recip_def; 339169689Skan for (occ_child = occ->children; occ_child; occ_child = occ_child->next) 340169689Skan insert_reciprocals (def_bsi, occ_child, def, recip_def, threshold); 341169689Skan} 342169689Skan 343169689Skan 344169689Skan/* Replace the division at USE_P with a multiplication by the reciprocal, if 345169689Skan possible. */ 346169689Skan 347169689Skanstatic inline void 348169689Skanreplace_reciprocal (use_operand_p use_p) 349169689Skan{ 350169689Skan tree use_stmt = USE_STMT (use_p); 351169689Skan basic_block bb = bb_for_stmt (use_stmt); 352169689Skan struct occurrence *occ = (struct occurrence *) bb->aux; 353169689Skan 354169689Skan if (occ->recip_def && use_stmt != occ->recip_def_stmt) 355169689Skan { 356169689Skan TREE_SET_CODE (TREE_OPERAND (use_stmt, 1), MULT_EXPR); 357169689Skan SET_USE (use_p, occ->recip_def); 358169689Skan fold_stmt_inplace (use_stmt); 359169689Skan update_stmt (use_stmt); 360169689Skan } 361169689Skan} 362169689Skan 363169689Skan 364169689Skan/* Free OCC and return one more "struct occurrence" to be freed. */ 365169689Skan 366169689Skanstatic struct occurrence * 367169689Skanfree_bb (struct occurrence *occ) 368169689Skan{ 369169689Skan struct occurrence *child, *next; 370169689Skan 371169689Skan /* First get the two pointers hanging off OCC. */ 372169689Skan next = occ->next; 373169689Skan child = occ->children; 374169689Skan occ->bb->aux = NULL; 375169689Skan pool_free (occ_pool, occ); 376169689Skan 377169689Skan /* Now ensure that we don't recurse unless it is necessary. */ 378169689Skan if (!child) 379169689Skan return next; 380169689Skan else 381169689Skan { 382169689Skan while (next) 383169689Skan next = free_bb (next); 384169689Skan 385169689Skan return child; 386169689Skan } 387169689Skan} 388169689Skan 389169689Skan 390169689Skan/* Look for floating-point divisions among DEF's uses, and try to 391169689Skan replace them by multiplications with the reciprocal. Add 392169689Skan as many statements computing the reciprocal as needed. 393169689Skan 394169689Skan DEF must be a GIMPLE register of a floating-point type. */ 395169689Skan 396169689Skanstatic void 397169689Skanexecute_cse_reciprocals_1 (block_stmt_iterator *def_bsi, tree def) 398169689Skan{ 399169689Skan use_operand_p use_p; 400169689Skan imm_use_iterator use_iter; 401169689Skan struct occurrence *occ; 402169689Skan int count = 0, threshold; 403169689Skan 404169689Skan gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def)); 405169689Skan 406169689Skan FOR_EACH_IMM_USE_FAST (use_p, use_iter, def) 407169689Skan { 408169689Skan tree use_stmt = USE_STMT (use_p); 409169689Skan if (is_division_by (use_stmt, def)) 410169689Skan { 411169689Skan register_division_in (bb_for_stmt (use_stmt)); 412169689Skan count++; 413169689Skan } 414169689Skan } 415169689Skan 416169689Skan /* Do the expensive part only if we can hope to optimize something. */ 417169689Skan threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def))); 418169689Skan if (count >= threshold) 419169689Skan { 420169689Skan tree use_stmt; 421169689Skan for (occ = occ_head; occ; occ = occ->next) 422169689Skan { 423169689Skan compute_merit (occ); 424169689Skan insert_reciprocals (def_bsi, occ, def, NULL, threshold); 425169689Skan } 426169689Skan 427169689Skan FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def) 428169689Skan { 429169689Skan if (is_division_by (use_stmt, def)) 430169689Skan { 431169689Skan FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter) 432169689Skan replace_reciprocal (use_p); 433169689Skan } 434169689Skan } 435169689Skan } 436169689Skan 437169689Skan for (occ = occ_head; occ; ) 438169689Skan occ = free_bb (occ); 439169689Skan 440169689Skan occ_head = NULL; 441169689Skan} 442169689Skan 443169689Skan 444169689Skanstatic bool 445169689Skangate_cse_reciprocals (void) 446169689Skan{ 447169689Skan return optimize && !optimize_size && flag_unsafe_math_optimizations; 448169689Skan} 449169689Skan 450169689Skan 451169689Skan/* Go through all the floating-point SSA_NAMEs, and call 452169689Skan execute_cse_reciprocals_1 on each of them. */ 453169689Skanstatic unsigned int 454169689Skanexecute_cse_reciprocals (void) 455169689Skan{ 456169689Skan basic_block bb; 457169689Skan tree arg; 458169689Skan 459169689Skan occ_pool = create_alloc_pool ("dominators for recip", 460169689Skan sizeof (struct occurrence), 461169689Skan n_basic_blocks / 3 + 1); 462169689Skan 463169689Skan calculate_dominance_info (CDI_DOMINATORS); 464169689Skan calculate_dominance_info (CDI_POST_DOMINATORS); 465169689Skan 466169689Skan#ifdef ENABLE_CHECKING 467169689Skan FOR_EACH_BB (bb) 468169689Skan gcc_assert (!bb->aux); 469169689Skan#endif 470169689Skan 471169689Skan for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = TREE_CHAIN (arg)) 472169689Skan if (default_def (arg) 473169689Skan && FLOAT_TYPE_P (TREE_TYPE (arg)) 474169689Skan && is_gimple_reg (arg)) 475169689Skan execute_cse_reciprocals_1 (NULL, default_def (arg)); 476169689Skan 477169689Skan FOR_EACH_BB (bb) 478169689Skan { 479169689Skan block_stmt_iterator bsi; 480169689Skan tree phi, def; 481169689Skan 482169689Skan for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi)) 483169689Skan { 484169689Skan def = PHI_RESULT (phi); 485169689Skan if (FLOAT_TYPE_P (TREE_TYPE (def)) 486169689Skan && is_gimple_reg (def)) 487169689Skan execute_cse_reciprocals_1 (NULL, def); 488169689Skan } 489169689Skan 490169689Skan for (bsi = bsi_after_labels (bb); !bsi_end_p (bsi); bsi_next (&bsi)) 491169689Skan { 492169689Skan tree stmt = bsi_stmt (bsi); 493169689Skan if (TREE_CODE (stmt) == MODIFY_EXPR 494169689Skan && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL 495169689Skan && FLOAT_TYPE_P (TREE_TYPE (def)) 496169689Skan && TREE_CODE (def) == SSA_NAME) 497169689Skan execute_cse_reciprocals_1 (&bsi, def); 498169689Skan } 499169689Skan } 500169689Skan 501169689Skan free_dominance_info (CDI_DOMINATORS); 502169689Skan free_dominance_info (CDI_POST_DOMINATORS); 503169689Skan free_alloc_pool (occ_pool); 504169689Skan return 0; 505169689Skan} 506169689Skan 507169689Skanstruct tree_opt_pass pass_cse_reciprocals = 508169689Skan{ 509169689Skan "recip", /* name */ 510169689Skan gate_cse_reciprocals, /* gate */ 511169689Skan execute_cse_reciprocals, /* execute */ 512169689Skan NULL, /* sub */ 513169689Skan NULL, /* next */ 514169689Skan 0, /* static_pass_number */ 515169689Skan 0, /* tv_id */ 516169689Skan PROP_ssa, /* properties_required */ 517169689Skan 0, /* properties_provided */ 518169689Skan 0, /* properties_destroyed */ 519169689Skan 0, /* todo_flags_start */ 520169689Skan TODO_dump_func | TODO_update_ssa | TODO_verify_ssa 521169689Skan | TODO_verify_stmts, /* todo_flags_finish */ 522169689Skan 0 /* letter */ 523169689Skan}; 524