tree-ssa-math-opts.c revision 1.11
1/* Global, SSA-based optimizations using mathematical identities. 2 Copyright (C) 2005-2017 Free Software Foundation, Inc. 3 4This file is part of GCC. 5 6GCC is free software; you can redistribute it and/or modify it 7under the terms of the GNU General Public License as published by the 8Free Software Foundation; either version 3, or (at your option) any 9later version. 10 11GCC is distributed in the hope that it will be useful, but WITHOUT 12ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14for more details. 15 16You should have received a copy of the GNU General Public License 17along with GCC; see the file COPYING3. If not see 18<http://www.gnu.org/licenses/>. */ 19 20/* Currently, the only mini-pass in this file tries to CSE reciprocal 21 operations. These are common in sequences such as this one: 22 23 modulus = sqrt(x*x + y*y + z*z); 24 x = x / modulus; 25 y = y / modulus; 26 z = z / modulus; 27 28 that can be optimized to 29 30 modulus = sqrt(x*x + y*y + z*z); 31 rmodulus = 1.0 / modulus; 32 x = x * rmodulus; 33 y = y * rmodulus; 34 z = z * rmodulus; 35 36 We do this for loop invariant divisors, and with this pass whenever 37 we notice that a division has the same divisor multiple times. 38 39 Of course, like in PRE, we don't insert a division if a dominator 40 already has one. However, this cannot be done as an extension of 41 PRE for several reasons. 42 43 First of all, with some experiments it was found out that the 44 transformation is not always useful if there are only two divisions 45 by the same divisor. This is probably because modern processors 46 can pipeline the divisions; on older, in-order processors it should 47 still be effective to optimize two divisions by the same number. 48 We make this a param, and it shall be called N in the remainder of 49 this comment. 50 51 Second, if trapping math is active, we have less freedom on where 52 to insert divisions: we can only do so in basic blocks that already 53 contain one. (If divisions don't trap, instead, we can insert 54 divisions elsewhere, which will be in blocks that are common dominators 55 of those that have the division). 56 57 We really don't want to compute the reciprocal unless a division will 58 be found. To do this, we won't insert the division in a basic block 59 that has less than N divisions *post-dominating* it. 60 61 The algorithm constructs a subset of the dominator tree, holding the 62 blocks containing the divisions and the common dominators to them, 63 and walk it twice. The first walk is in post-order, and it annotates 64 each block with the number of divisions that post-dominate it: this 65 gives information on where divisions can be inserted profitably. 66 The second walk is in pre-order, and it inserts divisions as explained 67 above, and replaces divisions by multiplications. 68 69 In the best case, the cost of the pass is O(n_statements). In the 70 worst-case, the cost is due to creating the dominator tree subset, 71 with a cost of O(n_basic_blocks ^ 2); however this can only happen 72 for n_statements / n_basic_blocks statements. So, the amortized cost 73 of creating the dominator tree subset is O(n_basic_blocks) and the 74 worst-case cost of the pass is O(n_statements * n_basic_blocks). 75 76 More practically, the cost will be small because there are few 77 divisions, and they tend to be in the same basic block, so insert_bb 78 is called very few times. 79 80 If we did this using domwalk.c, an efficient implementation would have 81 to work on all the variables in a single pass, because we could not 82 work on just a subset of the dominator tree, as we do now, and the 83 cost would also be something like O(n_statements * n_basic_blocks). 84 The data structures would be more complex in order to work on all the 85 variables in a single pass. */ 86 87#include "config.h" 88#include "system.h" 89#include "coretypes.h" 90#include "backend.h" 91#include "target.h" 92#include "rtl.h" 93#include "tree.h" 94#include "gimple.h" 95#include "predict.h" 96#include "alloc-pool.h" 97#include "tree-pass.h" 98#include "ssa.h" 99#include "optabs-tree.h" 100#include "gimple-pretty-print.h" 101#include "alias.h" 102#include "fold-const.h" 103#include "gimple-fold.h" 104#include "gimple-iterator.h" 105#include "gimplify.h" 106#include "gimplify-me.h" 107#include "stor-layout.h" 108#include "tree-cfg.h" 109#include "tree-dfa.h" 110#include "tree-ssa.h" 111#include "builtins.h" 112#include "params.h" 113#include "internal-fn.h" 114#include "case-cfn-macros.h" 115#include "optabs-libfuncs.h" 116#include "tree-eh.h" 117#include "targhooks.h" 118 119/* This structure represents one basic block that either computes a 120 division, or is a common dominator for basic block that compute a 121 division. */ 122struct occurrence { 123 /* The basic block represented by this structure. */ 124 basic_block bb; 125 126 /* If non-NULL, the SSA_NAME holding the definition for a reciprocal 127 inserted in BB. */ 128 tree recip_def; 129 130 /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that 131 was inserted in BB. */ 132 gimple *recip_def_stmt; 133 134 /* Pointer to a list of "struct occurrence"s for blocks dominated 135 by BB. */ 136 struct occurrence *children; 137 138 /* Pointer to the next "struct occurrence"s in the list of blocks 139 sharing a common dominator. */ 140 struct occurrence *next; 141 142 /* The number of divisions that are in BB before compute_merit. The 143 number of divisions that are in BB or post-dominate it after 144 compute_merit. */ 145 int num_divisions; 146 147 /* True if the basic block has a division, false if it is a common 148 dominator for basic blocks that do. If it is false and trapping 149 math is active, BB is not a candidate for inserting a reciprocal. */ 150 bool bb_has_division; 151}; 152 153static struct 154{ 155 /* Number of 1.0/X ops inserted. */ 156 int rdivs_inserted; 157 158 /* Number of 1.0/FUNC ops inserted. */ 159 int rfuncs_inserted; 160} reciprocal_stats; 161 162static struct 163{ 164 /* Number of cexpi calls inserted. */ 165 int inserted; 166} sincos_stats; 167 168static struct 169{ 170 /* Number of hand-written 16-bit nop / bswaps found. */ 171 int found_16bit; 172 173 /* Number of hand-written 32-bit nop / bswaps found. */ 174 int found_32bit; 175 176 /* Number of hand-written 64-bit nop / bswaps found. */ 177 int found_64bit; 178} nop_stats, bswap_stats; 179 180static struct 181{ 182 /* Number of widening multiplication ops inserted. */ 183 int widen_mults_inserted; 184 185 /* Number of integer multiply-and-accumulate ops inserted. */ 186 int maccs_inserted; 187 188 /* Number of fp fused multiply-add ops inserted. */ 189 int fmas_inserted; 190 191 /* Number of divmod calls inserted. */ 192 int divmod_calls_inserted; 193} widen_mul_stats; 194 195/* The instance of "struct occurrence" representing the highest 196 interesting block in the dominator tree. */ 197static struct occurrence *occ_head; 198 199/* Allocation pool for getting instances of "struct occurrence". */ 200static object_allocator<occurrence> *occ_pool; 201 202 203 204/* Allocate and return a new struct occurrence for basic block BB, and 205 whose children list is headed by CHILDREN. */ 206static struct occurrence * 207occ_new (basic_block bb, struct occurrence *children) 208{ 209 struct occurrence *occ; 210 211 bb->aux = occ = occ_pool->allocate (); 212 memset (occ, 0, sizeof (struct occurrence)); 213 214 occ->bb = bb; 215 occ->children = children; 216 return occ; 217} 218 219 220/* Insert NEW_OCC into our subset of the dominator tree. P_HEAD points to a 221 list of "struct occurrence"s, one per basic block, having IDOM as 222 their common dominator. 223 224 We try to insert NEW_OCC as deep as possible in the tree, and we also 225 insert any other block that is a common dominator for BB and one 226 block already in the tree. */ 227 228static void 229insert_bb (struct occurrence *new_occ, basic_block idom, 230 struct occurrence **p_head) 231{ 232 struct occurrence *occ, **p_occ; 233 234 for (p_occ = p_head; (occ = *p_occ) != NULL; ) 235 { 236 basic_block bb = new_occ->bb, occ_bb = occ->bb; 237 basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb); 238 if (dom == bb) 239 { 240 /* BB dominates OCC_BB. OCC becomes NEW_OCC's child: remove OCC 241 from its list. */ 242 *p_occ = occ->next; 243 occ->next = new_occ->children; 244 new_occ->children = occ; 245 246 /* Try the next block (it may as well be dominated by BB). */ 247 } 248 249 else if (dom == occ_bb) 250 { 251 /* OCC_BB dominates BB. Tail recurse to look deeper. */ 252 insert_bb (new_occ, dom, &occ->children); 253 return; 254 } 255 256 else if (dom != idom) 257 { 258 gcc_assert (!dom->aux); 259 260 /* There is a dominator between IDOM and BB, add it and make 261 two children out of NEW_OCC and OCC. First, remove OCC from 262 its list. */ 263 *p_occ = occ->next; 264 new_occ->next = occ; 265 occ->next = NULL; 266 267 /* None of the previous blocks has DOM as a dominator: if we tail 268 recursed, we would reexamine them uselessly. Just switch BB with 269 DOM, and go on looking for blocks dominated by DOM. */ 270 new_occ = occ_new (dom, new_occ); 271 } 272 273 else 274 { 275 /* Nothing special, go on with the next element. */ 276 p_occ = &occ->next; 277 } 278 } 279 280 /* No place was found as a child of IDOM. Make BB a sibling of IDOM. */ 281 new_occ->next = *p_head; 282 *p_head = new_occ; 283} 284 285/* Register that we found a division in BB. */ 286 287static inline void 288register_division_in (basic_block bb) 289{ 290 struct occurrence *occ; 291 292 occ = (struct occurrence *) bb->aux; 293 if (!occ) 294 { 295 occ = occ_new (bb, NULL); 296 insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head); 297 } 298 299 occ->bb_has_division = true; 300 occ->num_divisions++; 301} 302 303 304/* Compute the number of divisions that postdominate each block in OCC and 305 its children. */ 306 307static void 308compute_merit (struct occurrence *occ) 309{ 310 struct occurrence *occ_child; 311 basic_block dom = occ->bb; 312 313 for (occ_child = occ->children; occ_child; occ_child = occ_child->next) 314 { 315 basic_block bb; 316 if (occ_child->children) 317 compute_merit (occ_child); 318 319 if (flag_exceptions) 320 bb = single_noncomplex_succ (dom); 321 else 322 bb = dom; 323 324 if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb)) 325 occ->num_divisions += occ_child->num_divisions; 326 } 327} 328 329 330/* Return whether USE_STMT is a floating-point division by DEF. */ 331static inline bool 332is_division_by (gimple *use_stmt, tree def) 333{ 334 return is_gimple_assign (use_stmt) 335 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR 336 && gimple_assign_rhs2 (use_stmt) == def 337 /* Do not recognize x / x as valid division, as we are getting 338 confused later by replacing all immediate uses x in such 339 a stmt. */ 340 && gimple_assign_rhs1 (use_stmt) != def 341 && !stmt_can_throw_internal (use_stmt); 342} 343 344/* Walk the subset of the dominator tree rooted at OCC, setting the 345 RECIP_DEF field to a definition of 1.0 / DEF that can be used in 346 the given basic block. The field may be left NULL, of course, 347 if it is not possible or profitable to do the optimization. 348 349 DEF_BSI is an iterator pointing at the statement defining DEF. 350 If RECIP_DEF is set, a dominator already has a computation that can 351 be used. */ 352 353static void 354insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ, 355 tree def, tree recip_def, int threshold) 356{ 357 tree type; 358 gassign *new_stmt; 359 gimple_stmt_iterator gsi; 360 struct occurrence *occ_child; 361 362 if (!recip_def 363 && (occ->bb_has_division || !flag_trapping_math) 364 && occ->num_divisions >= threshold) 365 { 366 /* Make a variable with the replacement and substitute it. */ 367 type = TREE_TYPE (def); 368 recip_def = create_tmp_reg (type, "reciptmp"); 369 new_stmt = gimple_build_assign (recip_def, RDIV_EXPR, 370 build_one_cst (type), def); 371 372 if (occ->bb_has_division) 373 { 374 /* Case 1: insert before an existing division. */ 375 gsi = gsi_after_labels (occ->bb); 376 while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def)) 377 gsi_next (&gsi); 378 379 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT); 380 } 381 else if (def_gsi && occ->bb == def_gsi->bb) 382 { 383 /* Case 2: insert right after the definition. Note that this will 384 never happen if the definition statement can throw, because in 385 that case the sole successor of the statement's basic block will 386 dominate all the uses as well. */ 387 gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT); 388 } 389 else 390 { 391 /* Case 3: insert in a basic block not containing defs/uses. */ 392 gsi = gsi_after_labels (occ->bb); 393 gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT); 394 } 395 396 reciprocal_stats.rdivs_inserted++; 397 398 occ->recip_def_stmt = new_stmt; 399 } 400 401 occ->recip_def = recip_def; 402 for (occ_child = occ->children; occ_child; occ_child = occ_child->next) 403 insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold); 404} 405 406 407/* Replace the division at USE_P with a multiplication by the reciprocal, if 408 possible. */ 409 410static inline void 411replace_reciprocal (use_operand_p use_p) 412{ 413 gimple *use_stmt = USE_STMT (use_p); 414 basic_block bb = gimple_bb (use_stmt); 415 struct occurrence *occ = (struct occurrence *) bb->aux; 416 417 if (optimize_bb_for_speed_p (bb) 418 && occ->recip_def && use_stmt != occ->recip_def_stmt) 419 { 420 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt); 421 gimple_assign_set_rhs_code (use_stmt, MULT_EXPR); 422 SET_USE (use_p, occ->recip_def); 423 fold_stmt_inplace (&gsi); 424 update_stmt (use_stmt); 425 } 426} 427 428 429/* Free OCC and return one more "struct occurrence" to be freed. */ 430 431static struct occurrence * 432free_bb (struct occurrence *occ) 433{ 434 struct occurrence *child, *next; 435 436 /* First get the two pointers hanging off OCC. */ 437 next = occ->next; 438 child = occ->children; 439 occ->bb->aux = NULL; 440 occ_pool->remove (occ); 441 442 /* Now ensure that we don't recurse unless it is necessary. */ 443 if (!child) 444 return next; 445 else 446 { 447 while (next) 448 next = free_bb (next); 449 450 return child; 451 } 452} 453 454 455/* Look for floating-point divisions among DEF's uses, and try to 456 replace them by multiplications with the reciprocal. Add 457 as many statements computing the reciprocal as needed. 458 459 DEF must be a GIMPLE register of a floating-point type. */ 460 461static void 462execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def) 463{ 464 use_operand_p use_p; 465 imm_use_iterator use_iter; 466 struct occurrence *occ; 467 int count = 0, threshold; 468 469 gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def)); 470 471 FOR_EACH_IMM_USE_FAST (use_p, use_iter, def) 472 { 473 gimple *use_stmt = USE_STMT (use_p); 474 if (is_division_by (use_stmt, def)) 475 { 476 register_division_in (gimple_bb (use_stmt)); 477 count++; 478 } 479 } 480 481 /* Do the expensive part only if we can hope to optimize something. */ 482 threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def))); 483 if (count >= threshold) 484 { 485 gimple *use_stmt; 486 for (occ = occ_head; occ; occ = occ->next) 487 { 488 compute_merit (occ); 489 insert_reciprocals (def_gsi, occ, def, NULL, threshold); 490 } 491 492 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def) 493 { 494 if (is_division_by (use_stmt, def)) 495 { 496 FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter) 497 replace_reciprocal (use_p); 498 } 499 } 500 } 501 502 for (occ = occ_head; occ; ) 503 occ = free_bb (occ); 504 505 occ_head = NULL; 506} 507 508/* Return an internal function that implements the reciprocal of CALL, 509 or IFN_LAST if there is no such function that the target supports. */ 510 511internal_fn 512internal_fn_reciprocal (gcall *call) 513{ 514 internal_fn ifn; 515 516 switch (gimple_call_combined_fn (call)) 517 { 518 CASE_CFN_SQRT: 519 ifn = IFN_RSQRT; 520 break; 521 522 default: 523 return IFN_LAST; 524 } 525 526 tree_pair types = direct_internal_fn_types (ifn, call); 527 if (!direct_internal_fn_supported_p (ifn, types, OPTIMIZE_FOR_SPEED)) 528 return IFN_LAST; 529 530 return ifn; 531} 532 533/* Go through all the floating-point SSA_NAMEs, and call 534 execute_cse_reciprocals_1 on each of them. */ 535namespace { 536 537const pass_data pass_data_cse_reciprocals = 538{ 539 GIMPLE_PASS, /* type */ 540 "recip", /* name */ 541 OPTGROUP_NONE, /* optinfo_flags */ 542 TV_NONE, /* tv_id */ 543 PROP_ssa, /* properties_required */ 544 0, /* properties_provided */ 545 0, /* properties_destroyed */ 546 0, /* todo_flags_start */ 547 TODO_update_ssa, /* todo_flags_finish */ 548}; 549 550class pass_cse_reciprocals : public gimple_opt_pass 551{ 552public: 553 pass_cse_reciprocals (gcc::context *ctxt) 554 : gimple_opt_pass (pass_data_cse_reciprocals, ctxt) 555 {} 556 557 /* opt_pass methods: */ 558 virtual bool gate (function *) { return optimize && flag_reciprocal_math; } 559 virtual unsigned int execute (function *); 560 561}; // class pass_cse_reciprocals 562 563unsigned int 564pass_cse_reciprocals::execute (function *fun) 565{ 566 basic_block bb; 567 tree arg; 568 569 occ_pool = new object_allocator<occurrence> ("dominators for recip"); 570 571 memset (&reciprocal_stats, 0, sizeof (reciprocal_stats)); 572 calculate_dominance_info (CDI_DOMINATORS); 573 calculate_dominance_info (CDI_POST_DOMINATORS); 574 575 if (flag_checking) 576 FOR_EACH_BB_FN (bb, fun) 577 gcc_assert (!bb->aux); 578 579 for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg)) 580 if (FLOAT_TYPE_P (TREE_TYPE (arg)) 581 && is_gimple_reg (arg)) 582 { 583 tree name = ssa_default_def (fun, arg); 584 if (name) 585 execute_cse_reciprocals_1 (NULL, name); 586 } 587 588 FOR_EACH_BB_FN (bb, fun) 589 { 590 tree def; 591 592 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi); 593 gsi_next (&gsi)) 594 { 595 gphi *phi = gsi.phi (); 596 def = PHI_RESULT (phi); 597 if (! virtual_operand_p (def) 598 && FLOAT_TYPE_P (TREE_TYPE (def))) 599 execute_cse_reciprocals_1 (NULL, def); 600 } 601 602 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi); 603 gsi_next (&gsi)) 604 { 605 gimple *stmt = gsi_stmt (gsi); 606 607 if (gimple_has_lhs (stmt) 608 && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL 609 && FLOAT_TYPE_P (TREE_TYPE (def)) 610 && TREE_CODE (def) == SSA_NAME) 611 execute_cse_reciprocals_1 (&gsi, def); 612 } 613 614 if (optimize_bb_for_size_p (bb)) 615 continue; 616 617 /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b). */ 618 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi); 619 gsi_next (&gsi)) 620 { 621 gimple *stmt = gsi_stmt (gsi); 622 623 if (is_gimple_assign (stmt) 624 && gimple_assign_rhs_code (stmt) == RDIV_EXPR) 625 { 626 tree arg1 = gimple_assign_rhs2 (stmt); 627 gimple *stmt1; 628 629 if (TREE_CODE (arg1) != SSA_NAME) 630 continue; 631 632 stmt1 = SSA_NAME_DEF_STMT (arg1); 633 634 if (is_gimple_call (stmt1) 635 && gimple_call_lhs (stmt1)) 636 { 637 bool fail; 638 imm_use_iterator ui; 639 use_operand_p use_p; 640 tree fndecl = NULL_TREE; 641 642 gcall *call = as_a <gcall *> (stmt1); 643 internal_fn ifn = internal_fn_reciprocal (call); 644 if (ifn == IFN_LAST) 645 { 646 fndecl = gimple_call_fndecl (call); 647 if (!fndecl 648 || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_MD) 649 continue; 650 fndecl = targetm.builtin_reciprocal (fndecl); 651 if (!fndecl) 652 continue; 653 } 654 655 /* Check that all uses of the SSA name are divisions, 656 otherwise replacing the defining statement will do 657 the wrong thing. */ 658 fail = false; 659 FOR_EACH_IMM_USE_FAST (use_p, ui, arg1) 660 { 661 gimple *stmt2 = USE_STMT (use_p); 662 if (is_gimple_debug (stmt2)) 663 continue; 664 if (!is_gimple_assign (stmt2) 665 || gimple_assign_rhs_code (stmt2) != RDIV_EXPR 666 || gimple_assign_rhs1 (stmt2) == arg1 667 || gimple_assign_rhs2 (stmt2) != arg1) 668 { 669 fail = true; 670 break; 671 } 672 } 673 if (fail) 674 continue; 675 676 gimple_replace_ssa_lhs (call, arg1); 677 if (gimple_call_internal_p (call) != (ifn != IFN_LAST)) 678 { 679 auto_vec<tree, 4> args; 680 for (unsigned int i = 0; 681 i < gimple_call_num_args (call); i++) 682 args.safe_push (gimple_call_arg (call, i)); 683 gcall *stmt2; 684 if (ifn == IFN_LAST) 685 stmt2 = gimple_build_call_vec (fndecl, args); 686 else 687 stmt2 = gimple_build_call_internal_vec (ifn, args); 688 gimple_call_set_lhs (stmt2, arg1); 689 if (gimple_vdef (call)) 690 { 691 gimple_set_vdef (stmt2, gimple_vdef (call)); 692 SSA_NAME_DEF_STMT (gimple_vdef (stmt2)) = stmt2; 693 } 694 gimple_set_vuse (stmt2, gimple_vuse (call)); 695 gimple_stmt_iterator gsi2 = gsi_for_stmt (call); 696 gsi_replace (&gsi2, stmt2, true); 697 } 698 else 699 { 700 if (ifn == IFN_LAST) 701 gimple_call_set_fndecl (call, fndecl); 702 else 703 gimple_call_set_internal_fn (call, ifn); 704 update_stmt (call); 705 } 706 reciprocal_stats.rfuncs_inserted++; 707 708 FOR_EACH_IMM_USE_STMT (stmt, ui, arg1) 709 { 710 gimple_stmt_iterator gsi = gsi_for_stmt (stmt); 711 gimple_assign_set_rhs_code (stmt, MULT_EXPR); 712 fold_stmt_inplace (&gsi); 713 update_stmt (stmt); 714 } 715 } 716 } 717 } 718 } 719 720 statistics_counter_event (fun, "reciprocal divs inserted", 721 reciprocal_stats.rdivs_inserted); 722 statistics_counter_event (fun, "reciprocal functions inserted", 723 reciprocal_stats.rfuncs_inserted); 724 725 free_dominance_info (CDI_DOMINATORS); 726 free_dominance_info (CDI_POST_DOMINATORS); 727 delete occ_pool; 728 return 0; 729} 730 731} // anon namespace 732 733gimple_opt_pass * 734make_pass_cse_reciprocals (gcc::context *ctxt) 735{ 736 return new pass_cse_reciprocals (ctxt); 737} 738 739/* Records an occurrence at statement USE_STMT in the vector of trees 740 STMTS if it is dominated by *TOP_BB or dominates it or this basic block 741 is not yet initialized. Returns true if the occurrence was pushed on 742 the vector. Adjusts *TOP_BB to be the basic block dominating all 743 statements in the vector. */ 744 745static bool 746maybe_record_sincos (vec<gimple *> *stmts, 747 basic_block *top_bb, gimple *use_stmt) 748{ 749 basic_block use_bb = gimple_bb (use_stmt); 750 if (*top_bb 751 && (*top_bb == use_bb 752 || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb))) 753 stmts->safe_push (use_stmt); 754 else if (!*top_bb 755 || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb)) 756 { 757 stmts->safe_push (use_stmt); 758 *top_bb = use_bb; 759 } 760 else 761 return false; 762 763 return true; 764} 765 766/* Look for sin, cos and cexpi calls with the same argument NAME and 767 create a single call to cexpi CSEing the result in this case. 768 We first walk over all immediate uses of the argument collecting 769 statements that we can CSE in a vector and in a second pass replace 770 the statement rhs with a REALPART or IMAGPART expression on the 771 result of the cexpi call we insert before the use statement that 772 dominates all other candidates. */ 773 774static bool 775execute_cse_sincos_1 (tree name) 776{ 777 gimple_stmt_iterator gsi; 778 imm_use_iterator use_iter; 779 tree fndecl, res, type; 780 gimple *def_stmt, *use_stmt, *stmt; 781 int seen_cos = 0, seen_sin = 0, seen_cexpi = 0; 782 auto_vec<gimple *> stmts; 783 basic_block top_bb = NULL; 784 int i; 785 bool cfg_changed = false; 786 787 type = TREE_TYPE (name); 788 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name) 789 { 790 if (gimple_code (use_stmt) != GIMPLE_CALL 791 || !gimple_call_lhs (use_stmt)) 792 continue; 793 794 switch (gimple_call_combined_fn (use_stmt)) 795 { 796 CASE_CFN_COS: 797 seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0; 798 break; 799 800 CASE_CFN_SIN: 801 seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0; 802 break; 803 804 CASE_CFN_CEXPI: 805 seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0; 806 break; 807 808 default:; 809 } 810 } 811 812 if (seen_cos + seen_sin + seen_cexpi <= 1) 813 return false; 814 815 /* Simply insert cexpi at the beginning of top_bb but not earlier than 816 the name def statement. */ 817 fndecl = mathfn_built_in (type, BUILT_IN_CEXPI); 818 if (!fndecl) 819 return false; 820 stmt = gimple_build_call (fndecl, 1, name); 821 res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp"); 822 gimple_call_set_lhs (stmt, res); 823 824 def_stmt = SSA_NAME_DEF_STMT (name); 825 if (!SSA_NAME_IS_DEFAULT_DEF (name) 826 && gimple_code (def_stmt) != GIMPLE_PHI 827 && gimple_bb (def_stmt) == top_bb) 828 { 829 gsi = gsi_for_stmt (def_stmt); 830 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT); 831 } 832 else 833 { 834 gsi = gsi_after_labels (top_bb); 835 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT); 836 } 837 sincos_stats.inserted++; 838 839 /* And adjust the recorded old call sites. */ 840 for (i = 0; stmts.iterate (i, &use_stmt); ++i) 841 { 842 tree rhs = NULL; 843 844 switch (gimple_call_combined_fn (use_stmt)) 845 { 846 CASE_CFN_COS: 847 rhs = fold_build1 (REALPART_EXPR, type, res); 848 break; 849 850 CASE_CFN_SIN: 851 rhs = fold_build1 (IMAGPART_EXPR, type, res); 852 break; 853 854 CASE_CFN_CEXPI: 855 rhs = res; 856 break; 857 858 default:; 859 gcc_unreachable (); 860 } 861 862 /* Replace call with a copy. */ 863 stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs); 864 865 gsi = gsi_for_stmt (use_stmt); 866 gsi_replace (&gsi, stmt, true); 867 if (gimple_purge_dead_eh_edges (gimple_bb (stmt))) 868 cfg_changed = true; 869 } 870 871 return cfg_changed; 872} 873 874/* To evaluate powi(x,n), the floating point value x raised to the 875 constant integer exponent n, we use a hybrid algorithm that 876 combines the "window method" with look-up tables. For an 877 introduction to exponentiation algorithms and "addition chains", 878 see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth, 879 "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming", 880 3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation 881 Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998. */ 882 883/* Provide a default value for POWI_MAX_MULTS, the maximum number of 884 multiplications to inline before calling the system library's pow 885 function. powi(x,n) requires at worst 2*bits(n)-2 multiplications, 886 so this default never requires calling pow, powf or powl. */ 887 888#ifndef POWI_MAX_MULTS 889#define POWI_MAX_MULTS (2*HOST_BITS_PER_WIDE_INT-2) 890#endif 891 892/* The size of the "optimal power tree" lookup table. All 893 exponents less than this value are simply looked up in the 894 powi_table below. This threshold is also used to size the 895 cache of pseudo registers that hold intermediate results. */ 896#define POWI_TABLE_SIZE 256 897 898/* The size, in bits of the window, used in the "window method" 899 exponentiation algorithm. This is equivalent to a radix of 900 (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method". */ 901#define POWI_WINDOW_SIZE 3 902 903/* The following table is an efficient representation of an 904 "optimal power tree". For each value, i, the corresponding 905 value, j, in the table states than an optimal evaluation 906 sequence for calculating pow(x,i) can be found by evaluating 907 pow(x,j)*pow(x,i-j). An optimal power tree for the first 908 100 integers is given in Knuth's "Seminumerical algorithms". */ 909 910static const unsigned char powi_table[POWI_TABLE_SIZE] = 911 { 912 0, 1, 1, 2, 2, 3, 3, 4, /* 0 - 7 */ 913 4, 6, 5, 6, 6, 10, 7, 9, /* 8 - 15 */ 914 8, 16, 9, 16, 10, 12, 11, 13, /* 16 - 23 */ 915 12, 17, 13, 18, 14, 24, 15, 26, /* 24 - 31 */ 916 16, 17, 17, 19, 18, 33, 19, 26, /* 32 - 39 */ 917 20, 25, 21, 40, 22, 27, 23, 44, /* 40 - 47 */ 918 24, 32, 25, 34, 26, 29, 27, 44, /* 48 - 55 */ 919 28, 31, 29, 34, 30, 60, 31, 36, /* 56 - 63 */ 920 32, 64, 33, 34, 34, 46, 35, 37, /* 64 - 71 */ 921 36, 65, 37, 50, 38, 48, 39, 69, /* 72 - 79 */ 922 40, 49, 41, 43, 42, 51, 43, 58, /* 80 - 87 */ 923 44, 64, 45, 47, 46, 59, 47, 76, /* 88 - 95 */ 924 48, 65, 49, 66, 50, 67, 51, 66, /* 96 - 103 */ 925 52, 70, 53, 74, 54, 104, 55, 74, /* 104 - 111 */ 926 56, 64, 57, 69, 58, 78, 59, 68, /* 112 - 119 */ 927 60, 61, 61, 80, 62, 75, 63, 68, /* 120 - 127 */ 928 64, 65, 65, 128, 66, 129, 67, 90, /* 128 - 135 */ 929 68, 73, 69, 131, 70, 94, 71, 88, /* 136 - 143 */ 930 72, 128, 73, 98, 74, 132, 75, 121, /* 144 - 151 */ 931 76, 102, 77, 124, 78, 132, 79, 106, /* 152 - 159 */ 932 80, 97, 81, 160, 82, 99, 83, 134, /* 160 - 167 */ 933 84, 86, 85, 95, 86, 160, 87, 100, /* 168 - 175 */ 934 88, 113, 89, 98, 90, 107, 91, 122, /* 176 - 183 */ 935 92, 111, 93, 102, 94, 126, 95, 150, /* 184 - 191 */ 936 96, 128, 97, 130, 98, 133, 99, 195, /* 192 - 199 */ 937 100, 128, 101, 123, 102, 164, 103, 138, /* 200 - 207 */ 938 104, 145, 105, 146, 106, 109, 107, 149, /* 208 - 215 */ 939 108, 200, 109, 146, 110, 170, 111, 157, /* 216 - 223 */ 940 112, 128, 113, 130, 114, 182, 115, 132, /* 224 - 231 */ 941 116, 200, 117, 132, 118, 158, 119, 206, /* 232 - 239 */ 942 120, 240, 121, 162, 122, 147, 123, 152, /* 240 - 247 */ 943 124, 166, 125, 214, 126, 138, 127, 153, /* 248 - 255 */ 944 }; 945 946 947/* Return the number of multiplications required to calculate 948 powi(x,n) where n is less than POWI_TABLE_SIZE. This is a 949 subroutine of powi_cost. CACHE is an array indicating 950 which exponents have already been calculated. */ 951 952static int 953powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache) 954{ 955 /* If we've already calculated this exponent, then this evaluation 956 doesn't require any additional multiplications. */ 957 if (cache[n]) 958 return 0; 959 960 cache[n] = true; 961 return powi_lookup_cost (n - powi_table[n], cache) 962 + powi_lookup_cost (powi_table[n], cache) + 1; 963} 964 965/* Return the number of multiplications required to calculate 966 powi(x,n) for an arbitrary x, given the exponent N. This 967 function needs to be kept in sync with powi_as_mults below. */ 968 969static int 970powi_cost (HOST_WIDE_INT n) 971{ 972 bool cache[POWI_TABLE_SIZE]; 973 unsigned HOST_WIDE_INT digit; 974 unsigned HOST_WIDE_INT val; 975 int result; 976 977 if (n == 0) 978 return 0; 979 980 /* Ignore the reciprocal when calculating the cost. */ 981 val = (n < 0) ? -n : n; 982 983 /* Initialize the exponent cache. */ 984 memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool)); 985 cache[1] = true; 986 987 result = 0; 988 989 while (val >= POWI_TABLE_SIZE) 990 { 991 if (val & 1) 992 { 993 digit = val & ((1 << POWI_WINDOW_SIZE) - 1); 994 result += powi_lookup_cost (digit, cache) 995 + POWI_WINDOW_SIZE + 1; 996 val >>= POWI_WINDOW_SIZE; 997 } 998 else 999 { 1000 val >>= 1; 1001 result++; 1002 } 1003 } 1004 1005 return result + powi_lookup_cost (val, cache); 1006} 1007 1008/* Recursive subroutine of powi_as_mults. This function takes the 1009 array, CACHE, of already calculated exponents and an exponent N and 1010 returns a tree that corresponds to CACHE[1]**N, with type TYPE. */ 1011 1012static tree 1013powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type, 1014 HOST_WIDE_INT n, tree *cache) 1015{ 1016 tree op0, op1, ssa_target; 1017 unsigned HOST_WIDE_INT digit; 1018 gassign *mult_stmt; 1019 1020 if (n < POWI_TABLE_SIZE && cache[n]) 1021 return cache[n]; 1022 1023 ssa_target = make_temp_ssa_name (type, NULL, "powmult"); 1024 1025 if (n < POWI_TABLE_SIZE) 1026 { 1027 cache[n] = ssa_target; 1028 op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache); 1029 op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache); 1030 } 1031 else if (n & 1) 1032 { 1033 digit = n & ((1 << POWI_WINDOW_SIZE) - 1); 1034 op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache); 1035 op1 = powi_as_mults_1 (gsi, loc, type, digit, cache); 1036 } 1037 else 1038 { 1039 op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache); 1040 op1 = op0; 1041 } 1042 1043 mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1); 1044 gimple_set_location (mult_stmt, loc); 1045 gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT); 1046 1047 return ssa_target; 1048} 1049 1050/* Convert ARG0**N to a tree of multiplications of ARG0 with itself. 1051 This function needs to be kept in sync with powi_cost above. */ 1052 1053static tree 1054powi_as_mults (gimple_stmt_iterator *gsi, location_t loc, 1055 tree arg0, HOST_WIDE_INT n) 1056{ 1057 tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0); 1058 gassign *div_stmt; 1059 tree target; 1060 1061 if (n == 0) 1062 return build_real (type, dconst1); 1063 1064 memset (cache, 0, sizeof (cache)); 1065 cache[1] = arg0; 1066 1067 result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache); 1068 if (n >= 0) 1069 return result; 1070 1071 /* If the original exponent was negative, reciprocate the result. */ 1072 target = make_temp_ssa_name (type, NULL, "powmult"); 1073 div_stmt = gimple_build_assign (target, RDIV_EXPR, 1074 build_real (type, dconst1), result); 1075 gimple_set_location (div_stmt, loc); 1076 gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT); 1077 1078 return target; 1079} 1080 1081/* ARG0 and N are the two arguments to a powi builtin in GSI with 1082 location info LOC. If the arguments are appropriate, create an 1083 equivalent sequence of statements prior to GSI using an optimal 1084 number of multiplications, and return an expession holding the 1085 result. */ 1086 1087static tree 1088gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc, 1089 tree arg0, HOST_WIDE_INT n) 1090{ 1091 /* Avoid largest negative number. */ 1092 if (n != -n 1093 && ((n >= -1 && n <= 2) 1094 || (optimize_function_for_speed_p (cfun) 1095 && powi_cost (n) <= POWI_MAX_MULTS))) 1096 return powi_as_mults (gsi, loc, arg0, n); 1097 1098 return NULL_TREE; 1099} 1100 1101/* Build a gimple call statement that calls FN with argument ARG. 1102 Set the lhs of the call statement to a fresh SSA name. Insert the 1103 statement prior to GSI's current position, and return the fresh 1104 SSA name. */ 1105 1106static tree 1107build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc, 1108 tree fn, tree arg) 1109{ 1110 gcall *call_stmt; 1111 tree ssa_target; 1112 1113 call_stmt = gimple_build_call (fn, 1, arg); 1114 ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot"); 1115 gimple_set_lhs (call_stmt, ssa_target); 1116 gimple_set_location (call_stmt, loc); 1117 gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT); 1118 1119 return ssa_target; 1120} 1121 1122/* Build a gimple binary operation with the given CODE and arguments 1123 ARG0, ARG1, assigning the result to a new SSA name for variable 1124 TARGET. Insert the statement prior to GSI's current position, and 1125 return the fresh SSA name.*/ 1126 1127static tree 1128build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc, 1129 const char *name, enum tree_code code, 1130 tree arg0, tree arg1) 1131{ 1132 tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name); 1133 gassign *stmt = gimple_build_assign (result, code, arg0, arg1); 1134 gimple_set_location (stmt, loc); 1135 gsi_insert_before (gsi, stmt, GSI_SAME_STMT); 1136 return result; 1137} 1138 1139/* Build a gimple reference operation with the given CODE and argument 1140 ARG, assigning the result to a new SSA name of TYPE with NAME. 1141 Insert the statement prior to GSI's current position, and return 1142 the fresh SSA name. */ 1143 1144static inline tree 1145build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type, 1146 const char *name, enum tree_code code, tree arg0) 1147{ 1148 tree result = make_temp_ssa_name (type, NULL, name); 1149 gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0)); 1150 gimple_set_location (stmt, loc); 1151 gsi_insert_before (gsi, stmt, GSI_SAME_STMT); 1152 return result; 1153} 1154 1155/* Build a gimple assignment to cast VAL to TYPE. Insert the statement 1156 prior to GSI's current position, and return the fresh SSA name. */ 1157 1158static tree 1159build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc, 1160 tree type, tree val) 1161{ 1162 tree result = make_ssa_name (type); 1163 gassign *stmt = gimple_build_assign (result, NOP_EXPR, val); 1164 gimple_set_location (stmt, loc); 1165 gsi_insert_before (gsi, stmt, GSI_SAME_STMT); 1166 return result; 1167} 1168 1169struct pow_synth_sqrt_info 1170{ 1171 bool *factors; 1172 unsigned int deepest; 1173 unsigned int num_mults; 1174}; 1175 1176/* Return true iff the real value C can be represented as a 1177 sum of powers of 0.5 up to N. That is: 1178 C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1. 1179 Record in INFO the various parameters of the synthesis algorithm such 1180 as the factors a[i], the maximum 0.5 power and the number of 1181 multiplications that will be required. */ 1182 1183bool 1184representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n, 1185 struct pow_synth_sqrt_info *info) 1186{ 1187 REAL_VALUE_TYPE factor = dconsthalf; 1188 REAL_VALUE_TYPE remainder = c; 1189 1190 info->deepest = 0; 1191 info->num_mults = 0; 1192 memset (info->factors, 0, n * sizeof (bool)); 1193 1194 for (unsigned i = 0; i < n; i++) 1195 { 1196 REAL_VALUE_TYPE res; 1197 1198 /* If something inexact happened bail out now. */ 1199 if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor)) 1200 return false; 1201 1202 /* We have hit zero. The number is representable as a sum 1203 of powers of 0.5. */ 1204 if (real_equal (&res, &dconst0)) 1205 { 1206 info->factors[i] = true; 1207 info->deepest = i + 1; 1208 return true; 1209 } 1210 else if (!REAL_VALUE_NEGATIVE (res)) 1211 { 1212 remainder = res; 1213 info->factors[i] = true; 1214 info->num_mults++; 1215 } 1216 else 1217 info->factors[i] = false; 1218 1219 real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf); 1220 } 1221 return false; 1222} 1223 1224/* Return the tree corresponding to FN being applied 1225 to ARG N times at GSI and LOC. 1226 Look up previous results from CACHE if need be. 1227 cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times. */ 1228 1229static tree 1230get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi, 1231 tree fn, location_t loc, tree *cache) 1232{ 1233 tree res = cache[n]; 1234 if (!res) 1235 { 1236 tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache); 1237 res = build_and_insert_call (gsi, loc, fn, prev); 1238 cache[n] = res; 1239 } 1240 1241 return res; 1242} 1243 1244/* Print to STREAM the repeated application of function FNAME to ARG 1245 N times. So, for FNAME = "foo", ARG = "x", N = 2 it would print: 1246 "foo (foo (x))". */ 1247 1248static void 1249print_nested_fn (FILE* stream, const char *fname, const char* arg, 1250 unsigned int n) 1251{ 1252 if (n == 0) 1253 fprintf (stream, "%s", arg); 1254 else 1255 { 1256 fprintf (stream, "%s (", fname); 1257 print_nested_fn (stream, fname, arg, n - 1); 1258 fprintf (stream, ")"); 1259 } 1260} 1261 1262/* Print to STREAM the fractional sequence of sqrt chains 1263 applied to ARG, described by INFO. Used for the dump file. */ 1264 1265static void 1266dump_fractional_sqrt_sequence (FILE *stream, const char *arg, 1267 struct pow_synth_sqrt_info *info) 1268{ 1269 for (unsigned int i = 0; i < info->deepest; i++) 1270 { 1271 bool is_set = info->factors[i]; 1272 if (is_set) 1273 { 1274 print_nested_fn (stream, "sqrt", arg, i + 1); 1275 if (i != info->deepest - 1) 1276 fprintf (stream, " * "); 1277 } 1278 } 1279} 1280 1281/* Print to STREAM a representation of raising ARG to an integer 1282 power N. Used for the dump file. */ 1283 1284static void 1285dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n) 1286{ 1287 if (n > 1) 1288 fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n); 1289 else if (n == 1) 1290 fprintf (stream, "%s", arg); 1291} 1292 1293/* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of 1294 square roots. Place at GSI and LOC. Limit the maximum depth 1295 of the sqrt chains to MAX_DEPTH. Return the tree holding the 1296 result of the expanded sequence or NULL_TREE if the expansion failed. 1297 1298 This routine assumes that ARG1 is a real number with a fractional part 1299 (the integer exponent case will have been handled earlier in 1300 gimple_expand_builtin_pow). 1301 1302 For ARG1 > 0.0: 1303 * For ARG1 composed of a whole part WHOLE_PART and a fractional part 1304 FRAC_PART i.e. WHOLE_PART == floor (ARG1) and 1305 FRAC_PART == ARG1 - WHOLE_PART: 1306 Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where 1307 POW (ARG0, FRAC_PART) is expanded as a product of square root chains 1308 if it can be expressed as such, that is if FRAC_PART satisfies: 1309 FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i)) 1310 where integer a[i] is either 0 or 1. 1311 1312 Example: 1313 POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625) 1314 --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x))) 1315 1316 For ARG1 < 0.0 there are two approaches: 1317 * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1) 1318 is calculated as above. 1319 1320 Example: 1321 POW (x, -5.625) == 1.0 / POW (x, 5.625) 1322 --> 1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x)))) 1323 1324 * (B) : WHOLE_PART := - ceil (abs (ARG1)) 1325 FRAC_PART := ARG1 - WHOLE_PART 1326 and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART). 1327 Example: 1328 POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6) 1329 --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6)) 1330 1331 For ARG1 < 0.0 we choose between (A) and (B) depending on 1332 how many multiplications we'd have to do. 1333 So, for the example in (B): POW (x, -5.875), if we were to 1334 follow algorithm (A) we would produce: 1335 1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X))) 1336 which contains more multiplications than approach (B). 1337 1338 Hopefully, this approach will eliminate potentially expensive POW library 1339 calls when unsafe floating point math is enabled and allow the compiler to 1340 further optimise the multiplies, square roots and divides produced by this 1341 function. */ 1342 1343static tree 1344expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc, 1345 tree arg0, tree arg1, HOST_WIDE_INT max_depth) 1346{ 1347 tree type = TREE_TYPE (arg0); 1348 machine_mode mode = TYPE_MODE (type); 1349 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT); 1350 bool one_over = true; 1351 1352 if (!sqrtfn) 1353 return NULL_TREE; 1354 1355 if (TREE_CODE (arg1) != REAL_CST) 1356 return NULL_TREE; 1357 1358 REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1); 1359 1360 gcc_assert (max_depth > 0); 1361 tree *cache = XALLOCAVEC (tree, max_depth + 1); 1362 1363 struct pow_synth_sqrt_info synth_info; 1364 synth_info.factors = XALLOCAVEC (bool, max_depth + 1); 1365 synth_info.deepest = 0; 1366 synth_info.num_mults = 0; 1367 1368 bool neg_exp = REAL_VALUE_NEGATIVE (exp_init); 1369 REAL_VALUE_TYPE exp = real_value_abs (&exp_init); 1370 1371 /* The whole and fractional parts of exp. */ 1372 REAL_VALUE_TYPE whole_part; 1373 REAL_VALUE_TYPE frac_part; 1374 1375 real_floor (&whole_part, mode, &exp); 1376 real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part); 1377 1378 1379 REAL_VALUE_TYPE ceil_whole = dconst0; 1380 REAL_VALUE_TYPE ceil_fract = dconst0; 1381 1382 if (neg_exp) 1383 { 1384 real_ceil (&ceil_whole, mode, &exp); 1385 real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp); 1386 } 1387 1388 if (!representable_as_half_series_p (frac_part, max_depth, &synth_info)) 1389 return NULL_TREE; 1390 1391 /* Check whether it's more profitable to not use 1.0 / ... */ 1392 if (neg_exp) 1393 { 1394 struct pow_synth_sqrt_info alt_synth_info; 1395 alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1); 1396 alt_synth_info.deepest = 0; 1397 alt_synth_info.num_mults = 0; 1398 1399 if (representable_as_half_series_p (ceil_fract, max_depth, 1400 &alt_synth_info) 1401 && alt_synth_info.deepest <= synth_info.deepest 1402 && alt_synth_info.num_mults < synth_info.num_mults) 1403 { 1404 whole_part = ceil_whole; 1405 frac_part = ceil_fract; 1406 synth_info.deepest = alt_synth_info.deepest; 1407 synth_info.num_mults = alt_synth_info.num_mults; 1408 memcpy (synth_info.factors, alt_synth_info.factors, 1409 (max_depth + 1) * sizeof (bool)); 1410 one_over = false; 1411 } 1412 } 1413 1414 HOST_WIDE_INT n = real_to_integer (&whole_part); 1415 REAL_VALUE_TYPE cint; 1416 real_from_integer (&cint, VOIDmode, n, SIGNED); 1417 1418 if (!real_identical (&whole_part, &cint)) 1419 return NULL_TREE; 1420 1421 if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS) 1422 return NULL_TREE; 1423 1424 memset (cache, 0, (max_depth + 1) * sizeof (tree)); 1425 1426 tree integer_res = n == 0 ? build_real (type, dconst1) : arg0; 1427 1428 /* Calculate the integer part of the exponent. */ 1429 if (n > 1) 1430 { 1431 integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n); 1432 if (!integer_res) 1433 return NULL_TREE; 1434 } 1435 1436 if (dump_file) 1437 { 1438 char string[64]; 1439 1440 real_to_decimal (string, &exp_init, sizeof (string), 0, 1); 1441 fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string); 1442 1443 if (neg_exp) 1444 { 1445 if (one_over) 1446 { 1447 fprintf (dump_file, "1.0 / ("); 1448 dump_integer_part (dump_file, "x", n); 1449 if (n > 0) 1450 fprintf (dump_file, " * "); 1451 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info); 1452 fprintf (dump_file, ")"); 1453 } 1454 else 1455 { 1456 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info); 1457 fprintf (dump_file, " / ("); 1458 dump_integer_part (dump_file, "x", n); 1459 fprintf (dump_file, ")"); 1460 } 1461 } 1462 else 1463 { 1464 dump_fractional_sqrt_sequence (dump_file, "x", &synth_info); 1465 if (n > 0) 1466 fprintf (dump_file, " * "); 1467 dump_integer_part (dump_file, "x", n); 1468 } 1469 1470 fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest); 1471 } 1472 1473 1474 tree fract_res = NULL_TREE; 1475 cache[0] = arg0; 1476 1477 /* Calculate the fractional part of the exponent. */ 1478 for (unsigned i = 0; i < synth_info.deepest; i++) 1479 { 1480 if (synth_info.factors[i]) 1481 { 1482 tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache); 1483 1484 if (!fract_res) 1485 fract_res = sqrt_chain; 1486 1487 else 1488 fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR, 1489 fract_res, sqrt_chain); 1490 } 1491 } 1492 1493 tree res = NULL_TREE; 1494 1495 if (neg_exp) 1496 { 1497 if (one_over) 1498 { 1499 if (n > 0) 1500 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR, 1501 fract_res, integer_res); 1502 else 1503 res = fract_res; 1504 1505 res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR, 1506 build_real (type, dconst1), res); 1507 } 1508 else 1509 { 1510 res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR, 1511 fract_res, integer_res); 1512 } 1513 } 1514 else 1515 res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR, 1516 fract_res, integer_res); 1517 return res; 1518} 1519 1520/* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI 1521 with location info LOC. If possible, create an equivalent and 1522 less expensive sequence of statements prior to GSI, and return an 1523 expession holding the result. */ 1524 1525static tree 1526gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc, 1527 tree arg0, tree arg1) 1528{ 1529 REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6; 1530 REAL_VALUE_TYPE c2, dconst3; 1531 HOST_WIDE_INT n; 1532 tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x; 1533 machine_mode mode; 1534 bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi)); 1535 bool hw_sqrt_exists, c_is_int, c2_is_int; 1536 1537 dconst1_4 = dconst1; 1538 SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2); 1539 1540 /* If the exponent isn't a constant, there's nothing of interest 1541 to be done. */ 1542 if (TREE_CODE (arg1) != REAL_CST) 1543 return NULL_TREE; 1544 1545 /* Don't perform the operation if flag_signaling_nans is on 1546 and the operand is a signaling NaN. */ 1547 if (HONOR_SNANS (TYPE_MODE (TREE_TYPE (arg1))) 1548 && ((TREE_CODE (arg0) == REAL_CST 1549 && REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg0))) 1550 || REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg1)))) 1551 return NULL_TREE; 1552 1553 /* If the exponent is equivalent to an integer, expand to an optimal 1554 multiplication sequence when profitable. */ 1555 c = TREE_REAL_CST (arg1); 1556 n = real_to_integer (&c); 1557 real_from_integer (&cint, VOIDmode, n, SIGNED); 1558 c_is_int = real_identical (&c, &cint); 1559 1560 if (c_is_int 1561 && ((n >= -1 && n <= 2) 1562 || (flag_unsafe_math_optimizations 1563 && speed_p 1564 && powi_cost (n) <= POWI_MAX_MULTS))) 1565 return gimple_expand_builtin_powi (gsi, loc, arg0, n); 1566 1567 /* Attempt various optimizations using sqrt and cbrt. */ 1568 type = TREE_TYPE (arg0); 1569 mode = TYPE_MODE (type); 1570 sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT); 1571 1572 /* Optimize pow(x,0.5) = sqrt(x). This replacement is always safe 1573 unless signed zeros must be maintained. pow(-0,0.5) = +0, while 1574 sqrt(-0) = -0. */ 1575 if (sqrtfn 1576 && real_equal (&c, &dconsthalf) 1577 && !HONOR_SIGNED_ZEROS (mode)) 1578 return build_and_insert_call (gsi, loc, sqrtfn, arg0); 1579 1580 hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing; 1581 1582 /* Optimize pow(x,1./3.) = cbrt(x). This requires unsafe math 1583 optimizations since 1./3. is not exactly representable. If x 1584 is negative and finite, the correct value of pow(x,1./3.) is 1585 a NaN with the "invalid" exception raised, because the value 1586 of 1./3. actually has an even denominator. The correct value 1587 of cbrt(x) is a negative real value. */ 1588 cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT); 1589 dconst1_3 = real_value_truncate (mode, dconst_third ()); 1590 1591 if (flag_unsafe_math_optimizations 1592 && cbrtfn 1593 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0)) 1594 && real_equal (&c, &dconst1_3)) 1595 return build_and_insert_call (gsi, loc, cbrtfn, arg0); 1596 1597 /* Optimize pow(x,1./6.) = cbrt(sqrt(x)). Don't do this optimization 1598 if we don't have a hardware sqrt insn. */ 1599 dconst1_6 = dconst1_3; 1600 SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1); 1601 1602 if (flag_unsafe_math_optimizations 1603 && sqrtfn 1604 && cbrtfn 1605 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0)) 1606 && speed_p 1607 && hw_sqrt_exists 1608 && real_equal (&c, &dconst1_6)) 1609 { 1610 /* sqrt(x) */ 1611 sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0); 1612 1613 /* cbrt(sqrt(x)) */ 1614 return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0); 1615 } 1616 1617 1618 /* Attempt to expand the POW as a product of square root chains. 1619 Expand the 0.25 case even when otpimising for size. */ 1620 if (flag_unsafe_math_optimizations 1621 && sqrtfn 1622 && hw_sqrt_exists 1623 && (speed_p || real_equal (&c, &dconst1_4)) 1624 && !HONOR_SIGNED_ZEROS (mode)) 1625 { 1626 unsigned int max_depth = speed_p 1627 ? PARAM_VALUE (PARAM_MAX_POW_SQRT_DEPTH) 1628 : 2; 1629 1630 tree expand_with_sqrts 1631 = expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth); 1632 1633 if (expand_with_sqrts) 1634 return expand_with_sqrts; 1635 } 1636 1637 real_arithmetic (&c2, MULT_EXPR, &c, &dconst2); 1638 n = real_to_integer (&c2); 1639 real_from_integer (&cint, VOIDmode, n, SIGNED); 1640 c2_is_int = real_identical (&c2, &cint); 1641 1642 /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into 1643 1644 powi(x, n/3) * powi(cbrt(x), n%3), n > 0; 1645 1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)), n < 0. 1646 1647 Do not calculate the first factor when n/3 = 0. As cbrt(x) is 1648 different from pow(x, 1./3.) due to rounding and behavior with 1649 negative x, we need to constrain this transformation to unsafe 1650 math and positive x or finite math. */ 1651 real_from_integer (&dconst3, VOIDmode, 3, SIGNED); 1652 real_arithmetic (&c2, MULT_EXPR, &c, &dconst3); 1653 real_round (&c2, mode, &c2); 1654 n = real_to_integer (&c2); 1655 real_from_integer (&cint, VOIDmode, n, SIGNED); 1656 real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3); 1657 real_convert (&c2, mode, &c2); 1658 1659 if (flag_unsafe_math_optimizations 1660 && cbrtfn 1661 && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0)) 1662 && real_identical (&c2, &c) 1663 && !c2_is_int 1664 && optimize_function_for_speed_p (cfun) 1665 && powi_cost (n / 3) <= POWI_MAX_MULTS) 1666 { 1667 tree powi_x_ndiv3 = NULL_TREE; 1668 1669 /* Attempt to fold powi(arg0, abs(n/3)) into multiplies. If not 1670 possible or profitable, give up. Skip the degenerate case when 1671 abs(n) < 3, where the result is always 1. */ 1672 if (absu_hwi (n) >= 3) 1673 { 1674 powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0, 1675 abs_hwi (n / 3)); 1676 if (!powi_x_ndiv3) 1677 return NULL_TREE; 1678 } 1679 1680 /* Calculate powi(cbrt(x), n%3). Don't use gimple_expand_builtin_powi 1681 as that creates an unnecessary variable. Instead, just produce 1682 either cbrt(x) or cbrt(x) * cbrt(x). */ 1683 cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0); 1684 1685 if (absu_hwi (n) % 3 == 1) 1686 powi_cbrt_x = cbrt_x; 1687 else 1688 powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR, 1689 cbrt_x, cbrt_x); 1690 1691 /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1. */ 1692 if (absu_hwi (n) < 3) 1693 result = powi_cbrt_x; 1694 else 1695 result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR, 1696 powi_x_ndiv3, powi_cbrt_x); 1697 1698 /* If n is negative, reciprocate the result. */ 1699 if (n < 0) 1700 result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR, 1701 build_real (type, dconst1), result); 1702 1703 return result; 1704 } 1705 1706 /* No optimizations succeeded. */ 1707 return NULL_TREE; 1708} 1709 1710/* ARG is the argument to a cabs builtin call in GSI with location info 1711 LOC. Create a sequence of statements prior to GSI that calculates 1712 sqrt(R*R + I*I), where R and I are the real and imaginary components 1713 of ARG, respectively. Return an expression holding the result. */ 1714 1715static tree 1716gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg) 1717{ 1718 tree real_part, imag_part, addend1, addend2, sum, result; 1719 tree type = TREE_TYPE (TREE_TYPE (arg)); 1720 tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT); 1721 machine_mode mode = TYPE_MODE (type); 1722 1723 if (!flag_unsafe_math_optimizations 1724 || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi))) 1725 || !sqrtfn 1726 || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing) 1727 return NULL_TREE; 1728 1729 real_part = build_and_insert_ref (gsi, loc, type, "cabs", 1730 REALPART_EXPR, arg); 1731 addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR, 1732 real_part, real_part); 1733 imag_part = build_and_insert_ref (gsi, loc, type, "cabs", 1734 IMAGPART_EXPR, arg); 1735 addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR, 1736 imag_part, imag_part); 1737 sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2); 1738 result = build_and_insert_call (gsi, loc, sqrtfn, sum); 1739 1740 return result; 1741} 1742 1743/* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1 1744 on the SSA_NAME argument of each of them. Also expand powi(x,n) into 1745 an optimal number of multiplies, when n is a constant. */ 1746 1747namespace { 1748 1749const pass_data pass_data_cse_sincos = 1750{ 1751 GIMPLE_PASS, /* type */ 1752 "sincos", /* name */ 1753 OPTGROUP_NONE, /* optinfo_flags */ 1754 TV_NONE, /* tv_id */ 1755 PROP_ssa, /* properties_required */ 1756 PROP_gimple_opt_math, /* properties_provided */ 1757 0, /* properties_destroyed */ 1758 0, /* todo_flags_start */ 1759 TODO_update_ssa, /* todo_flags_finish */ 1760}; 1761 1762class pass_cse_sincos : public gimple_opt_pass 1763{ 1764public: 1765 pass_cse_sincos (gcc::context *ctxt) 1766 : gimple_opt_pass (pass_data_cse_sincos, ctxt) 1767 {} 1768 1769 /* opt_pass methods: */ 1770 virtual bool gate (function *) 1771 { 1772 /* We no longer require either sincos or cexp, since powi expansion 1773 piggybacks on this pass. */ 1774 return optimize; 1775 } 1776 1777 virtual unsigned int execute (function *); 1778 1779}; // class pass_cse_sincos 1780 1781unsigned int 1782pass_cse_sincos::execute (function *fun) 1783{ 1784 basic_block bb; 1785 bool cfg_changed = false; 1786 1787 calculate_dominance_info (CDI_DOMINATORS); 1788 memset (&sincos_stats, 0, sizeof (sincos_stats)); 1789 1790 FOR_EACH_BB_FN (bb, fun) 1791 { 1792 gimple_stmt_iterator gsi; 1793 bool cleanup_eh = false; 1794 1795 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 1796 { 1797 gimple *stmt = gsi_stmt (gsi); 1798 1799 /* Only the last stmt in a bb could throw, no need to call 1800 gimple_purge_dead_eh_edges if we change something in the middle 1801 of a basic block. */ 1802 cleanup_eh = false; 1803 1804 if (is_gimple_call (stmt) 1805 && gimple_call_lhs (stmt)) 1806 { 1807 tree arg, arg0, arg1, result; 1808 HOST_WIDE_INT n; 1809 location_t loc; 1810 1811 switch (gimple_call_combined_fn (stmt)) 1812 { 1813 CASE_CFN_COS: 1814 CASE_CFN_SIN: 1815 CASE_CFN_CEXPI: 1816 /* Make sure we have either sincos or cexp. */ 1817 if (!targetm.libc_has_function (function_c99_math_complex) 1818 && !targetm.libc_has_function (function_sincos)) 1819 break; 1820 1821 arg = gimple_call_arg (stmt, 0); 1822 if (TREE_CODE (arg) == SSA_NAME) 1823 cfg_changed |= execute_cse_sincos_1 (arg); 1824 break; 1825 1826 CASE_CFN_POW: 1827 arg0 = gimple_call_arg (stmt, 0); 1828 arg1 = gimple_call_arg (stmt, 1); 1829 1830 loc = gimple_location (stmt); 1831 result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1); 1832 1833 if (result) 1834 { 1835 tree lhs = gimple_get_lhs (stmt); 1836 gassign *new_stmt = gimple_build_assign (lhs, result); 1837 gimple_set_location (new_stmt, loc); 1838 unlink_stmt_vdef (stmt); 1839 gsi_replace (&gsi, new_stmt, true); 1840 cleanup_eh = true; 1841 if (gimple_vdef (stmt)) 1842 release_ssa_name (gimple_vdef (stmt)); 1843 } 1844 break; 1845 1846 CASE_CFN_POWI: 1847 arg0 = gimple_call_arg (stmt, 0); 1848 arg1 = gimple_call_arg (stmt, 1); 1849 loc = gimple_location (stmt); 1850 1851 if (real_minus_onep (arg0)) 1852 { 1853 tree t0, t1, cond, one, minus_one; 1854 gassign *stmt; 1855 1856 t0 = TREE_TYPE (arg0); 1857 t1 = TREE_TYPE (arg1); 1858 one = build_real (t0, dconst1); 1859 minus_one = build_real (t0, dconstm1); 1860 1861 cond = make_temp_ssa_name (t1, NULL, "powi_cond"); 1862 stmt = gimple_build_assign (cond, BIT_AND_EXPR, 1863 arg1, build_int_cst (t1, 1)); 1864 gimple_set_location (stmt, loc); 1865 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT); 1866 1867 result = make_temp_ssa_name (t0, NULL, "powi"); 1868 stmt = gimple_build_assign (result, COND_EXPR, cond, 1869 minus_one, one); 1870 gimple_set_location (stmt, loc); 1871 gsi_insert_before (&gsi, stmt, GSI_SAME_STMT); 1872 } 1873 else 1874 { 1875 if (!tree_fits_shwi_p (arg1)) 1876 break; 1877 1878 n = tree_to_shwi (arg1); 1879 result = gimple_expand_builtin_powi (&gsi, loc, arg0, n); 1880 } 1881 1882 if (result) 1883 { 1884 tree lhs = gimple_get_lhs (stmt); 1885 gassign *new_stmt = gimple_build_assign (lhs, result); 1886 gimple_set_location (new_stmt, loc); 1887 unlink_stmt_vdef (stmt); 1888 gsi_replace (&gsi, new_stmt, true); 1889 cleanup_eh = true; 1890 if (gimple_vdef (stmt)) 1891 release_ssa_name (gimple_vdef (stmt)); 1892 } 1893 break; 1894 1895 CASE_CFN_CABS: 1896 arg0 = gimple_call_arg (stmt, 0); 1897 loc = gimple_location (stmt); 1898 result = gimple_expand_builtin_cabs (&gsi, loc, arg0); 1899 1900 if (result) 1901 { 1902 tree lhs = gimple_get_lhs (stmt); 1903 gassign *new_stmt = gimple_build_assign (lhs, result); 1904 gimple_set_location (new_stmt, loc); 1905 unlink_stmt_vdef (stmt); 1906 gsi_replace (&gsi, new_stmt, true); 1907 cleanup_eh = true; 1908 if (gimple_vdef (stmt)) 1909 release_ssa_name (gimple_vdef (stmt)); 1910 } 1911 break; 1912 1913 default:; 1914 } 1915 } 1916 } 1917 if (cleanup_eh) 1918 cfg_changed |= gimple_purge_dead_eh_edges (bb); 1919 } 1920 1921 statistics_counter_event (fun, "sincos statements inserted", 1922 sincos_stats.inserted); 1923 1924 return cfg_changed ? TODO_cleanup_cfg : 0; 1925} 1926 1927} // anon namespace 1928 1929gimple_opt_pass * 1930make_pass_cse_sincos (gcc::context *ctxt) 1931{ 1932 return new pass_cse_sincos (ctxt); 1933} 1934 1935/* A symbolic number structure is used to detect byte permutation and selection 1936 patterns of a source. To achieve that, its field N contains an artificial 1937 number consisting of BITS_PER_MARKER sized markers tracking where does each 1938 byte come from in the source: 1939 1940 0 - target byte has the value 0 1941 FF - target byte has an unknown value (eg. due to sign extension) 1942 1..size - marker value is the byte index in the source (0 for lsb). 1943 1944 To detect permutations on memory sources (arrays and structures), a symbolic 1945 number is also associated: 1946 - a base address BASE_ADDR and an OFFSET giving the address of the source; 1947 - a range which gives the difference between the highest and lowest accessed 1948 memory location to make such a symbolic number; 1949 - the address SRC of the source element of lowest address as a convenience 1950 to easily get BASE_ADDR + offset + lowest bytepos. 1951 1952 Note 1: the range is different from size as size reflects the size of the 1953 type of the current expression. For instance, for an array char a[], 1954 (short) a[0] | (short) a[3] would have a size of 2 but a range of 4 while 1955 (short) a[0] | ((short) a[0] << 1) would still have a size of 2 but this 1956 time a range of 1. 1957 1958 Note 2: for non-memory sources, range holds the same value as size. 1959 1960 Note 3: SRC points to the SSA_NAME in case of non-memory source. */ 1961 1962struct symbolic_number { 1963 uint64_t n; 1964 tree type; 1965 tree base_addr; 1966 tree offset; 1967 HOST_WIDE_INT bytepos; 1968 tree src; 1969 tree alias_set; 1970 tree vuse; 1971 unsigned HOST_WIDE_INT range; 1972}; 1973 1974#define BITS_PER_MARKER 8 1975#define MARKER_MASK ((1 << BITS_PER_MARKER) - 1) 1976#define MARKER_BYTE_UNKNOWN MARKER_MASK 1977#define HEAD_MARKER(n, size) \ 1978 ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER))) 1979 1980/* The number which the find_bswap_or_nop_1 result should match in 1981 order to have a nop. The number is masked according to the size of 1982 the symbolic number before using it. */ 1983#define CMPNOP (sizeof (int64_t) < 8 ? 0 : \ 1984 (uint64_t)0x08070605 << 32 | 0x04030201) 1985 1986/* The number which the find_bswap_or_nop_1 result should match in 1987 order to have a byte swap. The number is masked according to the 1988 size of the symbolic number before using it. */ 1989#define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \ 1990 (uint64_t)0x01020304 << 32 | 0x05060708) 1991 1992/* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic 1993 number N. Return false if the requested operation is not permitted 1994 on a symbolic number. */ 1995 1996static inline bool 1997do_shift_rotate (enum tree_code code, 1998 struct symbolic_number *n, 1999 int count) 2000{ 2001 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT; 2002 unsigned head_marker; 2003 2004 if (count % BITS_PER_UNIT != 0) 2005 return false; 2006 count = (count / BITS_PER_UNIT) * BITS_PER_MARKER; 2007 2008 /* Zero out the extra bits of N in order to avoid them being shifted 2009 into the significant bits. */ 2010 if (size < 64 / BITS_PER_MARKER) 2011 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1; 2012 2013 switch (code) 2014 { 2015 case LSHIFT_EXPR: 2016 n->n <<= count; 2017 break; 2018 case RSHIFT_EXPR: 2019 head_marker = HEAD_MARKER (n->n, size); 2020 n->n >>= count; 2021 /* Arithmetic shift of signed type: result is dependent on the value. */ 2022 if (!TYPE_UNSIGNED (n->type) && head_marker) 2023 for (i = 0; i < count / BITS_PER_MARKER; i++) 2024 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN 2025 << ((size - 1 - i) * BITS_PER_MARKER); 2026 break; 2027 case LROTATE_EXPR: 2028 n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count)); 2029 break; 2030 case RROTATE_EXPR: 2031 n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count)); 2032 break; 2033 default: 2034 return false; 2035 } 2036 /* Zero unused bits for size. */ 2037 if (size < 64 / BITS_PER_MARKER) 2038 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1; 2039 return true; 2040} 2041 2042/* Perform sanity checking for the symbolic number N and the gimple 2043 statement STMT. */ 2044 2045static inline bool 2046verify_symbolic_number_p (struct symbolic_number *n, gimple *stmt) 2047{ 2048 tree lhs_type; 2049 2050 lhs_type = gimple_expr_type (stmt); 2051 2052 if (TREE_CODE (lhs_type) != INTEGER_TYPE) 2053 return false; 2054 2055 if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type)) 2056 return false; 2057 2058 return true; 2059} 2060 2061/* Initialize the symbolic number N for the bswap pass from the base element 2062 SRC manipulated by the bitwise OR expression. */ 2063 2064static bool 2065init_symbolic_number (struct symbolic_number *n, tree src) 2066{ 2067 int size; 2068 2069 if (! INTEGRAL_TYPE_P (TREE_TYPE (src))) 2070 return false; 2071 2072 n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE; 2073 n->src = src; 2074 2075 /* Set up the symbolic number N by setting each byte to a value between 1 and 2076 the byte size of rhs1. The highest order byte is set to n->size and the 2077 lowest order byte to 1. */ 2078 n->type = TREE_TYPE (src); 2079 size = TYPE_PRECISION (n->type); 2080 if (size % BITS_PER_UNIT != 0) 2081 return false; 2082 size /= BITS_PER_UNIT; 2083 if (size > 64 / BITS_PER_MARKER) 2084 return false; 2085 n->range = size; 2086 n->n = CMPNOP; 2087 2088 if (size < 64 / BITS_PER_MARKER) 2089 n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1; 2090 2091 return true; 2092} 2093 2094/* Check if STMT might be a byte swap or a nop from a memory source and returns 2095 the answer. If so, REF is that memory source and the base of the memory area 2096 accessed and the offset of the access from that base are recorded in N. */ 2097 2098bool 2099find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n) 2100{ 2101 /* Leaf node is an array or component ref. Memorize its base and 2102 offset from base to compare to other such leaf node. */ 2103 HOST_WIDE_INT bitsize, bitpos; 2104 machine_mode mode; 2105 int unsignedp, reversep, volatilep; 2106 tree offset, base_addr; 2107 2108 /* Not prepared to handle PDP endian. */ 2109 if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN) 2110 return false; 2111 2112 if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt)) 2113 return false; 2114 2115 base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode, 2116 &unsignedp, &reversep, &volatilep); 2117 2118 if (TREE_CODE (base_addr) == MEM_REF) 2119 { 2120 offset_int bit_offset = 0; 2121 tree off = TREE_OPERAND (base_addr, 1); 2122 2123 if (!integer_zerop (off)) 2124 { 2125 offset_int boff, coff = mem_ref_offset (base_addr); 2126 boff = coff << LOG2_BITS_PER_UNIT; 2127 bit_offset += boff; 2128 } 2129 2130 base_addr = TREE_OPERAND (base_addr, 0); 2131 2132 /* Avoid returning a negative bitpos as this may wreak havoc later. */ 2133 if (wi::neg_p (bit_offset)) 2134 { 2135 offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false); 2136 offset_int tem = bit_offset.and_not (mask); 2137 /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf. 2138 Subtract it to BIT_OFFSET and add it (scaled) to OFFSET. */ 2139 bit_offset -= tem; 2140 tem >>= LOG2_BITS_PER_UNIT; 2141 if (offset) 2142 offset = size_binop (PLUS_EXPR, offset, 2143 wide_int_to_tree (sizetype, tem)); 2144 else 2145 offset = wide_int_to_tree (sizetype, tem); 2146 } 2147 2148 bitpos += bit_offset.to_shwi (); 2149 } 2150 2151 if (bitpos % BITS_PER_UNIT) 2152 return false; 2153 if (bitsize % BITS_PER_UNIT) 2154 return false; 2155 if (reversep) 2156 return false; 2157 2158 if (!init_symbolic_number (n, ref)) 2159 return false; 2160 n->base_addr = base_addr; 2161 n->offset = offset; 2162 n->bytepos = bitpos / BITS_PER_UNIT; 2163 n->alias_set = reference_alias_ptr_type (ref); 2164 n->vuse = gimple_vuse (stmt); 2165 return true; 2166} 2167 2168/* Compute the symbolic number N representing the result of a bitwise OR on 2 2169 symbolic number N1 and N2 whose source statements are respectively 2170 SOURCE_STMT1 and SOURCE_STMT2. */ 2171 2172static gimple * 2173perform_symbolic_merge (gimple *source_stmt1, struct symbolic_number *n1, 2174 gimple *source_stmt2, struct symbolic_number *n2, 2175 struct symbolic_number *n) 2176{ 2177 int i, size; 2178 uint64_t mask; 2179 gimple *source_stmt; 2180 struct symbolic_number *n_start; 2181 2182 tree rhs1 = gimple_assign_rhs1 (source_stmt1); 2183 if (TREE_CODE (rhs1) == BIT_FIELD_REF 2184 && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME) 2185 rhs1 = TREE_OPERAND (rhs1, 0); 2186 tree rhs2 = gimple_assign_rhs1 (source_stmt2); 2187 if (TREE_CODE (rhs2) == BIT_FIELD_REF 2188 && TREE_CODE (TREE_OPERAND (rhs2, 0)) == SSA_NAME) 2189 rhs2 = TREE_OPERAND (rhs2, 0); 2190 2191 /* Sources are different, cancel bswap if they are not memory location with 2192 the same base (array, structure, ...). */ 2193 if (rhs1 != rhs2) 2194 { 2195 uint64_t inc; 2196 HOST_WIDE_INT start_sub, end_sub, end1, end2, end; 2197 struct symbolic_number *toinc_n_ptr, *n_end; 2198 basic_block bb1, bb2; 2199 2200 if (!n1->base_addr || !n2->base_addr 2201 || !operand_equal_p (n1->base_addr, n2->base_addr, 0)) 2202 return NULL; 2203 2204 if (!n1->offset != !n2->offset 2205 || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0))) 2206 return NULL; 2207 2208 if (n1->bytepos < n2->bytepos) 2209 { 2210 n_start = n1; 2211 start_sub = n2->bytepos - n1->bytepos; 2212 } 2213 else 2214 { 2215 n_start = n2; 2216 start_sub = n1->bytepos - n2->bytepos; 2217 } 2218 2219 bb1 = gimple_bb (source_stmt1); 2220 bb2 = gimple_bb (source_stmt2); 2221 if (dominated_by_p (CDI_DOMINATORS, bb1, bb2)) 2222 source_stmt = source_stmt1; 2223 else 2224 source_stmt = source_stmt2; 2225 2226 /* Find the highest address at which a load is performed and 2227 compute related info. */ 2228 end1 = n1->bytepos + (n1->range - 1); 2229 end2 = n2->bytepos + (n2->range - 1); 2230 if (end1 < end2) 2231 { 2232 end = end2; 2233 end_sub = end2 - end1; 2234 } 2235 else 2236 { 2237 end = end1; 2238 end_sub = end1 - end2; 2239 } 2240 n_end = (end2 > end1) ? n2 : n1; 2241 2242 /* Find symbolic number whose lsb is the most significant. */ 2243 if (BYTES_BIG_ENDIAN) 2244 toinc_n_ptr = (n_end == n1) ? n2 : n1; 2245 else 2246 toinc_n_ptr = (n_start == n1) ? n2 : n1; 2247 2248 n->range = end - n_start->bytepos + 1; 2249 2250 /* Check that the range of memory covered can be represented by 2251 a symbolic number. */ 2252 if (n->range > 64 / BITS_PER_MARKER) 2253 return NULL; 2254 2255 /* Reinterpret byte marks in symbolic number holding the value of 2256 bigger weight according to target endianness. */ 2257 inc = BYTES_BIG_ENDIAN ? end_sub : start_sub; 2258 size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT; 2259 for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER) 2260 { 2261 unsigned marker 2262 = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK; 2263 if (marker && marker != MARKER_BYTE_UNKNOWN) 2264 toinc_n_ptr->n += inc; 2265 } 2266 } 2267 else 2268 { 2269 n->range = n1->range; 2270 n_start = n1; 2271 source_stmt = source_stmt1; 2272 } 2273 2274 if (!n1->alias_set 2275 || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set)) 2276 n->alias_set = n1->alias_set; 2277 else 2278 n->alias_set = ptr_type_node; 2279 n->vuse = n_start->vuse; 2280 n->base_addr = n_start->base_addr; 2281 n->offset = n_start->offset; 2282 n->src = n_start->src; 2283 n->bytepos = n_start->bytepos; 2284 n->type = n_start->type; 2285 size = TYPE_PRECISION (n->type) / BITS_PER_UNIT; 2286 2287 for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER) 2288 { 2289 uint64_t masked1, masked2; 2290 2291 masked1 = n1->n & mask; 2292 masked2 = n2->n & mask; 2293 if (masked1 && masked2 && masked1 != masked2) 2294 return NULL; 2295 } 2296 n->n = n1->n | n2->n; 2297 2298 return source_stmt; 2299} 2300 2301/* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform 2302 the operation given by the rhs of STMT on the result. If the operation 2303 could successfully be executed the function returns a gimple stmt whose 2304 rhs's first tree is the expression of the source operand and NULL 2305 otherwise. */ 2306 2307static gimple * 2308find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit) 2309{ 2310 enum tree_code code; 2311 tree rhs1, rhs2 = NULL; 2312 gimple *rhs1_stmt, *rhs2_stmt, *source_stmt1; 2313 enum gimple_rhs_class rhs_class; 2314 2315 if (!limit || !is_gimple_assign (stmt)) 2316 return NULL; 2317 2318 rhs1 = gimple_assign_rhs1 (stmt); 2319 2320 if (find_bswap_or_nop_load (stmt, rhs1, n)) 2321 return stmt; 2322 2323 /* Handle BIT_FIELD_REF. */ 2324 if (TREE_CODE (rhs1) == BIT_FIELD_REF 2325 && TREE_CODE (TREE_OPERAND (rhs1, 0)) == SSA_NAME) 2326 { 2327 unsigned HOST_WIDE_INT bitsize = tree_to_uhwi (TREE_OPERAND (rhs1, 1)); 2328 unsigned HOST_WIDE_INT bitpos = tree_to_uhwi (TREE_OPERAND (rhs1, 2)); 2329 if (bitpos % BITS_PER_UNIT == 0 2330 && bitsize % BITS_PER_UNIT == 0 2331 && init_symbolic_number (n, TREE_OPERAND (rhs1, 0))) 2332 { 2333 /* Handle big-endian bit numbering in BIT_FIELD_REF. */ 2334 if (BYTES_BIG_ENDIAN) 2335 bitpos = TYPE_PRECISION (n->type) - bitpos - bitsize; 2336 2337 /* Shift. */ 2338 if (!do_shift_rotate (RSHIFT_EXPR, n, bitpos)) 2339 return NULL; 2340 2341 /* Mask. */ 2342 uint64_t mask = 0; 2343 uint64_t tmp = (1 << BITS_PER_UNIT) - 1; 2344 for (unsigned i = 0; i < bitsize / BITS_PER_UNIT; 2345 i++, tmp <<= BITS_PER_UNIT) 2346 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER); 2347 n->n &= mask; 2348 2349 /* Convert. */ 2350 n->type = TREE_TYPE (rhs1); 2351 if (!n->base_addr) 2352 n->range = TYPE_PRECISION (n->type) / BITS_PER_UNIT; 2353 2354 return verify_symbolic_number_p (n, stmt) ? stmt : NULL; 2355 } 2356 2357 return NULL; 2358 } 2359 2360 if (TREE_CODE (rhs1) != SSA_NAME) 2361 return NULL; 2362 2363 code = gimple_assign_rhs_code (stmt); 2364 rhs_class = gimple_assign_rhs_class (stmt); 2365 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1); 2366 2367 if (rhs_class == GIMPLE_BINARY_RHS) 2368 rhs2 = gimple_assign_rhs2 (stmt); 2369 2370 /* Handle unary rhs and binary rhs with integer constants as second 2371 operand. */ 2372 2373 if (rhs_class == GIMPLE_UNARY_RHS 2374 || (rhs_class == GIMPLE_BINARY_RHS 2375 && TREE_CODE (rhs2) == INTEGER_CST)) 2376 { 2377 if (code != BIT_AND_EXPR 2378 && code != LSHIFT_EXPR 2379 && code != RSHIFT_EXPR 2380 && code != LROTATE_EXPR 2381 && code != RROTATE_EXPR 2382 && !CONVERT_EXPR_CODE_P (code)) 2383 return NULL; 2384 2385 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1); 2386 2387 /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and 2388 we have to initialize the symbolic number. */ 2389 if (!source_stmt1) 2390 { 2391 if (gimple_assign_load_p (stmt) 2392 || !init_symbolic_number (n, rhs1)) 2393 return NULL; 2394 source_stmt1 = stmt; 2395 } 2396 2397 switch (code) 2398 { 2399 case BIT_AND_EXPR: 2400 { 2401 int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT; 2402 uint64_t val = int_cst_value (rhs2), mask = 0; 2403 uint64_t tmp = (1 << BITS_PER_UNIT) - 1; 2404 2405 /* Only constants masking full bytes are allowed. */ 2406 for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT) 2407 if ((val & tmp) != 0 && (val & tmp) != tmp) 2408 return NULL; 2409 else if (val & tmp) 2410 mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER); 2411 2412 n->n &= mask; 2413 } 2414 break; 2415 case LSHIFT_EXPR: 2416 case RSHIFT_EXPR: 2417 case LROTATE_EXPR: 2418 case RROTATE_EXPR: 2419 if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2))) 2420 return NULL; 2421 break; 2422 CASE_CONVERT: 2423 { 2424 int i, type_size, old_type_size; 2425 tree type; 2426 2427 type = gimple_expr_type (stmt); 2428 type_size = TYPE_PRECISION (type); 2429 if (type_size % BITS_PER_UNIT != 0) 2430 return NULL; 2431 type_size /= BITS_PER_UNIT; 2432 if (type_size > 64 / BITS_PER_MARKER) 2433 return NULL; 2434 2435 /* Sign extension: result is dependent on the value. */ 2436 old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT; 2437 if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size 2438 && HEAD_MARKER (n->n, old_type_size)) 2439 for (i = 0; i < type_size - old_type_size; i++) 2440 n->n |= (uint64_t) MARKER_BYTE_UNKNOWN 2441 << ((type_size - 1 - i) * BITS_PER_MARKER); 2442 2443 if (type_size < 64 / BITS_PER_MARKER) 2444 { 2445 /* If STMT casts to a smaller type mask out the bits not 2446 belonging to the target type. */ 2447 n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1; 2448 } 2449 n->type = type; 2450 if (!n->base_addr) 2451 n->range = type_size; 2452 } 2453 break; 2454 default: 2455 return NULL; 2456 }; 2457 return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL; 2458 } 2459 2460 /* Handle binary rhs. */ 2461 2462 if (rhs_class == GIMPLE_BINARY_RHS) 2463 { 2464 struct symbolic_number n1, n2; 2465 gimple *source_stmt, *source_stmt2; 2466 2467 if (code != BIT_IOR_EXPR) 2468 return NULL; 2469 2470 if (TREE_CODE (rhs2) != SSA_NAME) 2471 return NULL; 2472 2473 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2); 2474 2475 switch (code) 2476 { 2477 case BIT_IOR_EXPR: 2478 source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1); 2479 2480 if (!source_stmt1) 2481 return NULL; 2482 2483 source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1); 2484 2485 if (!source_stmt2) 2486 return NULL; 2487 2488 if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type)) 2489 return NULL; 2490 2491 if (!n1.vuse != !n2.vuse 2492 || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0))) 2493 return NULL; 2494 2495 source_stmt 2496 = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n); 2497 2498 if (!source_stmt) 2499 return NULL; 2500 2501 if (!verify_symbolic_number_p (n, stmt)) 2502 return NULL; 2503 2504 break; 2505 default: 2506 return NULL; 2507 } 2508 return source_stmt; 2509 } 2510 return NULL; 2511} 2512 2513/* Check if STMT completes a bswap implementation or a read in a given 2514 endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP 2515 accordingly. It also sets N to represent the kind of operations 2516 performed: size of the resulting expression and whether it works on 2517 a memory source, and if so alias-set and vuse. At last, the 2518 function returns a stmt whose rhs's first tree is the source 2519 expression. */ 2520 2521static gimple * 2522find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap) 2523{ 2524 unsigned rsize; 2525 uint64_t tmpn, mask; 2526/* The number which the find_bswap_or_nop_1 result should match in order 2527 to have a full byte swap. The number is shifted to the right 2528 according to the size of the symbolic number before using it. */ 2529 uint64_t cmpxchg = CMPXCHG; 2530 uint64_t cmpnop = CMPNOP; 2531 2532 gimple *ins_stmt; 2533 int limit; 2534 2535 /* The last parameter determines the depth search limit. It usually 2536 correlates directly to the number n of bytes to be touched. We 2537 increase that number by log2(n) + 1 here in order to also 2538 cover signed -> unsigned conversions of the src operand as can be seen 2539 in libgcc, and for initial shift/and operation of the src operand. */ 2540 limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt))); 2541 limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit); 2542 ins_stmt = find_bswap_or_nop_1 (stmt, n, limit); 2543 2544 if (!ins_stmt) 2545 return NULL; 2546 2547 /* Find real size of result (highest non-zero byte). */ 2548 if (n->base_addr) 2549 for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++); 2550 else 2551 rsize = n->range; 2552 2553 /* Zero out the bits corresponding to untouched bytes in original gimple 2554 expression. */ 2555 if (n->range < (int) sizeof (int64_t)) 2556 { 2557 mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1; 2558 cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER; 2559 cmpnop &= mask; 2560 } 2561 2562 /* Zero out the bits corresponding to unused bytes in the result of the 2563 gimple expression. */ 2564 if (rsize < n->range) 2565 { 2566 if (BYTES_BIG_ENDIAN) 2567 { 2568 mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1; 2569 cmpxchg &= mask; 2570 cmpnop >>= (n->range - rsize) * BITS_PER_MARKER; 2571 } 2572 else 2573 { 2574 mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1; 2575 cmpxchg >>= (n->range - rsize) * BITS_PER_MARKER; 2576 cmpnop &= mask; 2577 } 2578 n->range = rsize; 2579 } 2580 2581 /* A complete byte swap should make the symbolic number to start with 2582 the largest digit in the highest order byte. Unchanged symbolic 2583 number indicates a read with same endianness as target architecture. */ 2584 if (n->n == cmpnop) 2585 *bswap = false; 2586 else if (n->n == cmpxchg) 2587 *bswap = true; 2588 else 2589 return NULL; 2590 2591 /* Useless bit manipulation performed by code. */ 2592 if (!n->base_addr && n->n == cmpnop) 2593 return NULL; 2594 2595 n->range *= BITS_PER_UNIT; 2596 return ins_stmt; 2597} 2598 2599namespace { 2600 2601const pass_data pass_data_optimize_bswap = 2602{ 2603 GIMPLE_PASS, /* type */ 2604 "bswap", /* name */ 2605 OPTGROUP_NONE, /* optinfo_flags */ 2606 TV_NONE, /* tv_id */ 2607 PROP_ssa, /* properties_required */ 2608 0, /* properties_provided */ 2609 0, /* properties_destroyed */ 2610 0, /* todo_flags_start */ 2611 0, /* todo_flags_finish */ 2612}; 2613 2614class pass_optimize_bswap : public gimple_opt_pass 2615{ 2616public: 2617 pass_optimize_bswap (gcc::context *ctxt) 2618 : gimple_opt_pass (pass_data_optimize_bswap, ctxt) 2619 {} 2620 2621 /* opt_pass methods: */ 2622 virtual bool gate (function *) 2623 { 2624 return flag_expensive_optimizations && optimize; 2625 } 2626 2627 virtual unsigned int execute (function *); 2628 2629}; // class pass_optimize_bswap 2630 2631/* Perform the bswap optimization: replace the expression computed in the rhs 2632 of CUR_STMT by an equivalent bswap, load or load + bswap expression. 2633 Which of these alternatives replace the rhs is given by N->base_addr (non 2634 null if a load is needed) and BSWAP. The type, VUSE and set-alias of the 2635 load to perform are also given in N while the builtin bswap invoke is given 2636 in FNDEL. Finally, if a load is involved, SRC_STMT refers to one of the 2637 load statements involved to construct the rhs in CUR_STMT and N->range gives 2638 the size of the rhs expression for maintaining some statistics. 2639 2640 Note that if the replacement involve a load, CUR_STMT is moved just after 2641 SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT 2642 changing of basic block. */ 2643 2644static bool 2645bswap_replace (gimple *cur_stmt, gimple *ins_stmt, tree fndecl, 2646 tree bswap_type, tree load_type, struct symbolic_number *n, 2647 bool bswap) 2648{ 2649 gimple_stmt_iterator gsi; 2650 tree src, tmp, tgt; 2651 gimple *bswap_stmt; 2652 2653 gsi = gsi_for_stmt (cur_stmt); 2654 src = n->src; 2655 tgt = gimple_assign_lhs (cur_stmt); 2656 2657 /* Need to load the value from memory first. */ 2658 if (n->base_addr) 2659 { 2660 gimple_stmt_iterator gsi_ins = gsi_for_stmt (ins_stmt); 2661 tree addr_expr, addr_tmp, val_expr, val_tmp; 2662 tree load_offset_ptr, aligned_load_type; 2663 gimple *addr_stmt, *load_stmt; 2664 unsigned align; 2665 HOST_WIDE_INT load_offset = 0; 2666 basic_block ins_bb, cur_bb; 2667 2668 ins_bb = gimple_bb (ins_stmt); 2669 cur_bb = gimple_bb (cur_stmt); 2670 if (!dominated_by_p (CDI_DOMINATORS, cur_bb, ins_bb)) 2671 return false; 2672 2673 align = get_object_alignment (src); 2674 2675 /* Move cur_stmt just before one of the load of the original 2676 to ensure it has the same VUSE. See PR61517 for what could 2677 go wrong. */ 2678 if (gimple_bb (cur_stmt) != gimple_bb (ins_stmt)) 2679 reset_flow_sensitive_info (gimple_assign_lhs (cur_stmt)); 2680 gsi_move_before (&gsi, &gsi_ins); 2681 gsi = gsi_for_stmt (cur_stmt); 2682 2683 /* Compute address to load from and cast according to the size 2684 of the load. */ 2685 addr_expr = build_fold_addr_expr (unshare_expr (src)); 2686 if (is_gimple_mem_ref_addr (addr_expr)) 2687 addr_tmp = addr_expr; 2688 else 2689 { 2690 addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL, 2691 "load_src"); 2692 addr_stmt = gimple_build_assign (addr_tmp, addr_expr); 2693 gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT); 2694 } 2695 2696 /* Perform the load. */ 2697 aligned_load_type = load_type; 2698 if (align < TYPE_ALIGN (load_type)) 2699 aligned_load_type = build_aligned_type (load_type, align); 2700 load_offset_ptr = build_int_cst (n->alias_set, load_offset); 2701 val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp, 2702 load_offset_ptr); 2703 2704 if (!bswap) 2705 { 2706 if (n->range == 16) 2707 nop_stats.found_16bit++; 2708 else if (n->range == 32) 2709 nop_stats.found_32bit++; 2710 else 2711 { 2712 gcc_assert (n->range == 64); 2713 nop_stats.found_64bit++; 2714 } 2715 2716 /* Convert the result of load if necessary. */ 2717 if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type)) 2718 { 2719 val_tmp = make_temp_ssa_name (aligned_load_type, NULL, 2720 "load_dst"); 2721 load_stmt = gimple_build_assign (val_tmp, val_expr); 2722 gimple_set_vuse (load_stmt, n->vuse); 2723 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT); 2724 gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp); 2725 } 2726 else 2727 { 2728 gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr); 2729 gimple_set_vuse (cur_stmt, n->vuse); 2730 } 2731 update_stmt (cur_stmt); 2732 2733 if (dump_file) 2734 { 2735 fprintf (dump_file, 2736 "%d bit load in target endianness found at: ", 2737 (int) n->range); 2738 print_gimple_stmt (dump_file, cur_stmt, 0, 0); 2739 } 2740 return true; 2741 } 2742 else 2743 { 2744 val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst"); 2745 load_stmt = gimple_build_assign (val_tmp, val_expr); 2746 gimple_set_vuse (load_stmt, n->vuse); 2747 gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT); 2748 } 2749 src = val_tmp; 2750 } 2751 else if (TREE_CODE (src) == BIT_FIELD_REF) 2752 src = TREE_OPERAND (src, 0); 2753 2754 if (n->range == 16) 2755 bswap_stats.found_16bit++; 2756 else if (n->range == 32) 2757 bswap_stats.found_32bit++; 2758 else 2759 { 2760 gcc_assert (n->range == 64); 2761 bswap_stats.found_64bit++; 2762 } 2763 2764 tmp = src; 2765 2766 /* Convert the src expression if necessary. */ 2767 if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type)) 2768 { 2769 gimple *convert_stmt; 2770 2771 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc"); 2772 convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src); 2773 gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT); 2774 } 2775 2776 /* Canonical form for 16 bit bswap is a rotate expression. Only 16bit values 2777 are considered as rotation of 2N bit values by N bits is generally not 2778 equivalent to a bswap. Consider for instance 0x01020304 r>> 16 which 2779 gives 0x03040102 while a bswap for that value is 0x04030201. */ 2780 if (bswap && n->range == 16) 2781 { 2782 tree count = build_int_cst (NULL, BITS_PER_UNIT); 2783 src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count); 2784 bswap_stmt = gimple_build_assign (NULL, src); 2785 } 2786 else 2787 bswap_stmt = gimple_build_call (fndecl, 1, tmp); 2788 2789 tmp = tgt; 2790 2791 /* Convert the result if necessary. */ 2792 if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type)) 2793 { 2794 gimple *convert_stmt; 2795 2796 tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst"); 2797 convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp); 2798 gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT); 2799 } 2800 2801 gimple_set_lhs (bswap_stmt, tmp); 2802 2803 if (dump_file) 2804 { 2805 fprintf (dump_file, "%d bit bswap implementation found at: ", 2806 (int) n->range); 2807 print_gimple_stmt (dump_file, cur_stmt, 0, 0); 2808 } 2809 2810 gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT); 2811 gsi_remove (&gsi, true); 2812 return true; 2813} 2814 2815/* Find manual byte swap implementations as well as load in a given 2816 endianness. Byte swaps are turned into a bswap builtin invokation 2817 while endian loads are converted to bswap builtin invokation or 2818 simple load according to the target endianness. */ 2819 2820unsigned int 2821pass_optimize_bswap::execute (function *fun) 2822{ 2823 basic_block bb; 2824 bool bswap32_p, bswap64_p; 2825 bool changed = false; 2826 tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE; 2827 2828 if (BITS_PER_UNIT != 8) 2829 return 0; 2830 2831 bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32) 2832 && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing); 2833 bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64) 2834 && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing 2835 || (bswap32_p && word_mode == SImode))); 2836 2837 /* Determine the argument type of the builtins. The code later on 2838 assumes that the return and argument type are the same. */ 2839 if (bswap32_p) 2840 { 2841 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32); 2842 bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl))); 2843 } 2844 2845 if (bswap64_p) 2846 { 2847 tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64); 2848 bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl))); 2849 } 2850 2851 memset (&nop_stats, 0, sizeof (nop_stats)); 2852 memset (&bswap_stats, 0, sizeof (bswap_stats)); 2853 calculate_dominance_info (CDI_DOMINATORS); 2854 2855 FOR_EACH_BB_FN (bb, fun) 2856 { 2857 gimple_stmt_iterator gsi; 2858 2859 /* We do a reverse scan for bswap patterns to make sure we get the 2860 widest match. As bswap pattern matching doesn't handle previously 2861 inserted smaller bswap replacements as sub-patterns, the wider 2862 variant wouldn't be detected. */ 2863 for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);) 2864 { 2865 gimple *ins_stmt, *cur_stmt = gsi_stmt (gsi); 2866 tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type; 2867 enum tree_code code; 2868 struct symbolic_number n; 2869 bool bswap; 2870 2871 /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt 2872 might be moved to a different basic block by bswap_replace and gsi 2873 must not points to it if that's the case. Moving the gsi_prev 2874 there make sure that gsi points to the statement previous to 2875 cur_stmt while still making sure that all statements are 2876 considered in this basic block. */ 2877 gsi_prev (&gsi); 2878 2879 if (!is_gimple_assign (cur_stmt)) 2880 continue; 2881 2882 code = gimple_assign_rhs_code (cur_stmt); 2883 switch (code) 2884 { 2885 case LROTATE_EXPR: 2886 case RROTATE_EXPR: 2887 if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt)) 2888 || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt)) 2889 % BITS_PER_UNIT) 2890 continue; 2891 /* Fall through. */ 2892 case BIT_IOR_EXPR: 2893 break; 2894 default: 2895 continue; 2896 } 2897 2898 ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap); 2899 2900 if (!ins_stmt) 2901 continue; 2902 2903 switch (n.range) 2904 { 2905 case 16: 2906 /* Already in canonical form, nothing to do. */ 2907 if (code == LROTATE_EXPR || code == RROTATE_EXPR) 2908 continue; 2909 load_type = bswap_type = uint16_type_node; 2910 break; 2911 case 32: 2912 load_type = uint32_type_node; 2913 if (bswap32_p) 2914 { 2915 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32); 2916 bswap_type = bswap32_type; 2917 } 2918 break; 2919 case 64: 2920 load_type = uint64_type_node; 2921 if (bswap64_p) 2922 { 2923 fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64); 2924 bswap_type = bswap64_type; 2925 } 2926 break; 2927 default: 2928 continue; 2929 } 2930 2931 if (bswap && !fndecl && n.range != 16) 2932 continue; 2933 2934 if (bswap_replace (cur_stmt, ins_stmt, fndecl, bswap_type, load_type, 2935 &n, bswap)) 2936 changed = true; 2937 } 2938 } 2939 2940 statistics_counter_event (fun, "16-bit nop implementations found", 2941 nop_stats.found_16bit); 2942 statistics_counter_event (fun, "32-bit nop implementations found", 2943 nop_stats.found_32bit); 2944 statistics_counter_event (fun, "64-bit nop implementations found", 2945 nop_stats.found_64bit); 2946 statistics_counter_event (fun, "16-bit bswap implementations found", 2947 bswap_stats.found_16bit); 2948 statistics_counter_event (fun, "32-bit bswap implementations found", 2949 bswap_stats.found_32bit); 2950 statistics_counter_event (fun, "64-bit bswap implementations found", 2951 bswap_stats.found_64bit); 2952 2953 return (changed ? TODO_update_ssa : 0); 2954} 2955 2956} // anon namespace 2957 2958gimple_opt_pass * 2959make_pass_optimize_bswap (gcc::context *ctxt) 2960{ 2961 return new pass_optimize_bswap (ctxt); 2962} 2963 2964/* Return true if stmt is a type conversion operation that can be stripped 2965 when used in a widening multiply operation. */ 2966static bool 2967widening_mult_conversion_strippable_p (tree result_type, gimple *stmt) 2968{ 2969 enum tree_code rhs_code = gimple_assign_rhs_code (stmt); 2970 2971 if (TREE_CODE (result_type) == INTEGER_TYPE) 2972 { 2973 tree op_type; 2974 tree inner_op_type; 2975 2976 if (!CONVERT_EXPR_CODE_P (rhs_code)) 2977 return false; 2978 2979 op_type = TREE_TYPE (gimple_assign_lhs (stmt)); 2980 2981 /* If the type of OP has the same precision as the result, then 2982 we can strip this conversion. The multiply operation will be 2983 selected to create the correct extension as a by-product. */ 2984 if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type)) 2985 return true; 2986 2987 /* We can also strip a conversion if it preserves the signed-ness of 2988 the operation and doesn't narrow the range. */ 2989 inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt)); 2990 2991 /* If the inner-most type is unsigned, then we can strip any 2992 intermediate widening operation. If it's signed, then the 2993 intermediate widening operation must also be signed. */ 2994 if ((TYPE_UNSIGNED (inner_op_type) 2995 || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type)) 2996 && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type)) 2997 return true; 2998 2999 return false; 3000 } 3001 3002 return rhs_code == FIXED_CONVERT_EXPR; 3003} 3004 3005/* Return true if RHS is a suitable operand for a widening multiplication, 3006 assuming a target type of TYPE. 3007 There are two cases: 3008 3009 - RHS makes some value at least twice as wide. Store that value 3010 in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT. 3011 3012 - RHS is an integer constant. Store that value in *NEW_RHS_OUT if so, 3013 but leave *TYPE_OUT untouched. */ 3014 3015static bool 3016is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out, 3017 tree *new_rhs_out) 3018{ 3019 gimple *stmt; 3020 tree type1, rhs1; 3021 3022 if (TREE_CODE (rhs) == SSA_NAME) 3023 { 3024 stmt = SSA_NAME_DEF_STMT (rhs); 3025 if (is_gimple_assign (stmt)) 3026 { 3027 if (! widening_mult_conversion_strippable_p (type, stmt)) 3028 rhs1 = rhs; 3029 else 3030 { 3031 rhs1 = gimple_assign_rhs1 (stmt); 3032 3033 if (TREE_CODE (rhs1) == INTEGER_CST) 3034 { 3035 *new_rhs_out = rhs1; 3036 *type_out = NULL; 3037 return true; 3038 } 3039 } 3040 } 3041 else 3042 rhs1 = rhs; 3043 3044 type1 = TREE_TYPE (rhs1); 3045 3046 if (TREE_CODE (type1) != TREE_CODE (type) 3047 || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type)) 3048 return false; 3049 3050 *new_rhs_out = rhs1; 3051 *type_out = type1; 3052 return true; 3053 } 3054 3055 if (TREE_CODE (rhs) == INTEGER_CST) 3056 { 3057 *new_rhs_out = rhs; 3058 *type_out = NULL; 3059 return true; 3060 } 3061 3062 return false; 3063} 3064 3065/* Return true if STMT performs a widening multiplication, assuming the 3066 output type is TYPE. If so, store the unwidened types of the operands 3067 in *TYPE1_OUT and *TYPE2_OUT respectively. Also fill *RHS1_OUT and 3068 *RHS2_OUT such that converting those operands to types *TYPE1_OUT 3069 and *TYPE2_OUT would give the operands of the multiplication. */ 3070 3071static bool 3072is_widening_mult_p (gimple *stmt, 3073 tree *type1_out, tree *rhs1_out, 3074 tree *type2_out, tree *rhs2_out) 3075{ 3076 tree type = TREE_TYPE (gimple_assign_lhs (stmt)); 3077 3078 if (TREE_CODE (type) == INTEGER_TYPE) 3079 { 3080 if (TYPE_OVERFLOW_TRAPS (type)) 3081 return false; 3082 } 3083 else if (TREE_CODE (type) != FIXED_POINT_TYPE) 3084 return false; 3085 3086 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out, 3087 rhs1_out)) 3088 return false; 3089 3090 if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out, 3091 rhs2_out)) 3092 return false; 3093 3094 if (*type1_out == NULL) 3095 { 3096 if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out)) 3097 return false; 3098 *type1_out = *type2_out; 3099 } 3100 3101 if (*type2_out == NULL) 3102 { 3103 if (!int_fits_type_p (*rhs2_out, *type1_out)) 3104 return false; 3105 *type2_out = *type1_out; 3106 } 3107 3108 /* Ensure that the larger of the two operands comes first. */ 3109 if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out)) 3110 { 3111 std::swap (*type1_out, *type2_out); 3112 std::swap (*rhs1_out, *rhs2_out); 3113 } 3114 3115 return true; 3116} 3117 3118/* Process a single gimple statement STMT, which has a MULT_EXPR as 3119 its rhs, and try to convert it into a WIDEN_MULT_EXPR. The return 3120 value is true iff we converted the statement. */ 3121 3122static bool 3123convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi) 3124{ 3125 tree lhs, rhs1, rhs2, type, type1, type2; 3126 enum insn_code handler; 3127 machine_mode to_mode, from_mode, actual_mode; 3128 optab op; 3129 int actual_precision; 3130 location_t loc = gimple_location (stmt); 3131 bool from_unsigned1, from_unsigned2; 3132 3133 lhs = gimple_assign_lhs (stmt); 3134 type = TREE_TYPE (lhs); 3135 if (TREE_CODE (type) != INTEGER_TYPE) 3136 return false; 3137 3138 if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2)) 3139 return false; 3140 3141 to_mode = TYPE_MODE (type); 3142 from_mode = TYPE_MODE (type1); 3143 from_unsigned1 = TYPE_UNSIGNED (type1); 3144 from_unsigned2 = TYPE_UNSIGNED (type2); 3145 3146 if (from_unsigned1 && from_unsigned2) 3147 op = umul_widen_optab; 3148 else if (!from_unsigned1 && !from_unsigned2) 3149 op = smul_widen_optab; 3150 else 3151 op = usmul_widen_optab; 3152 3153 handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode, 3154 0, &actual_mode); 3155 3156 if (handler == CODE_FOR_nothing) 3157 { 3158 if (op != smul_widen_optab) 3159 { 3160 /* We can use a signed multiply with unsigned types as long as 3161 there is a wider mode to use, or it is the smaller of the two 3162 types that is unsigned. Note that type1 >= type2, always. */ 3163 if ((TYPE_UNSIGNED (type1) 3164 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode)) 3165 || (TYPE_UNSIGNED (type2) 3166 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode))) 3167 { 3168 from_mode = GET_MODE_WIDER_MODE (from_mode); 3169 if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode)) 3170 return false; 3171 } 3172 3173 op = smul_widen_optab; 3174 handler = find_widening_optab_handler_and_mode (op, to_mode, 3175 from_mode, 0, 3176 &actual_mode); 3177 3178 if (handler == CODE_FOR_nothing) 3179 return false; 3180 3181 from_unsigned1 = from_unsigned2 = false; 3182 } 3183 else 3184 return false; 3185 } 3186 3187 /* Ensure that the inputs to the handler are in the correct precison 3188 for the opcode. This will be the full mode size. */ 3189 actual_precision = GET_MODE_PRECISION (actual_mode); 3190 if (2 * actual_precision > TYPE_PRECISION (type)) 3191 return false; 3192 if (actual_precision != TYPE_PRECISION (type1) 3193 || from_unsigned1 != TYPE_UNSIGNED (type1)) 3194 rhs1 = build_and_insert_cast (gsi, loc, 3195 build_nonstandard_integer_type 3196 (actual_precision, from_unsigned1), rhs1); 3197 if (actual_precision != TYPE_PRECISION (type2) 3198 || from_unsigned2 != TYPE_UNSIGNED (type2)) 3199 rhs2 = build_and_insert_cast (gsi, loc, 3200 build_nonstandard_integer_type 3201 (actual_precision, from_unsigned2), rhs2); 3202 3203 /* Handle constants. */ 3204 if (TREE_CODE (rhs1) == INTEGER_CST) 3205 rhs1 = fold_convert (type1, rhs1); 3206 if (TREE_CODE (rhs2) == INTEGER_CST) 3207 rhs2 = fold_convert (type2, rhs2); 3208 3209 gimple_assign_set_rhs1 (stmt, rhs1); 3210 gimple_assign_set_rhs2 (stmt, rhs2); 3211 gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR); 3212 update_stmt (stmt); 3213 widen_mul_stats.widen_mults_inserted++; 3214 return true; 3215} 3216 3217/* Process a single gimple statement STMT, which is found at the 3218 iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its 3219 rhs (given by CODE), and try to convert it into a 3220 WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR. The return value 3221 is true iff we converted the statement. */ 3222 3223static bool 3224convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt, 3225 enum tree_code code) 3226{ 3227 gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL; 3228 gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt; 3229 tree type, type1, type2, optype; 3230 tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs; 3231 enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK; 3232 optab this_optab; 3233 enum tree_code wmult_code; 3234 enum insn_code handler; 3235 machine_mode to_mode, from_mode, actual_mode; 3236 location_t loc = gimple_location (stmt); 3237 int actual_precision; 3238 bool from_unsigned1, from_unsigned2; 3239 3240 lhs = gimple_assign_lhs (stmt); 3241 type = TREE_TYPE (lhs); 3242 if (TREE_CODE (type) != INTEGER_TYPE 3243 && TREE_CODE (type) != FIXED_POINT_TYPE) 3244 return false; 3245 3246 if (code == MINUS_EXPR) 3247 wmult_code = WIDEN_MULT_MINUS_EXPR; 3248 else 3249 wmult_code = WIDEN_MULT_PLUS_EXPR; 3250 3251 rhs1 = gimple_assign_rhs1 (stmt); 3252 rhs2 = gimple_assign_rhs2 (stmt); 3253 3254 if (TREE_CODE (rhs1) == SSA_NAME) 3255 { 3256 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1); 3257 if (is_gimple_assign (rhs1_stmt)) 3258 rhs1_code = gimple_assign_rhs_code (rhs1_stmt); 3259 } 3260 3261 if (TREE_CODE (rhs2) == SSA_NAME) 3262 { 3263 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2); 3264 if (is_gimple_assign (rhs2_stmt)) 3265 rhs2_code = gimple_assign_rhs_code (rhs2_stmt); 3266 } 3267 3268 /* Allow for one conversion statement between the multiply 3269 and addition/subtraction statement. If there are more than 3270 one conversions then we assume they would invalidate this 3271 transformation. If that's not the case then they should have 3272 been folded before now. */ 3273 if (CONVERT_EXPR_CODE_P (rhs1_code)) 3274 { 3275 conv1_stmt = rhs1_stmt; 3276 rhs1 = gimple_assign_rhs1 (rhs1_stmt); 3277 if (TREE_CODE (rhs1) == SSA_NAME) 3278 { 3279 rhs1_stmt = SSA_NAME_DEF_STMT (rhs1); 3280 if (is_gimple_assign (rhs1_stmt)) 3281 rhs1_code = gimple_assign_rhs_code (rhs1_stmt); 3282 } 3283 else 3284 return false; 3285 } 3286 if (CONVERT_EXPR_CODE_P (rhs2_code)) 3287 { 3288 conv2_stmt = rhs2_stmt; 3289 rhs2 = gimple_assign_rhs1 (rhs2_stmt); 3290 if (TREE_CODE (rhs2) == SSA_NAME) 3291 { 3292 rhs2_stmt = SSA_NAME_DEF_STMT (rhs2); 3293 if (is_gimple_assign (rhs2_stmt)) 3294 rhs2_code = gimple_assign_rhs_code (rhs2_stmt); 3295 } 3296 else 3297 return false; 3298 } 3299 3300 /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call 3301 is_widening_mult_p, but we still need the rhs returns. 3302 3303 It might also appear that it would be sufficient to use the existing 3304 operands of the widening multiply, but that would limit the choice of 3305 multiply-and-accumulate instructions. 3306 3307 If the widened-multiplication result has more than one uses, it is 3308 probably wiser not to do the conversion. */ 3309 if (code == PLUS_EXPR 3310 && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR)) 3311 { 3312 if (!has_single_use (rhs1) 3313 || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1, 3314 &type2, &mult_rhs2)) 3315 return false; 3316 add_rhs = rhs2; 3317 conv_stmt = conv1_stmt; 3318 } 3319 else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR) 3320 { 3321 if (!has_single_use (rhs2) 3322 || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1, 3323 &type2, &mult_rhs2)) 3324 return false; 3325 add_rhs = rhs1; 3326 conv_stmt = conv2_stmt; 3327 } 3328 else 3329 return false; 3330 3331 to_mode = TYPE_MODE (type); 3332 from_mode = TYPE_MODE (type1); 3333 from_unsigned1 = TYPE_UNSIGNED (type1); 3334 from_unsigned2 = TYPE_UNSIGNED (type2); 3335 optype = type1; 3336 3337 /* There's no such thing as a mixed sign madd yet, so use a wider mode. */ 3338 if (from_unsigned1 != from_unsigned2) 3339 { 3340 if (!INTEGRAL_TYPE_P (type)) 3341 return false; 3342 /* We can use a signed multiply with unsigned types as long as 3343 there is a wider mode to use, or it is the smaller of the two 3344 types that is unsigned. Note that type1 >= type2, always. */ 3345 if ((from_unsigned1 3346 && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode)) 3347 || (from_unsigned2 3348 && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode))) 3349 { 3350 from_mode = GET_MODE_WIDER_MODE (from_mode); 3351 if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode)) 3352 return false; 3353 } 3354 3355 from_unsigned1 = from_unsigned2 = false; 3356 optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode), 3357 false); 3358 } 3359 3360 /* If there was a conversion between the multiply and addition 3361 then we need to make sure it fits a multiply-and-accumulate. 3362 The should be a single mode change which does not change the 3363 value. */ 3364 if (conv_stmt) 3365 { 3366 /* We use the original, unmodified data types for this. */ 3367 tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt)); 3368 tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt)); 3369 int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2); 3370 bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2); 3371 3372 if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type)) 3373 { 3374 /* Conversion is a truncate. */ 3375 if (TYPE_PRECISION (to_type) < data_size) 3376 return false; 3377 } 3378 else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type)) 3379 { 3380 /* Conversion is an extend. Check it's the right sort. */ 3381 if (TYPE_UNSIGNED (from_type) != is_unsigned 3382 && !(is_unsigned && TYPE_PRECISION (from_type) > data_size)) 3383 return false; 3384 } 3385 /* else convert is a no-op for our purposes. */ 3386 } 3387 3388 /* Verify that the machine can perform a widening multiply 3389 accumulate in this mode/signedness combination, otherwise 3390 this transformation is likely to pessimize code. */ 3391 this_optab = optab_for_tree_code (wmult_code, optype, optab_default); 3392 handler = find_widening_optab_handler_and_mode (this_optab, to_mode, 3393 from_mode, 0, &actual_mode); 3394 3395 if (handler == CODE_FOR_nothing) 3396 return false; 3397 3398 /* Ensure that the inputs to the handler are in the correct precison 3399 for the opcode. This will be the full mode size. */ 3400 actual_precision = GET_MODE_PRECISION (actual_mode); 3401 if (actual_precision != TYPE_PRECISION (type1) 3402 || from_unsigned1 != TYPE_UNSIGNED (type1)) 3403 mult_rhs1 = build_and_insert_cast (gsi, loc, 3404 build_nonstandard_integer_type 3405 (actual_precision, from_unsigned1), 3406 mult_rhs1); 3407 if (actual_precision != TYPE_PRECISION (type2) 3408 || from_unsigned2 != TYPE_UNSIGNED (type2)) 3409 mult_rhs2 = build_and_insert_cast (gsi, loc, 3410 build_nonstandard_integer_type 3411 (actual_precision, from_unsigned2), 3412 mult_rhs2); 3413 3414 if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs))) 3415 add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs); 3416 3417 /* Handle constants. */ 3418 if (TREE_CODE (mult_rhs1) == INTEGER_CST) 3419 mult_rhs1 = fold_convert (type1, mult_rhs1); 3420 if (TREE_CODE (mult_rhs2) == INTEGER_CST) 3421 mult_rhs2 = fold_convert (type2, mult_rhs2); 3422 3423 gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2, 3424 add_rhs); 3425 update_stmt (gsi_stmt (*gsi)); 3426 widen_mul_stats.maccs_inserted++; 3427 return true; 3428} 3429 3430/* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2 3431 with uses in additions and subtractions to form fused multiply-add 3432 operations. Returns true if successful and MUL_STMT should be removed. */ 3433 3434static bool 3435convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2) 3436{ 3437 tree mul_result = gimple_get_lhs (mul_stmt); 3438 tree type = TREE_TYPE (mul_result); 3439 gimple *use_stmt, *neguse_stmt; 3440 gassign *fma_stmt; 3441 use_operand_p use_p; 3442 imm_use_iterator imm_iter; 3443 3444 if (FLOAT_TYPE_P (type) 3445 && flag_fp_contract_mode == FP_CONTRACT_OFF) 3446 return false; 3447 3448 /* We don't want to do bitfield reduction ops. */ 3449 if (INTEGRAL_TYPE_P (type) 3450 && (TYPE_PRECISION (type) 3451 != GET_MODE_PRECISION (TYPE_MODE (type)) 3452 || TYPE_OVERFLOW_TRAPS (type))) 3453 return false; 3454 3455 /* If the target doesn't support it, don't generate it. We assume that 3456 if fma isn't available then fms, fnma or fnms are not either. */ 3457 if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing) 3458 return false; 3459 3460 /* If the multiplication has zero uses, it is kept around probably because 3461 of -fnon-call-exceptions. Don't optimize it away in that case, 3462 it is DCE job. */ 3463 if (has_zero_uses (mul_result)) 3464 return false; 3465 3466 /* Make sure that the multiplication statement becomes dead after 3467 the transformation, thus that all uses are transformed to FMAs. 3468 This means we assume that an FMA operation has the same cost 3469 as an addition. */ 3470 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result) 3471 { 3472 enum tree_code use_code; 3473 tree result = mul_result; 3474 bool negate_p = false; 3475 3476 use_stmt = USE_STMT (use_p); 3477 3478 if (is_gimple_debug (use_stmt)) 3479 continue; 3480 3481 /* For now restrict this operations to single basic blocks. In theory 3482 we would want to support sinking the multiplication in 3483 m = a*b; 3484 if () 3485 ma = m + c; 3486 else 3487 d = m; 3488 to form a fma in the then block and sink the multiplication to the 3489 else block. */ 3490 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt)) 3491 return false; 3492 3493 if (!is_gimple_assign (use_stmt)) 3494 return false; 3495 3496 use_code = gimple_assign_rhs_code (use_stmt); 3497 3498 /* A negate on the multiplication leads to FNMA. */ 3499 if (use_code == NEGATE_EXPR) 3500 { 3501 ssa_op_iter iter; 3502 use_operand_p usep; 3503 3504 result = gimple_assign_lhs (use_stmt); 3505 3506 /* Make sure the negate statement becomes dead with this 3507 single transformation. */ 3508 if (!single_imm_use (gimple_assign_lhs (use_stmt), 3509 &use_p, &neguse_stmt)) 3510 return false; 3511 3512 /* Make sure the multiplication isn't also used on that stmt. */ 3513 FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE) 3514 if (USE_FROM_PTR (usep) == mul_result) 3515 return false; 3516 3517 /* Re-validate. */ 3518 use_stmt = neguse_stmt; 3519 if (gimple_bb (use_stmt) != gimple_bb (mul_stmt)) 3520 return false; 3521 if (!is_gimple_assign (use_stmt)) 3522 return false; 3523 3524 use_code = gimple_assign_rhs_code (use_stmt); 3525 negate_p = true; 3526 } 3527 3528 switch (use_code) 3529 { 3530 case MINUS_EXPR: 3531 if (gimple_assign_rhs2 (use_stmt) == result) 3532 negate_p = !negate_p; 3533 break; 3534 case PLUS_EXPR: 3535 break; 3536 default: 3537 /* FMA can only be formed from PLUS and MINUS. */ 3538 return false; 3539 } 3540 3541 /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed 3542 by a MULT_EXPR that we'll visit later, we might be able to 3543 get a more profitable match with fnma. 3544 OTOH, if we don't, a negate / fma pair has likely lower latency 3545 that a mult / subtract pair. */ 3546 if (use_code == MINUS_EXPR && !negate_p 3547 && gimple_assign_rhs1 (use_stmt) == result 3548 && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing 3549 && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing) 3550 { 3551 tree rhs2 = gimple_assign_rhs2 (use_stmt); 3552 3553 if (TREE_CODE (rhs2) == SSA_NAME) 3554 { 3555 gimple *stmt2 = SSA_NAME_DEF_STMT (rhs2); 3556 if (has_single_use (rhs2) 3557 && is_gimple_assign (stmt2) 3558 && gimple_assign_rhs_code (stmt2) == MULT_EXPR) 3559 return false; 3560 } 3561 } 3562 3563 /* We can't handle a * b + a * b. */ 3564 if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt)) 3565 return false; 3566 3567 /* While it is possible to validate whether or not the exact form 3568 that we've recognized is available in the backend, the assumption 3569 is that the transformation is never a loss. For instance, suppose 3570 the target only has the plain FMA pattern available. Consider 3571 a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which 3572 is still two operations. Consider -(a*b)-c -> fma(-a,b,-c): we 3573 still have 3 operations, but in the FMA form the two NEGs are 3574 independent and could be run in parallel. */ 3575 } 3576 3577 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result) 3578 { 3579 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt); 3580 enum tree_code use_code; 3581 tree addop, mulop1 = op1, result = mul_result; 3582 bool negate_p = false; 3583 3584 if (is_gimple_debug (use_stmt)) 3585 continue; 3586 3587 use_code = gimple_assign_rhs_code (use_stmt); 3588 if (use_code == NEGATE_EXPR) 3589 { 3590 result = gimple_assign_lhs (use_stmt); 3591 single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt); 3592 gsi_remove (&gsi, true); 3593 release_defs (use_stmt); 3594 3595 use_stmt = neguse_stmt; 3596 gsi = gsi_for_stmt (use_stmt); 3597 use_code = gimple_assign_rhs_code (use_stmt); 3598 negate_p = true; 3599 } 3600 3601 if (gimple_assign_rhs1 (use_stmt) == result) 3602 { 3603 addop = gimple_assign_rhs2 (use_stmt); 3604 /* a * b - c -> a * b + (-c) */ 3605 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR) 3606 addop = force_gimple_operand_gsi (&gsi, 3607 build1 (NEGATE_EXPR, 3608 type, addop), 3609 true, NULL_TREE, true, 3610 GSI_SAME_STMT); 3611 } 3612 else 3613 { 3614 addop = gimple_assign_rhs1 (use_stmt); 3615 /* a - b * c -> (-b) * c + a */ 3616 if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR) 3617 negate_p = !negate_p; 3618 } 3619 3620 if (negate_p) 3621 mulop1 = force_gimple_operand_gsi (&gsi, 3622 build1 (NEGATE_EXPR, 3623 type, mulop1), 3624 true, NULL_TREE, true, 3625 GSI_SAME_STMT); 3626 3627 fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt), 3628 FMA_EXPR, mulop1, op2, addop); 3629 gsi_replace (&gsi, fma_stmt, true); 3630 widen_mul_stats.fmas_inserted++; 3631 } 3632 3633 return true; 3634} 3635 3636 3637/* Helper function of match_uaddsub_overflow. Return 1 3638 if USE_STMT is unsigned overflow check ovf != 0 for 3639 STMT, -1 if USE_STMT is unsigned overflow check ovf == 0 3640 and 0 otherwise. */ 3641 3642static int 3643uaddsub_overflow_check_p (gimple *stmt, gimple *use_stmt) 3644{ 3645 enum tree_code ccode = ERROR_MARK; 3646 tree crhs1 = NULL_TREE, crhs2 = NULL_TREE; 3647 if (gimple_code (use_stmt) == GIMPLE_COND) 3648 { 3649 ccode = gimple_cond_code (use_stmt); 3650 crhs1 = gimple_cond_lhs (use_stmt); 3651 crhs2 = gimple_cond_rhs (use_stmt); 3652 } 3653 else if (is_gimple_assign (use_stmt)) 3654 { 3655 if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS) 3656 { 3657 ccode = gimple_assign_rhs_code (use_stmt); 3658 crhs1 = gimple_assign_rhs1 (use_stmt); 3659 crhs2 = gimple_assign_rhs2 (use_stmt); 3660 } 3661 else if (gimple_assign_rhs_code (use_stmt) == COND_EXPR) 3662 { 3663 tree cond = gimple_assign_rhs1 (use_stmt); 3664 if (COMPARISON_CLASS_P (cond)) 3665 { 3666 ccode = TREE_CODE (cond); 3667 crhs1 = TREE_OPERAND (cond, 0); 3668 crhs2 = TREE_OPERAND (cond, 1); 3669 } 3670 else 3671 return 0; 3672 } 3673 else 3674 return 0; 3675 } 3676 else 3677 return 0; 3678 3679 if (TREE_CODE_CLASS (ccode) != tcc_comparison) 3680 return 0; 3681 3682 enum tree_code code = gimple_assign_rhs_code (stmt); 3683 tree lhs = gimple_assign_lhs (stmt); 3684 tree rhs1 = gimple_assign_rhs1 (stmt); 3685 tree rhs2 = gimple_assign_rhs2 (stmt); 3686 3687 switch (ccode) 3688 { 3689 case GT_EXPR: 3690 case LE_EXPR: 3691 /* r = a - b; r > a or r <= a 3692 r = a + b; a > r or a <= r or b > r or b <= r. */ 3693 if ((code == MINUS_EXPR && crhs1 == lhs && crhs2 == rhs1) 3694 || (code == PLUS_EXPR && (crhs1 == rhs1 || crhs1 == rhs2) 3695 && crhs2 == lhs)) 3696 return ccode == GT_EXPR ? 1 : -1; 3697 break; 3698 case LT_EXPR: 3699 case GE_EXPR: 3700 /* r = a - b; a < r or a >= r 3701 r = a + b; r < a or r >= a or r < b or r >= b. */ 3702 if ((code == MINUS_EXPR && crhs1 == rhs1 && crhs2 == lhs) 3703 || (code == PLUS_EXPR && crhs1 == lhs 3704 && (crhs2 == rhs1 || crhs2 == rhs2))) 3705 return ccode == LT_EXPR ? 1 : -1; 3706 break; 3707 default: 3708 break; 3709 } 3710 return 0; 3711} 3712 3713/* Recognize for unsigned x 3714 x = y - z; 3715 if (x > y) 3716 where there are other uses of x and replace it with 3717 _7 = SUB_OVERFLOW (y, z); 3718 x = REALPART_EXPR <_7>; 3719 _8 = IMAGPART_EXPR <_7>; 3720 if (_8) 3721 and similarly for addition. */ 3722 3723static bool 3724match_uaddsub_overflow (gimple_stmt_iterator *gsi, gimple *stmt, 3725 enum tree_code code) 3726{ 3727 tree lhs = gimple_assign_lhs (stmt); 3728 tree type = TREE_TYPE (lhs); 3729 use_operand_p use_p; 3730 imm_use_iterator iter; 3731 bool use_seen = false; 3732 bool ovf_use_seen = false; 3733 gimple *use_stmt; 3734 3735 gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR); 3736 if (!INTEGRAL_TYPE_P (type) 3737 || !TYPE_UNSIGNED (type) 3738 || has_zero_uses (lhs) 3739 || has_single_use (lhs) 3740 || optab_handler (code == PLUS_EXPR ? uaddv4_optab : usubv4_optab, 3741 TYPE_MODE (type)) == CODE_FOR_nothing) 3742 return false; 3743 3744 FOR_EACH_IMM_USE_FAST (use_p, iter, lhs) 3745 { 3746 use_stmt = USE_STMT (use_p); 3747 if (is_gimple_debug (use_stmt)) 3748 continue; 3749 3750 if (uaddsub_overflow_check_p (stmt, use_stmt)) 3751 ovf_use_seen = true; 3752 else 3753 use_seen = true; 3754 if (ovf_use_seen && use_seen) 3755 break; 3756 } 3757 3758 if (!ovf_use_seen || !use_seen) 3759 return false; 3760 3761 tree ctype = build_complex_type (type); 3762 tree rhs1 = gimple_assign_rhs1 (stmt); 3763 tree rhs2 = gimple_assign_rhs2 (stmt); 3764 gcall *g = gimple_build_call_internal (code == PLUS_EXPR 3765 ? IFN_ADD_OVERFLOW : IFN_SUB_OVERFLOW, 3766 2, rhs1, rhs2); 3767 tree ctmp = make_ssa_name (ctype); 3768 gimple_call_set_lhs (g, ctmp); 3769 gsi_insert_before (gsi, g, GSI_SAME_STMT); 3770 gassign *g2 = gimple_build_assign (lhs, REALPART_EXPR, 3771 build1 (REALPART_EXPR, type, ctmp)); 3772 gsi_replace (gsi, g2, true); 3773 tree ovf = make_ssa_name (type); 3774 g2 = gimple_build_assign (ovf, IMAGPART_EXPR, 3775 build1 (IMAGPART_EXPR, type, ctmp)); 3776 gsi_insert_after (gsi, g2, GSI_NEW_STMT); 3777 3778 FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs) 3779 { 3780 if (is_gimple_debug (use_stmt)) 3781 continue; 3782 3783 int ovf_use = uaddsub_overflow_check_p (stmt, use_stmt); 3784 if (ovf_use == 0) 3785 continue; 3786 if (gimple_code (use_stmt) == GIMPLE_COND) 3787 { 3788 gcond *cond_stmt = as_a <gcond *> (use_stmt); 3789 gimple_cond_set_lhs (cond_stmt, ovf); 3790 gimple_cond_set_rhs (cond_stmt, build_int_cst (type, 0)); 3791 gimple_cond_set_code (cond_stmt, ovf_use == 1 ? NE_EXPR : EQ_EXPR); 3792 } 3793 else 3794 { 3795 gcc_checking_assert (is_gimple_assign (use_stmt)); 3796 if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS) 3797 { 3798 gimple_assign_set_rhs1 (use_stmt, ovf); 3799 gimple_assign_set_rhs2 (use_stmt, build_int_cst (type, 0)); 3800 gimple_assign_set_rhs_code (use_stmt, 3801 ovf_use == 1 ? NE_EXPR : EQ_EXPR); 3802 } 3803 else 3804 { 3805 gcc_checking_assert (gimple_assign_rhs_code (use_stmt) 3806 == COND_EXPR); 3807 tree cond = build2 (ovf_use == 1 ? NE_EXPR : EQ_EXPR, 3808 boolean_type_node, ovf, 3809 build_int_cst (type, 0)); 3810 gimple_assign_set_rhs1 (use_stmt, cond); 3811 } 3812 } 3813 update_stmt (use_stmt); 3814 } 3815 return true; 3816} 3817 3818/* Return true if target has support for divmod. */ 3819 3820static bool 3821target_supports_divmod_p (optab divmod_optab, optab div_optab, machine_mode mode) 3822{ 3823 /* If target supports hardware divmod insn, use it for divmod. */ 3824 if (optab_handler (divmod_optab, mode) != CODE_FOR_nothing) 3825 return true; 3826 3827 /* Check if libfunc for divmod is available. */ 3828 rtx libfunc = optab_libfunc (divmod_optab, mode); 3829 if (libfunc != NULL_RTX) 3830 { 3831 /* If optab_handler exists for div_optab, perhaps in a wider mode, 3832 we don't want to use the libfunc even if it exists for given mode. */ 3833 for (machine_mode div_mode = mode; 3834 div_mode != VOIDmode; 3835 div_mode = GET_MODE_WIDER_MODE (div_mode)) 3836 if (optab_handler (div_optab, div_mode) != CODE_FOR_nothing) 3837 return false; 3838 3839 return targetm.expand_divmod_libfunc != NULL; 3840 } 3841 3842 return false; 3843} 3844 3845/* Check if stmt is candidate for divmod transform. */ 3846 3847static bool 3848divmod_candidate_p (gassign *stmt) 3849{ 3850 tree type = TREE_TYPE (gimple_assign_lhs (stmt)); 3851 enum machine_mode mode = TYPE_MODE (type); 3852 optab divmod_optab, div_optab; 3853 3854 if (TYPE_UNSIGNED (type)) 3855 { 3856 divmod_optab = udivmod_optab; 3857 div_optab = udiv_optab; 3858 } 3859 else 3860 { 3861 divmod_optab = sdivmod_optab; 3862 div_optab = sdiv_optab; 3863 } 3864 3865 tree op1 = gimple_assign_rhs1 (stmt); 3866 tree op2 = gimple_assign_rhs2 (stmt); 3867 3868 /* Disable the transform if either is a constant, since division-by-constant 3869 may have specialized expansion. */ 3870 if (CONSTANT_CLASS_P (op1) || CONSTANT_CLASS_P (op2)) 3871 return false; 3872 3873 /* Exclude the case where TYPE_OVERFLOW_TRAPS (type) as that should 3874 expand using the [su]divv optabs. */ 3875 if (TYPE_OVERFLOW_TRAPS (type)) 3876 return false; 3877 3878 if (!target_supports_divmod_p (divmod_optab, div_optab, mode)) 3879 return false; 3880 3881 return true; 3882} 3883 3884/* This function looks for: 3885 t1 = a TRUNC_DIV_EXPR b; 3886 t2 = a TRUNC_MOD_EXPR b; 3887 and transforms it to the following sequence: 3888 complex_tmp = DIVMOD (a, b); 3889 t1 = REALPART_EXPR(a); 3890 t2 = IMAGPART_EXPR(b); 3891 For conditions enabling the transform see divmod_candidate_p(). 3892 3893 The pass has three parts: 3894 1) Find top_stmt which is trunc_div or trunc_mod stmt and dominates all 3895 other trunc_div_expr and trunc_mod_expr stmts. 3896 2) Add top_stmt and all trunc_div and trunc_mod stmts dominated by top_stmt 3897 to stmts vector. 3898 3) Insert DIVMOD call just before top_stmt and update entries in 3899 stmts vector to use return value of DIMOVD (REALEXPR_PART for div, 3900 IMAGPART_EXPR for mod). */ 3901 3902static bool 3903convert_to_divmod (gassign *stmt) 3904{ 3905 if (stmt_can_throw_internal (stmt) 3906 || !divmod_candidate_p (stmt)) 3907 return false; 3908 3909 tree op1 = gimple_assign_rhs1 (stmt); 3910 tree op2 = gimple_assign_rhs2 (stmt); 3911 3912 imm_use_iterator use_iter; 3913 gimple *use_stmt; 3914 auto_vec<gimple *> stmts; 3915 3916 gimple *top_stmt = stmt; 3917 basic_block top_bb = gimple_bb (stmt); 3918 3919 /* Part 1: Try to set top_stmt to "topmost" stmt that dominates 3920 at-least stmt and possibly other trunc_div/trunc_mod stmts 3921 having same operands as stmt. */ 3922 3923 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, op1) 3924 { 3925 if (is_gimple_assign (use_stmt) 3926 && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR 3927 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR) 3928 && operand_equal_p (op1, gimple_assign_rhs1 (use_stmt), 0) 3929 && operand_equal_p (op2, gimple_assign_rhs2 (use_stmt), 0)) 3930 { 3931 if (stmt_can_throw_internal (use_stmt)) 3932 continue; 3933 3934 basic_block bb = gimple_bb (use_stmt); 3935 3936 if (bb == top_bb) 3937 { 3938 if (gimple_uid (use_stmt) < gimple_uid (top_stmt)) 3939 top_stmt = use_stmt; 3940 } 3941 else if (dominated_by_p (CDI_DOMINATORS, top_bb, bb)) 3942 { 3943 top_bb = bb; 3944 top_stmt = use_stmt; 3945 } 3946 } 3947 } 3948 3949 tree top_op1 = gimple_assign_rhs1 (top_stmt); 3950 tree top_op2 = gimple_assign_rhs2 (top_stmt); 3951 3952 stmts.safe_push (top_stmt); 3953 bool div_seen = (gimple_assign_rhs_code (top_stmt) == TRUNC_DIV_EXPR); 3954 3955 /* Part 2: Add all trunc_div/trunc_mod statements domianted by top_bb 3956 to stmts vector. The 2nd loop will always add stmt to stmts vector, since 3957 gimple_bb (top_stmt) dominates gimple_bb (stmt), so the 3958 2nd loop ends up adding at-least single trunc_mod_expr stmt. */ 3959 3960 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, top_op1) 3961 { 3962 if (is_gimple_assign (use_stmt) 3963 && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR 3964 || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR) 3965 && operand_equal_p (top_op1, gimple_assign_rhs1 (use_stmt), 0) 3966 && operand_equal_p (top_op2, gimple_assign_rhs2 (use_stmt), 0)) 3967 { 3968 if (use_stmt == top_stmt 3969 || stmt_can_throw_internal (use_stmt) 3970 || !dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt), top_bb)) 3971 continue; 3972 3973 stmts.safe_push (use_stmt); 3974 if (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR) 3975 div_seen = true; 3976 } 3977 } 3978 3979 if (!div_seen) 3980 return false; 3981 3982 /* Part 3: Create libcall to internal fn DIVMOD: 3983 divmod_tmp = DIVMOD (op1, op2). */ 3984 3985 gcall *call_stmt = gimple_build_call_internal (IFN_DIVMOD, 2, op1, op2); 3986 tree res = make_temp_ssa_name (build_complex_type (TREE_TYPE (op1)), 3987 call_stmt, "divmod_tmp"); 3988 gimple_call_set_lhs (call_stmt, res); 3989 3990 /* Insert the call before top_stmt. */ 3991 gimple_stmt_iterator top_stmt_gsi = gsi_for_stmt (top_stmt); 3992 gsi_insert_before (&top_stmt_gsi, call_stmt, GSI_SAME_STMT); 3993 3994 widen_mul_stats.divmod_calls_inserted++; 3995 3996 /* Update all statements in stmts vector: 3997 lhs = op1 TRUNC_DIV_EXPR op2 -> lhs = REALPART_EXPR<divmod_tmp> 3998 lhs = op1 TRUNC_MOD_EXPR op2 -> lhs = IMAGPART_EXPR<divmod_tmp>. */ 3999 4000 for (unsigned i = 0; stmts.iterate (i, &use_stmt); ++i) 4001 { 4002 tree new_rhs; 4003 4004 switch (gimple_assign_rhs_code (use_stmt)) 4005 { 4006 case TRUNC_DIV_EXPR: 4007 new_rhs = fold_build1 (REALPART_EXPR, TREE_TYPE (op1), res); 4008 break; 4009 4010 case TRUNC_MOD_EXPR: 4011 new_rhs = fold_build1 (IMAGPART_EXPR, TREE_TYPE (op1), res); 4012 break; 4013 4014 default: 4015 gcc_unreachable (); 4016 } 4017 4018 gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt); 4019 gimple_assign_set_rhs_from_tree (&gsi, new_rhs); 4020 update_stmt (use_stmt); 4021 } 4022 4023 return true; 4024} 4025 4026/* Find integer multiplications where the operands are extended from 4027 smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR 4028 where appropriate. */ 4029 4030namespace { 4031 4032const pass_data pass_data_optimize_widening_mul = 4033{ 4034 GIMPLE_PASS, /* type */ 4035 "widening_mul", /* name */ 4036 OPTGROUP_NONE, /* optinfo_flags */ 4037 TV_NONE, /* tv_id */ 4038 PROP_ssa, /* properties_required */ 4039 0, /* properties_provided */ 4040 0, /* properties_destroyed */ 4041 0, /* todo_flags_start */ 4042 TODO_update_ssa, /* todo_flags_finish */ 4043}; 4044 4045class pass_optimize_widening_mul : public gimple_opt_pass 4046{ 4047public: 4048 pass_optimize_widening_mul (gcc::context *ctxt) 4049 : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt) 4050 {} 4051 4052 /* opt_pass methods: */ 4053 virtual bool gate (function *) 4054 { 4055 return flag_expensive_optimizations && optimize; 4056 } 4057 4058 virtual unsigned int execute (function *); 4059 4060}; // class pass_optimize_widening_mul 4061 4062unsigned int 4063pass_optimize_widening_mul::execute (function *fun) 4064{ 4065 basic_block bb; 4066 bool cfg_changed = false; 4067 4068 memset (&widen_mul_stats, 0, sizeof (widen_mul_stats)); 4069 calculate_dominance_info (CDI_DOMINATORS); 4070 renumber_gimple_stmt_uids (); 4071 4072 FOR_EACH_BB_FN (bb, fun) 4073 { 4074 gimple_stmt_iterator gsi; 4075 4076 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) 4077 { 4078 gimple *stmt = gsi_stmt (gsi); 4079 enum tree_code code; 4080 4081 if (is_gimple_assign (stmt)) 4082 { 4083 code = gimple_assign_rhs_code (stmt); 4084 switch (code) 4085 { 4086 case MULT_EXPR: 4087 if (!convert_mult_to_widen (stmt, &gsi) 4088 && convert_mult_to_fma (stmt, 4089 gimple_assign_rhs1 (stmt), 4090 gimple_assign_rhs2 (stmt))) 4091 { 4092 gsi_remove (&gsi, true); 4093 release_defs (stmt); 4094 continue; 4095 } 4096 break; 4097 4098 case PLUS_EXPR: 4099 case MINUS_EXPR: 4100 if (!convert_plusminus_to_widen (&gsi, stmt, code)) 4101 match_uaddsub_overflow (&gsi, stmt, code); 4102 break; 4103 4104 case TRUNC_MOD_EXPR: 4105 convert_to_divmod (as_a<gassign *> (stmt)); 4106 break; 4107 4108 default:; 4109 } 4110 } 4111 else if (is_gimple_call (stmt) 4112 && gimple_call_lhs (stmt)) 4113 { 4114 tree fndecl = gimple_call_fndecl (stmt); 4115 if (fndecl 4116 && gimple_call_builtin_p (stmt, BUILT_IN_NORMAL)) 4117 { 4118 switch (DECL_FUNCTION_CODE (fndecl)) 4119 { 4120 case BUILT_IN_POWF: 4121 case BUILT_IN_POW: 4122 case BUILT_IN_POWL: 4123 if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST 4124 && real_equal 4125 (&TREE_REAL_CST (gimple_call_arg (stmt, 1)), 4126 &dconst2) 4127 && convert_mult_to_fma (stmt, 4128 gimple_call_arg (stmt, 0), 4129 gimple_call_arg (stmt, 0))) 4130 { 4131 unlink_stmt_vdef (stmt); 4132 if (gsi_remove (&gsi, true) 4133 && gimple_purge_dead_eh_edges (bb)) 4134 cfg_changed = true; 4135 release_defs (stmt); 4136 continue; 4137 } 4138 break; 4139 4140 default:; 4141 } 4142 } 4143 } 4144 gsi_next (&gsi); 4145 } 4146 } 4147 4148 statistics_counter_event (fun, "widening multiplications inserted", 4149 widen_mul_stats.widen_mults_inserted); 4150 statistics_counter_event (fun, "widening maccs inserted", 4151 widen_mul_stats.maccs_inserted); 4152 statistics_counter_event (fun, "fused multiply-adds inserted", 4153 widen_mul_stats.fmas_inserted); 4154 statistics_counter_event (fun, "divmod calls inserted", 4155 widen_mul_stats.divmod_calls_inserted); 4156 4157 return cfg_changed ? TODO_cleanup_cfg : 0; 4158} 4159 4160} // anon namespace 4161 4162gimple_opt_pass * 4163make_pass_optimize_widening_mul (gcc::context *ctxt) 4164{ 4165 return new pass_optimize_widening_mul (ctxt); 4166} 4167