loop-unroll.c revision 146895
1/* Loop unrolling and peeling. 2 Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc. 3 4This file is part of GCC. 5 6GCC is free software; you can redistribute it and/or modify it under 7the terms of the GNU General Public License as published by the Free 8Software Foundation; either version 2, or (at your option) any later 9version. 10 11GCC is distributed in the hope that it will be useful, but WITHOUT ANY 12WARRANTY; without even the implied warranty of MERCHANTABILITY or 13FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14for more details. 15 16You should have received a copy of the GNU General Public License 17along with GCC; see the file COPYING. If not, write to the Free 18Software Foundation, 59 Temple Place - Suite 330, Boston, MA 1902111-1307, USA. */ 20 21#include "config.h" 22#include "system.h" 23#include "coretypes.h" 24#include "tm.h" 25#include "rtl.h" 26#include "hard-reg-set.h" 27#include "basic-block.h" 28#include "cfgloop.h" 29#include "cfglayout.h" 30#include "params.h" 31#include "output.h" 32#include "expr.h" 33/* We need to use the macro exact_log2. */ 34#include "toplev.h" 35 36/* This pass performs loop unrolling and peeling. We only perform these 37 optimizations on innermost loops (with single exception) because 38 the impact on performance is greatest here, and we want to avoid 39 unnecessary code size growth. The gain is caused by greater sequentiality 40 of code, better code to optimize for further passes and in some cases 41 by fewer testings of exit conditions. The main problem is code growth, 42 that impacts performance negatively due to effect of caches. 43 44 What we do: 45 46 -- complete peeling of once-rolling loops; this is the above mentioned 47 exception, as this causes loop to be cancelled completely and 48 does not cause code growth 49 -- complete peeling of loops that roll (small) constant times. 50 -- simple peeling of first iterations of loops that do not roll much 51 (according to profile feedback) 52 -- unrolling of loops that roll constant times; this is almost always 53 win, as we get rid of exit condition tests. 54 -- unrolling of loops that roll number of times that we can compute 55 in runtime; we also get rid of exit condition tests here, but there 56 is the extra expense for calculating the number of iterations 57 -- simple unrolling of remaining loops; this is performed only if we 58 are asked to, as the gain is questionable in this case and often 59 it may even slow down the code 60 For more detailed descriptions of each of those, see comments at 61 appropriate function below. 62 63 There is a lot of parameters (defined and described in params.def) that 64 control how much we unroll/peel. 65 66 ??? A great problem is that we don't have a good way how to determine 67 how many times we should unroll the loop; the experiments I have made 68 showed that this choice may affect performance in order of several %. 69 */ 70 71static void decide_unrolling_and_peeling (struct loops *, int); 72static void peel_loops_completely (struct loops *, int); 73static void decide_peel_simple (struct loop *, int); 74static void decide_peel_once_rolling (struct loop *, int); 75static void decide_peel_completely (struct loop *, int); 76static void decide_unroll_stupid (struct loop *, int); 77static void decide_unroll_constant_iterations (struct loop *, int); 78static void decide_unroll_runtime_iterations (struct loop *, int); 79static void peel_loop_simple (struct loops *, struct loop *); 80static void peel_loop_completely (struct loops *, struct loop *); 81static void unroll_loop_stupid (struct loops *, struct loop *); 82static void unroll_loop_constant_iterations (struct loops *, struct loop *); 83static void unroll_loop_runtime_iterations (struct loops *, struct loop *); 84static void expand_bct (edge, int); 85static bool discard_increment (struct loop *); 86 87/* Unroll and/or peel (depending on FLAGS) LOOPS. */ 88void 89unroll_and_peel_loops (struct loops *loops, int flags) 90{ 91 struct loop *loop, *next; 92 int check; 93 94 /* First perform complete loop peeling (it is almost surely a win, 95 and affects parameters for further decision a lot). */ 96 peel_loops_completely (loops, flags); 97 98 /* Now decide rest of unrolling and peeling. */ 99 decide_unrolling_and_peeling (loops, flags); 100 101 loop = loops->tree_root; 102 while (loop->inner) 103 loop = loop->inner; 104 105 /* Scan the loops, inner ones first. */ 106 while (loop != loops->tree_root) 107 { 108 if (loop->next) 109 { 110 next = loop->next; 111 while (next->inner) 112 next = next->inner; 113 } 114 else 115 next = loop->outer; 116 117 check = 1; 118 /* And perform the appropriate transformations. */ 119 switch (loop->lpt_decision.decision) 120 { 121 case LPT_PEEL_COMPLETELY: 122 /* Already done. */ 123 abort (); 124 case LPT_PEEL_SIMPLE: 125 peel_loop_simple (loops, loop); 126 break; 127 case LPT_UNROLL_CONSTANT: 128 unroll_loop_constant_iterations (loops, loop); 129 break; 130 case LPT_UNROLL_RUNTIME: 131 unroll_loop_runtime_iterations (loops, loop); 132 break; 133 case LPT_UNROLL_STUPID: 134 unroll_loop_stupid (loops, loop); 135 break; 136 case LPT_NONE: 137 check = 0; 138 break; 139 default: 140 abort (); 141 } 142 if (check) 143 { 144#ifdef ENABLE_CHECKING 145 verify_dominators (CDI_DOMINATORS); 146 verify_loop_structure (loops); 147#endif 148 } 149 loop = next; 150 } 151} 152 153/* Check whether to peel LOOPS (depending on FLAGS) completely and do so. */ 154static void 155peel_loops_completely (struct loops *loops, int flags) 156{ 157 struct loop *loop, *next; 158 159 loop = loops->tree_root; 160 while (loop->inner) 161 loop = loop->inner; 162 163 while (loop != loops->tree_root) 164 { 165 if (loop->next) 166 { 167 next = loop->next; 168 while (next->inner) 169 next = next->inner; 170 } 171 else 172 next = loop->outer; 173 174 loop->lpt_decision.decision = LPT_NONE; 175 loop->has_desc = 0; 176 177 if (rtl_dump_file) 178 fprintf (rtl_dump_file, ";; Considering loop %d for complete peeling\n", 179 loop->num); 180 181 loop->ninsns = num_loop_insns (loop); 182 183 decide_peel_once_rolling (loop, flags); 184 if (loop->lpt_decision.decision == LPT_NONE) 185 decide_peel_completely (loop, flags); 186 187 if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY) 188 { 189 peel_loop_completely (loops, loop); 190#ifdef ENABLE_CHECKING 191 verify_dominators (CDI_DOMINATORS); 192 verify_loop_structure (loops); 193#endif 194 } 195 loop = next; 196 } 197} 198 199/* Decide whether unroll or peel LOOPS (depending on FLAGS) and how much. */ 200static void 201decide_unrolling_and_peeling (struct loops *loops, int flags) 202{ 203 struct loop *loop = loops->tree_root, *next; 204 205 while (loop->inner) 206 loop = loop->inner; 207 208 /* Scan the loops, inner ones first. */ 209 while (loop != loops->tree_root) 210 { 211 if (loop->next) 212 { 213 next = loop->next; 214 while (next->inner) 215 next = next->inner; 216 } 217 else 218 next = loop->outer; 219 220 loop->lpt_decision.decision = LPT_NONE; 221 222 if (rtl_dump_file) 223 fprintf (rtl_dump_file, ";; Considering loop %d\n", loop->num); 224 225 /* Do not peel cold areas. */ 226 if (!maybe_hot_bb_p (loop->header)) 227 { 228 if (rtl_dump_file) 229 fprintf (rtl_dump_file, ";; Not considering loop, cold area\n"); 230 loop = next; 231 continue; 232 } 233 234 /* Can the loop be manipulated? */ 235 if (!can_duplicate_loop_p (loop)) 236 { 237 if (rtl_dump_file) 238 fprintf (rtl_dump_file, 239 ";; Not considering loop, cannot duplicate\n"); 240 loop = next; 241 continue; 242 } 243 244 /* Skip non-innermost loops. */ 245 if (loop->inner) 246 { 247 if (rtl_dump_file) 248 fprintf (rtl_dump_file, ";; Not considering loop, is not innermost\n"); 249 loop = next; 250 continue; 251 } 252 253 loop->ninsns = num_loop_insns (loop); 254 loop->av_ninsns = average_num_loop_insns (loop); 255 256 /* Try transformations one by one in decreasing order of 257 priority. */ 258 259 decide_unroll_constant_iterations (loop, flags); 260 if (loop->lpt_decision.decision == LPT_NONE) 261 decide_unroll_runtime_iterations (loop, flags); 262 if (loop->lpt_decision.decision == LPT_NONE) 263 decide_unroll_stupid (loop, flags); 264 if (loop->lpt_decision.decision == LPT_NONE) 265 decide_peel_simple (loop, flags); 266 267 loop = next; 268 } 269} 270 271/* Decide whether the LOOP is once rolling and suitable for complete 272 peeling. */ 273static void 274decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED) 275{ 276 if (rtl_dump_file) 277 fprintf (rtl_dump_file, ";; Considering peeling once rolling loop\n"); 278 279 /* Is the loop small enough? */ 280 if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns) 281 { 282 if (rtl_dump_file) 283 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n"); 284 return; 285 } 286 287 /* Check for simple loops. */ 288 loop->simple = simple_loop_p (loop, &loop->desc); 289 loop->has_desc = 1; 290 291 /* Check number of iterations. */ 292 if (!loop->simple || !loop->desc.const_iter || loop->desc.niter != 0) 293 { 294 if (rtl_dump_file) 295 fprintf (rtl_dump_file, ";; Unable to prove that the loop rolls exactly once\n"); 296 return; 297 } 298 299 /* Success. */ 300 if (rtl_dump_file) 301 fprintf (rtl_dump_file, ";; Decided to peel exactly once rolling loop\n"); 302 loop->lpt_decision.decision = LPT_PEEL_COMPLETELY; 303} 304 305/* Decide whether the LOOP is suitable for complete peeling. */ 306static void 307decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED) 308{ 309 unsigned npeel; 310 311 if (rtl_dump_file) 312 fprintf (rtl_dump_file, ";; Considering peeling completely\n"); 313 314 /* Skip non-innermost loops. */ 315 if (loop->inner) 316 { 317 if (rtl_dump_file) 318 fprintf (rtl_dump_file, ";; Not considering loop, is not innermost\n"); 319 return; 320 } 321 322 /* Do not peel cold areas. */ 323 if (!maybe_hot_bb_p (loop->header)) 324 { 325 if (rtl_dump_file) 326 fprintf (rtl_dump_file, ";; Not considering loop, cold area\n"); 327 return; 328 } 329 330 /* Can the loop be manipulated? */ 331 if (!can_duplicate_loop_p (loop)) 332 { 333 if (rtl_dump_file) 334 fprintf (rtl_dump_file, 335 ";; Not considering loop, cannot duplicate\n"); 336 return; 337 } 338 339 /* npeel = number of iterations to peel. */ 340 npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns; 341 if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES)) 342 npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES); 343 344 /* Is the loop small enough? */ 345 if (!npeel) 346 { 347 if (rtl_dump_file) 348 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n"); 349 return; 350 } 351 352 /* Check for simple loops. */ 353 if (!loop->has_desc) 354 { 355 loop->simple = simple_loop_p (loop, &loop->desc); 356 loop->has_desc = 1; 357 } 358 359 /* Check number of iterations. */ 360 if (!loop->simple || !loop->desc.const_iter) 361 { 362 if (rtl_dump_file) 363 fprintf (rtl_dump_file, ";; Unable to prove that the loop iterates constant times\n"); 364 return; 365 } 366 367 if (loop->desc.niter > npeel - 1) 368 { 369 if (rtl_dump_file) 370 { 371 fprintf (rtl_dump_file, ";; Not peeling loop completely, rolls too much ("); 372 fprintf (rtl_dump_file, HOST_WIDEST_INT_PRINT_DEC,(HOST_WIDEST_INT) loop->desc.niter); 373 fprintf (rtl_dump_file, " iterations > %d [maximum peelings])\n", npeel); 374 } 375 return; 376 } 377 378 /* Success. */ 379 if (rtl_dump_file) 380 fprintf (rtl_dump_file, ";; Decided to peel loop completely\n"); 381 loop->lpt_decision.decision = LPT_PEEL_COMPLETELY; 382} 383 384/* Peel all iterations of LOOP, remove exit edges and cancel the loop 385 completely. The transformation done: 386 387 for (i = 0; i < 4; i++) 388 body; 389 390 ==> 391 392 i = 0; 393 body; i++; 394 body; i++; 395 body; i++; 396 body; i++; 397 */ 398static void 399peel_loop_completely (struct loops *loops, struct loop *loop) 400{ 401 sbitmap wont_exit; 402 unsigned HOST_WIDE_INT npeel; 403 unsigned n_remove_edges, i; 404 edge *remove_edges; 405 struct loop_desc *desc = &loop->desc; 406 bool discard_inc = false; 407 bool is_bct; 408 409 if ((is_bct = is_bct_cond (BB_END (loop->desc.out_edge->src)))) 410 discard_inc = discard_increment (loop); 411 412 npeel = desc->niter; 413 414 if (npeel) 415 { 416 wont_exit = sbitmap_alloc (npeel + 1); 417 sbitmap_ones (wont_exit); 418 RESET_BIT (wont_exit, 0); 419 if (desc->may_be_zero) 420 RESET_BIT (wont_exit, 1); 421 422 remove_edges = xcalloc (npeel, sizeof (edge)); 423 n_remove_edges = 0; 424 425 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), 426 loops, npeel, 427 wont_exit, desc->out_edge, remove_edges, &n_remove_edges, 428 DLTHE_FLAG_UPDATE_FREQ)) 429 abort (); 430 431 free (wont_exit); 432 433 /* Expand the branch and count. */ 434 if (is_bct) 435 for (i = 0; i < n_remove_edges; i++) 436 expand_bct (remove_edges[i], discard_inc); 437 438 /* Remove the exit edges. */ 439 for (i = 0; i < n_remove_edges; i++) 440 remove_path (loops, remove_edges[i]); 441 free (remove_edges); 442 } 443 444 /* Expand the branch and count. */ 445 if (is_bct) 446 expand_bct (desc->in_edge, discard_inc); 447 448 /* Now remove the unreachable part of the last iteration and cancel 449 the loop. */ 450 remove_path (loops, desc->in_edge); 451 452 if (rtl_dump_file) 453 fprintf (rtl_dump_file, ";; Peeled loop completely, %d times\n", (int) npeel); 454} 455 456/* Decide whether to unroll LOOP iterating constant number of times and how much. */ 457static void 458decide_unroll_constant_iterations (struct loop *loop, int flags) 459{ 460 unsigned nunroll, nunroll_by_av, best_copies, best_unroll = -1, n_copies, i; 461 462 if (!(flags & UAP_UNROLL)) 463 { 464 /* We were not asked to, just return back silently. */ 465 return; 466 } 467 468 if (rtl_dump_file) 469 fprintf (rtl_dump_file, ";; Considering unrolling loop with constant number of iterations\n"); 470 471 /* nunroll = total number of copies of the original loop body in 472 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */ 473 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns; 474 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns; 475 if (nunroll > nunroll_by_av) 476 nunroll = nunroll_by_av; 477 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES)) 478 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES); 479 480 /* Skip big loops. */ 481 if (nunroll <= 1) 482 { 483 if (rtl_dump_file) 484 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n"); 485 return; 486 } 487 488 /* Check for simple loops. */ 489 if (!loop->has_desc) 490 { 491 loop->simple = simple_loop_p (loop, &loop->desc); 492 loop->has_desc = 1; 493 } 494 495 /* Check number of iterations. */ 496 if (!loop->simple || !loop->desc.const_iter) 497 { 498 if (rtl_dump_file) 499 fprintf (rtl_dump_file, ";; Unable to prove that the loop iterates constant times\n"); 500 return; 501 } 502 503 /* Check whether the loop rolls enough to consider. */ 504 if (loop->desc.niter < 2 * nunroll) 505 { 506 if (rtl_dump_file) 507 fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n"); 508 return; 509 } 510 511 /* Success; now compute number of iterations to unroll. We alter 512 nunroll so that as few as possible copies of loop body are 513 necessary, while still not decreasing the number of unrollings 514 too much (at most by 1). */ 515 best_copies = 2 * nunroll + 10; 516 517 i = 2 * nunroll + 2; 518 if ((unsigned) i - 1 >= loop->desc.niter) 519 i = loop->desc.niter - 2; 520 521 for (; i >= nunroll - 1; i--) 522 { 523 unsigned exit_mod = loop->desc.niter % (i + 1); 524 525 if (loop->desc.postincr) 526 n_copies = exit_mod + i + 1; 527 else if (exit_mod != (unsigned) i || loop->desc.may_be_zero) 528 n_copies = exit_mod + i + 2; 529 else 530 n_copies = i + 1; 531 532 if (n_copies < best_copies) 533 { 534 best_copies = n_copies; 535 best_unroll = i; 536 } 537 } 538 539 if (rtl_dump_file) 540 fprintf (rtl_dump_file, ";; max_unroll %d (%d copies, initial %d).\n", 541 best_unroll + 1, best_copies, nunroll); 542 543 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT; 544 loop->lpt_decision.times = best_unroll; 545} 546 547/* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES + 1 548 times. The transformation does this: 549 550 for (i = 0; i < 102; i++) 551 body; 552 553 ==> 554 555 i = 0; 556 body; i++; 557 body; i++; 558 while (i < 102) 559 { 560 body; i++; 561 body; i++; 562 body; i++; 563 body; i++; 564 } 565 */ 566static void 567unroll_loop_constant_iterations (struct loops *loops, struct loop *loop) 568{ 569 unsigned HOST_WIDE_INT niter; 570 unsigned exit_mod; 571 sbitmap wont_exit; 572 unsigned n_remove_edges, i; 573 edge *remove_edges; 574 unsigned max_unroll = loop->lpt_decision.times; 575 struct loop_desc *desc = &loop->desc; 576 bool discard_inc = false; 577 bool is_bct; 578 579 niter = desc->niter; 580 581 if (niter <= (unsigned) max_unroll + 1) 582 abort (); /* Should not get here (such loop should be peeled instead). */ 583 584 exit_mod = niter % (max_unroll + 1); 585 586 wont_exit = sbitmap_alloc (max_unroll + 1); 587 sbitmap_ones (wont_exit); 588 589 remove_edges = xcalloc (max_unroll + exit_mod + 1, sizeof (edge)); 590 n_remove_edges = 0; 591 592 /* For a loop ending with a branch and count for which the increment 593 of the count register will be discarded, adjust the initialization of 594 the count register. */ 595 if ((is_bct = is_bct_cond (BB_END (desc->out_edge->src))) 596 && (discard_inc = discard_increment (loop))) 597 { 598 rtx ini_var; 599 600 rtx init_code; 601 int n_peel, new_bct_value; 602 603 /* Get expression for number of iterations. */ 604 start_sequence (); 605 606 n_peel = (niter+1) % (max_unroll+1); 607 new_bct_value = (niter+1 - n_peel) / (max_unroll+1) ; 608 ini_var = GEN_INT (new_bct_value); 609 610 emit_move_insn (desc->var, ini_var); 611 init_code = get_insns (); 612 end_sequence (); 613 614 loop_split_edge_with (loop_preheader_edge (loop), init_code); 615 } 616 617 if (desc->postincr) 618 { 619 /* Counter is incremented after the exit test; leave exit test 620 in the first copy, so that the loops that start with test 621 of exit condition have continuous body after unrolling. */ 622 623 if (rtl_dump_file) 624 fprintf (rtl_dump_file, ";; Condition on beginning of loop.\n"); 625 626 /* Peel exit_mod iterations. */ 627 RESET_BIT (wont_exit, 0); 628 if (desc->may_be_zero) 629 RESET_BIT (wont_exit, 1); 630 631 if (exit_mod 632 && !duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), 633 loops, exit_mod, 634 wont_exit, desc->out_edge, remove_edges, &n_remove_edges, 635 DLTHE_FLAG_UPDATE_FREQ)) 636 abort (); 637 638 SET_BIT (wont_exit, 1); 639 } 640 else 641 { 642 /* Leave exit test in last copy, for the same reason as above if 643 the loop tests the condition at the end of loop body. */ 644 645 if (rtl_dump_file) 646 fprintf (rtl_dump_file, ";; Condition on end of loop.\n"); 647 648 /* We know that niter >= max_unroll + 2; so we do not need to care of 649 case when we would exit before reaching the loop. So just peel 650 exit_mod + 1 iterations. 651 */ 652 if (exit_mod != (unsigned) max_unroll || desc->may_be_zero) 653 { 654 RESET_BIT (wont_exit, 0); 655 if (desc->may_be_zero) 656 RESET_BIT (wont_exit, 1); 657 658 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), 659 loops, exit_mod + 1, 660 wont_exit, desc->out_edge, remove_edges, &n_remove_edges, 661 DLTHE_FLAG_UPDATE_FREQ)) 662 abort (); 663 664 SET_BIT (wont_exit, 0); 665 SET_BIT (wont_exit, 1); 666 } 667 668 RESET_BIT (wont_exit, max_unroll); 669 } 670 671 /* Now unroll the loop. */ 672 if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop), 673 loops, max_unroll, 674 wont_exit, desc->out_edge, remove_edges, &n_remove_edges, 675 DLTHE_FLAG_UPDATE_FREQ)) 676 abort (); 677 678 free (wont_exit); 679 680 /* Expand the branch and count. */ 681 if (is_bct) 682 for (i = 0; i < n_remove_edges; i++) 683 expand_bct (remove_edges[i], discard_inc); 684 685 /* Remove the edges. */ 686 for (i = 0; i < n_remove_edges; i++) 687 remove_path (loops, remove_edges[i]); 688 free (remove_edges); 689 690 if (rtl_dump_file) 691 fprintf (rtl_dump_file, ";; Unrolled loop %d times, constant # of iterations %i insns\n",max_unroll, num_loop_insns (loop)); 692} 693 694/* Decide whether to unroll LOOP iterating runtime computable number of times 695 and how much. */ 696static void 697decide_unroll_runtime_iterations (struct loop *loop, int flags) 698{ 699 unsigned nunroll, nunroll_by_av, i; 700 701 if (!(flags & UAP_UNROLL)) 702 { 703 /* We were not asked to, just return back silently. */ 704 return; 705 } 706 707 if (rtl_dump_file) 708 fprintf (rtl_dump_file, ";; Considering unrolling loop with runtime computable number of iterations\n"); 709 710 /* nunroll = total number of copies of the original loop body in 711 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */ 712 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns; 713 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns; 714 if (nunroll > nunroll_by_av) 715 nunroll = nunroll_by_av; 716 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES)) 717 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES); 718 719 /* Skip big loops. */ 720 if (nunroll <= 1) 721 { 722 if (rtl_dump_file) 723 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n"); 724 return; 725 } 726 727 /* Check for simple loops. */ 728 if (!loop->has_desc) 729 { 730 loop->simple = simple_loop_p (loop, &loop->desc); 731 loop->has_desc = 1; 732 } 733 734 /* Check simpleness. */ 735 if (!loop->simple) 736 { 737 if (rtl_dump_file) 738 fprintf (rtl_dump_file, ";; Unable to prove that the number of iterations can be counted in runtime\n"); 739 return; 740 } 741 742 if (loop->desc.const_iter) 743 { 744 if (rtl_dump_file) 745 fprintf (rtl_dump_file, ";; Loop iterates constant times\n"); 746 return; 747 } 748 749 /* If we have profile feedback, check whether the loop rolls. */ 750 if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll) 751 { 752 if (rtl_dump_file) 753 fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n"); 754 return; 755 } 756 757 /* Success; now force nunroll to be power of 2, as we are unable to 758 cope with overflows in computation of number of iterations. */ 759 for (i = 1; 2 * i <= nunroll; i *= 2); 760 761 loop->lpt_decision.decision = LPT_UNROLL_RUNTIME; 762 loop->lpt_decision.times = i - 1; 763} 764 765/* Unroll LOOP for that we are able to count number of iterations in runtime 766 LOOP->LPT_DECISION.TIMES + 1 times. The transformation does this (with some 767 extra care for case n < 0): 768 769 for (i = 0; i < n; i++) 770 body; 771 772 ==> 773 774 i = 0; 775 mod = n % 4; 776 777 switch (mod) 778 { 779 case 3: 780 body; i++; 781 case 2: 782 body; i++; 783 case 1: 784 body; i++; 785 case 0: ; 786 } 787 788 while (i < n) 789 { 790 body; i++; 791 body; i++; 792 body; i++; 793 body; i++; 794 } 795 */ 796static void 797unroll_loop_runtime_iterations (struct loops *loops, struct loop *loop) 798{ 799 rtx niter, init_code, branch_code, jump, label; 800 unsigned i, j, p; 801 basic_block preheader, *body, *dom_bbs, swtch, ezc_swtch; 802 unsigned n_dom_bbs; 803 sbitmap wont_exit; 804 int may_exit_copy; 805 unsigned n_peel, n_remove_edges; 806 edge *remove_edges, e; 807 bool extra_zero_check, last_may_exit; 808 unsigned max_unroll = loop->lpt_decision.times; 809 struct loop_desc *desc = &loop->desc; 810 bool discard_inc = false; 811 bool is_bct; 812 813 /* Remember blocks whose dominators will have to be updated. */ 814 dom_bbs = xcalloc (n_basic_blocks, sizeof (basic_block)); 815 n_dom_bbs = 0; 816 817 body = get_loop_body (loop); 818 for (i = 0; i < loop->num_nodes; i++) 819 { 820 unsigned nldom; 821 basic_block *ldom; 822 823 nldom = get_dominated_by (CDI_DOMINATORS, body[i], &ldom); 824 for (j = 0; j < nldom; j++) 825 if (!flow_bb_inside_loop_p (loop, ldom[j])) 826 dom_bbs[n_dom_bbs++] = ldom[j]; 827 828 free (ldom); 829 } 830 free (body); 831 832 if (desc->postincr) 833 { 834 /* Leave exit in first copy (for explanation why see comment in 835 unroll_loop_constant_iterations). */ 836 may_exit_copy = 0; 837 n_peel = max_unroll - 1; 838 extra_zero_check = true; 839 last_may_exit = false; 840 } 841 else 842 { 843 /* Leave exit in last copy (for explanation why see comment in 844 unroll_loop_constant_iterations). */ 845 may_exit_copy = max_unroll; 846 n_peel = max_unroll; 847 extra_zero_check = false; 848 last_may_exit = true; 849 } 850 851 /* Get expression for number of iterations. */ 852 start_sequence (); 853 niter = count_loop_iterations (desc, NULL, NULL); 854 if (!niter) 855 abort (); 856 niter = force_operand (niter, NULL); 857 858 /* Count modulo by ANDing it with max_unroll; we use the fact that 859 the number of unrollings is a power of two, and thus this is correct 860 even if there is overflow in the computation. */ 861 niter = expand_simple_binop (GET_MODE (desc->var), AND, 862 niter, 863 GEN_INT (max_unroll), 864 NULL_RTX, 0, OPTAB_LIB_WIDEN); 865 866 /* For a loop ending with a branch and count for which the increment 867 of the count register will be discarded, adjust the initialization of 868 the count register. */ 869 if ((is_bct = is_bct_cond (BB_END (desc->out_edge->src))) 870 && (discard_inc = discard_increment (loop))) 871 { 872 rtx count, count2, count_unroll_mod; 873 int count_unroll; 874 875 /* start_sequence (); */ 876 877 count = count_loop_iterations (desc, NULL, NULL); 878 879 count_unroll = loop->lpt_decision.times+1; 880 881 882 883 count_unroll_mod = GEN_INT (exact_log2 (count_unroll)); 884 count = expand_simple_binop (GET_MODE (desc->var), LSHIFTRT, 885 count, count_unroll_mod, 886 0, 0, OPTAB_LIB_WIDEN); 887 888 count2 = expand_simple_binop (GET_MODE (desc->var), PLUS, 889 count, GEN_INT (2), 890 0, 0, OPTAB_LIB_WIDEN); 891 892 emit_move_insn (desc->var, count2); 893 } 894 895 init_code = get_insns (); 896 end_sequence (); 897 898 /* Precondition the loop. */ 899 loop_split_edge_with (loop_preheader_edge (loop), init_code); 900 901 remove_edges = xcalloc (max_unroll + n_peel + 1, sizeof (edge)); 902 n_remove_edges = 0; 903 904 wont_exit = sbitmap_alloc (max_unroll + 2); 905 906 /* Peel the first copy of loop body (almost always we must leave exit test 907 here; the only exception is when we have extra zero check and the number 908 of iterations is reliable (i.e. comes out of NE condition). Also record 909 the place of (possible) extra zero check. */ 910 sbitmap_zero (wont_exit); 911 if (extra_zero_check && desc->cond == NE) 912 SET_BIT (wont_exit, 1); 913 ezc_swtch = loop_preheader_edge (loop)->src; 914 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), 915 loops, 1, 916 wont_exit, desc->out_edge, remove_edges, &n_remove_edges, 917 DLTHE_FLAG_UPDATE_FREQ)) 918 abort (); 919 920 /* Record the place where switch will be built for preconditioning. */ 921 swtch = loop_split_edge_with (loop_preheader_edge (loop), 922 NULL_RTX); 923 924 for (i = 0; i < n_peel; i++) 925 { 926 /* Peel the copy. */ 927 sbitmap_zero (wont_exit); 928 if (i != n_peel - 1 || !last_may_exit) 929 SET_BIT (wont_exit, 1); 930 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), 931 loops, 1, 932 wont_exit, desc->out_edge, remove_edges, &n_remove_edges, 933 DLTHE_FLAG_UPDATE_FREQ)) 934 abort (); 935 936 /* Create item for switch. */ 937 j = n_peel - i - (extra_zero_check ? 0 : 1); 938 p = REG_BR_PROB_BASE / (i + 2); 939 940 /* If modulo is zero do not jumo to the header of the unrolled loops. 941 Jump instead to the last branch and count that precedes it. */ 942 if (is_bct && discard_inc && (j == 0)) 943 { 944 basic_block lastbb = loop_preheader_edge(loop)->src; 945 rtx split_after; 946 947 /* Skip dummy basic blocks generated during the unrolling. */ 948 while (!is_bct_cond (BB_END (lastbb))) 949 lastbb = lastbb->pred->src; 950 951 split_after = PREV_INSN (BB_END (lastbb)); 952 953 preheader = split_loop_bb (lastbb , split_after)->dest; 954 } 955 else 956 preheader = loop_split_edge_with (loop_preheader_edge (loop), 957 NULL_RTX); 958 label = block_label (preheader); 959 start_sequence (); 960 do_compare_rtx_and_jump (copy_rtx (niter), GEN_INT (j), EQ, 0, 961 GET_MODE (desc->var), NULL_RTX, NULL_RTX, 962 label); 963 jump = get_last_insn (); 964 JUMP_LABEL (jump) = label; 965 REG_NOTES (jump) 966 = gen_rtx_EXPR_LIST (REG_BR_PROB, 967 GEN_INT (p), REG_NOTES (jump)); 968 969 LABEL_NUSES (label)++; 970 branch_code = get_insns (); 971 end_sequence (); 972 973 swtch = loop_split_edge_with (swtch->pred, branch_code); 974 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch); 975 swtch->succ->probability = REG_BR_PROB_BASE - p; 976 e = make_edge (swtch, preheader, 977 swtch->succ->flags & EDGE_IRREDUCIBLE_LOOP); 978 e->probability = p; 979 } 980 981 if (extra_zero_check) 982 { 983 /* Add branch for zero iterations. */ 984 p = REG_BR_PROB_BASE / (max_unroll + 1); 985 swtch = ezc_swtch; 986 preheader = loop_split_edge_with (loop_preheader_edge (loop), NULL_RTX); 987 label = block_label (preheader); 988 start_sequence (); 989 do_compare_rtx_and_jump (copy_rtx (niter), const0_rtx, EQ, 0, 990 GET_MODE (desc->var), NULL_RTX, NULL_RTX, 991 label); 992 jump = get_last_insn (); 993 JUMP_LABEL (jump) = label; 994 REG_NOTES (jump) 995 = gen_rtx_EXPR_LIST (REG_BR_PROB, 996 GEN_INT (p), REG_NOTES (jump)); 997 998 LABEL_NUSES (label)++; 999 branch_code = get_insns (); 1000 end_sequence (); 1001 1002 swtch = loop_split_edge_with (swtch->succ, branch_code); 1003 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch); 1004 swtch->succ->probability = REG_BR_PROB_BASE - p; 1005 e = make_edge (swtch, preheader, 1006 swtch->succ->flags & EDGE_IRREDUCIBLE_LOOP); 1007 e->probability = p; 1008 } 1009 1010 /* Recount dominators for outer blocks. */ 1011 iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, n_dom_bbs); 1012 1013 /* And unroll loop. */ 1014 1015 sbitmap_ones (wont_exit); 1016 RESET_BIT (wont_exit, may_exit_copy); 1017 1018 if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop), 1019 loops, max_unroll, 1020 wont_exit, desc->out_edge, remove_edges, &n_remove_edges, 1021 DLTHE_FLAG_UPDATE_FREQ)) 1022 abort (); 1023 1024 free (wont_exit); 1025 1026 /* Expand the branch and count. */ 1027 if (is_bct) 1028 for (i = 0; i < n_remove_edges; i++) 1029 expand_bct (remove_edges[i], discard_inc); 1030 1031 /* Remove the edges. */ 1032 for (i = 0; i < n_remove_edges; i++) 1033 remove_path (loops, remove_edges[i]); 1034 free (remove_edges); 1035 1036 if (rtl_dump_file) 1037 fprintf (rtl_dump_file, 1038 ";; Unrolled loop %d times, counting # of iterations in runtime, %i insns\n", 1039 max_unroll, num_loop_insns (loop)); 1040} 1041 1042/* Decide whether to simply peel LOOP and how much. */ 1043static void 1044decide_peel_simple (struct loop *loop, int flags) 1045{ 1046 unsigned npeel; 1047 1048 if (!(flags & UAP_PEEL)) 1049 { 1050 /* We were not asked to, just return back silently. */ 1051 return; 1052 } 1053 1054 if (rtl_dump_file) 1055 fprintf (rtl_dump_file, ";; Considering simply peeling loop\n"); 1056 1057 /* npeel = number of iterations to peel. */ 1058 npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns; 1059 if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES)) 1060 npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES); 1061 1062 /* Skip big loops. */ 1063 if (!npeel) 1064 { 1065 if (rtl_dump_file) 1066 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n"); 1067 return; 1068 } 1069 1070 /* Check for simple loops. */ 1071 if (!loop->has_desc) 1072 { 1073 loop->simple = simple_loop_p (loop, &loop->desc); 1074 loop->has_desc = 1; 1075 } 1076 1077 /* Check number of iterations. */ 1078 if (loop->simple && loop->desc.const_iter) 1079 { 1080 if (rtl_dump_file) 1081 fprintf (rtl_dump_file, ";; Loop iterates constant times\n"); 1082 return; 1083 } 1084 1085 /* Do not simply peel loops with branches inside -- it increases number 1086 of mispredicts. */ 1087 if (loop->desc.n_branches > 1) 1088 { 1089 if (rtl_dump_file) 1090 fprintf (rtl_dump_file, ";; Not peeling, contains branches\n"); 1091 return; 1092 } 1093 1094 if (loop->header->count) 1095 { 1096 unsigned niter = expected_loop_iterations (loop); 1097 if (niter + 1 > npeel) 1098 { 1099 if (rtl_dump_file) 1100 { 1101 fprintf (rtl_dump_file, ";; Not peeling loop, rolls too much ("); 1102 fprintf (rtl_dump_file, HOST_WIDEST_INT_PRINT_DEC, (HOST_WIDEST_INT) (niter + 1)); 1103 fprintf (rtl_dump_file, " iterations > %d [maximum peelings])\n", npeel); 1104 } 1105 return; 1106 } 1107 npeel = niter + 1; 1108 } 1109 else 1110 { 1111 /* For now we have no good heuristics to decide whether loop peeling 1112 will be effective, so disable it. */ 1113 if (rtl_dump_file) 1114 fprintf (rtl_dump_file, 1115 ";; Not peeling loop, no evidence it will be profitable\n"); 1116 return; 1117 } 1118 1119 /* Success. */ 1120 loop->lpt_decision.decision = LPT_PEEL_SIMPLE; 1121 loop->lpt_decision.times = npeel; 1122} 1123 1124/* Peel a LOOP LOOP->LPT_DECISION.TIMES times. The transformation: 1125 while (cond) 1126 body; 1127 1128 ==> 1129 1130 if (!cond) goto end; 1131 body; 1132 if (!cond) goto end; 1133 body; 1134 while (cond) 1135 body; 1136 end: ; 1137 */ 1138static void 1139peel_loop_simple (struct loops *loops, struct loop *loop) 1140{ 1141 sbitmap wont_exit; 1142 unsigned npeel = loop->lpt_decision.times; 1143 1144 wont_exit = sbitmap_alloc (npeel + 1); 1145 sbitmap_zero (wont_exit); 1146 1147 if (!duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), 1148 loops, npeel, wont_exit, NULL, NULL, NULL, 1149 DLTHE_FLAG_UPDATE_FREQ)) 1150 abort (); 1151 1152 free (wont_exit); 1153 1154 if (rtl_dump_file) 1155 fprintf (rtl_dump_file, ";; Peeling loop %d times\n", npeel); 1156} 1157 1158/* Decide whether to unroll LOOP stupidly and how much. */ 1159static void 1160decide_unroll_stupid (struct loop *loop, int flags) 1161{ 1162 unsigned nunroll, nunroll_by_av, i; 1163 1164 if (!(flags & UAP_UNROLL_ALL)) 1165 { 1166 /* We were not asked to, just return back silently. */ 1167 return; 1168 } 1169 1170 if (rtl_dump_file) 1171 fprintf (rtl_dump_file, ";; Considering unrolling loop stupidly\n"); 1172 1173 /* nunroll = total number of copies of the original loop body in 1174 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */ 1175 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns; 1176 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns; 1177 if (nunroll > nunroll_by_av) 1178 nunroll = nunroll_by_av; 1179 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES)) 1180 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES); 1181 1182 /* Skip big loops. */ 1183 if (nunroll <= 1) 1184 { 1185 if (rtl_dump_file) 1186 fprintf (rtl_dump_file, ";; Not considering loop, is too big\n"); 1187 return; 1188 } 1189 1190 /* Check for simple loops. */ 1191 if (!loop->has_desc) 1192 { 1193 loop->simple = simple_loop_p (loop, &loop->desc); 1194 loop->has_desc = 1; 1195 } 1196 1197 /* Check simpleness. */ 1198 if (loop->simple) 1199 { 1200 if (rtl_dump_file) 1201 fprintf (rtl_dump_file, ";; The loop is simple\n"); 1202 return; 1203 } 1204 1205 /* Do not unroll loops with branches inside -- it increases number 1206 of mispredicts. */ 1207 if (loop->desc.n_branches > 1) 1208 { 1209 if (rtl_dump_file) 1210 fprintf (rtl_dump_file, ";; Not unrolling, contains branches\n"); 1211 return; 1212 } 1213 1214 /* If we have profile feedback, check whether the loop rolls. */ 1215 if (loop->header->count && expected_loop_iterations (loop) < 2 * nunroll) 1216 { 1217 if (rtl_dump_file) 1218 fprintf (rtl_dump_file, ";; Not unrolling loop, doesn't roll\n"); 1219 return; 1220 } 1221 1222 /* Success. Now force nunroll to be power of 2, as it seems that this 1223 improves results (partially because of better alignments, partially 1224 because of some dark magic). */ 1225 for (i = 1; 2 * i <= nunroll; i *= 2); 1226 1227 loop->lpt_decision.decision = LPT_UNROLL_STUPID; 1228 loop->lpt_decision.times = i - 1; 1229} 1230 1231/* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation: 1232 while (cond) 1233 body; 1234 1235 ==> 1236 1237 while (cond) 1238 { 1239 body; 1240 if (!cond) break; 1241 body; 1242 if (!cond) break; 1243 body; 1244 if (!cond) break; 1245 body; 1246 } 1247 */ 1248static void 1249unroll_loop_stupid (struct loops *loops, struct loop *loop) 1250{ 1251 sbitmap wont_exit; 1252 unsigned nunroll = loop->lpt_decision.times; 1253 1254 wont_exit = sbitmap_alloc (nunroll + 1); 1255 sbitmap_zero (wont_exit); 1256 1257 if (!duplicate_loop_to_header_edge (loop, loop_latch_edge (loop), 1258 loops, nunroll, wont_exit, NULL, NULL, NULL, 1259 DLTHE_FLAG_UPDATE_FREQ)) 1260 abort (); 1261 1262 free (wont_exit); 1263 1264 if (rtl_dump_file) 1265 fprintf (rtl_dump_file, ";; Unrolled loop %d times, %i insns\n", 1266 nunroll, num_loop_insns (loop)); 1267} 1268 1269/* Expand a bct instruction in a branch and an increment. 1270 If flag_inc is set, the induction variable does not need to be 1271 incremented. */ 1272 1273static void 1274expand_bct (edge e, int flag_inc) 1275{ 1276 rtx bct_insn = BB_END (e->src); 1277 rtx cmp; 1278 rtx inc; 1279 rtx seq; 1280 1281 rtx tgt; 1282 rtx condition; 1283 rtx labelref; 1284 rtx reg; 1285 rtx pattern = PATTERN (bct_insn); 1286 1287 if (!is_bct_cond (bct_insn)) 1288 return; 1289 1290 inc = get_var_set_from_bct (bct_insn); 1291 cmp = XVECEXP (pattern, 0, 0); 1292 reg = SET_DEST (inc); 1293 1294 start_sequence (); 1295 if (!flag_inc) 1296 { 1297 tgt = force_operand (XEXP (inc, 1), XEXP (inc, 0)); 1298 if (tgt != XEXP (inc, 0)) 1299 emit_move_insn (XEXP (inc, 0), tgt); 1300 } 1301 1302 condition = XEXP (SET_SRC (cmp), 0); 1303 labelref = XEXP (SET_SRC (cmp), 1); 1304 1305 do_compare_rtx_and_jump (copy_rtx (reg), XEXP (condition, 1), 1306 GET_CODE (condition), 0, 1307 GET_MODE (reg), NULL_RTX, NULL_RTX, 1308 XEXP (labelref, 0)); 1309 seq = get_insns (); 1310 end_sequence (); 1311 emit_insn_after (seq, bct_insn); 1312 1313 delete_insn (bct_insn); 1314 1315 return; 1316} 1317 1318/* Check that the increment of the count register can be discarded. */ 1319bool 1320discard_increment (struct loop *loop) 1321{ 1322 struct loop_desc *desc = &loop->desc; 1323 rtx inc, set_src, reg; 1324 rtx bct_insn; 1325 unsigned int i; 1326 basic_block *body; 1327 1328 bct_insn = BB_END (desc->out_edge->src); 1329 if (!is_bct_cond (bct_insn)) 1330 abort(); 1331 1332 inc = get_var_set_from_bct (bct_insn); 1333 1334 /* Check that inc is of the form reg = reg - 1. */ 1335 reg = SET_DEST (inc); 1336 set_src = SET_SRC (inc); 1337 1338 if (GET_CODE (set_src) != PLUS) 1339 return false; 1340 1341 if (!rtx_equal_p (XEXP (set_src, 0), reg)) 1342 return false; 1343 1344 if (!CONSTANT_P (XEXP (set_src, 1))) 1345 return false; 1346 1347 if (INTVAL (XEXP (set_src, 1)) != -1) 1348 return false; 1349 1350 /* We need to check that the register has no other uses beside the branch and 1351 count. */ 1352 body = get_loop_body (loop); 1353 for(i=0; i < loop->num_nodes; i++) 1354 { 1355 if (reg_mentioned_p (desc->var, BB_HEAD (body[i]))) 1356 return false; 1357 1358 if (body[i] != desc->out_edge->src) 1359 if (reg_mentioned_p (desc->var, BB_END (body[i]))) 1360 return false; 1361 1362 if (reg_used_between_p (desc->var, BB_HEAD (body[i]), BB_END (body[i]))) 1363 return false; 1364 } 1365 1366 /* Check that the branch and count ends the latch. */ 1367 if (desc->out_edge->src != loop->latch) 1368 { 1369 rtx insn; 1370 1371 /* Latch is a dummy block generated by loop-init. */ 1372 if (BRANCH_EDGE(desc->out_edge->src)->dest != loop->latch) 1373 return false; 1374 1375 for (insn = BB_HEAD (loop->latch); insn != NEXT_INSN (BB_END (loop->latch)); 1376 insn = NEXT_INSN (insn)) 1377 if (INSN_P (insn)) return false; 1378 } 1379 1380 return true; 1381} 1382 1383