1/* Loop distribution. 2 Copyright (C) 2006, 2007, 2008, 2009, 2010 3 Free Software Foundation, Inc. 4 Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr> 5 and Sebastian Pop <sebastian.pop@amd.com>. 6 7This file is part of GCC. 8 9GCC is free software; you can redistribute it and/or modify it 10under the terms of the GNU General Public License as published by the 11Free Software Foundation; either version 3, or (at your option) any 12later version. 13 14GCC is distributed in the hope that it will be useful, but WITHOUT 15ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 16FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 17for more details. 18 19You should have received a copy of the GNU General Public License 20along with GCC; see the file COPYING3. If not see 21<http://www.gnu.org/licenses/>. */ 22 23/* This pass performs loop distribution: for example, the loop 24 25 |DO I = 2, N 26 | A(I) = B(I) + C 27 | D(I) = A(I-1)*E 28 |ENDDO 29 30 is transformed to 31 32 |DOALL I = 2, N 33 | A(I) = B(I) + C 34 |ENDDO 35 | 36 |DOALL I = 2, N 37 | D(I) = A(I-1)*E 38 |ENDDO 39 40 This pass uses an RDG, Reduced Dependence Graph built on top of the 41 data dependence relations. The RDG is then topologically sorted to 42 obtain a map of information producers/consumers based on which it 43 generates the new loops. */ 44 45#include "config.h" 46#include "system.h" 47#include "coretypes.h" 48#include "tm.h" 49#include "ggc.h" 50#include "tree.h" 51#include "target.h" 52 53#include "rtl.h" 54#include "basic-block.h" 55#include "diagnostic.h" 56#include "tree-flow.h" 57#include "tree-dump.h" 58#include "timevar.h" 59#include "cfgloop.h" 60#include "expr.h" 61#include "optabs.h" 62#include "tree-chrec.h" 63#include "tree-data-ref.h" 64#include "tree-scalar-evolution.h" 65#include "tree-pass.h" 66#include "lambda.h" 67#include "langhooks.h" 68#include "tree-vectorizer.h" 69 70/* If bit I is not set, it means that this node represents an 71 operation that has already been performed, and that should not be 72 performed again. This is the subgraph of remaining important 73 computations that is passed to the DFS algorithm for avoiding to 74 include several times the same stores in different loops. */ 75static bitmap remaining_stmts; 76 77/* A node of the RDG is marked in this bitmap when it has as a 78 predecessor a node that writes to memory. */ 79static bitmap upstream_mem_writes; 80 81/* Update the PHI nodes of NEW_LOOP. NEW_LOOP is a duplicate of 82 ORIG_LOOP. */ 83 84static void 85update_phis_for_loop_copy (struct loop *orig_loop, struct loop *new_loop) 86{ 87 tree new_ssa_name; 88 gimple_stmt_iterator si_new, si_orig; 89 edge orig_loop_latch = loop_latch_edge (orig_loop); 90 edge orig_entry_e = loop_preheader_edge (orig_loop); 91 edge new_loop_entry_e = loop_preheader_edge (new_loop); 92 93 /* Scan the phis in the headers of the old and new loops 94 (they are organized in exactly the same order). */ 95 for (si_new = gsi_start_phis (new_loop->header), 96 si_orig = gsi_start_phis (orig_loop->header); 97 !gsi_end_p (si_new) && !gsi_end_p (si_orig); 98 gsi_next (&si_new), gsi_next (&si_orig)) 99 { 100 tree def; 101 source_location locus; 102 gimple phi_new = gsi_stmt (si_new); 103 gimple phi_orig = gsi_stmt (si_orig); 104 105 /* Add the first phi argument for the phi in NEW_LOOP (the one 106 associated with the entry of NEW_LOOP) */ 107 def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_entry_e); 108 locus = gimple_phi_arg_location_from_edge (phi_orig, orig_entry_e); 109 add_phi_arg (phi_new, def, new_loop_entry_e, locus); 110 111 /* Add the second phi argument for the phi in NEW_LOOP (the one 112 associated with the latch of NEW_LOOP) */ 113 def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_loop_latch); 114 locus = gimple_phi_arg_location_from_edge (phi_orig, orig_loop_latch); 115 116 if (TREE_CODE (def) == SSA_NAME) 117 { 118 new_ssa_name = get_current_def (def); 119 120 if (!new_ssa_name) 121 /* This only happens if there are no definitions inside the 122 loop. Use the the invariant in the new loop as is. */ 123 new_ssa_name = def; 124 } 125 else 126 /* Could be an integer. */ 127 new_ssa_name = def; 128 129 add_phi_arg (phi_new, new_ssa_name, loop_latch_edge (new_loop), locus); 130 } 131} 132 133/* Return a copy of LOOP placed before LOOP. */ 134 135static struct loop * 136copy_loop_before (struct loop *loop) 137{ 138 struct loop *res; 139 edge preheader = loop_preheader_edge (loop); 140 141 if (!single_exit (loop)) 142 return NULL; 143 144 initialize_original_copy_tables (); 145 res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, preheader); 146 free_original_copy_tables (); 147 148 if (!res) 149 return NULL; 150 151 update_phis_for_loop_copy (loop, res); 152 rename_variables_in_loop (res); 153 154 return res; 155} 156 157/* Creates an empty basic block after LOOP. */ 158 159static void 160create_bb_after_loop (struct loop *loop) 161{ 162 edge exit = single_exit (loop); 163 164 if (!exit) 165 return; 166 167 split_edge (exit); 168} 169 170/* Generate code for PARTITION from the code in LOOP. The loop is 171 copied when COPY_P is true. All the statements not flagged in the 172 PARTITION bitmap are removed from the loop or from its copy. The 173 statements are indexed in sequence inside a basic block, and the 174 basic blocks of a loop are taken in dom order. Returns true when 175 the code gen succeeded. */ 176 177static bool 178generate_loops_for_partition (struct loop *loop, bitmap partition, bool copy_p) 179{ 180 unsigned i, x; 181 gimple_stmt_iterator bsi; 182 basic_block *bbs; 183 184 if (copy_p) 185 { 186 loop = copy_loop_before (loop); 187 create_preheader (loop, CP_SIMPLE_PREHEADERS); 188 create_bb_after_loop (loop); 189 } 190 191 if (loop == NULL) 192 return false; 193 194 /* Remove stmts not in the PARTITION bitmap. The order in which we 195 visit the phi nodes and the statements is exactly as in 196 stmts_from_loop. */ 197 bbs = get_loop_body_in_dom_order (loop); 198 199 for (x = 0, i = 0; i < loop->num_nodes; i++) 200 { 201 basic_block bb = bbs[i]; 202 203 for (bsi = gsi_start_phis (bb); !gsi_end_p (bsi);) 204 if (!bitmap_bit_p (partition, x++)) 205 { 206 gimple phi = gsi_stmt (bsi); 207 if (!is_gimple_reg (gimple_phi_result (phi))) 208 mark_virtual_phi_result_for_renaming (phi); 209 remove_phi_node (&bsi, true); 210 } 211 else 212 gsi_next (&bsi); 213 214 for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi);) 215 { 216 gimple stmt = gsi_stmt (bsi); 217 if (gimple_code (gsi_stmt (bsi)) != GIMPLE_LABEL 218 && !bitmap_bit_p (partition, x++)) 219 { 220 unlink_stmt_vdef (stmt); 221 gsi_remove (&bsi, true); 222 release_defs (stmt); 223 } 224 else 225 gsi_next (&bsi); 226 } 227 } 228 229 free (bbs); 230 return true; 231} 232 233/* Build the size argument for a memset call. */ 234 235static inline tree 236build_size_arg_loc (location_t loc, tree nb_iter, tree op, 237 gimple_seq *stmt_list) 238{ 239 gimple_seq stmts; 240 tree x; 241 242 x = fold_build2_loc (loc, MULT_EXPR, size_type_node, 243 fold_convert_loc (loc, size_type_node, nb_iter), 244 fold_convert_loc (loc, size_type_node, 245 TYPE_SIZE_UNIT (TREE_TYPE (op)))); 246 x = force_gimple_operand (x, &stmts, true, NULL); 247 gimple_seq_add_seq (stmt_list, stmts); 248 249 return x; 250} 251 252/* Generate a call to memset. Return true when the operation succeeded. */ 253 254static void 255generate_memset_zero (gimple stmt, tree op0, tree nb_iter, 256 gimple_stmt_iterator bsi) 257{ 258 tree addr_base, nb_bytes; 259 bool res = false; 260 gimple_seq stmt_list = NULL, stmts; 261 gimple fn_call; 262 tree mem, fn; 263 struct data_reference *dr = XCNEW (struct data_reference); 264 location_t loc = gimple_location (stmt); 265 266 DR_STMT (dr) = stmt; 267 DR_REF (dr) = op0; 268 res = dr_analyze_innermost (dr); 269 gcc_assert (res && stride_of_unit_type_p (DR_STEP (dr), TREE_TYPE (op0))); 270 271 nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list); 272 addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr)); 273 addr_base = fold_convert_loc (loc, sizetype, addr_base); 274 275 /* Test for a negative stride, iterating over every element. */ 276 if (integer_zerop (size_binop (PLUS_EXPR, 277 TYPE_SIZE_UNIT (TREE_TYPE (op0)), 278 fold_convert (sizetype, DR_STEP (dr))))) 279 { 280 addr_base = size_binop_loc (loc, MINUS_EXPR, addr_base, 281 fold_convert_loc (loc, sizetype, nb_bytes)); 282 addr_base = size_binop_loc (loc, PLUS_EXPR, addr_base, 283 TYPE_SIZE_UNIT (TREE_TYPE (op0))); 284 } 285 286 addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR, 287 TREE_TYPE (DR_BASE_ADDRESS (dr)), 288 DR_BASE_ADDRESS (dr), addr_base); 289 mem = force_gimple_operand (addr_base, &stmts, true, NULL); 290 gimple_seq_add_seq (&stmt_list, stmts); 291 292 fn = build_fold_addr_expr (implicit_built_in_decls [BUILT_IN_MEMSET]); 293 fn_call = gimple_build_call (fn, 3, mem, integer_zero_node, nb_bytes); 294 gimple_seq_add_stmt (&stmt_list, fn_call); 295 gsi_insert_seq_after (&bsi, stmt_list, GSI_CONTINUE_LINKING); 296 297 if (dump_file && (dump_flags & TDF_DETAILS)) 298 fprintf (dump_file, "generated memset zero\n"); 299 300 free_data_ref (dr); 301} 302 303/* Tries to generate a builtin function for the instructions of LOOP 304 pointed to by the bits set in PARTITION. Returns true when the 305 operation succeeded. */ 306 307static bool 308generate_builtin (struct loop *loop, bitmap partition, bool copy_p) 309{ 310 bool res = false; 311 unsigned i, x = 0; 312 basic_block *bbs; 313 gimple write = NULL; 314 gimple_stmt_iterator bsi; 315 tree nb_iter = number_of_exit_cond_executions (loop); 316 317 if (!nb_iter || nb_iter == chrec_dont_know) 318 return false; 319 320 bbs = get_loop_body_in_dom_order (loop); 321 322 for (i = 0; i < loop->num_nodes; i++) 323 { 324 basic_block bb = bbs[i]; 325 326 for (bsi = gsi_start_phis (bb); !gsi_end_p (bsi); gsi_next (&bsi)) 327 x++; 328 329 for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi)) 330 { 331 gimple stmt = gsi_stmt (bsi); 332 333 if (bitmap_bit_p (partition, x++) 334 && is_gimple_assign (stmt) 335 && !is_gimple_reg (gimple_assign_lhs (stmt))) 336 { 337 /* Don't generate the builtins when there are more than 338 one memory write. */ 339 if (write != NULL) 340 goto end; 341 342 write = stmt; 343 if (bb == loop->latch) 344 nb_iter = number_of_latch_executions (loop); 345 } 346 } 347 } 348 349 if (!stmt_with_adjacent_zero_store_dr_p (write)) 350 goto end; 351 352 /* The new statements will be placed before LOOP. */ 353 bsi = gsi_last_bb (loop_preheader_edge (loop)->src); 354 generate_memset_zero (write, gimple_assign_lhs (write), nb_iter, bsi); 355 res = true; 356 357 /* If this is the last partition for which we generate code, we have 358 to destroy the loop. */ 359 if (!copy_p) 360 { 361 unsigned nbbs = loop->num_nodes; 362 edge exit = single_exit (loop); 363 basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest; 364 redirect_edge_pred (exit, src); 365 exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE); 366 exit->flags |= EDGE_FALLTHRU; 367 cancel_loop_tree (loop); 368 rescan_loop_exit (exit, false, true); 369 370 for (i = 0; i < nbbs; i++) 371 delete_basic_block (bbs[i]); 372 373 set_immediate_dominator (CDI_DOMINATORS, dest, 374 recompute_dominator (CDI_DOMINATORS, dest)); 375 } 376 377 end: 378 free (bbs); 379 return res; 380} 381 382/* Generates code for PARTITION. For simple loops, this function can 383 generate a built-in. */ 384 385static bool 386generate_code_for_partition (struct loop *loop, bitmap partition, bool copy_p) 387{ 388 if (generate_builtin (loop, partition, copy_p)) 389 return true; 390 391 return generate_loops_for_partition (loop, partition, copy_p); 392} 393 394 395/* Returns true if the node V of RDG cannot be recomputed. */ 396 397static bool 398rdg_cannot_recompute_vertex_p (struct graph *rdg, int v) 399{ 400 if (RDG_MEM_WRITE_STMT (rdg, v)) 401 return true; 402 403 return false; 404} 405 406/* Returns true when the vertex V has already been generated in the 407 current partition (V is in PROCESSED), or when V belongs to another 408 partition and cannot be recomputed (V is not in REMAINING_STMTS). */ 409 410static inline bool 411already_processed_vertex_p (bitmap processed, int v) 412{ 413 return (bitmap_bit_p (processed, v) 414 || !bitmap_bit_p (remaining_stmts, v)); 415} 416 417/* Returns NULL when there is no anti-dependence among the successors 418 of vertex V, otherwise returns the edge with the anti-dep. */ 419 420static struct graph_edge * 421has_anti_dependence (struct vertex *v) 422{ 423 struct graph_edge *e; 424 425 if (v->succ) 426 for (e = v->succ; e; e = e->succ_next) 427 if (RDGE_TYPE (e) == anti_dd) 428 return e; 429 430 return NULL; 431} 432 433/* Returns true when V has an anti-dependence edge among its successors. */ 434 435static bool 436predecessor_has_mem_write (struct graph *rdg, struct vertex *v) 437{ 438 struct graph_edge *e; 439 440 if (v->pred) 441 for (e = v->pred; e; e = e->pred_next) 442 if (bitmap_bit_p (upstream_mem_writes, e->src) 443 /* Don't consider flow channels: a write to memory followed 444 by a read from memory. These channels allow the split of 445 the RDG in different partitions. */ 446 && !RDG_MEM_WRITE_STMT (rdg, e->src)) 447 return true; 448 449 return false; 450} 451 452/* Initializes the upstream_mem_writes bitmap following the 453 information from RDG. */ 454 455static void 456mark_nodes_having_upstream_mem_writes (struct graph *rdg) 457{ 458 int v, x; 459 bitmap seen = BITMAP_ALLOC (NULL); 460 461 for (v = rdg->n_vertices - 1; v >= 0; v--) 462 if (!bitmap_bit_p (seen, v)) 463 { 464 unsigned i; 465 VEC (int, heap) *nodes = VEC_alloc (int, heap, 3); 466 467 graphds_dfs (rdg, &v, 1, &nodes, false, NULL); 468 469 for (i = 0; VEC_iterate (int, nodes, i, x); i++) 470 { 471 if (bitmap_bit_p (seen, x)) 472 continue; 473 474 bitmap_set_bit (seen, x); 475 476 if (RDG_MEM_WRITE_STMT (rdg, x) 477 || predecessor_has_mem_write (rdg, &(rdg->vertices[x])) 478 /* In anti dependences the read should occur before 479 the write, this is why both the read and the write 480 should be placed in the same partition. */ 481 || has_anti_dependence (&(rdg->vertices[x]))) 482 { 483 bitmap_set_bit (upstream_mem_writes, x); 484 } 485 } 486 487 VEC_free (int, heap, nodes); 488 } 489} 490 491/* Returns true when vertex u has a memory write node as a predecessor 492 in RDG. */ 493 494static bool 495has_upstream_mem_writes (int u) 496{ 497 return bitmap_bit_p (upstream_mem_writes, u); 498} 499 500static void rdg_flag_vertex_and_dependent (struct graph *, int, bitmap, bitmap, 501 bitmap, bool *); 502 503/* Flag the uses of U stopping following the information from 504 upstream_mem_writes. */ 505 506static void 507rdg_flag_uses (struct graph *rdg, int u, bitmap partition, bitmap loops, 508 bitmap processed, bool *part_has_writes) 509{ 510 use_operand_p use_p; 511 struct vertex *x = &(rdg->vertices[u]); 512 gimple stmt = RDGV_STMT (x); 513 struct graph_edge *anti_dep = has_anti_dependence (x); 514 515 /* Keep in the same partition the destination of an antidependence, 516 because this is a store to the exact same location. Putting this 517 in another partition is bad for cache locality. */ 518 if (anti_dep) 519 { 520 int v = anti_dep->dest; 521 522 if (!already_processed_vertex_p (processed, v)) 523 rdg_flag_vertex_and_dependent (rdg, v, partition, loops, 524 processed, part_has_writes); 525 } 526 527 if (gimple_code (stmt) != GIMPLE_PHI) 528 { 529 if ((use_p = gimple_vuse_op (stmt)) != NULL_USE_OPERAND_P) 530 { 531 tree use = USE_FROM_PTR (use_p); 532 533 if (TREE_CODE (use) == SSA_NAME) 534 { 535 gimple def_stmt = SSA_NAME_DEF_STMT (use); 536 int v = rdg_vertex_for_stmt (rdg, def_stmt); 537 538 if (v >= 0 539 && !already_processed_vertex_p (processed, v)) 540 rdg_flag_vertex_and_dependent (rdg, v, partition, loops, 541 processed, part_has_writes); 542 } 543 } 544 } 545 546 if (is_gimple_assign (stmt) && has_upstream_mem_writes (u)) 547 { 548 tree op0 = gimple_assign_lhs (stmt); 549 550 /* Scalar channels don't have enough space for transmitting data 551 between tasks, unless we add more storage by privatizing. */ 552 if (is_gimple_reg (op0)) 553 { 554 use_operand_p use_p; 555 imm_use_iterator iter; 556 557 FOR_EACH_IMM_USE_FAST (use_p, iter, op0) 558 { 559 int v = rdg_vertex_for_stmt (rdg, USE_STMT (use_p)); 560 561 if (!already_processed_vertex_p (processed, v)) 562 rdg_flag_vertex_and_dependent (rdg, v, partition, loops, 563 processed, part_has_writes); 564 } 565 } 566 } 567} 568 569/* Flag V from RDG as part of PARTITION, and also flag its loop number 570 in LOOPS. */ 571 572static void 573rdg_flag_vertex (struct graph *rdg, int v, bitmap partition, bitmap loops, 574 bool *part_has_writes) 575{ 576 struct loop *loop; 577 578 if (bitmap_bit_p (partition, v)) 579 return; 580 581 loop = loop_containing_stmt (RDG_STMT (rdg, v)); 582 bitmap_set_bit (loops, loop->num); 583 bitmap_set_bit (partition, v); 584 585 if (rdg_cannot_recompute_vertex_p (rdg, v)) 586 { 587 *part_has_writes = true; 588 bitmap_clear_bit (remaining_stmts, v); 589 } 590} 591 592/* Flag in the bitmap PARTITION the vertex V and all its predecessors. 593 Also flag their loop number in LOOPS. */ 594 595static void 596rdg_flag_vertex_and_dependent (struct graph *rdg, int v, bitmap partition, 597 bitmap loops, bitmap processed, 598 bool *part_has_writes) 599{ 600 unsigned i; 601 VEC (int, heap) *nodes = VEC_alloc (int, heap, 3); 602 int x; 603 604 bitmap_set_bit (processed, v); 605 rdg_flag_uses (rdg, v, partition, loops, processed, part_has_writes); 606 graphds_dfs (rdg, &v, 1, &nodes, false, remaining_stmts); 607 rdg_flag_vertex (rdg, v, partition, loops, part_has_writes); 608 609 for (i = 0; VEC_iterate (int, nodes, i, x); i++) 610 if (!already_processed_vertex_p (processed, x)) 611 rdg_flag_vertex_and_dependent (rdg, x, partition, loops, processed, 612 part_has_writes); 613 614 VEC_free (int, heap, nodes); 615} 616 617/* Initialize CONDS with all the condition statements from the basic 618 blocks of LOOP. */ 619 620static void 621collect_condition_stmts (struct loop *loop, VEC (gimple, heap) **conds) 622{ 623 unsigned i; 624 edge e; 625 VEC (edge, heap) *exits = get_loop_exit_edges (loop); 626 627 for (i = 0; VEC_iterate (edge, exits, i, e); i++) 628 { 629 gimple cond = last_stmt (e->src); 630 631 if (cond) 632 VEC_safe_push (gimple, heap, *conds, cond); 633 } 634 635 VEC_free (edge, heap, exits); 636} 637 638/* Add to PARTITION all the exit condition statements for LOOPS 639 together with all their dependent statements determined from 640 RDG. */ 641 642static void 643rdg_flag_loop_exits (struct graph *rdg, bitmap loops, bitmap partition, 644 bitmap processed, bool *part_has_writes) 645{ 646 unsigned i; 647 bitmap_iterator bi; 648 VEC (gimple, heap) *conds = VEC_alloc (gimple, heap, 3); 649 650 EXECUTE_IF_SET_IN_BITMAP (loops, 0, i, bi) 651 collect_condition_stmts (get_loop (i), &conds); 652 653 while (!VEC_empty (gimple, conds)) 654 { 655 gimple cond = VEC_pop (gimple, conds); 656 int v = rdg_vertex_for_stmt (rdg, cond); 657 bitmap new_loops = BITMAP_ALLOC (NULL); 658 659 if (!already_processed_vertex_p (processed, v)) 660 rdg_flag_vertex_and_dependent (rdg, v, partition, new_loops, processed, 661 part_has_writes); 662 663 EXECUTE_IF_SET_IN_BITMAP (new_loops, 0, i, bi) 664 if (!bitmap_bit_p (loops, i)) 665 { 666 bitmap_set_bit (loops, i); 667 collect_condition_stmts (get_loop (i), &conds); 668 } 669 670 BITMAP_FREE (new_loops); 671 } 672} 673 674/* Returns a bitmap in which all the statements needed for computing 675 the strongly connected component C of the RDG are flagged, also 676 including the loop exit conditions. */ 677 678static bitmap 679build_rdg_partition_for_component (struct graph *rdg, rdgc c, 680 bool *part_has_writes) 681{ 682 int i, v; 683 bitmap partition = BITMAP_ALLOC (NULL); 684 bitmap loops = BITMAP_ALLOC (NULL); 685 bitmap processed = BITMAP_ALLOC (NULL); 686 687 for (i = 0; VEC_iterate (int, c->vertices, i, v); i++) 688 if (!already_processed_vertex_p (processed, v)) 689 rdg_flag_vertex_and_dependent (rdg, v, partition, loops, processed, 690 part_has_writes); 691 692 rdg_flag_loop_exits (rdg, loops, partition, processed, part_has_writes); 693 694 BITMAP_FREE (processed); 695 BITMAP_FREE (loops); 696 return partition; 697} 698 699/* Free memory for COMPONENTS. */ 700 701static void 702free_rdg_components (VEC (rdgc, heap) *components) 703{ 704 int i; 705 rdgc x; 706 707 for (i = 0; VEC_iterate (rdgc, components, i, x); i++) 708 { 709 VEC_free (int, heap, x->vertices); 710 free (x); 711 } 712} 713 714/* Build the COMPONENTS vector with the strongly connected components 715 of RDG in which the STARTING_VERTICES occur. */ 716 717static void 718rdg_build_components (struct graph *rdg, VEC (int, heap) *starting_vertices, 719 VEC (rdgc, heap) **components) 720{ 721 int i, v; 722 bitmap saved_components = BITMAP_ALLOC (NULL); 723 int n_components = graphds_scc (rdg, NULL); 724 VEC (int, heap) **all_components = XNEWVEC (VEC (int, heap) *, n_components); 725 726 for (i = 0; i < n_components; i++) 727 all_components[i] = VEC_alloc (int, heap, 3); 728 729 for (i = 0; i < rdg->n_vertices; i++) 730 VEC_safe_push (int, heap, all_components[rdg->vertices[i].component], i); 731 732 for (i = 0; VEC_iterate (int, starting_vertices, i, v); i++) 733 { 734 int c = rdg->vertices[v].component; 735 736 if (!bitmap_bit_p (saved_components, c)) 737 { 738 rdgc x = XCNEW (struct rdg_component); 739 x->num = c; 740 x->vertices = all_components[c]; 741 742 VEC_safe_push (rdgc, heap, *components, x); 743 bitmap_set_bit (saved_components, c); 744 } 745 } 746 747 for (i = 0; i < n_components; i++) 748 if (!bitmap_bit_p (saved_components, i)) 749 VEC_free (int, heap, all_components[i]); 750 751 free (all_components); 752 BITMAP_FREE (saved_components); 753} 754 755/* Returns true when it is possible to generate a builtin pattern for 756 the PARTITION of RDG. For the moment we detect only the memset 757 zero pattern. */ 758 759static bool 760can_generate_builtin (struct graph *rdg, bitmap partition) 761{ 762 unsigned i; 763 bitmap_iterator bi; 764 int nb_reads = 0; 765 int nb_writes = 0; 766 int stores_zero = 0; 767 768 EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, bi) 769 if (RDG_MEM_READS_STMT (rdg, i)) 770 nb_reads++; 771 else if (RDG_MEM_WRITE_STMT (rdg, i)) 772 { 773 nb_writes++; 774 if (stmt_with_adjacent_zero_store_dr_p (RDG_STMT (rdg, i))) 775 stores_zero++; 776 } 777 778 return stores_zero == 1 && nb_writes == 1 && nb_reads == 0; 779} 780 781/* Returns true when PARTITION1 and PARTITION2 have similar memory 782 accesses in RDG. */ 783 784static bool 785similar_memory_accesses (struct graph *rdg, bitmap partition1, 786 bitmap partition2) 787{ 788 unsigned i, j; 789 bitmap_iterator bi, bj; 790 791 EXECUTE_IF_SET_IN_BITMAP (partition1, 0, i, bi) 792 if (RDG_MEM_WRITE_STMT (rdg, i) 793 || RDG_MEM_READS_STMT (rdg, i)) 794 EXECUTE_IF_SET_IN_BITMAP (partition2, 0, j, bj) 795 if (RDG_MEM_WRITE_STMT (rdg, j) 796 || RDG_MEM_READS_STMT (rdg, j)) 797 if (rdg_has_similar_memory_accesses (rdg, i, j)) 798 return true; 799 800 return false; 801} 802 803/* Fuse all the partitions from PARTITIONS that contain similar memory 804 references, i.e., we're taking care of cache locality. This 805 function does not fuse those partitions that contain patterns that 806 can be code generated with builtins. */ 807 808static void 809fuse_partitions_with_similar_memory_accesses (struct graph *rdg, 810 VEC (bitmap, heap) **partitions) 811{ 812 int p1, p2; 813 bitmap partition1, partition2; 814 815 for (p1 = 0; VEC_iterate (bitmap, *partitions, p1, partition1); p1++) 816 if (!can_generate_builtin (rdg, partition1)) 817 for (p2 = 0; VEC_iterate (bitmap, *partitions, p2, partition2); p2++) 818 if (p1 != p2 819 && !can_generate_builtin (rdg, partition2) 820 && similar_memory_accesses (rdg, partition1, partition2)) 821 { 822 bitmap_ior_into (partition1, partition2); 823 VEC_ordered_remove (bitmap, *partitions, p2); 824 p2--; 825 } 826} 827 828/* Aggregate several components into a useful partition that is 829 registered in the PARTITIONS vector. Partitions will be 830 distributed in different loops. */ 831 832static void 833rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components, 834 VEC (int, heap) **other_stores, 835 VEC (bitmap, heap) **partitions, bitmap processed) 836{ 837 int i; 838 rdgc x; 839 bitmap partition = BITMAP_ALLOC (NULL); 840 841 for (i = 0; VEC_iterate (rdgc, components, i, x); i++) 842 { 843 bitmap np; 844 bool part_has_writes = false; 845 int v = VEC_index (int, x->vertices, 0); 846 847 if (bitmap_bit_p (processed, v)) 848 continue; 849 850 np = build_rdg_partition_for_component (rdg, x, &part_has_writes); 851 bitmap_ior_into (partition, np); 852 bitmap_ior_into (processed, np); 853 BITMAP_FREE (np); 854 855 if (part_has_writes) 856 { 857 if (dump_file && (dump_flags & TDF_DETAILS)) 858 { 859 fprintf (dump_file, "ldist useful partition:\n"); 860 dump_bitmap (dump_file, partition); 861 } 862 863 VEC_safe_push (bitmap, heap, *partitions, partition); 864 partition = BITMAP_ALLOC (NULL); 865 } 866 } 867 868 /* Add the nodes from the RDG that were not marked as processed, and 869 that are used outside the current loop. These are scalar 870 computations that are not yet part of previous partitions. */ 871 for (i = 0; i < rdg->n_vertices; i++) 872 if (!bitmap_bit_p (processed, i) 873 && rdg_defs_used_in_other_loops_p (rdg, i)) 874 VEC_safe_push (int, heap, *other_stores, i); 875 876 /* If there are still statements left in the OTHER_STORES array, 877 create other components and partitions with these stores and 878 their dependences. */ 879 if (VEC_length (int, *other_stores) > 0) 880 { 881 VEC (rdgc, heap) *comps = VEC_alloc (rdgc, heap, 3); 882 VEC (int, heap) *foo = VEC_alloc (int, heap, 3); 883 884 rdg_build_components (rdg, *other_stores, &comps); 885 rdg_build_partitions (rdg, comps, &foo, partitions, processed); 886 887 VEC_free (int, heap, foo); 888 free_rdg_components (comps); 889 } 890 891 /* If there is something left in the last partition, save it. */ 892 if (bitmap_count_bits (partition) > 0) 893 VEC_safe_push (bitmap, heap, *partitions, partition); 894 else 895 BITMAP_FREE (partition); 896 897 fuse_partitions_with_similar_memory_accesses (rdg, partitions); 898} 899 900/* Dump to FILE the PARTITIONS. */ 901 902static void 903dump_rdg_partitions (FILE *file, VEC (bitmap, heap) *partitions) 904{ 905 int i; 906 bitmap partition; 907 908 for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++) 909 debug_bitmap_file (file, partition); 910} 911 912/* Debug PARTITIONS. */ 913extern void debug_rdg_partitions (VEC (bitmap, heap) *); 914 915void 916debug_rdg_partitions (VEC (bitmap, heap) *partitions) 917{ 918 dump_rdg_partitions (stderr, partitions); 919} 920 921/* Returns the number of read and write operations in the RDG. */ 922 923static int 924number_of_rw_in_rdg (struct graph *rdg) 925{ 926 int i, res = 0; 927 928 for (i = 0; i < rdg->n_vertices; i++) 929 { 930 if (RDG_MEM_WRITE_STMT (rdg, i)) 931 ++res; 932 933 if (RDG_MEM_READS_STMT (rdg, i)) 934 ++res; 935 } 936 937 return res; 938} 939 940/* Returns the number of read and write operations in a PARTITION of 941 the RDG. */ 942 943static int 944number_of_rw_in_partition (struct graph *rdg, bitmap partition) 945{ 946 int res = 0; 947 unsigned i; 948 bitmap_iterator ii; 949 950 EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, ii) 951 { 952 if (RDG_MEM_WRITE_STMT (rdg, i)) 953 ++res; 954 955 if (RDG_MEM_READS_STMT (rdg, i)) 956 ++res; 957 } 958 959 return res; 960} 961 962/* Returns true when one of the PARTITIONS contains all the read or 963 write operations of RDG. */ 964 965static bool 966partition_contains_all_rw (struct graph *rdg, VEC (bitmap, heap) *partitions) 967{ 968 int i; 969 bitmap partition; 970 int nrw = number_of_rw_in_rdg (rdg); 971 972 for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++) 973 if (nrw == number_of_rw_in_partition (rdg, partition)) 974 return true; 975 976 return false; 977} 978 979/* Generate code from STARTING_VERTICES in RDG. Returns the number of 980 distributed loops. */ 981 982static int 983ldist_gen (struct loop *loop, struct graph *rdg, 984 VEC (int, heap) *starting_vertices) 985{ 986 int i, nbp; 987 VEC (rdgc, heap) *components = VEC_alloc (rdgc, heap, 3); 988 VEC (bitmap, heap) *partitions = VEC_alloc (bitmap, heap, 3); 989 VEC (int, heap) *other_stores = VEC_alloc (int, heap, 3); 990 bitmap partition, processed = BITMAP_ALLOC (NULL); 991 992 remaining_stmts = BITMAP_ALLOC (NULL); 993 upstream_mem_writes = BITMAP_ALLOC (NULL); 994 995 for (i = 0; i < rdg->n_vertices; i++) 996 { 997 bitmap_set_bit (remaining_stmts, i); 998 999 /* Save in OTHER_STORES all the memory writes that are not in 1000 STARTING_VERTICES. */ 1001 if (RDG_MEM_WRITE_STMT (rdg, i)) 1002 { 1003 int v; 1004 unsigned j; 1005 bool found = false; 1006 1007 for (j = 0; VEC_iterate (int, starting_vertices, j, v); j++) 1008 if (i == v) 1009 { 1010 found = true; 1011 break; 1012 } 1013 1014 if (!found) 1015 VEC_safe_push (int, heap, other_stores, i); 1016 } 1017 } 1018 1019 mark_nodes_having_upstream_mem_writes (rdg); 1020 rdg_build_components (rdg, starting_vertices, &components); 1021 rdg_build_partitions (rdg, components, &other_stores, &partitions, 1022 processed); 1023 BITMAP_FREE (processed); 1024 nbp = VEC_length (bitmap, partitions); 1025 1026 if (nbp <= 1 1027 || partition_contains_all_rw (rdg, partitions)) 1028 goto ldist_done; 1029 1030 if (dump_file && (dump_flags & TDF_DETAILS)) 1031 dump_rdg_partitions (dump_file, partitions); 1032 1033 for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++) 1034 if (!generate_code_for_partition (loop, partition, i < nbp - 1)) 1035 goto ldist_done; 1036 1037 rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa); 1038 update_ssa (TODO_update_ssa_only_virtuals | TODO_update_ssa); 1039 1040 ldist_done: 1041 1042 BITMAP_FREE (remaining_stmts); 1043 BITMAP_FREE (upstream_mem_writes); 1044 1045 for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++) 1046 BITMAP_FREE (partition); 1047 1048 VEC_free (int, heap, other_stores); 1049 VEC_free (bitmap, heap, partitions); 1050 free_rdg_components (components); 1051 return nbp; 1052} 1053 1054/* Distributes the code from LOOP in such a way that producer 1055 statements are placed before consumer statements. When STMTS is 1056 NULL, performs the maximal distribution, if STMTS is not NULL, 1057 tries to separate only these statements from the LOOP's body. 1058 Returns the number of distributed loops. */ 1059 1060static int 1061distribute_loop (struct loop *loop, VEC (gimple, heap) *stmts) 1062{ 1063 int res = 0; 1064 struct graph *rdg; 1065 gimple s; 1066 unsigned i; 1067 VEC (int, heap) *vertices; 1068 1069 if (loop->num_nodes > 2) 1070 { 1071 if (dump_file && (dump_flags & TDF_DETAILS)) 1072 fprintf (dump_file, 1073 "FIXME: Loop %d not distributed: it has more than two basic blocks.\n", 1074 loop->num); 1075 1076 return res; 1077 } 1078 1079 rdg = build_rdg (loop); 1080 1081 if (!rdg) 1082 { 1083 if (dump_file && (dump_flags & TDF_DETAILS)) 1084 fprintf (dump_file, 1085 "FIXME: Loop %d not distributed: failed to build the RDG.\n", 1086 loop->num); 1087 1088 return res; 1089 } 1090 1091 vertices = VEC_alloc (int, heap, 3); 1092 1093 if (dump_file && (dump_flags & TDF_DETAILS)) 1094 dump_rdg (dump_file, rdg); 1095 1096 for (i = 0; VEC_iterate (gimple, stmts, i, s); i++) 1097 { 1098 int v = rdg_vertex_for_stmt (rdg, s); 1099 1100 if (v >= 0) 1101 { 1102 VEC_safe_push (int, heap, vertices, v); 1103 1104 if (dump_file && (dump_flags & TDF_DETAILS)) 1105 fprintf (dump_file, 1106 "ldist asked to generate code for vertex %d\n", v); 1107 } 1108 } 1109 1110 res = ldist_gen (loop, rdg, vertices); 1111 VEC_free (int, heap, vertices); 1112 free_rdg (rdg); 1113 1114 return res; 1115} 1116 1117/* Distribute all loops in the current function. */ 1118 1119static unsigned int 1120tree_loop_distribution (void) 1121{ 1122 struct loop *loop; 1123 loop_iterator li; 1124 int nb_generated_loops = 0; 1125 1126 FOR_EACH_LOOP (li, loop, 0) 1127 { 1128 VEC (gimple, heap) *work_list = NULL; 1129 1130 /* If the loop doesn't have a single exit we will fail anyway, 1131 so do that early. */ 1132 if (!single_exit (loop)) 1133 continue; 1134 1135 /* With the following working list, we're asking distribute_loop 1136 to separate the stores of the loop: when dependences allow, 1137 it will end on having one store per loop. */ 1138 stores_from_loop (loop, &work_list); 1139 1140 /* A simple heuristic for cache locality is to not split stores 1141 to the same array. Without this call, an unrolled loop would 1142 be split into as many loops as unroll factor, each loop 1143 storing in the same array. */ 1144 remove_similar_memory_refs (&work_list); 1145 1146 nb_generated_loops = distribute_loop (loop, work_list); 1147 1148 if (dump_file && (dump_flags & TDF_DETAILS)) 1149 { 1150 if (nb_generated_loops > 1) 1151 fprintf (dump_file, "Loop %d distributed: split to %d loops.\n", 1152 loop->num, nb_generated_loops); 1153 else 1154 fprintf (dump_file, "Loop %d is the same.\n", loop->num); 1155 } 1156 1157 verify_loop_structure (); 1158 1159 VEC_free (gimple, heap, work_list); 1160 } 1161 1162 return 0; 1163} 1164 1165static bool 1166gate_tree_loop_distribution (void) 1167{ 1168 return flag_tree_loop_distribution != 0; 1169} 1170 1171struct gimple_opt_pass pass_loop_distribution = 1172{ 1173 { 1174 GIMPLE_PASS, 1175 "ldist", /* name */ 1176 gate_tree_loop_distribution, /* gate */ 1177 tree_loop_distribution, /* execute */ 1178 NULL, /* sub */ 1179 NULL, /* next */ 1180 0, /* static_pass_number */ 1181 TV_TREE_LOOP_DISTRIBUTION, /* tv_id */ 1182 PROP_cfg | PROP_ssa, /* properties_required */ 1183 0, /* properties_provided */ 1184 0, /* properties_destroyed */ 1185 0, /* todo_flags_start */ 1186 TODO_dump_func | TODO_verify_loops /* todo_flags_finish */ 1187 } 1188}; 1189