1/* Array prefetching. 2 Copyright (C) 2005 Free Software Foundation, Inc. 3 4This file is part of GCC. 5 6GCC is free software; you can redistribute it and/or modify it 7under the terms of the GNU General Public License as published by the 8Free Software Foundation; either version 2, or (at your option) any 9later version. 10 11GCC is distributed in the hope that it will be useful, but WITHOUT 12ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14for more details. 15 16You should have received a copy of the GNU General Public License 17along with GCC; see the file COPYING. If not, write to the Free 18Software Foundation, 59 Temple Place - Suite 330, Boston, MA 1902111-1307, USA. */ 20 21#include "config.h" 22#include "system.h" 23#include "coretypes.h" 24#include "tm.h" 25#include "tree.h" 26#include "rtl.h" 27#include "tm_p.h" 28#include "hard-reg-set.h" 29#include "basic-block.h" 30#include "output.h" 31#include "diagnostic.h" 32#include "tree-flow.h" 33#include "tree-dump.h" 34#include "timevar.h" 35#include "cfgloop.h" 36#include "varray.h" 37#include "expr.h" 38#include "tree-pass.h" 39#include "ggc.h" 40#include "insn-config.h" 41#include "recog.h" 42#include "hashtab.h" 43#include "tree-chrec.h" 44#include "tree-scalar-evolution.h" 45#include "toplev.h" 46#include "params.h" 47#include "langhooks.h" 48 49/* This pass inserts prefetch instructions to optimize cache usage during 50 accesses to arrays in loops. It processes loops sequentially and: 51 52 1) Gathers all memory references in the single loop. 53 2) For each of the references it decides when it is profitable to prefetch 54 it. To do it, we evaluate the reuse among the accesses, and determines 55 two values: PREFETCH_BEFORE (meaning that it only makes sense to do 56 prefetching in the first PREFETCH_BEFORE iterations of the loop) and 57 PREFETCH_MOD (meaning that it only makes sense to prefetch in the 58 iterations of the loop that are zero modulo PREFETCH_MOD). For example 59 (assuming cache line size is 64 bytes, char has size 1 byte and there 60 is no hardware sequential prefetch): 61 62 char *a; 63 for (i = 0; i < max; i++) 64 { 65 a[255] = ...; (0) 66 a[i] = ...; (1) 67 a[i + 64] = ...; (2) 68 a[16*i] = ...; (3) 69 a[187*i] = ...; (4) 70 a[187*i + 50] = ...; (5) 71 } 72 73 (0) obviously has PREFETCH_BEFORE 1 74 (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory 75 location 64 iterations before it, and PREFETCH_MOD 64 (since 76 it hits the same cache line otherwise). 77 (2) has PREFETCH_MOD 64 78 (3) has PREFETCH_MOD 4 79 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since 80 the cache line accessed by (4) is the same with probability only 81 7/32. 82 (5) has PREFETCH_MOD 1 as well. 83 84 3) We determine how much ahead we need to prefetch. The number of 85 iterations needed is time to fetch / time spent in one iteration of 86 the loop. The problem is that we do not know either of these values, 87 so we just make a heuristic guess based on a magic (possibly) 88 target-specific constant and size of the loop. 89 90 4) Determine which of the references we prefetch. We take into account 91 that there is a maximum number of simultaneous prefetches (provided 92 by machine description). We prefetch as many prefetches as possible 93 while still within this bound (starting with those with lowest 94 prefetch_mod, since they are responsible for most of the cache 95 misses). 96 97 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD 98 and PREFETCH_BEFORE requirements (within some bounds), and to avoid 99 prefetching nonaccessed memory. 100 TODO -- actually implement peeling. 101 102 6) We actually emit the prefetch instructions. ??? Perhaps emit the 103 prefetch instructions with guards in cases where 5) was not sufficient 104 to satisfy the constraints? 105 106 Some other TODO: 107 -- write and use more general reuse analysis (that could be also used 108 in other cache aimed loop optimizations) 109 -- make it behave sanely together with the prefetches given by user 110 (now we just ignore them; at the very least we should avoid 111 optimizing loops in that user put his own prefetches) 112 -- we assume cache line size alignment of arrays; this could be 113 improved. */ 114 115/* Magic constants follow. These should be replaced by machine specific 116 numbers. */ 117 118/* A number that should roughly correspond to the number of instructions 119 executed before the prefetch is completed. */ 120 121#ifndef PREFETCH_LATENCY 122#define PREFETCH_LATENCY 200 123#endif 124 125/* Number of prefetches that can run at the same time. */ 126 127#ifndef SIMULTANEOUS_PREFETCHES 128#define SIMULTANEOUS_PREFETCHES 3 129#endif 130 131/* True if write can be prefetched by a read prefetch. */ 132 133#ifndef WRITE_CAN_USE_READ_PREFETCH 134#define WRITE_CAN_USE_READ_PREFETCH 1 135#endif 136 137/* True if read can be prefetched by a write prefetch. */ 138 139#ifndef READ_CAN_USE_WRITE_PREFETCH 140#define READ_CAN_USE_WRITE_PREFETCH 0 141#endif 142 143/* Cache line size. Assumed to be a power of two. */ 144 145#ifndef PREFETCH_BLOCK 146#define PREFETCH_BLOCK 32 147#endif 148 149/* Do we have a forward hardware sequential prefetching? */ 150 151#ifndef HAVE_FORWARD_PREFETCH 152#define HAVE_FORWARD_PREFETCH 0 153#endif 154 155/* Do we have a backward hardware sequential prefetching? */ 156 157#ifndef HAVE_BACKWARD_PREFETCH 158#define HAVE_BACKWARD_PREFETCH 0 159#endif 160 161/* In some cases we are only able to determine that there is a certain 162 probability that the two accesses hit the same cache line. In this 163 case, we issue the prefetches for both of them if this probability 164 is less then (1000 - ACCEPTABLE_MISS_RATE) promile. */ 165 166#ifndef ACCEPTABLE_MISS_RATE 167#define ACCEPTABLE_MISS_RATE 50 168#endif 169 170#ifndef HAVE_prefetch 171#define HAVE_prefetch 0 172#endif 173 174/* The group of references between that reuse may occur. */ 175 176struct mem_ref_group 177{ 178 tree base; /* Base of the reference. */ 179 HOST_WIDE_INT step; /* Step of the reference. */ 180 struct mem_ref *refs; /* References in the group. */ 181 struct mem_ref_group *next; /* Next group of references. */ 182}; 183 184/* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */ 185 186#define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0) 187 188/* The memory reference. */ 189 190struct mem_ref 191{ 192 tree stmt; /* Statement in that the reference appears. */ 193 tree mem; /* The reference. */ 194 HOST_WIDE_INT delta; /* Constant offset of the reference. */ 195 bool write_p; /* Is it a write? */ 196 struct mem_ref_group *group; /* The group of references it belongs to. */ 197 unsigned HOST_WIDE_INT prefetch_mod; 198 /* Prefetch only each PREFETCH_MOD-th 199 iteration. */ 200 unsigned HOST_WIDE_INT prefetch_before; 201 /* Prefetch only first PREFETCH_BEFORE 202 iterations. */ 203 bool issue_prefetch_p; /* Should we really issue the prefetch? */ 204 struct mem_ref *next; /* The next reference in the group. */ 205}; 206 207/* Dumps information about reference REF to FILE. */ 208 209static void 210dump_mem_ref (FILE *file, struct mem_ref *ref) 211{ 212 fprintf (file, "Reference %p:\n", (void *) ref); 213 214 fprintf (file, " group %p (base ", (void *) ref->group); 215 print_generic_expr (file, ref->group->base, TDF_SLIM); 216 fprintf (file, ", step "); 217 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->group->step); 218 fprintf (file, ")\n"); 219 220 fprintf (dump_file, " delta "); 221 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta); 222 fprintf (file, "\n"); 223 224 fprintf (file, " %s\n", ref->write_p ? "write" : "read"); 225 226 fprintf (file, "\n"); 227} 228 229/* Finds a group with BASE and STEP in GROUPS, or creates one if it does not 230 exist. */ 231 232static struct mem_ref_group * 233find_or_create_group (struct mem_ref_group **groups, tree base, 234 HOST_WIDE_INT step) 235{ 236 struct mem_ref_group *group; 237 238 for (; *groups; groups = &(*groups)->next) 239 { 240 if ((*groups)->step == step 241 && operand_equal_p ((*groups)->base, base, 0)) 242 return *groups; 243 244 /* Keep the list of groups sorted by decreasing step. */ 245 if ((*groups)->step < step) 246 break; 247 } 248 249 group = xcalloc (1, sizeof (struct mem_ref_group)); 250 group->base = base; 251 group->step = step; 252 group->refs = NULL; 253 group->next = *groups; 254 *groups = group; 255 256 return group; 257} 258 259/* Records a memory reference MEM in GROUP with offset DELTA and write status 260 WRITE_P. The reference occurs in statement STMT. */ 261 262static void 263record_ref (struct mem_ref_group *group, tree stmt, tree mem, 264 HOST_WIDE_INT delta, bool write_p) 265{ 266 struct mem_ref **aref; 267 268 /* Do not record the same address twice. */ 269 for (aref = &group->refs; *aref; aref = &(*aref)->next) 270 { 271 /* It does not have to be possible for write reference to reuse the read 272 prefetch, or vice versa. */ 273 if (!WRITE_CAN_USE_READ_PREFETCH 274 && write_p 275 && !(*aref)->write_p) 276 continue; 277 if (!READ_CAN_USE_WRITE_PREFETCH 278 && !write_p 279 && (*aref)->write_p) 280 continue; 281 282 if ((*aref)->delta == delta) 283 return; 284 } 285 286 (*aref) = xcalloc (1, sizeof (struct mem_ref)); 287 (*aref)->stmt = stmt; 288 (*aref)->mem = mem; 289 (*aref)->delta = delta; 290 (*aref)->write_p = write_p; 291 (*aref)->prefetch_before = PREFETCH_ALL; 292 (*aref)->prefetch_mod = 1; 293 (*aref)->issue_prefetch_p = false; 294 (*aref)->group = group; 295 (*aref)->next = NULL; 296 297 if (dump_file && (dump_flags & TDF_DETAILS)) 298 dump_mem_ref (dump_file, *aref); 299} 300 301/* Release memory references in GROUPS. */ 302 303static void 304release_mem_refs (struct mem_ref_group *groups) 305{ 306 struct mem_ref_group *next_g; 307 struct mem_ref *ref, *next_r; 308 309 for (; groups; groups = next_g) 310 { 311 next_g = groups->next; 312 for (ref = groups->refs; ref; ref = next_r) 313 { 314 next_r = ref->next; 315 free (ref); 316 } 317 free (groups); 318 } 319} 320 321/* A structure used to pass arguments to idx_analyze_ref. */ 322 323struct ar_data 324{ 325 struct loop *loop; /* Loop of the reference. */ 326 tree stmt; /* Statement of the reference. */ 327 HOST_WIDE_INT *step; /* Step of the memory reference. */ 328 HOST_WIDE_INT *delta; /* Offset of the memory reference. */ 329}; 330 331/* Analyzes a single INDEX of a memory reference to obtain information 332 described at analyze_ref. Callback for for_each_index. */ 333 334static bool 335idx_analyze_ref (tree base, tree *index, void *data) 336{ 337 struct ar_data *ar_data = data; 338 tree ibase, step, stepsize; 339 HOST_WIDE_INT istep, idelta = 0, imult = 1; 340 affine_iv iv; 341 342 if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF 343 || TREE_CODE (base) == ALIGN_INDIRECT_REF) 344 return false; 345 346 if (!simple_iv (ar_data->loop, ar_data->stmt, *index, &iv, false)) 347 return false; 348 ibase = iv.base; 349 step = iv.step; 350 351 if (zero_p (step)) 352 istep = 0; 353 else 354 { 355 if (!cst_and_fits_in_hwi (step)) 356 return false; 357 istep = int_cst_value (step); 358 } 359 360 if (TREE_CODE (ibase) == PLUS_EXPR 361 && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1))) 362 { 363 idelta = int_cst_value (TREE_OPERAND (ibase, 1)); 364 ibase = TREE_OPERAND (ibase, 0); 365 } 366 if (cst_and_fits_in_hwi (ibase)) 367 { 368 idelta += int_cst_value (ibase); 369 ibase = build_int_cst (TREE_TYPE (ibase), 0); 370 } 371 372 if (TREE_CODE (base) == ARRAY_REF) 373 { 374 stepsize = array_ref_element_size (base); 375 if (!cst_and_fits_in_hwi (stepsize)) 376 return false; 377 imult = int_cst_value (stepsize); 378 379 istep *= imult; 380 idelta *= imult; 381 } 382 383 *ar_data->step += istep; 384 *ar_data->delta += idelta; 385 *index = ibase; 386 387 return true; 388} 389 390/* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and 391 STEP are integer constants and iter is number of iterations of LOOP. The 392 reference occurs in statement STMT. Strips nonaddressable component 393 references from REF_P. */ 394 395static bool 396analyze_ref (struct loop *loop, tree *ref_p, tree *base, 397 HOST_WIDE_INT *step, HOST_WIDE_INT *delta, 398 tree stmt) 399{ 400 struct ar_data ar_data; 401 tree off; 402 HOST_WIDE_INT bit_offset; 403 tree ref = *ref_p; 404 405 *step = 0; 406 *delta = 0; 407 408 /* First strip off the component references. Ignore bitfields. */ 409 if (TREE_CODE (ref) == COMPONENT_REF 410 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))) 411 ref = TREE_OPERAND (ref, 0); 412 413 *ref_p = ref; 414 415 for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0)) 416 { 417 off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1)); 418 bit_offset = TREE_INT_CST_LOW (off); 419 gcc_assert (bit_offset % BITS_PER_UNIT == 0); 420 421 *delta += bit_offset / BITS_PER_UNIT; 422 } 423 424 *base = unshare_expr (ref); 425 ar_data.loop = loop; 426 ar_data.stmt = stmt; 427 ar_data.step = step; 428 ar_data.delta = delta; 429 return for_each_index (base, idx_analyze_ref, &ar_data); 430} 431 432/* Record a memory reference REF to the list REFS. The reference occurs in 433 LOOP in statement STMT and it is write if WRITE_P. */ 434 435static void 436gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs, 437 tree ref, bool write_p, tree stmt) 438{ 439 tree base; 440 HOST_WIDE_INT step, delta; 441 struct mem_ref_group *agrp; 442 443 if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt)) 444 return; 445 446 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP 447 are integer constants. */ 448 agrp = find_or_create_group (refs, base, step); 449 record_ref (agrp, stmt, ref, delta, write_p); 450} 451 452/* Record the suitable memory references in LOOP. */ 453 454static struct mem_ref_group * 455gather_memory_references (struct loop *loop) 456{ 457 basic_block *body = get_loop_body_in_dom_order (loop); 458 basic_block bb; 459 unsigned i; 460 block_stmt_iterator bsi; 461 tree stmt, lhs, rhs; 462 struct mem_ref_group *refs = NULL; 463 464 /* Scan the loop body in order, so that the former references precede the 465 later ones. */ 466 for (i = 0; i < loop->num_nodes; i++) 467 { 468 bb = body[i]; 469 if (bb->loop_father != loop) 470 continue; 471 472 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi)) 473 { 474 stmt = bsi_stmt (bsi); 475 if (TREE_CODE (stmt) != MODIFY_EXPR) 476 continue; 477 478 lhs = TREE_OPERAND (stmt, 0); 479 rhs = TREE_OPERAND (stmt, 1); 480 481 if (REFERENCE_CLASS_P (rhs)) 482 gather_memory_references_ref (loop, &refs, rhs, false, stmt); 483 if (REFERENCE_CLASS_P (lhs)) 484 gather_memory_references_ref (loop, &refs, lhs, true, stmt); 485 } 486 } 487 free (body); 488 489 return refs; 490} 491 492/* Prune the prefetch candidate REF using the self-reuse. */ 493 494static void 495prune_ref_by_self_reuse (struct mem_ref *ref) 496{ 497 HOST_WIDE_INT step = ref->group->step; 498 bool backward = step < 0; 499 500 if (step == 0) 501 { 502 /* Prefetch references to invariant address just once. */ 503 ref->prefetch_before = 1; 504 return; 505 } 506 507 if (backward) 508 step = -step; 509 510 if (step > PREFETCH_BLOCK) 511 return; 512 513 if ((backward && HAVE_BACKWARD_PREFETCH) 514 || (!backward && HAVE_FORWARD_PREFETCH)) 515 { 516 ref->prefetch_before = 1; 517 return; 518 } 519 520 ref->prefetch_mod = PREFETCH_BLOCK / step; 521} 522 523/* Divides X by BY, rounding down. */ 524 525static HOST_WIDE_INT 526ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by) 527{ 528 gcc_assert (by > 0); 529 530 if (x >= 0) 531 return x / by; 532 else 533 return (x + by - 1) / by; 534} 535 536/* Prune the prefetch candidate REF using the reuse with BY. 537 If BY_IS_BEFORE is true, BY is before REF in the loop. */ 538 539static void 540prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by, 541 bool by_is_before) 542{ 543 HOST_WIDE_INT step = ref->group->step; 544 bool backward = step < 0; 545 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta; 546 HOST_WIDE_INT delta = delta_b - delta_r; 547 HOST_WIDE_INT hit_from; 548 unsigned HOST_WIDE_INT prefetch_before, prefetch_block; 549 550 if (delta == 0) 551 { 552 /* If the references has the same address, only prefetch the 553 former. */ 554 if (by_is_before) 555 ref->prefetch_before = 0; 556 557 return; 558 } 559 560 if (!step) 561 { 562 /* If the reference addresses are invariant and fall into the 563 same cache line, prefetch just the first one. */ 564 if (!by_is_before) 565 return; 566 567 if (ddown (ref->delta, PREFETCH_BLOCK) 568 != ddown (by->delta, PREFETCH_BLOCK)) 569 return; 570 571 ref->prefetch_before = 0; 572 return; 573 } 574 575 /* Only prune the reference that is behind in the array. */ 576 if (backward) 577 { 578 if (delta > 0) 579 return; 580 581 /* Transform the data so that we may assume that the accesses 582 are forward. */ 583 delta = - delta; 584 step = -step; 585 delta_r = PREFETCH_BLOCK - 1 - delta_r; 586 delta_b = PREFETCH_BLOCK - 1 - delta_b; 587 } 588 else 589 { 590 if (delta < 0) 591 return; 592 } 593 594 /* Check whether the two references are likely to hit the same cache 595 line, and how distant the iterations in that it occurs are from 596 each other. */ 597 598 if (step <= PREFETCH_BLOCK) 599 { 600 /* The accesses are sure to meet. Let us check when. */ 601 hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK; 602 prefetch_before = (hit_from - delta_r + step - 1) / step; 603 604 if (prefetch_before < ref->prefetch_before) 605 ref->prefetch_before = prefetch_before; 606 607 return; 608 } 609 610 /* A more complicated case. First let us ensure that size of cache line 611 and step are coprime (here we assume that PREFETCH_BLOCK is a power 612 of two. */ 613 prefetch_block = PREFETCH_BLOCK; 614 while ((step & 1) == 0 615 && prefetch_block > 1) 616 { 617 step >>= 1; 618 prefetch_block >>= 1; 619 delta >>= 1; 620 } 621 622 /* Now step > prefetch_block, and step and prefetch_block are coprime. 623 Determine the probability that the accesses hit the same cache line. */ 624 625 prefetch_before = delta / step; 626 delta %= step; 627 if ((unsigned HOST_WIDE_INT) delta 628 <= (prefetch_block * ACCEPTABLE_MISS_RATE / 1000)) 629 { 630 if (prefetch_before < ref->prefetch_before) 631 ref->prefetch_before = prefetch_before; 632 633 return; 634 } 635 636 /* Try also the following iteration. */ 637 prefetch_before++; 638 delta = step - delta; 639 if ((unsigned HOST_WIDE_INT) delta 640 <= (prefetch_block * ACCEPTABLE_MISS_RATE / 1000)) 641 { 642 if (prefetch_before < ref->prefetch_before) 643 ref->prefetch_before = prefetch_before; 644 645 return; 646 } 647 648 /* The ref probably does not reuse by. */ 649 return; 650} 651 652/* Prune the prefetch candidate REF using the reuses with other references 653 in REFS. */ 654 655static void 656prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs) 657{ 658 struct mem_ref *prune_by; 659 bool before = true; 660 661 prune_ref_by_self_reuse (ref); 662 663 for (prune_by = refs; prune_by; prune_by = prune_by->next) 664 { 665 if (prune_by == ref) 666 { 667 before = false; 668 continue; 669 } 670 671 if (!WRITE_CAN_USE_READ_PREFETCH 672 && ref->write_p 673 && !prune_by->write_p) 674 continue; 675 if (!READ_CAN_USE_WRITE_PREFETCH 676 && !ref->write_p 677 && prune_by->write_p) 678 continue; 679 680 prune_ref_by_group_reuse (ref, prune_by, before); 681 } 682} 683 684/* Prune the prefetch candidates in GROUP using the reuse analysis. */ 685 686static void 687prune_group_by_reuse (struct mem_ref_group *group) 688{ 689 struct mem_ref *ref_pruned; 690 691 for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next) 692 { 693 prune_ref_by_reuse (ref_pruned, group->refs); 694 695 if (dump_file && (dump_flags & TDF_DETAILS)) 696 { 697 fprintf (dump_file, "Reference %p:", (void *) ref_pruned); 698 699 if (ref_pruned->prefetch_before == PREFETCH_ALL 700 && ref_pruned->prefetch_mod == 1) 701 fprintf (dump_file, " no restrictions"); 702 else if (ref_pruned->prefetch_before == 0) 703 fprintf (dump_file, " do not prefetch"); 704 else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod) 705 fprintf (dump_file, " prefetch once"); 706 else 707 { 708 if (ref_pruned->prefetch_before != PREFETCH_ALL) 709 { 710 fprintf (dump_file, " prefetch before "); 711 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC, 712 ref_pruned->prefetch_before); 713 } 714 if (ref_pruned->prefetch_mod != 1) 715 { 716 fprintf (dump_file, " prefetch mod "); 717 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC, 718 ref_pruned->prefetch_mod); 719 } 720 } 721 fprintf (dump_file, "\n"); 722 } 723 } 724} 725 726/* Prune the list of prefetch candidates GROUPS using the reuse analysis. */ 727 728static void 729prune_by_reuse (struct mem_ref_group *groups) 730{ 731 for (; groups; groups = groups->next) 732 prune_group_by_reuse (groups); 733} 734 735/* Returns true if we should issue prefetch for REF. */ 736 737static bool 738should_issue_prefetch_p (struct mem_ref *ref) 739{ 740 /* For now do not issue prefetches for only first few of the 741 iterations. */ 742 if (ref->prefetch_before != PREFETCH_ALL) 743 return false; 744 745 return true; 746} 747 748/* Decide which of the prefetch candidates in GROUPS to prefetch. 749 AHEAD is the number of iterations to prefetch ahead (which corresponds 750 to the number of simultaneous instances of one prefetch running at a 751 time). UNROLL_FACTOR is the factor by that the loop is going to be 752 unrolled. Returns true if there is anything to prefetch. */ 753 754static bool 755schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor, 756 unsigned ahead) 757{ 758 unsigned max_prefetches, n_prefetches; 759 struct mem_ref *ref; 760 bool any = false; 761 762 max_prefetches = (SIMULTANEOUS_PREFETCHES * unroll_factor) / ahead; 763 if (max_prefetches > (unsigned) SIMULTANEOUS_PREFETCHES) 764 max_prefetches = SIMULTANEOUS_PREFETCHES; 765 766 if (dump_file && (dump_flags & TDF_DETAILS)) 767 fprintf (dump_file, "Max prefetches to issue: %d.\n", max_prefetches); 768 769 if (!max_prefetches) 770 return false; 771 772 /* For now we just take memory references one by one and issue 773 prefetches for as many as possible. The groups are sorted 774 starting with the largest step, since the references with 775 large step are more likely to cause many cache misses. */ 776 777 for (; groups; groups = groups->next) 778 for (ref = groups->refs; ref; ref = ref->next) 779 { 780 if (!should_issue_prefetch_p (ref)) 781 continue; 782 783 ref->issue_prefetch_p = true; 784 785 /* If prefetch_mod is less then unroll_factor, we need to insert 786 several prefetches for the reference. */ 787 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) 788 / ref->prefetch_mod); 789 if (max_prefetches <= n_prefetches) 790 return true; 791 792 max_prefetches -= n_prefetches; 793 any = true; 794 } 795 796 return any; 797} 798 799/* Determine whether there is any reference suitable for prefetching 800 in GROUPS. */ 801 802static bool 803anything_to_prefetch_p (struct mem_ref_group *groups) 804{ 805 struct mem_ref *ref; 806 807 for (; groups; groups = groups->next) 808 for (ref = groups->refs; ref; ref = ref->next) 809 if (should_issue_prefetch_p (ref)) 810 return true; 811 812 return false; 813} 814 815/* Issue prefetches for the reference REF into loop as decided before. 816 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR 817 is the factor by which LOOP was unrolled. */ 818 819static void 820issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead) 821{ 822 HOST_WIDE_INT delta; 823 tree addr, addr_base, prefetch, params, write_p; 824 block_stmt_iterator bsi; 825 unsigned n_prefetches, ap; 826 827 if (dump_file && (dump_flags & TDF_DETAILS)) 828 fprintf (dump_file, "Issued prefetch for %p.\n", (void *) ref); 829 830 bsi = bsi_for_stmt (ref->stmt); 831 832 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1) 833 / ref->prefetch_mod); 834 addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node); 835 addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL); 836 837 for (ap = 0; ap < n_prefetches; ap++) 838 { 839 /* Determine the address to prefetch. */ 840 delta = (ahead + ap * ref->prefetch_mod) * ref->group->step; 841 addr = fold_build2 (PLUS_EXPR, ptr_type_node, 842 addr_base, build_int_cst (ptr_type_node, delta)); 843 addr = force_gimple_operand_bsi (&bsi, unshare_expr (addr), true, NULL); 844 845 /* Create the prefetch instruction. */ 846 write_p = ref->write_p ? integer_one_node : integer_zero_node; 847 params = tree_cons (NULL_TREE, addr, 848 tree_cons (NULL_TREE, write_p, NULL_TREE)); 849 850 prefetch = build_function_call_expr (built_in_decls[BUILT_IN_PREFETCH], 851 params); 852 bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT); 853 } 854} 855 856/* Issue prefetches for the references in GROUPS into loop as decided before. 857 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the 858 factor by that LOOP was unrolled. */ 859 860static void 861issue_prefetches (struct mem_ref_group *groups, 862 unsigned unroll_factor, unsigned ahead) 863{ 864 struct mem_ref *ref; 865 866 for (; groups; groups = groups->next) 867 for (ref = groups->refs; ref; ref = ref->next) 868 if (ref->issue_prefetch_p) 869 issue_prefetch_ref (ref, unroll_factor, ahead); 870} 871 872/* Determines whether we can profitably unroll LOOP FACTOR times, and if 873 this is the case, fill in DESC by the description of number of 874 iterations. */ 875 876static bool 877should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc, 878 unsigned factor) 879{ 880 if (!can_unroll_loop_p (loop, factor, desc)) 881 return false; 882 883 /* We only consider loops without control flow for unrolling. This is not 884 a hard restriction -- tree_unroll_loop works with arbitrary loops 885 as well; but the unrolling/prefetching is usually more profitable for 886 loops consisting of a single basic block, and we want to limit the 887 code growth. */ 888 if (loop->num_nodes > 2) 889 return false; 890 891 return true; 892} 893 894/* Determine the coefficient by that unroll LOOP, from the information 895 contained in the list of memory references REFS. Description of 896 umber of iterations of LOOP is stored to DESC. AHEAD is the number 897 of iterations ahead that we need to prefetch. NINSNS is number of 898 insns of the LOOP. */ 899 900static unsigned 901determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs, 902 unsigned ahead, unsigned ninsns, 903 struct tree_niter_desc *desc) 904{ 905 unsigned upper_bound, size_factor, constraint_factor; 906 unsigned factor, max_mod_constraint, ahead_factor; 907 struct mem_ref_group *agp; 908 struct mem_ref *ref; 909 910 upper_bound = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES); 911 912 /* First check whether the loop is not too large to unroll. */ 913 size_factor = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns; 914 if (size_factor <= 1) 915 return 1; 916 917 if (size_factor < upper_bound) 918 upper_bound = size_factor; 919 920 max_mod_constraint = 1; 921 for (agp = refs; agp; agp = agp->next) 922 for (ref = agp->refs; ref; ref = ref->next) 923 if (should_issue_prefetch_p (ref) 924 && ref->prefetch_mod > max_mod_constraint) 925 max_mod_constraint = ref->prefetch_mod; 926 927 /* Set constraint_factor as large as needed to be able to satisfy the 928 largest modulo constraint. */ 929 constraint_factor = max_mod_constraint; 930 931 /* If ahead is too large in comparison with the number of available 932 prefetches, unroll the loop as much as needed to be able to prefetch 933 at least partially some of the references in the loop. */ 934 ahead_factor = ((ahead + SIMULTANEOUS_PREFETCHES - 1) 935 / SIMULTANEOUS_PREFETCHES); 936 937 /* Unroll as much as useful, but bound the code size growth. */ 938 if (constraint_factor < ahead_factor) 939 factor = ahead_factor; 940 else 941 factor = constraint_factor; 942 if (factor > upper_bound) 943 factor = upper_bound; 944 945 if (!should_unroll_loop_p (loop, desc, factor)) 946 return 1; 947 948 return factor; 949} 950 951/* Issue prefetch instructions for array references in LOOP. Returns 952 true if the LOOP was unrolled. LOOPS is the array containing all 953 loops. */ 954 955static bool 956loop_prefetch_arrays (struct loops *loops, struct loop *loop) 957{ 958 struct mem_ref_group *refs; 959 unsigned ahead, ninsns, unroll_factor; 960 struct tree_niter_desc desc; 961 bool unrolled = false; 962 963 /* Step 1: gather the memory references. */ 964 refs = gather_memory_references (loop); 965 966 /* Step 2: estimate the reuse effects. */ 967 prune_by_reuse (refs); 968 969 if (!anything_to_prefetch_p (refs)) 970 goto fail; 971 972 /* Step 3: determine the ahead and unroll factor. */ 973 974 /* FIXME: We should use not size of the loop, but the average number of 975 instructions executed per iteration of the loop. */ 976 ninsns = tree_num_loop_insns (loop); 977 ahead = (PREFETCH_LATENCY + ninsns - 1) / ninsns; 978 unroll_factor = determine_unroll_factor (loop, refs, ahead, ninsns, 979 &desc); 980 if (dump_file && (dump_flags & TDF_DETAILS)) 981 fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor); 982 983 /* If the loop rolls less than the required unroll factor, prefetching 984 is useless. */ 985 if (unroll_factor > 1 986 && cst_and_fits_in_hwi (desc.niter) 987 && (unsigned HOST_WIDE_INT) int_cst_value (desc.niter) < unroll_factor) 988 goto fail; 989 990 /* Step 4: what to prefetch? */ 991 if (!schedule_prefetches (refs, unroll_factor, ahead)) 992 goto fail; 993 994 /* Step 5: unroll the loop. TODO -- peeling of first and last few 995 iterations so that we do not issue superfluous prefetches. */ 996 if (unroll_factor != 1) 997 { 998 tree_unroll_loop (loops, loop, unroll_factor, 999 single_dom_exit (loop), &desc); 1000 unrolled = true; 1001 } 1002 1003 /* Step 6: issue the prefetches. */ 1004 issue_prefetches (refs, unroll_factor, ahead); 1005 1006fail: 1007 release_mem_refs (refs); 1008 return unrolled; 1009} 1010 1011/* Issue prefetch instructions for array references in LOOPS. */ 1012 1013unsigned int 1014tree_ssa_prefetch_arrays (struct loops *loops) 1015{ 1016 unsigned i; 1017 struct loop *loop; 1018 bool unrolled = false; 1019 int todo_flags = 0; 1020 1021 if (!HAVE_prefetch 1022 /* It is possible to ask compiler for say -mtune=i486 -march=pentium4. 1023 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part 1024 of processor costs and i486 does not have prefetch, but 1025 -march=pentium4 causes HAVE_prefetch to be true. Ugh. */ 1026 || PREFETCH_BLOCK == 0) 1027 return 0; 1028 1029 initialize_original_copy_tables (); 1030 1031 if (!built_in_decls[BUILT_IN_PREFETCH]) 1032 { 1033 tree type = build_function_type (void_type_node, 1034 tree_cons (NULL_TREE, 1035 const_ptr_type_node, 1036 NULL_TREE)); 1037 tree decl = lang_hooks.builtin_function ("__builtin_prefetch", type, 1038 BUILT_IN_PREFETCH, BUILT_IN_NORMAL, 1039 NULL, NULL_TREE); 1040 DECL_IS_NOVOPS (decl) = true; 1041 built_in_decls[BUILT_IN_PREFETCH] = decl; 1042 } 1043 1044 /* We assume that size of cache line is a power of two, so verify this 1045 here. */ 1046 gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0); 1047 1048 for (i = loops->num - 1; i > 0; i--) 1049 { 1050 loop = loops->parray[i]; 1051 1052 if (dump_file && (dump_flags & TDF_DETAILS)) 1053 fprintf (dump_file, "Processing loop %d:\n", loop->num); 1054 1055 if (loop) 1056 unrolled |= loop_prefetch_arrays (loops, loop); 1057 1058 if (dump_file && (dump_flags & TDF_DETAILS)) 1059 fprintf (dump_file, "\n\n"); 1060 } 1061 1062 if (unrolled) 1063 { 1064 scev_reset (); 1065 todo_flags |= TODO_cleanup_cfg; 1066 } 1067 1068 free_original_copy_tables (); 1069 return todo_flags; 1070} 1071