gcm.cpp revision 605:98cb887364d3
1/*
2 * Copyright 1997-2008 Sun Microsystems, Inc.  All Rights Reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
20 * CA 95054 USA or visit www.sun.com if you need additional information or
21 * have any questions.
22 *
23 */
24
25// Portions of code courtesy of Clifford Click
26
27// Optimization - Graph Style
28
29#include "incls/_precompiled.incl"
30#include "incls/_gcm.cpp.incl"
31
32// To avoid float value underflow
33#define MIN_BLOCK_FREQUENCY 1.e-35f
34
35//----------------------------schedule_node_into_block-------------------------
36// Insert node n into block b. Look for projections of n and make sure they
37// are in b also.
38void PhaseCFG::schedule_node_into_block( Node *n, Block *b ) {
39  // Set basic block of n, Add n to b,
40  _bbs.map(n->_idx, b);
41  b->add_inst(n);
42
43  // After Matching, nearly any old Node may have projections trailing it.
44  // These are usually machine-dependent flags.  In any case, they might
45  // float to another block below this one.  Move them up.
46  for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
47    Node*  use  = n->fast_out(i);
48    if (use->is_Proj()) {
49      Block* buse = _bbs[use->_idx];
50      if (buse != b) {              // In wrong block?
51        if (buse != NULL)
52          buse->find_remove(use);   // Remove from wrong block
53        _bbs.map(use->_idx, b);     // Re-insert in this block
54        b->add_inst(use);
55      }
56    }
57  }
58}
59
60//----------------------------replace_block_proj_ctrl-------------------------
61// Nodes that have is_block_proj() nodes as their control need to use
62// the appropriate Region for their actual block as their control since
63// the projection will be in a predecessor block.
64void PhaseCFG::replace_block_proj_ctrl( Node *n ) {
65  const Node *in0 = n->in(0);
66  assert(in0 != NULL, "Only control-dependent");
67  const Node *p = in0->is_block_proj();
68  if (p != NULL && p != n) {    // Control from a block projection?
69    assert(!n->pinned() || n->is_SafePointScalarObject(), "only SafePointScalarObject pinned node is expected here");
70    // Find trailing Region
71    Block *pb = _bbs[in0->_idx]; // Block-projection already has basic block
72    uint j = 0;
73    if (pb->_num_succs != 1) {  // More then 1 successor?
74      // Search for successor
75      uint max = pb->_nodes.size();
76      assert( max > 1, "" );
77      uint start = max - pb->_num_succs;
78      // Find which output path belongs to projection
79      for (j = start; j < max; j++) {
80        if( pb->_nodes[j] == in0 )
81          break;
82      }
83      assert( j < max, "must find" );
84      // Change control to match head of successor basic block
85      j -= start;
86    }
87    n->set_req(0, pb->_succs[j]->head());
88  }
89}
90
91
92//------------------------------schedule_pinned_nodes--------------------------
93// Set the basic block for Nodes pinned into blocks
94void PhaseCFG::schedule_pinned_nodes( VectorSet &visited ) {
95  // Allocate node stack of size C->unique()+8 to avoid frequent realloc
96  GrowableArray <Node *> spstack(C->unique()+8);
97  spstack.push(_root);
98  while ( spstack.is_nonempty() ) {
99    Node *n = spstack.pop();
100    if( !visited.test_set(n->_idx) ) { // Test node and flag it as visited
101      if( n->pinned() && !_bbs.lookup(n->_idx) ) {  // Pinned?  Nail it down!
102        assert( n->in(0), "pinned Node must have Control" );
103        // Before setting block replace block_proj control edge
104        replace_block_proj_ctrl(n);
105        Node *input = n->in(0);
106        while( !input->is_block_start() )
107          input = input->in(0);
108        Block *b = _bbs[input->_idx];  // Basic block of controlling input
109        schedule_node_into_block(n, b);
110      }
111      for( int i = n->req() - 1; i >= 0; --i ) {  // For all inputs
112        if( n->in(i) != NULL )
113          spstack.push(n->in(i));
114      }
115    }
116  }
117}
118
119#ifdef ASSERT
120// Assert that new input b2 is dominated by all previous inputs.
121// Check this by by seeing that it is dominated by b1, the deepest
122// input observed until b2.
123static void assert_dom(Block* b1, Block* b2, Node* n, Block_Array &bbs) {
124  if (b1 == NULL)  return;
125  assert(b1->_dom_depth < b2->_dom_depth, "sanity");
126  Block* tmp = b2;
127  while (tmp != b1 && tmp != NULL) {
128    tmp = tmp->_idom;
129  }
130  if (tmp != b1) {
131    // Detected an unschedulable graph.  Print some nice stuff and die.
132    tty->print_cr("!!! Unschedulable graph !!!");
133    for (uint j=0; j<n->len(); j++) { // For all inputs
134      Node* inn = n->in(j); // Get input
135      if (inn == NULL)  continue;  // Ignore NULL, missing inputs
136      Block* inb = bbs[inn->_idx];
137      tty->print("B%d idom=B%d depth=%2d ",inb->_pre_order,
138                 inb->_idom ? inb->_idom->_pre_order : 0, inb->_dom_depth);
139      inn->dump();
140    }
141    tty->print("Failing node: ");
142    n->dump();
143    assert(false, "unscheduable graph");
144  }
145}
146#endif
147
148static Block* find_deepest_input(Node* n, Block_Array &bbs) {
149  // Find the last input dominated by all other inputs.
150  Block* deepb           = NULL;        // Deepest block so far
151  int    deepb_dom_depth = 0;
152  for (uint k = 0; k < n->len(); k++) { // For all inputs
153    Node* inn = n->in(k);               // Get input
154    if (inn == NULL)  continue;         // Ignore NULL, missing inputs
155    Block* inb = bbs[inn->_idx];
156    assert(inb != NULL, "must already have scheduled this input");
157    if (deepb_dom_depth < (int) inb->_dom_depth) {
158      // The new inb must be dominated by the previous deepb.
159      // The various inputs must be linearly ordered in the dom
160      // tree, or else there will not be a unique deepest block.
161      DEBUG_ONLY(assert_dom(deepb, inb, n, bbs));
162      deepb = inb;                      // Save deepest block
163      deepb_dom_depth = deepb->_dom_depth;
164    }
165  }
166  assert(deepb != NULL, "must be at least one input to n");
167  return deepb;
168}
169
170
171//------------------------------schedule_early---------------------------------
172// Find the earliest Block any instruction can be placed in.  Some instructions
173// are pinned into Blocks.  Unpinned instructions can appear in last block in
174// which all their inputs occur.
175bool PhaseCFG::schedule_early(VectorSet &visited, Node_List &roots) {
176  // Allocate stack with enough space to avoid frequent realloc
177  Node_Stack nstack(roots.Size() + 8); // (unique >> 1) + 24 from Java2D stats
178  // roots.push(_root); _root will be processed among C->top() inputs
179  roots.push(C->top());
180  visited.set(C->top()->_idx);
181
182  while (roots.size() != 0) {
183    // Use local variables nstack_top_n & nstack_top_i to cache values
184    // on stack's top.
185    Node *nstack_top_n = roots.pop();
186    uint  nstack_top_i = 0;
187//while_nstack_nonempty:
188    while (true) {
189      // Get parent node and next input's index from stack's top.
190      Node *n = nstack_top_n;
191      uint  i = nstack_top_i;
192
193      if (i == 0) {
194        // Fixup some control.  Constants without control get attached
195        // to root and nodes that use is_block_proj() nodes should be attached
196        // to the region that starts their block.
197        const Node *in0 = n->in(0);
198        if (in0 != NULL) {              // Control-dependent?
199          replace_block_proj_ctrl(n);
200        } else {               // n->in(0) == NULL
201          if (n->req() == 1) { // This guy is a constant with NO inputs?
202            n->set_req(0, _root);
203          }
204        }
205      }
206
207      // First, visit all inputs and force them to get a block.  If an
208      // input is already in a block we quit following inputs (to avoid
209      // cycles). Instead we put that Node on a worklist to be handled
210      // later (since IT'S inputs may not have a block yet).
211      bool done = true;              // Assume all n's inputs will be processed
212      while (i < n->len()) {         // For all inputs
213        Node *in = n->in(i);         // Get input
214        ++i;
215        if (in == NULL) continue;    // Ignore NULL, missing inputs
216        int is_visited = visited.test_set(in->_idx);
217        if (!_bbs.lookup(in->_idx)) { // Missing block selection?
218          if (is_visited) {
219            // assert( !visited.test(in->_idx), "did not schedule early" );
220            return false;
221          }
222          nstack.push(n, i);         // Save parent node and next input's index.
223          nstack_top_n = in;         // Process current input now.
224          nstack_top_i = 0;
225          done = false;              // Not all n's inputs processed.
226          break; // continue while_nstack_nonempty;
227        } else if (!is_visited) {    // Input not yet visited?
228          roots.push(in);            // Visit this guy later, using worklist
229        }
230      }
231      if (done) {
232        // All of n's inputs have been processed, complete post-processing.
233
234        // Some instructions are pinned into a block.  These include Region,
235        // Phi, Start, Return, and other control-dependent instructions and
236        // any projections which depend on them.
237        if (!n->pinned()) {
238          // Set earliest legal block.
239          _bbs.map(n->_idx, find_deepest_input(n, _bbs));
240        } else {
241          assert(_bbs[n->_idx] == _bbs[n->in(0)->_idx], "Pinned Node should be at the same block as its control edge");
242        }
243
244        if (nstack.is_empty()) {
245          // Finished all nodes on stack.
246          // Process next node on the worklist 'roots'.
247          break;
248        }
249        // Get saved parent node and next input's index.
250        nstack_top_n = nstack.node();
251        nstack_top_i = nstack.index();
252        nstack.pop();
253      } //    if (done)
254    }   // while (true)
255  }     // while (roots.size() != 0)
256  return true;
257}
258
259//------------------------------dom_lca----------------------------------------
260// Find least common ancestor in dominator tree
261// LCA is a current notion of LCA, to be raised above 'this'.
262// As a convenient boundary condition, return 'this' if LCA is NULL.
263// Find the LCA of those two nodes.
264Block* Block::dom_lca(Block* LCA) {
265  if (LCA == NULL || LCA == this)  return this;
266
267  Block* anc = this;
268  while (anc->_dom_depth > LCA->_dom_depth)
269    anc = anc->_idom;           // Walk up till anc is as high as LCA
270
271  while (LCA->_dom_depth > anc->_dom_depth)
272    LCA = LCA->_idom;           // Walk up till LCA is as high as anc
273
274  while (LCA != anc) {          // Walk both up till they are the same
275    LCA = LCA->_idom;
276    anc = anc->_idom;
277  }
278
279  return LCA;
280}
281
282//--------------------------raise_LCA_above_use--------------------------------
283// We are placing a definition, and have been given a def->use edge.
284// The definition must dominate the use, so move the LCA upward in the
285// dominator tree to dominate the use.  If the use is a phi, adjust
286// the LCA only with the phi input paths which actually use this def.
287static Block* raise_LCA_above_use(Block* LCA, Node* use, Node* def, Block_Array &bbs) {
288  Block* buse = bbs[use->_idx];
289  if (buse == NULL)    return LCA;   // Unused killing Projs have no use block
290  if (!use->is_Phi())  return buse->dom_lca(LCA);
291  uint pmax = use->req();       // Number of Phi inputs
292  // Why does not this loop just break after finding the matching input to
293  // the Phi?  Well...it's like this.  I do not have true def-use/use-def
294  // chains.  Means I cannot distinguish, from the def-use direction, which
295  // of many use-defs lead from the same use to the same def.  That is, this
296  // Phi might have several uses of the same def.  Each use appears in a
297  // different predecessor block.  But when I enter here, I cannot distinguish
298  // which use-def edge I should find the predecessor block for.  So I find
299  // them all.  Means I do a little extra work if a Phi uses the same value
300  // more than once.
301  for (uint j=1; j<pmax; j++) { // For all inputs
302    if (use->in(j) == def) {    // Found matching input?
303      Block* pred = bbs[buse->pred(j)->_idx];
304      LCA = pred->dom_lca(LCA);
305    }
306  }
307  return LCA;
308}
309
310//----------------------------raise_LCA_above_marks----------------------------
311// Return a new LCA that dominates LCA and any of its marked predecessors.
312// Search all my parents up to 'early' (exclusive), looking for predecessors
313// which are marked with the given index.  Return the LCA (in the dom tree)
314// of all marked blocks.  If there are none marked, return the original
315// LCA.
316static Block* raise_LCA_above_marks(Block* LCA, node_idx_t mark,
317                                    Block* early, Block_Array &bbs) {
318  Block_List worklist;
319  worklist.push(LCA);
320  while (worklist.size() > 0) {
321    Block* mid = worklist.pop();
322    if (mid == early)  continue;  // stop searching here
323
324    // Test and set the visited bit.
325    if (mid->raise_LCA_visited() == mark)  continue;  // already visited
326
327    // Don't process the current LCA, otherwise the search may terminate early
328    if (mid != LCA && mid->raise_LCA_mark() == mark) {
329      // Raise the LCA.
330      LCA = mid->dom_lca(LCA);
331      if (LCA == early)  break;   // stop searching everywhere
332      assert(early->dominates(LCA), "early is high enough");
333      // Resume searching at that point, skipping intermediate levels.
334      worklist.push(LCA);
335      if (LCA == mid)
336        continue; // Don't mark as visited to avoid early termination.
337    } else {
338      // Keep searching through this block's predecessors.
339      for (uint j = 1, jmax = mid->num_preds(); j < jmax; j++) {
340        Block* mid_parent = bbs[ mid->pred(j)->_idx ];
341        worklist.push(mid_parent);
342      }
343    }
344    mid->set_raise_LCA_visited(mark);
345  }
346  return LCA;
347}
348
349//--------------------------memory_early_block--------------------------------
350// This is a variation of find_deepest_input, the heart of schedule_early.
351// Find the "early" block for a load, if we considered only memory and
352// address inputs, that is, if other data inputs were ignored.
353//
354// Because a subset of edges are considered, the resulting block will
355// be earlier (at a shallower dom_depth) than the true schedule_early
356// point of the node. We compute this earlier block as a more permissive
357// site for anti-dependency insertion, but only if subsume_loads is enabled.
358static Block* memory_early_block(Node* load, Block* early, Block_Array &bbs) {
359  Node* base;
360  Node* index;
361  Node* store = load->in(MemNode::Memory);
362  load->as_Mach()->memory_inputs(base, index);
363
364  assert(base != NodeSentinel && index != NodeSentinel,
365         "unexpected base/index inputs");
366
367  Node* mem_inputs[4];
368  int mem_inputs_length = 0;
369  if (base != NULL)  mem_inputs[mem_inputs_length++] = base;
370  if (index != NULL) mem_inputs[mem_inputs_length++] = index;
371  if (store != NULL) mem_inputs[mem_inputs_length++] = store;
372
373  // In the comparision below, add one to account for the control input,
374  // which may be null, but always takes up a spot in the in array.
375  if (mem_inputs_length + 1 < (int) load->req()) {
376    // This "load" has more inputs than just the memory, base and index inputs.
377    // For purposes of checking anti-dependences, we need to start
378    // from the early block of only the address portion of the instruction,
379    // and ignore other blocks that may have factored into the wider
380    // schedule_early calculation.
381    if (load->in(0) != NULL) mem_inputs[mem_inputs_length++] = load->in(0);
382
383    Block* deepb           = NULL;        // Deepest block so far
384    int    deepb_dom_depth = 0;
385    for (int i = 0; i < mem_inputs_length; i++) {
386      Block* inb = bbs[mem_inputs[i]->_idx];
387      if (deepb_dom_depth < (int) inb->_dom_depth) {
388        // The new inb must be dominated by the previous deepb.
389        // The various inputs must be linearly ordered in the dom
390        // tree, or else there will not be a unique deepest block.
391        DEBUG_ONLY(assert_dom(deepb, inb, load, bbs));
392        deepb = inb;                      // Save deepest block
393        deepb_dom_depth = deepb->_dom_depth;
394      }
395    }
396    early = deepb;
397  }
398
399  return early;
400}
401
402//--------------------------insert_anti_dependences---------------------------
403// A load may need to witness memory that nearby stores can overwrite.
404// For each nearby store, either insert an "anti-dependence" edge
405// from the load to the store, or else move LCA upward to force the
406// load to (eventually) be scheduled in a block above the store.
407//
408// Do not add edges to stores on distinct control-flow paths;
409// only add edges to stores which might interfere.
410//
411// Return the (updated) LCA.  There will not be any possibly interfering
412// store between the load's "early block" and the updated LCA.
413// Any stores in the updated LCA will have new precedence edges
414// back to the load.  The caller is expected to schedule the load
415// in the LCA, in which case the precedence edges will make LCM
416// preserve anti-dependences.  The caller may also hoist the load
417// above the LCA, if it is not the early block.
418Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) {
419  assert(load->needs_anti_dependence_check(), "must be a load of some sort");
420  assert(LCA != NULL, "");
421  DEBUG_ONLY(Block* LCA_orig = LCA);
422
423  // Compute the alias index.  Loads and stores with different alias indices
424  // do not need anti-dependence edges.
425  uint load_alias_idx = C->get_alias_index(load->adr_type());
426#ifdef ASSERT
427  if (load_alias_idx == Compile::AliasIdxBot && C->AliasLevel() > 0 &&
428      (PrintOpto || VerifyAliases ||
429       PrintMiscellaneous && (WizardMode || Verbose))) {
430    // Load nodes should not consume all of memory.
431    // Reporting a bottom type indicates a bug in adlc.
432    // If some particular type of node validly consumes all of memory,
433    // sharpen the preceding "if" to exclude it, so we can catch bugs here.
434    tty->print_cr("*** Possible Anti-Dependence Bug:  Load consumes all of memory.");
435    load->dump(2);
436    if (VerifyAliases)  assert(load_alias_idx != Compile::AliasIdxBot, "");
437  }
438#endif
439  assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrComp),
440         "String compare is only known 'load' that does not conflict with any stores");
441
442  if (!C->alias_type(load_alias_idx)->is_rewritable()) {
443    // It is impossible to spoil this load by putting stores before it,
444    // because we know that the stores will never update the value
445    // which 'load' must witness.
446    return LCA;
447  }
448
449  node_idx_t load_index = load->_idx;
450
451  // Note the earliest legal placement of 'load', as determined by
452  // by the unique point in the dom tree where all memory effects
453  // and other inputs are first available.  (Computed by schedule_early.)
454  // For normal loads, 'early' is the shallowest place (dom graph wise)
455  // to look for anti-deps between this load and any store.
456  Block* early = _bbs[load_index];
457
458  // If we are subsuming loads, compute an "early" block that only considers
459  // memory or address inputs. This block may be different than the
460  // schedule_early block in that it could be at an even shallower depth in the
461  // dominator tree, and allow for a broader discovery of anti-dependences.
462  if (C->subsume_loads()) {
463    early = memory_early_block(load, early, _bbs);
464  }
465
466  ResourceArea *area = Thread::current()->resource_area();
467  Node_List worklist_mem(area);     // prior memory state to store
468  Node_List worklist_store(area);   // possible-def to explore
469  Node_List worklist_visited(area); // visited mergemem nodes
470  Node_List non_early_stores(area); // all relevant stores outside of early
471  bool must_raise_LCA = false;
472
473#ifdef TRACK_PHI_INPUTS
474  // %%% This extra checking fails because MergeMem nodes are not GVNed.
475  // Provide "phi_inputs" to check if every input to a PhiNode is from the
476  // original memory state.  This indicates a PhiNode for which should not
477  // prevent the load from sinking.  For such a block, set_raise_LCA_mark
478  // may be overly conservative.
479  // Mechanism: count inputs seen for each Phi encountered in worklist_store.
480  DEBUG_ONLY(GrowableArray<uint> phi_inputs(area, C->unique(),0,0));
481#endif
482
483  // 'load' uses some memory state; look for users of the same state.
484  // Recurse through MergeMem nodes to the stores that use them.
485
486  // Each of these stores is a possible definition of memory
487  // that 'load' needs to use.  We need to force 'load'
488  // to occur before each such store.  When the store is in
489  // the same block as 'load', we insert an anti-dependence
490  // edge load->store.
491
492  // The relevant stores "nearby" the load consist of a tree rooted
493  // at initial_mem, with internal nodes of type MergeMem.
494  // Therefore, the branches visited by the worklist are of this form:
495  //    initial_mem -> (MergeMem ->)* store
496  // The anti-dependence constraints apply only to the fringe of this tree.
497
498  Node* initial_mem = load->in(MemNode::Memory);
499  worklist_store.push(initial_mem);
500  worklist_visited.push(initial_mem);
501  worklist_mem.push(NULL);
502  while (worklist_store.size() > 0) {
503    // Examine a nearby store to see if it might interfere with our load.
504    Node* mem   = worklist_mem.pop();
505    Node* store = worklist_store.pop();
506    uint op = store->Opcode();
507
508    // MergeMems do not directly have anti-deps.
509    // Treat them as internal nodes in a forward tree of memory states,
510    // the leaves of which are each a 'possible-def'.
511    if (store == initial_mem    // root (exclusive) of tree we are searching
512        || op == Op_MergeMem    // internal node of tree we are searching
513        ) {
514      mem = store;   // It's not a possibly interfering store.
515      if (store == initial_mem)
516        initial_mem = NULL;  // only process initial memory once
517
518      for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
519        store = mem->fast_out(i);
520        if (store->is_MergeMem()) {
521          // Be sure we don't get into combinatorial problems.
522          // (Allow phis to be repeated; they can merge two relevant states.)
523          uint j = worklist_visited.size();
524          for (; j > 0; j--) {
525            if (worklist_visited.at(j-1) == store)  break;
526          }
527          if (j > 0)  continue; // already on work list; do not repeat
528          worklist_visited.push(store);
529        }
530        worklist_mem.push(mem);
531        worklist_store.push(store);
532      }
533      continue;
534    }
535
536    if (op == Op_MachProj || op == Op_Catch)   continue;
537    if (store->needs_anti_dependence_check())  continue;  // not really a store
538
539    // Compute the alias index.  Loads and stores with different alias
540    // indices do not need anti-dependence edges.  Wide MemBar's are
541    // anti-dependent on everything (except immutable memories).
542    const TypePtr* adr_type = store->adr_type();
543    if (!C->can_alias(adr_type, load_alias_idx))  continue;
544
545    // Most slow-path runtime calls do NOT modify Java memory, but
546    // they can block and so write Raw memory.
547    if (store->is_Mach()) {
548      MachNode* mstore = store->as_Mach();
549      if (load_alias_idx != Compile::AliasIdxRaw) {
550        // Check for call into the runtime using the Java calling
551        // convention (and from there into a wrapper); it has no
552        // _method.  Can't do this optimization for Native calls because
553        // they CAN write to Java memory.
554        if (mstore->ideal_Opcode() == Op_CallStaticJava) {
555          assert(mstore->is_MachSafePoint(), "");
556          MachSafePointNode* ms = (MachSafePointNode*) mstore;
557          assert(ms->is_MachCallJava(), "");
558          MachCallJavaNode* mcj = (MachCallJavaNode*) ms;
559          if (mcj->_method == NULL) {
560            // These runtime calls do not write to Java visible memory
561            // (other than Raw) and so do not require anti-dependence edges.
562            continue;
563          }
564        }
565        // Same for SafePoints: they read/write Raw but only read otherwise.
566        // This is basically a workaround for SafePoints only defining control
567        // instead of control + memory.
568        if (mstore->ideal_Opcode() == Op_SafePoint)
569          continue;
570      } else {
571        // Some raw memory, such as the load of "top" at an allocation,
572        // can be control dependent on the previous safepoint. See
573        // comments in GraphKit::allocate_heap() about control input.
574        // Inserting an anti-dep between such a safepoint and a use
575        // creates a cycle, and will cause a subsequent failure in
576        // local scheduling.  (BugId 4919904)
577        // (%%% How can a control input be a safepoint and not a projection??)
578        if (mstore->ideal_Opcode() == Op_SafePoint && load->in(0) == mstore)
579          continue;
580      }
581    }
582
583    // Identify a block that the current load must be above,
584    // or else observe that 'store' is all the way up in the
585    // earliest legal block for 'load'.  In the latter case,
586    // immediately insert an anti-dependence edge.
587    Block* store_block = _bbs[store->_idx];
588    assert(store_block != NULL, "unused killing projections skipped above");
589
590    if (store->is_Phi()) {
591      // 'load' uses memory which is one (or more) of the Phi's inputs.
592      // It must be scheduled not before the Phi, but rather before
593      // each of the relevant Phi inputs.
594      //
595      // Instead of finding the LCA of all inputs to a Phi that match 'mem',
596      // we mark each corresponding predecessor block and do a combined
597      // hoisting operation later (raise_LCA_above_marks).
598      //
599      // Do not assert(store_block != early, "Phi merging memory after access")
600      // PhiNode may be at start of block 'early' with backedge to 'early'
601      DEBUG_ONLY(bool found_match = false);
602      for (uint j = PhiNode::Input, jmax = store->req(); j < jmax; j++) {
603        if (store->in(j) == mem) {   // Found matching input?
604          DEBUG_ONLY(found_match = true);
605          Block* pred_block = _bbs[store_block->pred(j)->_idx];
606          if (pred_block != early) {
607            // If any predecessor of the Phi matches the load's "early block",
608            // we do not need a precedence edge between the Phi and 'load'
609            // since the load will be forced into a block preceding the Phi.
610            pred_block->set_raise_LCA_mark(load_index);
611            assert(!LCA_orig->dominates(pred_block) ||
612                   early->dominates(pred_block), "early is high enough");
613            must_raise_LCA = true;
614          }
615        }
616      }
617      assert(found_match, "no worklist bug");
618#ifdef TRACK_PHI_INPUTS
619#ifdef ASSERT
620      // This assert asks about correct handling of PhiNodes, which may not
621      // have all input edges directly from 'mem'. See BugId 4621264
622      int num_mem_inputs = phi_inputs.at_grow(store->_idx,0) + 1;
623      // Increment by exactly one even if there are multiple copies of 'mem'
624      // coming into the phi, because we will run this block several times
625      // if there are several copies of 'mem'.  (That's how DU iterators work.)
626      phi_inputs.at_put(store->_idx, num_mem_inputs);
627      assert(PhiNode::Input + num_mem_inputs < store->req(),
628             "Expect at least one phi input will not be from original memory state");
629#endif //ASSERT
630#endif //TRACK_PHI_INPUTS
631    } else if (store_block != early) {
632      // 'store' is between the current LCA and earliest possible block.
633      // Label its block, and decide later on how to raise the LCA
634      // to include the effect on LCA of this store.
635      // If this store's block gets chosen as the raised LCA, we
636      // will find him on the non_early_stores list and stick him
637      // with a precedence edge.
638      // (But, don't bother if LCA is already raised all the way.)
639      if (LCA != early) {
640        store_block->set_raise_LCA_mark(load_index);
641        must_raise_LCA = true;
642        non_early_stores.push(store);
643      }
644    } else {
645      // Found a possibly-interfering store in the load's 'early' block.
646      // This means 'load' cannot sink at all in the dominator tree.
647      // Add an anti-dep edge, and squeeze 'load' into the highest block.
648      assert(store != load->in(0), "dependence cycle found");
649      if (verify) {
650        assert(store->find_edge(load) != -1, "missing precedence edge");
651      } else {
652        store->add_prec(load);
653      }
654      LCA = early;
655      // This turns off the process of gathering non_early_stores.
656    }
657  }
658  // (Worklist is now empty; all nearby stores have been visited.)
659
660  // Finished if 'load' must be scheduled in its 'early' block.
661  // If we found any stores there, they have already been given
662  // precedence edges.
663  if (LCA == early)  return LCA;
664
665  // We get here only if there are no possibly-interfering stores
666  // in the load's 'early' block.  Move LCA up above all predecessors
667  // which contain stores we have noted.
668  //
669  // The raised LCA block can be a home to such interfering stores,
670  // but its predecessors must not contain any such stores.
671  //
672  // The raised LCA will be a lower bound for placing the load,
673  // preventing the load from sinking past any block containing
674  // a store that may invalidate the memory state required by 'load'.
675  if (must_raise_LCA)
676    LCA = raise_LCA_above_marks(LCA, load->_idx, early, _bbs);
677  if (LCA == early)  return LCA;
678
679  // Insert anti-dependence edges from 'load' to each store
680  // in the non-early LCA block.
681  // Mine the non_early_stores list for such stores.
682  if (LCA->raise_LCA_mark() == load_index) {
683    while (non_early_stores.size() > 0) {
684      Node* store = non_early_stores.pop();
685      Block* store_block = _bbs[store->_idx];
686      if (store_block == LCA) {
687        // add anti_dependence from store to load in its own block
688        assert(store != load->in(0), "dependence cycle found");
689        if (verify) {
690          assert(store->find_edge(load) != -1, "missing precedence edge");
691        } else {
692          store->add_prec(load);
693        }
694      } else {
695        assert(store_block->raise_LCA_mark() == load_index, "block was marked");
696        // Any other stores we found must be either inside the new LCA
697        // or else outside the original LCA.  In the latter case, they
698        // did not interfere with any use of 'load'.
699        assert(LCA->dominates(store_block)
700               || !LCA_orig->dominates(store_block), "no stray stores");
701      }
702    }
703  }
704
705  // Return the highest block containing stores; any stores
706  // within that block have been given anti-dependence edges.
707  return LCA;
708}
709
710// This class is used to iterate backwards over the nodes in the graph.
711
712class Node_Backward_Iterator {
713
714private:
715  Node_Backward_Iterator();
716
717public:
718  // Constructor for the iterator
719  Node_Backward_Iterator(Node *root, VectorSet &visited, Node_List &stack, Block_Array &bbs);
720
721  // Postincrement operator to iterate over the nodes
722  Node *next();
723
724private:
725  VectorSet   &_visited;
726  Node_List   &_stack;
727  Block_Array &_bbs;
728};
729
730// Constructor for the Node_Backward_Iterator
731Node_Backward_Iterator::Node_Backward_Iterator( Node *root, VectorSet &visited, Node_List &stack, Block_Array &bbs )
732  : _visited(visited), _stack(stack), _bbs(bbs) {
733  // The stack should contain exactly the root
734  stack.clear();
735  stack.push(root);
736
737  // Clear the visited bits
738  visited.Clear();
739}
740
741// Iterator for the Node_Backward_Iterator
742Node *Node_Backward_Iterator::next() {
743
744  // If the _stack is empty, then just return NULL: finished.
745  if ( !_stack.size() )
746    return NULL;
747
748  // '_stack' is emulating a real _stack.  The 'visit-all-users' loop has been
749  // made stateless, so I do not need to record the index 'i' on my _stack.
750  // Instead I visit all users each time, scanning for unvisited users.
751  // I visit unvisited not-anti-dependence users first, then anti-dependent
752  // children next.
753  Node *self = _stack.pop();
754
755  // I cycle here when I am entering a deeper level of recursion.
756  // The key variable 'self' was set prior to jumping here.
757  while( 1 ) {
758
759    _visited.set(self->_idx);
760
761    // Now schedule all uses as late as possible.
762    uint src     = self->is_Proj() ? self->in(0)->_idx : self->_idx;
763    uint src_rpo = _bbs[src]->_rpo;
764
765    // Schedule all nodes in a post-order visit
766    Node *unvisited = NULL;  // Unvisited anti-dependent Node, if any
767
768    // Scan for unvisited nodes
769    for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) {
770      // For all uses, schedule late
771      Node* n = self->fast_out(i); // Use
772
773      // Skip already visited children
774      if ( _visited.test(n->_idx) )
775        continue;
776
777      // do not traverse backward control edges
778      Node *use = n->is_Proj() ? n->in(0) : n;
779      uint use_rpo = _bbs[use->_idx]->_rpo;
780
781      if ( use_rpo < src_rpo )
782        continue;
783
784      // Phi nodes always precede uses in a basic block
785      if ( use_rpo == src_rpo && use->is_Phi() )
786        continue;
787
788      unvisited = n;      // Found unvisited
789
790      // Check for possible-anti-dependent
791      if( !n->needs_anti_dependence_check() )
792        break;            // Not visited, not anti-dep; schedule it NOW
793    }
794
795    // Did I find an unvisited not-anti-dependent Node?
796    if ( !unvisited )
797      break;                  // All done with children; post-visit 'self'
798
799    // Visit the unvisited Node.  Contains the obvious push to
800    // indicate I'm entering a deeper level of recursion.  I push the
801    // old state onto the _stack and set a new state and loop (recurse).
802    _stack.push(self);
803    self = unvisited;
804  } // End recursion loop
805
806  return self;
807}
808
809//------------------------------ComputeLatenciesBackwards----------------------
810// Compute the latency of all the instructions.
811void PhaseCFG::ComputeLatenciesBackwards(VectorSet &visited, Node_List &stack) {
812#ifndef PRODUCT
813  if (trace_opto_pipelining())
814    tty->print("\n#---- ComputeLatenciesBackwards ----\n");
815#endif
816
817  Node_Backward_Iterator iter((Node *)_root, visited, stack, _bbs);
818  Node *n;
819
820  // Walk over all the nodes from last to first
821  while (n = iter.next()) {
822    // Set the latency for the definitions of this instruction
823    partial_latency_of_defs(n);
824  }
825} // end ComputeLatenciesBackwards
826
827//------------------------------partial_latency_of_defs------------------------
828// Compute the latency impact of this node on all defs.  This computes
829// a number that increases as we approach the beginning of the routine.
830void PhaseCFG::partial_latency_of_defs(Node *n) {
831  // Set the latency for this instruction
832#ifndef PRODUCT
833  if (trace_opto_pipelining()) {
834    tty->print("# latency_to_inputs: node_latency[%d] = %d for node",
835               n->_idx, _node_latency.at_grow(n->_idx));
836    dump();
837  }
838#endif
839
840  if (n->is_Proj())
841    n = n->in(0);
842
843  if (n->is_Root())
844    return;
845
846  uint nlen = n->len();
847  uint use_latency = _node_latency.at_grow(n->_idx);
848  uint use_pre_order = _bbs[n->_idx]->_pre_order;
849
850  for ( uint j=0; j<nlen; j++ ) {
851    Node *def = n->in(j);
852
853    if (!def || def == n)
854      continue;
855
856    // Walk backwards thru projections
857    if (def->is_Proj())
858      def = def->in(0);
859
860#ifndef PRODUCT
861    if (trace_opto_pipelining()) {
862      tty->print("#    in(%2d): ", j);
863      def->dump();
864    }
865#endif
866
867    // If the defining block is not known, assume it is ok
868    Block *def_block = _bbs[def->_idx];
869    uint def_pre_order = def_block ? def_block->_pre_order : 0;
870
871    if ( (use_pre_order <  def_pre_order) ||
872         (use_pre_order == def_pre_order && n->is_Phi()) )
873      continue;
874
875    uint delta_latency = n->latency(j);
876    uint current_latency = delta_latency + use_latency;
877
878    if (_node_latency.at_grow(def->_idx) < current_latency) {
879      _node_latency.at_put_grow(def->_idx, current_latency);
880    }
881
882#ifndef PRODUCT
883    if (trace_opto_pipelining()) {
884      tty->print_cr("#      %d + edge_latency(%d) == %d -> %d, node_latency[%d] = %d",
885                    use_latency, j, delta_latency, current_latency, def->_idx,
886                    _node_latency.at_grow(def->_idx));
887    }
888#endif
889  }
890}
891
892//------------------------------latency_from_use-------------------------------
893// Compute the latency of a specific use
894int PhaseCFG::latency_from_use(Node *n, const Node *def, Node *use) {
895  // If self-reference, return no latency
896  if (use == n || use->is_Root())
897    return 0;
898
899  uint def_pre_order = _bbs[def->_idx]->_pre_order;
900  uint latency = 0;
901
902  // If the use is not a projection, then it is simple...
903  if (!use->is_Proj()) {
904#ifndef PRODUCT
905    if (trace_opto_pipelining()) {
906      tty->print("#    out(): ");
907      use->dump();
908    }
909#endif
910
911    uint use_pre_order = _bbs[use->_idx]->_pre_order;
912
913    if (use_pre_order < def_pre_order)
914      return 0;
915
916    if (use_pre_order == def_pre_order && use->is_Phi())
917      return 0;
918
919    uint nlen = use->len();
920    uint nl = _node_latency.at_grow(use->_idx);
921
922    for ( uint j=0; j<nlen; j++ ) {
923      if (use->in(j) == n) {
924        // Change this if we want local latencies
925        uint ul = use->latency(j);
926        uint  l = ul + nl;
927        if (latency < l) latency = l;
928#ifndef PRODUCT
929        if (trace_opto_pipelining()) {
930          tty->print_cr("#      %d + edge_latency(%d) == %d -> %d, latency = %d",
931                        nl, j, ul, l, latency);
932        }
933#endif
934      }
935    }
936  } else {
937    // This is a projection, just grab the latency of the use(s)
938    for (DUIterator_Fast jmax, j = use->fast_outs(jmax); j < jmax; j++) {
939      uint l = latency_from_use(use, def, use->fast_out(j));
940      if (latency < l) latency = l;
941    }
942  }
943
944  return latency;
945}
946
947//------------------------------latency_from_uses------------------------------
948// Compute the latency of this instruction relative to all of it's uses.
949// This computes a number that increases as we approach the beginning of the
950// routine.
951void PhaseCFG::latency_from_uses(Node *n) {
952  // Set the latency for this instruction
953#ifndef PRODUCT
954  if (trace_opto_pipelining()) {
955    tty->print("# latency_from_outputs: node_latency[%d] = %d for node",
956               n->_idx, _node_latency.at_grow(n->_idx));
957    dump();
958  }
959#endif
960  uint latency=0;
961  const Node *def = n->is_Proj() ? n->in(0): n;
962
963  for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
964    uint l = latency_from_use(n, def, n->fast_out(i));
965
966    if (latency < l) latency = l;
967  }
968
969  _node_latency.at_put_grow(n->_idx, latency);
970}
971
972//------------------------------hoist_to_cheaper_block-------------------------
973// Pick a block for node self, between early and LCA, that is a cheaper
974// alternative to LCA.
975Block* PhaseCFG::hoist_to_cheaper_block(Block* LCA, Block* early, Node* self) {
976  const double delta = 1+PROB_UNLIKELY_MAG(4);
977  Block* least       = LCA;
978  double least_freq  = least->_freq;
979  uint target        = _node_latency.at_grow(self->_idx);
980  uint start_latency = _node_latency.at_grow(LCA->_nodes[0]->_idx);
981  uint end_latency   = _node_latency.at_grow(LCA->_nodes[LCA->end_idx()]->_idx);
982  bool in_latency    = (target <= start_latency);
983  const Block* root_block = _bbs[_root->_idx];
984
985  // Turn off latency scheduling if scheduling is just plain off
986  if (!C->do_scheduling())
987    in_latency = true;
988
989  // Do not hoist (to cover latency) instructions which target a
990  // single register.  Hoisting stretches the live range of the
991  // single register and may force spilling.
992  MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL;
993  if (mach && mach->out_RegMask().is_bound1() && mach->out_RegMask().is_NotEmpty())
994    in_latency = true;
995
996#ifndef PRODUCT
997  if (trace_opto_pipelining()) {
998    tty->print("# Find cheaper block for latency %d: ",
999      _node_latency.at_grow(self->_idx));
1000    self->dump();
1001    tty->print_cr("#   B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g",
1002      LCA->_pre_order,
1003      LCA->_nodes[0]->_idx,
1004      start_latency,
1005      LCA->_nodes[LCA->end_idx()]->_idx,
1006      end_latency,
1007      least_freq);
1008  }
1009#endif
1010
1011  // Walk up the dominator tree from LCA (Lowest common ancestor) to
1012  // the earliest legal location.  Capture the least execution frequency.
1013  while (LCA != early) {
1014    LCA = LCA->_idom;         // Follow up the dominator tree
1015
1016    if (LCA == NULL) {
1017      // Bailout without retry
1018      C->record_method_not_compilable("late schedule failed: LCA == NULL");
1019      return least;
1020    }
1021
1022    // Don't hoist machine instructions to the root basic block
1023    if (mach && LCA == root_block)
1024      break;
1025
1026    uint start_lat = _node_latency.at_grow(LCA->_nodes[0]->_idx);
1027    uint end_idx   = LCA->end_idx();
1028    uint end_lat   = _node_latency.at_grow(LCA->_nodes[end_idx]->_idx);
1029    double LCA_freq = LCA->_freq;
1030#ifndef PRODUCT
1031    if (trace_opto_pipelining()) {
1032      tty->print_cr("#   B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g",
1033        LCA->_pre_order, LCA->_nodes[0]->_idx, start_lat, end_idx, end_lat, LCA_freq);
1034    }
1035#endif
1036    if (LCA_freq < least_freq              || // Better Frequency
1037        ( !in_latency                   &&    // No block containing latency
1038          LCA_freq < least_freq * delta &&    // No worse frequency
1039          target >= end_lat             &&    // within latency range
1040          !self->is_iteratively_computed() )  // But don't hoist IV increments
1041             // because they may end up above other uses of their phi forcing
1042             // their result register to be different from their input.
1043       ) {
1044      least = LCA;            // Found cheaper block
1045      least_freq = LCA_freq;
1046      start_latency = start_lat;
1047      end_latency = end_lat;
1048      if (target <= start_lat)
1049        in_latency = true;
1050    }
1051  }
1052
1053#ifndef PRODUCT
1054  if (trace_opto_pipelining()) {
1055    tty->print_cr("#  Choose block B%d with start latency=%d and freq=%g",
1056      least->_pre_order, start_latency, least_freq);
1057  }
1058#endif
1059
1060  // See if the latency needs to be updated
1061  if (target < end_latency) {
1062#ifndef PRODUCT
1063    if (trace_opto_pipelining()) {
1064      tty->print_cr("#  Change latency for [%4d] from %d to %d", self->_idx, target, end_latency);
1065    }
1066#endif
1067    _node_latency.at_put_grow(self->_idx, end_latency);
1068    partial_latency_of_defs(self);
1069  }
1070
1071  return least;
1072}
1073
1074
1075//------------------------------schedule_late-----------------------------------
1076// Now schedule all codes as LATE as possible.  This is the LCA in the
1077// dominator tree of all USES of a value.  Pick the block with the least
1078// loop nesting depth that is lowest in the dominator tree.
1079extern const char must_clone[];
1080void PhaseCFG::schedule_late(VectorSet &visited, Node_List &stack) {
1081#ifndef PRODUCT
1082  if (trace_opto_pipelining())
1083    tty->print("\n#---- schedule_late ----\n");
1084#endif
1085
1086  Node_Backward_Iterator iter((Node *)_root, visited, stack, _bbs);
1087  Node *self;
1088
1089  // Walk over all the nodes from last to first
1090  while (self = iter.next()) {
1091    Block* early = _bbs[self->_idx];   // Earliest legal placement
1092
1093    if (self->is_top()) {
1094      // Top node goes in bb #2 with other constants.
1095      // It must be special-cased, because it has no out edges.
1096      early->add_inst(self);
1097      continue;
1098    }
1099
1100    // No uses, just terminate
1101    if (self->outcnt() == 0) {
1102      assert(self->Opcode() == Op_MachProj, "sanity");
1103      continue;                   // Must be a dead machine projection
1104    }
1105
1106    // If node is pinned in the block, then no scheduling can be done.
1107    if( self->pinned() )          // Pinned in block?
1108      continue;
1109
1110    MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL;
1111    if (mach) {
1112      switch (mach->ideal_Opcode()) {
1113      case Op_CreateEx:
1114        // Don't move exception creation
1115        early->add_inst(self);
1116        continue;
1117        break;
1118      case Op_CheckCastPP:
1119        // Don't move CheckCastPP nodes away from their input, if the input
1120        // is a rawptr (5071820).
1121        Node *def = self->in(1);
1122        if (def != NULL && def->bottom_type()->base() == Type::RawPtr) {
1123          early->add_inst(self);
1124          continue;
1125        }
1126        break;
1127      }
1128    }
1129
1130    // Gather LCA of all uses
1131    Block *LCA = NULL;
1132    {
1133      for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) {
1134        // For all uses, find LCA
1135        Node* use = self->fast_out(i);
1136        LCA = raise_LCA_above_use(LCA, use, self, _bbs);
1137      }
1138    }  // (Hide defs of imax, i from rest of block.)
1139
1140    // Place temps in the block of their use.  This isn't a
1141    // requirement for correctness but it reduces useless
1142    // interference between temps and other nodes.
1143    if (mach != NULL && mach->is_MachTemp()) {
1144      _bbs.map(self->_idx, LCA);
1145      LCA->add_inst(self);
1146      continue;
1147    }
1148
1149    // Check if 'self' could be anti-dependent on memory
1150    if (self->needs_anti_dependence_check()) {
1151      // Hoist LCA above possible-defs and insert anti-dependences to
1152      // defs in new LCA block.
1153      LCA = insert_anti_dependences(LCA, self);
1154    }
1155
1156    if (early->_dom_depth > LCA->_dom_depth) {
1157      // Somehow the LCA has moved above the earliest legal point.
1158      // (One way this can happen is via memory_early_block.)
1159      if (C->subsume_loads() == true && !C->failing()) {
1160        // Retry with subsume_loads == false
1161        // If this is the first failure, the sentinel string will "stick"
1162        // to the Compile object, and the C2Compiler will see it and retry.
1163        C->record_failure(C2Compiler::retry_no_subsuming_loads());
1164      } else {
1165        // Bailout without retry when (early->_dom_depth > LCA->_dom_depth)
1166        C->record_method_not_compilable("late schedule failed: incorrect graph");
1167      }
1168      return;
1169    }
1170
1171    // If there is no opportunity to hoist, then we're done.
1172    bool try_to_hoist = (LCA != early);
1173
1174    // Must clone guys stay next to use; no hoisting allowed.
1175    // Also cannot hoist guys that alter memory or are otherwise not
1176    // allocatable (hoisting can make a value live longer, leading to
1177    // anti and output dependency problems which are normally resolved
1178    // by the register allocator giving everyone a different register).
1179    if (mach != NULL && must_clone[mach->ideal_Opcode()])
1180      try_to_hoist = false;
1181
1182    Block* late = NULL;
1183    if (try_to_hoist) {
1184      // Now find the block with the least execution frequency.
1185      // Start at the latest schedule and work up to the earliest schedule
1186      // in the dominator tree.  Thus the Node will dominate all its uses.
1187      late = hoist_to_cheaper_block(LCA, early, self);
1188    } else {
1189      // Just use the LCA of the uses.
1190      late = LCA;
1191    }
1192
1193    // Put the node into target block
1194    schedule_node_into_block(self, late);
1195
1196#ifdef ASSERT
1197    if (self->needs_anti_dependence_check()) {
1198      // since precedence edges are only inserted when we're sure they
1199      // are needed make sure that after placement in a block we don't
1200      // need any new precedence edges.
1201      verify_anti_dependences(late, self);
1202    }
1203#endif
1204  } // Loop until all nodes have been visited
1205
1206} // end ScheduleLate
1207
1208//------------------------------GlobalCodeMotion-------------------------------
1209void PhaseCFG::GlobalCodeMotion( Matcher &matcher, uint unique, Node_List &proj_list ) {
1210  ResourceMark rm;
1211
1212#ifndef PRODUCT
1213  if (trace_opto_pipelining()) {
1214    tty->print("\n---- Start GlobalCodeMotion ----\n");
1215  }
1216#endif
1217
1218  // Initialize the bbs.map for things on the proj_list
1219  uint i;
1220  for( i=0; i < proj_list.size(); i++ )
1221    _bbs.map(proj_list[i]->_idx, NULL);
1222
1223  // Set the basic block for Nodes pinned into blocks
1224  Arena *a = Thread::current()->resource_area();
1225  VectorSet visited(a);
1226  schedule_pinned_nodes( visited );
1227
1228  // Find the earliest Block any instruction can be placed in.  Some
1229  // instructions are pinned into Blocks.  Unpinned instructions can
1230  // appear in last block in which all their inputs occur.
1231  visited.Clear();
1232  Node_List stack(a);
1233  stack.map( (unique >> 1) + 16, NULL); // Pre-grow the list
1234  if (!schedule_early(visited, stack)) {
1235    // Bailout without retry
1236    C->record_method_not_compilable("early schedule failed");
1237    return;
1238  }
1239
1240  // Build Def-Use edges.
1241  proj_list.push(_root);        // Add real root as another root
1242  proj_list.pop();
1243
1244  // Compute the latency information (via backwards walk) for all the
1245  // instructions in the graph
1246  GrowableArray<uint> node_latency;
1247  _node_latency = node_latency;
1248
1249  if( C->do_scheduling() )
1250    ComputeLatenciesBackwards(visited, stack);
1251
1252  // Now schedule all codes as LATE as possible.  This is the LCA in the
1253  // dominator tree of all USES of a value.  Pick the block with the least
1254  // loop nesting depth that is lowest in the dominator tree.
1255  // ( visited.Clear() called in schedule_late()->Node_Backward_Iterator() )
1256  schedule_late(visited, stack);
1257  if( C->failing() ) {
1258    // schedule_late fails only when graph is incorrect.
1259    assert(!VerifyGraphEdges, "verification should have failed");
1260    return;
1261  }
1262
1263  unique = C->unique();
1264
1265#ifndef PRODUCT
1266  if (trace_opto_pipelining()) {
1267    tty->print("\n---- Detect implicit null checks ----\n");
1268  }
1269#endif
1270
1271  // Detect implicit-null-check opportunities.  Basically, find NULL checks
1272  // with suitable memory ops nearby.  Use the memory op to do the NULL check.
1273  // I can generate a memory op if there is not one nearby.
1274  if (C->is_method_compilation()) {
1275    // Don't do it for natives, adapters, or runtime stubs
1276    int allowed_reasons = 0;
1277    // ...and don't do it when there have been too many traps, globally.
1278    for (int reason = (int)Deoptimization::Reason_none+1;
1279         reason < Compile::trapHistLength; reason++) {
1280      assert(reason < BitsPerInt, "recode bit map");
1281      if (!C->too_many_traps((Deoptimization::DeoptReason) reason))
1282        allowed_reasons |= nth_bit(reason);
1283    }
1284    // By reversing the loop direction we get a very minor gain on mpegaudio.
1285    // Feel free to revert to a forward loop for clarity.
1286    // for( int i=0; i < (int)matcher._null_check_tests.size(); i+=2 ) {
1287    for( int i= matcher._null_check_tests.size()-2; i>=0; i-=2 ) {
1288      Node *proj = matcher._null_check_tests[i  ];
1289      Node *val  = matcher._null_check_tests[i+1];
1290      _bbs[proj->_idx]->implicit_null_check(this, proj, val, allowed_reasons);
1291      // The implicit_null_check will only perform the transformation
1292      // if the null branch is truly uncommon, *and* it leads to an
1293      // uncommon trap.  Combined with the too_many_traps guards
1294      // above, this prevents SEGV storms reported in 6366351,
1295      // by recompiling offending methods without this optimization.
1296    }
1297  }
1298
1299#ifndef PRODUCT
1300  if (trace_opto_pipelining()) {
1301    tty->print("\n---- Start Local Scheduling ----\n");
1302  }
1303#endif
1304
1305  // Schedule locally.  Right now a simple topological sort.
1306  // Later, do a real latency aware scheduler.
1307  int *ready_cnt = NEW_RESOURCE_ARRAY(int,C->unique());
1308  memset( ready_cnt, -1, C->unique() * sizeof(int) );
1309  visited.Clear();
1310  for (i = 0; i < _num_blocks; i++) {
1311    if (!_blocks[i]->schedule_local(this, matcher, ready_cnt, visited)) {
1312      if (!C->failure_reason_is(C2Compiler::retry_no_subsuming_loads())) {
1313        C->record_method_not_compilable("local schedule failed");
1314      }
1315      return;
1316    }
1317  }
1318
1319  // If we inserted any instructions between a Call and his CatchNode,
1320  // clone the instructions on all paths below the Catch.
1321  for( i=0; i < _num_blocks; i++ )
1322    _blocks[i]->call_catch_cleanup(_bbs);
1323
1324#ifndef PRODUCT
1325  if (trace_opto_pipelining()) {
1326    tty->print("\n---- After GlobalCodeMotion ----\n");
1327    for (uint i = 0; i < _num_blocks; i++) {
1328      _blocks[i]->dump();
1329    }
1330  }
1331#endif
1332}
1333
1334
1335//------------------------------Estimate_Block_Frequency-----------------------
1336// Estimate block frequencies based on IfNode probabilities.
1337void PhaseCFG::Estimate_Block_Frequency() {
1338
1339  // Force conditional branches leading to uncommon traps to be unlikely,
1340  // not because we get to the uncommon_trap with less relative frequency,
1341  // but because an uncommon_trap typically causes a deopt, so we only get
1342  // there once.
1343  if (C->do_freq_based_layout()) {
1344    Block_List worklist;
1345    Block* root_blk = _blocks[0];
1346    for (uint i = 1; i < root_blk->num_preds(); i++) {
1347      Block *pb = _bbs[root_blk->pred(i)->_idx];
1348      if (pb->has_uncommon_code()) {
1349        worklist.push(pb);
1350      }
1351    }
1352    while (worklist.size() > 0) {
1353      Block* uct = worklist.pop();
1354      if (uct == _broot) continue;
1355      for (uint i = 1; i < uct->num_preds(); i++) {
1356        Block *pb = _bbs[uct->pred(i)->_idx];
1357        if (pb->_num_succs == 1) {
1358          worklist.push(pb);
1359        } else if (pb->num_fall_throughs() == 2) {
1360          pb->update_uncommon_branch(uct);
1361        }
1362      }
1363    }
1364  }
1365
1366  // Create the loop tree and calculate loop depth.
1367  _root_loop = create_loop_tree();
1368  _root_loop->compute_loop_depth(0);
1369
1370  // Compute block frequency of each block, relative to a single loop entry.
1371  _root_loop->compute_freq();
1372
1373  // Adjust all frequencies to be relative to a single method entry
1374  _root_loop->_freq = 1.0;
1375  _root_loop->scale_freq();
1376
1377  // force paths ending at uncommon traps to be infrequent
1378  if (!C->do_freq_based_layout()) {
1379    Block_List worklist;
1380    Block* root_blk = _blocks[0];
1381    for (uint i = 1; i < root_blk->num_preds(); i++) {
1382      Block *pb = _bbs[root_blk->pred(i)->_idx];
1383      if (pb->has_uncommon_code()) {
1384        worklist.push(pb);
1385      }
1386    }
1387    while (worklist.size() > 0) {
1388      Block* uct = worklist.pop();
1389      uct->_freq = PROB_MIN;
1390      for (uint i = 1; i < uct->num_preds(); i++) {
1391        Block *pb = _bbs[uct->pred(i)->_idx];
1392        if (pb->_num_succs == 1 && pb->_freq > PROB_MIN) {
1393          worklist.push(pb);
1394        }
1395      }
1396    }
1397  }
1398
1399#ifdef ASSERT
1400  for (uint i = 0; i < _num_blocks; i++ ) {
1401    Block *b = _blocks[i];
1402    assert(b->_freq >= MIN_BLOCK_FREQUENCY, "Register Allocator requires meaningful block frequency");
1403  }
1404#endif
1405
1406#ifndef PRODUCT
1407  if (PrintCFGBlockFreq) {
1408    tty->print_cr("CFG Block Frequencies");
1409    _root_loop->dump_tree();
1410    if (Verbose) {
1411      tty->print_cr("PhaseCFG dump");
1412      dump();
1413      tty->print_cr("Node dump");
1414      _root->dump(99999);
1415    }
1416  }
1417#endif
1418}
1419
1420//----------------------------create_loop_tree--------------------------------
1421// Create a loop tree from the CFG
1422CFGLoop* PhaseCFG::create_loop_tree() {
1423
1424#ifdef ASSERT
1425  assert( _blocks[0] == _broot, "" );
1426  for (uint i = 0; i < _num_blocks; i++ ) {
1427    Block *b = _blocks[i];
1428    // Check that _loop field are clear...we could clear them if not.
1429    assert(b->_loop == NULL, "clear _loop expected");
1430    // Sanity check that the RPO numbering is reflected in the _blocks array.
1431    // It doesn't have to be for the loop tree to be built, but if it is not,
1432    // then the blocks have been reordered since dom graph building...which
1433    // may question the RPO numbering
1434    assert(b->_rpo == i, "unexpected reverse post order number");
1435  }
1436#endif
1437
1438  int idct = 0;
1439  CFGLoop* root_loop = new CFGLoop(idct++);
1440
1441  Block_List worklist;
1442
1443  // Assign blocks to loops
1444  for(uint i = _num_blocks - 1; i > 0; i-- ) { // skip Root block
1445    Block *b = _blocks[i];
1446
1447    if (b->head()->is_Loop()) {
1448      Block* loop_head = b;
1449      assert(loop_head->num_preds() - 1 == 2, "loop must have 2 predecessors");
1450      Node* tail_n = loop_head->pred(LoopNode::LoopBackControl);
1451      Block* tail = _bbs[tail_n->_idx];
1452
1453      // Defensively filter out Loop nodes for non-single-entry loops.
1454      // For all reasonable loops, the head occurs before the tail in RPO.
1455      if (i <= tail->_rpo) {
1456
1457        // The tail and (recursive) predecessors of the tail
1458        // are made members of a new loop.
1459
1460        assert(worklist.size() == 0, "nonempty worklist");
1461        CFGLoop* nloop = new CFGLoop(idct++);
1462        assert(loop_head->_loop == NULL, "just checking");
1463        loop_head->_loop = nloop;
1464        // Add to nloop so push_pred() will skip over inner loops
1465        nloop->add_member(loop_head);
1466        nloop->push_pred(loop_head, LoopNode::LoopBackControl, worklist, _bbs);
1467
1468        while (worklist.size() > 0) {
1469          Block* member = worklist.pop();
1470          if (member != loop_head) {
1471            for (uint j = 1; j < member->num_preds(); j++) {
1472              nloop->push_pred(member, j, worklist, _bbs);
1473            }
1474          }
1475        }
1476      }
1477    }
1478  }
1479
1480  // Create a member list for each loop consisting
1481  // of both blocks and (immediate child) loops.
1482  for (uint i = 0; i < _num_blocks; i++) {
1483    Block *b = _blocks[i];
1484    CFGLoop* lp = b->_loop;
1485    if (lp == NULL) {
1486      // Not assigned to a loop. Add it to the method's pseudo loop.
1487      b->_loop = root_loop;
1488      lp = root_loop;
1489    }
1490    if (lp == root_loop || b != lp->head()) { // loop heads are already members
1491      lp->add_member(b);
1492    }
1493    if (lp != root_loop) {
1494      if (lp->parent() == NULL) {
1495        // Not a nested loop. Make it a child of the method's pseudo loop.
1496        root_loop->add_nested_loop(lp);
1497      }
1498      if (b == lp->head()) {
1499        // Add nested loop to member list of parent loop.
1500        lp->parent()->add_member(lp);
1501      }
1502    }
1503  }
1504
1505  return root_loop;
1506}
1507
1508//------------------------------push_pred--------------------------------------
1509void CFGLoop::push_pred(Block* blk, int i, Block_List& worklist, Block_Array& node_to_blk) {
1510  Node* pred_n = blk->pred(i);
1511  Block* pred = node_to_blk[pred_n->_idx];
1512  CFGLoop *pred_loop = pred->_loop;
1513  if (pred_loop == NULL) {
1514    // Filter out blocks for non-single-entry loops.
1515    // For all reasonable loops, the head occurs before the tail in RPO.
1516    if (pred->_rpo > head()->_rpo) {
1517      pred->_loop = this;
1518      worklist.push(pred);
1519    }
1520  } else if (pred_loop != this) {
1521    // Nested loop.
1522    while (pred_loop->_parent != NULL && pred_loop->_parent != this) {
1523      pred_loop = pred_loop->_parent;
1524    }
1525    // Make pred's loop be a child
1526    if (pred_loop->_parent == NULL) {
1527      add_nested_loop(pred_loop);
1528      // Continue with loop entry predecessor.
1529      Block* pred_head = pred_loop->head();
1530      assert(pred_head->num_preds() - 1 == 2, "loop must have 2 predecessors");
1531      assert(pred_head != head(), "loop head in only one loop");
1532      push_pred(pred_head, LoopNode::EntryControl, worklist, node_to_blk);
1533    } else {
1534      assert(pred_loop->_parent == this && _parent == NULL, "just checking");
1535    }
1536  }
1537}
1538
1539//------------------------------add_nested_loop--------------------------------
1540// Make cl a child of the current loop in the loop tree.
1541void CFGLoop::add_nested_loop(CFGLoop* cl) {
1542  assert(_parent == NULL, "no parent yet");
1543  assert(cl != this, "not my own parent");
1544  cl->_parent = this;
1545  CFGLoop* ch = _child;
1546  if (ch == NULL) {
1547    _child = cl;
1548  } else {
1549    while (ch->_sibling != NULL) { ch = ch->_sibling; }
1550    ch->_sibling = cl;
1551  }
1552}
1553
1554//------------------------------compute_loop_depth-----------------------------
1555// Store the loop depth in each CFGLoop object.
1556// Recursively walk the children to do the same for them.
1557void CFGLoop::compute_loop_depth(int depth) {
1558  _depth = depth;
1559  CFGLoop* ch = _child;
1560  while (ch != NULL) {
1561    ch->compute_loop_depth(depth + 1);
1562    ch = ch->_sibling;
1563  }
1564}
1565
1566//------------------------------compute_freq-----------------------------------
1567// Compute the frequency of each block and loop, relative to a single entry
1568// into the dominating loop head.
1569void CFGLoop::compute_freq() {
1570  // Bottom up traversal of loop tree (visit inner loops first.)
1571  // Set loop head frequency to 1.0, then transitively
1572  // compute frequency for all successors in the loop,
1573  // as well as for each exit edge.  Inner loops are
1574  // treated as single blocks with loop exit targets
1575  // as the successor blocks.
1576
1577  // Nested loops first
1578  CFGLoop* ch = _child;
1579  while (ch != NULL) {
1580    ch->compute_freq();
1581    ch = ch->_sibling;
1582  }
1583  assert (_members.length() > 0, "no empty loops");
1584  Block* hd = head();
1585  hd->_freq = 1.0f;
1586  for (int i = 0; i < _members.length(); i++) {
1587    CFGElement* s = _members.at(i);
1588    float freq = s->_freq;
1589    if (s->is_block()) {
1590      Block* b = s->as_Block();
1591      for (uint j = 0; j < b->_num_succs; j++) {
1592        Block* sb = b->_succs[j];
1593        update_succ_freq(sb, freq * b->succ_prob(j));
1594      }
1595    } else {
1596      CFGLoop* lp = s->as_CFGLoop();
1597      assert(lp->_parent == this, "immediate child");
1598      for (int k = 0; k < lp->_exits.length(); k++) {
1599        Block* eb = lp->_exits.at(k).get_target();
1600        float prob = lp->_exits.at(k).get_prob();
1601        update_succ_freq(eb, freq * prob);
1602      }
1603    }
1604  }
1605
1606  // For all loops other than the outer, "method" loop,
1607  // sum and normalize the exit probability. The "method" loop
1608  // should keep the initial exit probability of 1, so that
1609  // inner blocks do not get erroneously scaled.
1610  if (_depth != 0) {
1611    // Total the exit probabilities for this loop.
1612    float exits_sum = 0.0f;
1613    for (int i = 0; i < _exits.length(); i++) {
1614      exits_sum += _exits.at(i).get_prob();
1615    }
1616
1617    // Normalize the exit probabilities. Until now, the
1618    // probabilities estimate the possibility of exit per
1619    // a single loop iteration; afterward, they estimate
1620    // the probability of exit per loop entry.
1621    for (int i = 0; i < _exits.length(); i++) {
1622      Block* et = _exits.at(i).get_target();
1623      float new_prob = 0.0f;
1624      if (_exits.at(i).get_prob() > 0.0f) {
1625        new_prob = _exits.at(i).get_prob() / exits_sum;
1626      }
1627      BlockProbPair bpp(et, new_prob);
1628      _exits.at_put(i, bpp);
1629    }
1630
1631    // Save the total, but guard against unreasonable probability,
1632    // as the value is used to estimate the loop trip count.
1633    // An infinite trip count would blur relative block
1634    // frequencies.
1635    if (exits_sum > 1.0f) exits_sum = 1.0;
1636    if (exits_sum < PROB_MIN) exits_sum = PROB_MIN;
1637    _exit_prob = exits_sum;
1638  }
1639}
1640
1641//------------------------------succ_prob-------------------------------------
1642// Determine the probability of reaching successor 'i' from the receiver block.
1643float Block::succ_prob(uint i) {
1644  int eidx = end_idx();
1645  Node *n = _nodes[eidx];  // Get ending Node
1646
1647  int op = n->Opcode();
1648  if (n->is_Mach()) {
1649    if (n->is_MachNullCheck()) {
1650      // Can only reach here if called after lcm. The original Op_If is gone,
1651      // so we attempt to infer the probability from one or both of the
1652      // successor blocks.
1653      assert(_num_succs == 2, "expecting 2 successors of a null check");
1654      // If either successor has only one predecessor, then the
1655      // probability estimate can be derived using the
1656      // relative frequency of the successor and this block.
1657      if (_succs[i]->num_preds() == 2) {
1658        return _succs[i]->_freq / _freq;
1659      } else if (_succs[1-i]->num_preds() == 2) {
1660        return 1 - (_succs[1-i]->_freq / _freq);
1661      } else {
1662        // Estimate using both successor frequencies
1663        float freq = _succs[i]->_freq;
1664        return freq / (freq + _succs[1-i]->_freq);
1665      }
1666    }
1667    op = n->as_Mach()->ideal_Opcode();
1668  }
1669
1670
1671  // Switch on branch type
1672  switch( op ) {
1673  case Op_CountedLoopEnd:
1674  case Op_If: {
1675    assert (i < 2, "just checking");
1676    // Conditionals pass on only part of their frequency
1677    float prob  = n->as_MachIf()->_prob;
1678    assert(prob >= 0.0 && prob <= 1.0, "out of range probability");
1679    // If succ[i] is the FALSE branch, invert path info
1680    if( _nodes[i + eidx + 1]->Opcode() == Op_IfFalse ) {
1681      return 1.0f - prob; // not taken
1682    } else {
1683      return prob; // taken
1684    }
1685  }
1686
1687  case Op_Jump:
1688    // Divide the frequency between all successors evenly
1689    return 1.0f/_num_succs;
1690
1691  case Op_Catch: {
1692    const CatchProjNode *ci = _nodes[i + eidx + 1]->as_CatchProj();
1693    if (ci->_con == CatchProjNode::fall_through_index) {
1694      // Fall-thru path gets the lion's share.
1695      return 1.0f - PROB_UNLIKELY_MAG(5)*_num_succs;
1696    } else {
1697      // Presume exceptional paths are equally unlikely
1698      return PROB_UNLIKELY_MAG(5);
1699    }
1700  }
1701
1702  case Op_Root:
1703  case Op_Goto:
1704    // Pass frequency straight thru to target
1705    return 1.0f;
1706
1707  case Op_NeverBranch:
1708    return 0.0f;
1709
1710  case Op_TailCall:
1711  case Op_TailJump:
1712  case Op_Return:
1713  case Op_Halt:
1714  case Op_Rethrow:
1715    // Do not push out freq to root block
1716    return 0.0f;
1717
1718  default:
1719    ShouldNotReachHere();
1720  }
1721
1722  return 0.0f;
1723}
1724
1725//------------------------------num_fall_throughs-----------------------------
1726// Return the number of fall-through candidates for a block
1727int Block::num_fall_throughs() {
1728  int eidx = end_idx();
1729  Node *n = _nodes[eidx];  // Get ending Node
1730
1731  int op = n->Opcode();
1732  if (n->is_Mach()) {
1733    if (n->is_MachNullCheck()) {
1734      // In theory, either side can fall-thru, for simplicity sake,
1735      // let's say only the false branch can now.
1736      return 1;
1737    }
1738    op = n->as_Mach()->ideal_Opcode();
1739  }
1740
1741  // Switch on branch type
1742  switch( op ) {
1743  case Op_CountedLoopEnd:
1744  case Op_If:
1745    return 2;
1746
1747  case Op_Root:
1748  case Op_Goto:
1749    return 1;
1750
1751  case Op_Catch: {
1752    for (uint i = 0; i < _num_succs; i++) {
1753      const CatchProjNode *ci = _nodes[i + eidx + 1]->as_CatchProj();
1754      if (ci->_con == CatchProjNode::fall_through_index) {
1755        return 1;
1756      }
1757    }
1758    return 0;
1759  }
1760
1761  case Op_Jump:
1762  case Op_NeverBranch:
1763  case Op_TailCall:
1764  case Op_TailJump:
1765  case Op_Return:
1766  case Op_Halt:
1767  case Op_Rethrow:
1768    return 0;
1769
1770  default:
1771    ShouldNotReachHere();
1772  }
1773
1774  return 0;
1775}
1776
1777//------------------------------succ_fall_through-----------------------------
1778// Return true if a specific successor could be fall-through target.
1779bool Block::succ_fall_through(uint i) {
1780  int eidx = end_idx();
1781  Node *n = _nodes[eidx];  // Get ending Node
1782
1783  int op = n->Opcode();
1784  if (n->is_Mach()) {
1785    if (n->is_MachNullCheck()) {
1786      // In theory, either side can fall-thru, for simplicity sake,
1787      // let's say only the false branch can now.
1788      return _nodes[i + eidx + 1]->Opcode() == Op_IfFalse;
1789    }
1790    op = n->as_Mach()->ideal_Opcode();
1791  }
1792
1793  // Switch on branch type
1794  switch( op ) {
1795  case Op_CountedLoopEnd:
1796  case Op_If:
1797  case Op_Root:
1798  case Op_Goto:
1799    return true;
1800
1801  case Op_Catch: {
1802    const CatchProjNode *ci = _nodes[i + eidx + 1]->as_CatchProj();
1803    return ci->_con == CatchProjNode::fall_through_index;
1804  }
1805
1806  case Op_Jump:
1807  case Op_NeverBranch:
1808  case Op_TailCall:
1809  case Op_TailJump:
1810  case Op_Return:
1811  case Op_Halt:
1812  case Op_Rethrow:
1813    return false;
1814
1815  default:
1816    ShouldNotReachHere();
1817  }
1818
1819  return false;
1820}
1821
1822//------------------------------update_uncommon_branch------------------------
1823// Update the probability of a two-branch to be uncommon
1824void Block::update_uncommon_branch(Block* ub) {
1825  int eidx = end_idx();
1826  Node *n = _nodes[eidx];  // Get ending Node
1827
1828  int op = n->as_Mach()->ideal_Opcode();
1829
1830  assert(op == Op_CountedLoopEnd || op == Op_If, "must be a If");
1831  assert(num_fall_throughs() == 2, "must be a two way branch block");
1832
1833  // Which successor is ub?
1834  uint s;
1835  for (s = 0; s <_num_succs; s++) {
1836    if (_succs[s] == ub) break;
1837  }
1838  assert(s < 2, "uncommon successor must be found");
1839
1840  // If ub is the true path, make the proability small, else
1841  // ub is the false path, and make the probability large
1842  bool invert = (_nodes[s + eidx + 1]->Opcode() == Op_IfFalse);
1843
1844  // Get existing probability
1845  float p = n->as_MachIf()->_prob;
1846
1847  if (invert) p = 1.0 - p;
1848  if (p > PROB_MIN) {
1849    p = PROB_MIN;
1850  }
1851  if (invert) p = 1.0 - p;
1852
1853  n->as_MachIf()->_prob = p;
1854}
1855
1856//------------------------------update_succ_freq-------------------------------
1857// Update the appropriate frequency associated with block 'b', a successor of
1858// a block in this loop.
1859void CFGLoop::update_succ_freq(Block* b, float freq) {
1860  if (b->_loop == this) {
1861    if (b == head()) {
1862      // back branch within the loop
1863      // Do nothing now, the loop carried frequency will be
1864      // adjust later in scale_freq().
1865    } else {
1866      // simple branch within the loop
1867      b->_freq += freq;
1868    }
1869  } else if (!in_loop_nest(b)) {
1870    // branch is exit from this loop
1871    BlockProbPair bpp(b, freq);
1872    _exits.append(bpp);
1873  } else {
1874    // branch into nested loop
1875    CFGLoop* ch = b->_loop;
1876    ch->_freq += freq;
1877  }
1878}
1879
1880//------------------------------in_loop_nest-----------------------------------
1881// Determine if block b is in the receiver's loop nest.
1882bool CFGLoop::in_loop_nest(Block* b) {
1883  int depth = _depth;
1884  CFGLoop* b_loop = b->_loop;
1885  int b_depth = b_loop->_depth;
1886  if (depth == b_depth) {
1887    return true;
1888  }
1889  while (b_depth > depth) {
1890    b_loop = b_loop->_parent;
1891    b_depth = b_loop->_depth;
1892  }
1893  return b_loop == this;
1894}
1895
1896//------------------------------scale_freq-------------------------------------
1897// Scale frequency of loops and blocks by trip counts from outer loops
1898// Do a top down traversal of loop tree (visit outer loops first.)
1899void CFGLoop::scale_freq() {
1900  float loop_freq = _freq * trip_count();
1901  for (int i = 0; i < _members.length(); i++) {
1902    CFGElement* s = _members.at(i);
1903    float block_freq = s->_freq * loop_freq;
1904    if (block_freq < MIN_BLOCK_FREQUENCY) block_freq = MIN_BLOCK_FREQUENCY;
1905    s->_freq = block_freq;
1906  }
1907  CFGLoop* ch = _child;
1908  while (ch != NULL) {
1909    ch->scale_freq();
1910    ch = ch->_sibling;
1911  }
1912}
1913
1914#ifndef PRODUCT
1915//------------------------------dump_tree--------------------------------------
1916void CFGLoop::dump_tree() const {
1917  dump();
1918  if (_child != NULL)   _child->dump_tree();
1919  if (_sibling != NULL) _sibling->dump_tree();
1920}
1921
1922//------------------------------dump-------------------------------------------
1923void CFGLoop::dump() const {
1924  for (int i = 0; i < _depth; i++) tty->print("   ");
1925  tty->print("%s: %d  trip_count: %6.0f freq: %6.0f\n",
1926             _depth == 0 ? "Method" : "Loop", _id, trip_count(), _freq);
1927  for (int i = 0; i < _depth; i++) tty->print("   ");
1928  tty->print("         members:", _id);
1929  int k = 0;
1930  for (int i = 0; i < _members.length(); i++) {
1931    if (k++ >= 6) {
1932      tty->print("\n              ");
1933      for (int j = 0; j < _depth+1; j++) tty->print("   ");
1934      k = 0;
1935    }
1936    CFGElement *s = _members.at(i);
1937    if (s->is_block()) {
1938      Block *b = s->as_Block();
1939      tty->print(" B%d(%6.3f)", b->_pre_order, b->_freq);
1940    } else {
1941      CFGLoop* lp = s->as_CFGLoop();
1942      tty->print(" L%d(%6.3f)", lp->_id, lp->_freq);
1943    }
1944  }
1945  tty->print("\n");
1946  for (int i = 0; i < _depth; i++) tty->print("   ");
1947  tty->print("         exits:  ");
1948  k = 0;
1949  for (int i = 0; i < _exits.length(); i++) {
1950    if (k++ >= 7) {
1951      tty->print("\n              ");
1952      for (int j = 0; j < _depth+1; j++) tty->print("   ");
1953      k = 0;
1954    }
1955    Block *blk = _exits.at(i).get_target();
1956    float prob = _exits.at(i).get_prob();
1957    tty->print(" ->%d@%d%%", blk->_pre_order, (int)(prob*100));
1958  }
1959  tty->print("\n");
1960}
1961#endif
1962