1/* Vectorizer Specific Loop Manipulations
2   Copyright (C) 2003-2020 Free Software Foundation, Inc.
3   Contributed by Dorit Naishlos <dorit@il.ibm.com>
4   and Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3.  If not see
20<http://www.gnu.org/licenses/>.  */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "tree.h"
27#include "gimple.h"
28#include "cfghooks.h"
29#include "tree-pass.h"
30#include "ssa.h"
31#include "fold-const.h"
32#include "cfganal.h"
33#include "gimplify.h"
34#include "gimple-iterator.h"
35#include "gimplify-me.h"
36#include "tree-cfg.h"
37#include "tree-ssa-loop-manip.h"
38#include "tree-into-ssa.h"
39#include "tree-ssa.h"
40#include "cfgloop.h"
41#include "tree-scalar-evolution.h"
42#include "tree-vectorizer.h"
43#include "tree-ssa-loop-ivopts.h"
44#include "gimple-fold.h"
45#include "tree-ssa-loop-niter.h"
46#include "internal-fn.h"
47#include "stor-layout.h"
48#include "optabs-query.h"
49#include "vec-perm-indices.h"
50#include "insn-config.h"
51#include "rtl.h"
52#include "recog.h"
53
54/*************************************************************************
55  Simple Loop Peeling Utilities
56
57  Utilities to support loop peeling for vectorization purposes.
58 *************************************************************************/
59
60
61/* Renames the use *OP_P.  */
62
63static void
64rename_use_op (use_operand_p op_p)
65{
66  tree new_name;
67
68  if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
69    return;
70
71  new_name = get_current_def (USE_FROM_PTR (op_p));
72
73  /* Something defined outside of the loop.  */
74  if (!new_name)
75    return;
76
77  /* An ordinary ssa name defined in the loop.  */
78
79  SET_USE (op_p, new_name);
80}
81
82
83/* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
84   on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
85   true.  */
86
87static void
88rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
89{
90  gimple *stmt;
91  use_operand_p use_p;
92  ssa_op_iter iter;
93  edge e;
94  edge_iterator ei;
95  class loop *loop = bb->loop_father;
96  class loop *outer_loop = NULL;
97
98  if (rename_from_outer_loop)
99    {
100      gcc_assert (loop);
101      outer_loop = loop_outer (loop);
102    }
103
104  for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
105       gsi_next (&gsi))
106    {
107      stmt = gsi_stmt (gsi);
108      FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
109	rename_use_op (use_p);
110    }
111
112  FOR_EACH_EDGE (e, ei, bb->preds)
113    {
114      if (!flow_bb_inside_loop_p (loop, e->src))
115	{
116	  if (!rename_from_outer_loop)
117	    continue;
118	  if (e->src != outer_loop->header)
119	    {
120	      if (outer_loop->inner->next)
121		{
122		  /* If outer_loop has 2 inner loops, allow there to
123		     be an extra basic block which decides which of the
124		     two loops to use using LOOP_VECTORIZED.  */
125		  if (!single_pred_p (e->src)
126		      || single_pred (e->src) != outer_loop->header)
127		    continue;
128		}
129	    }
130	}
131      for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
132	   gsi_next (&gsi))
133        rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
134    }
135}
136
137
138struct adjust_info
139{
140  tree from, to;
141  basic_block bb;
142};
143
144/* A stack of values to be adjusted in debug stmts.  We have to
145   process them LIFO, so that the closest substitution applies.  If we
146   processed them FIFO, without the stack, we might substitute uses
147   with a PHI DEF that would soon become non-dominant, and when we got
148   to the suitable one, it wouldn't have anything to substitute any
149   more.  */
150static vec<adjust_info, va_heap> adjust_vec;
151
152/* Adjust any debug stmts that referenced AI->from values to use the
153   loop-closed AI->to, if the references are dominated by AI->bb and
154   not by the definition of AI->from.  */
155
156static void
157adjust_debug_stmts_now (adjust_info *ai)
158{
159  basic_block bbphi = ai->bb;
160  tree orig_def = ai->from;
161  tree new_def = ai->to;
162  imm_use_iterator imm_iter;
163  gimple *stmt;
164  basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
165
166  gcc_assert (dom_info_available_p (CDI_DOMINATORS));
167
168  /* Adjust any debug stmts that held onto non-loop-closed
169     references.  */
170  FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
171    {
172      use_operand_p use_p;
173      basic_block bbuse;
174
175      if (!is_gimple_debug (stmt))
176	continue;
177
178      gcc_assert (gimple_debug_bind_p (stmt));
179
180      bbuse = gimple_bb (stmt);
181
182      if ((bbuse == bbphi
183	   || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
184	  && !(bbuse == bbdef
185	       || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
186	{
187	  if (new_def)
188	    FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
189	      SET_USE (use_p, new_def);
190	  else
191	    {
192	      gimple_debug_bind_reset_value (stmt);
193	      update_stmt (stmt);
194	    }
195	}
196    }
197}
198
199/* Adjust debug stmts as scheduled before.  */
200
201static void
202adjust_vec_debug_stmts (void)
203{
204  if (!MAY_HAVE_DEBUG_BIND_STMTS)
205    return;
206
207  gcc_assert (adjust_vec.exists ());
208
209  while (!adjust_vec.is_empty ())
210    {
211      adjust_debug_stmts_now (&adjust_vec.last ());
212      adjust_vec.pop ();
213    }
214}
215
216/* Adjust any debug stmts that referenced FROM values to use the
217   loop-closed TO, if the references are dominated by BB and not by
218   the definition of FROM.  If adjust_vec is non-NULL, adjustments
219   will be postponed until adjust_vec_debug_stmts is called.  */
220
221static void
222adjust_debug_stmts (tree from, tree to, basic_block bb)
223{
224  adjust_info ai;
225
226  if (MAY_HAVE_DEBUG_BIND_STMTS
227      && TREE_CODE (from) == SSA_NAME
228      && ! SSA_NAME_IS_DEFAULT_DEF (from)
229      && ! virtual_operand_p (from))
230    {
231      ai.from = from;
232      ai.to = to;
233      ai.bb = bb;
234
235      if (adjust_vec.exists ())
236	adjust_vec.safe_push (ai);
237      else
238	adjust_debug_stmts_now (&ai);
239    }
240}
241
242/* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
243   to adjust any debug stmts that referenced the old phi arg,
244   presumably non-loop-closed references left over from other
245   transformations.  */
246
247static void
248adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
249{
250  tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
251
252  SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
253
254  if (MAY_HAVE_DEBUG_BIND_STMTS)
255    adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
256			gimple_bb (update_phi));
257}
258
259/* Define one loop mask MASK from loop LOOP.  INIT_MASK is the value that
260   the mask should have during the first iteration and NEXT_MASK is the
261   value that it should have on subsequent iterations.  */
262
263static void
264vect_set_loop_mask (class loop *loop, tree mask, tree init_mask,
265		    tree next_mask)
266{
267  gphi *phi = create_phi_node (mask, loop->header);
268  add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
269  add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
270}
271
272/* Add SEQ to the end of LOOP's preheader block.  */
273
274static void
275add_preheader_seq (class loop *loop, gimple_seq seq)
276{
277  if (seq)
278    {
279      edge pe = loop_preheader_edge (loop);
280      basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
281      gcc_assert (!new_bb);
282    }
283}
284
285/* Add SEQ to the beginning of LOOP's header block.  */
286
287static void
288add_header_seq (class loop *loop, gimple_seq seq)
289{
290  if (seq)
291    {
292      gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
293      gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
294    }
295}
296
297/* Return true if the target can interleave elements of two vectors.
298   OFFSET is 0 if the first half of the vectors should be interleaved
299   or 1 if the second half should.  When returning true, store the
300   associated permutation in INDICES.  */
301
302static bool
303interleave_supported_p (vec_perm_indices *indices, tree vectype,
304			unsigned int offset)
305{
306  poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
307  poly_uint64 base = exact_div (nelts, 2) * offset;
308  vec_perm_builder sel (nelts, 2, 3);
309  for (unsigned int i = 0; i < 3; ++i)
310    {
311      sel.quick_push (base + i);
312      sel.quick_push (base + i + nelts);
313    }
314  indices->new_vector (sel, 2, nelts);
315  return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
316}
317
318/* Try to use permutes to define the masks in DEST_RGM using the masks
319   in SRC_RGM, given that the former has twice as many masks as the
320   latter.  Return true on success, adding any new statements to SEQ.  */
321
322static bool
323vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
324			       rgroup_masks *src_rgm)
325{
326  tree src_masktype = src_rgm->mask_type;
327  tree dest_masktype = dest_rgm->mask_type;
328  machine_mode src_mode = TYPE_MODE (src_masktype);
329  insn_code icode1, icode2;
330  if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
331      && (icode1 = optab_handler (vec_unpacku_hi_optab,
332				  src_mode)) != CODE_FOR_nothing
333      && (icode2 = optab_handler (vec_unpacku_lo_optab,
334				  src_mode)) != CODE_FOR_nothing)
335    {
336      /* Unpacking the source masks gives at least as many mask bits as
337	 we need.  We can then VIEW_CONVERT any excess bits away.  */
338      machine_mode dest_mode = insn_data[icode1].operand[0].mode;
339      gcc_assert (dest_mode == insn_data[icode2].operand[0].mode);
340      tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode);
341      for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
342	{
343	  tree src = src_rgm->masks[i / 2];
344	  tree dest = dest_rgm->masks[i];
345	  tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
346			    ? VEC_UNPACK_HI_EXPR
347			    : VEC_UNPACK_LO_EXPR);
348	  gassign *stmt;
349	  if (dest_masktype == unpack_masktype)
350	    stmt = gimple_build_assign (dest, code, src);
351	  else
352	    {
353	      tree temp = make_ssa_name (unpack_masktype);
354	      stmt = gimple_build_assign (temp, code, src);
355	      gimple_seq_add_stmt (seq, stmt);
356	      stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
357					  build1 (VIEW_CONVERT_EXPR,
358						  dest_masktype, temp));
359	    }
360	  gimple_seq_add_stmt (seq, stmt);
361	}
362      return true;
363    }
364  vec_perm_indices indices[2];
365  if (dest_masktype == src_masktype
366      && interleave_supported_p (&indices[0], src_masktype, 0)
367      && interleave_supported_p (&indices[1], src_masktype, 1))
368    {
369      /* The destination requires twice as many mask bits as the source, so
370	 we can use interleaving permutes to double up the number of bits.  */
371      tree masks[2];
372      for (unsigned int i = 0; i < 2; ++i)
373	masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
374      for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
375	{
376	  tree src = src_rgm->masks[i / 2];
377	  tree dest = dest_rgm->masks[i];
378	  gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
379					      src, src, masks[i & 1]);
380	  gimple_seq_add_stmt (seq, stmt);
381	}
382      return true;
383    }
384  return false;
385}
386
387/* Helper for vect_set_loop_condition_masked.  Generate definitions for
388   all the masks in RGM and return a mask that is nonzero when the loop
389   needs to iterate.  Add any new preheader statements to PREHEADER_SEQ.
390   Use LOOP_COND_GSI to insert code before the exit gcond.
391
392   RGM belongs to loop LOOP.  The loop originally iterated NITERS
393   times and has been vectorized according to LOOP_VINFO.
394
395   If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
396   starts with NITERS_SKIP dummy iterations of the scalar loop before
397   the real work starts.  The mask elements for these dummy iterations
398   must be 0, to ensure that the extra iterations do not have an effect.
399
400   It is known that:
401
402     NITERS * RGM->max_nscalars_per_iter
403
404   does not overflow.  However, MIGHT_WRAP_P says whether an induction
405   variable that starts at 0 and has step:
406
407     VF * RGM->max_nscalars_per_iter
408
409   might overflow before hitting a value above:
410
411     (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
412
413   This means that we cannot guarantee that such an induction variable
414   would ever hit a value that produces a set of all-false masks for RGM.  */
415
416static tree
417vect_set_loop_masks_directly (class loop *loop, loop_vec_info loop_vinfo,
418			      gimple_seq *preheader_seq,
419			      gimple_stmt_iterator loop_cond_gsi,
420			      rgroup_masks *rgm, tree niters, tree niters_skip,
421			      bool might_wrap_p)
422{
423  tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
424  tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
425  tree mask_type = rgm->mask_type;
426  unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
427  poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
428  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
429
430  /* Calculate the maximum number of scalar values that the rgroup
431     handles in total, the number that it handles for each iteration
432     of the vector loop, and the number that it should skip during the
433     first iteration of the vector loop.  */
434  tree nscalars_total = niters;
435  tree nscalars_step = build_int_cst (iv_type, vf);
436  tree nscalars_skip = niters_skip;
437  if (nscalars_per_iter != 1)
438    {
439      /* We checked before choosing to use a fully-masked loop that these
440	 multiplications don't overflow.  */
441      tree compare_factor = build_int_cst (compare_type, nscalars_per_iter);
442      tree iv_factor = build_int_cst (iv_type, nscalars_per_iter);
443      nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
444				     nscalars_total, compare_factor);
445      nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type,
446				    nscalars_step, iv_factor);
447      if (nscalars_skip)
448	nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
449				      nscalars_skip, compare_factor);
450    }
451
452  /* Create an induction variable that counts the number of scalars
453     processed.  */
454  tree index_before_incr, index_after_incr;
455  gimple_stmt_iterator incr_gsi;
456  bool insert_after;
457  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
458  create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop,
459	     &incr_gsi, insert_after, &index_before_incr, &index_after_incr);
460
461  tree zero_index = build_int_cst (compare_type, 0);
462  tree test_index, test_limit, first_limit;
463  gimple_stmt_iterator *test_gsi;
464  if (might_wrap_p)
465    {
466      /* In principle the loop should stop iterating once the incremented
467	 IV reaches a value greater than or equal to:
468
469	   NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
470
471	 However, there's no guarantee that this addition doesn't overflow
472	 the comparison type, or that the IV hits a value above it before
473	 wrapping around.  We therefore adjust the limit down by one
474	 IV step:
475
476	   (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
477	   -[infinite-prec] NSCALARS_STEP
478
479	 and compare the IV against this limit _before_ incrementing it.
480	 Since the comparison type is unsigned, we actually want the
481	 subtraction to saturate at zero:
482
483	   (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
484	   -[sat] NSCALARS_STEP
485
486	 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
487
488	   NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
489
490	 where the rightmost subtraction can be done directly in
491	 COMPARE_TYPE.  */
492      test_index = index_before_incr;
493      tree adjust = gimple_convert (preheader_seq, compare_type,
494				    nscalars_step);
495      if (nscalars_skip)
496	adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
497			       adjust, nscalars_skip);
498      test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
499				 nscalars_total, adjust);
500      test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
501				 test_limit, adjust);
502      test_gsi = &incr_gsi;
503
504      /* Get a safe limit for the first iteration.  */
505      if (nscalars_skip)
506	{
507	  /* The first vector iteration can handle at most NSCALARS_STEP
508	     scalars.  NSCALARS_STEP <= CONST_LIMIT, and adding
509	     NSCALARS_SKIP to that cannot overflow.  */
510	  tree const_limit = build_int_cst (compare_type,
511					    LOOP_VINFO_VECT_FACTOR (loop_vinfo)
512					    * nscalars_per_iter);
513	  first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
514				      nscalars_total, const_limit);
515	  first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
516				      first_limit, nscalars_skip);
517	}
518      else
519	/* For the first iteration it doesn't matter whether the IV hits
520	   a value above NSCALARS_TOTAL.  That only matters for the latch
521	   condition.  */
522	first_limit = nscalars_total;
523    }
524  else
525    {
526      /* Test the incremented IV, which will always hit a value above
527	 the bound before wrapping.  */
528      test_index = index_after_incr;
529      test_limit = nscalars_total;
530      if (nscalars_skip)
531	test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
532				   test_limit, nscalars_skip);
533      test_gsi = &loop_cond_gsi;
534
535      first_limit = test_limit;
536    }
537
538  /* Convert the IV value to the comparison type (either a no-op or
539     a demotion).  */
540  gimple_seq test_seq = NULL;
541  test_index = gimple_convert (&test_seq, compare_type, test_index);
542  gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
543
544  /* Provide a definition of each mask in the group.  */
545  tree next_mask = NULL_TREE;
546  tree mask;
547  unsigned int i;
548  FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
549    {
550      /* Previous masks will cover BIAS scalars.  This mask covers the
551	 next batch.  */
552      poly_uint64 bias = nscalars_per_mask * i;
553      tree bias_tree = build_int_cst (compare_type, bias);
554      gimple *tmp_stmt;
555
556      /* See whether the first iteration of the vector loop is known
557	 to have a full mask.  */
558      poly_uint64 const_limit;
559      bool first_iteration_full
560	= (poly_int_tree_p (first_limit, &const_limit)
561	   && known_ge (const_limit, (i + 1) * nscalars_per_mask));
562
563      /* Rather than have a new IV that starts at BIAS and goes up to
564	 TEST_LIMIT, prefer to use the same 0-based IV for each mask
565	 and adjust the bound down by BIAS.  */
566      tree this_test_limit = test_limit;
567      if (i != 0)
568	{
569	  this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
570					  compare_type, this_test_limit,
571					  bias_tree);
572	  this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
573					  compare_type, this_test_limit,
574					  bias_tree);
575	}
576
577      /* Create the initial mask.  First include all scalars that
578	 are within the loop limit.  */
579      tree init_mask = NULL_TREE;
580      if (!first_iteration_full)
581	{
582	  tree start, end;
583	  if (first_limit == test_limit)
584	    {
585	      /* Use a natural test between zero (the initial IV value)
586		 and the loop limit.  The "else" block would be valid too,
587		 but this choice can avoid the need to load BIAS_TREE into
588		 a register.  */
589	      start = zero_index;
590	      end = this_test_limit;
591	    }
592	  else
593	    {
594	      /* FIRST_LIMIT is the maximum number of scalars handled by the
595		 first iteration of the vector loop.  Test the portion
596		 associated with this mask.  */
597	      start = bias_tree;
598	      end = first_limit;
599	    }
600
601	  init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
602	  tmp_stmt = vect_gen_while (init_mask, start, end);
603	  gimple_seq_add_stmt (preheader_seq, tmp_stmt);
604	}
605
606      /* Now AND out the bits that are within the number of skipped
607	 scalars.  */
608      poly_uint64 const_skip;
609      if (nscalars_skip
610	  && !(poly_int_tree_p (nscalars_skip, &const_skip)
611	       && known_le (const_skip, bias)))
612	{
613	  tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
614						    bias_tree, nscalars_skip);
615	  if (init_mask)
616	    init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
617				      init_mask, unskipped_mask);
618	  else
619	    init_mask = unskipped_mask;
620	}
621
622      if (!init_mask)
623	/* First iteration is full.  */
624	init_mask = build_minus_one_cst (mask_type);
625
626      /* Get the mask value for the next iteration of the loop.  */
627      next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
628      gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
629      gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
630
631      vect_set_loop_mask (loop, mask, init_mask, next_mask);
632    }
633  return next_mask;
634}
635
636/* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
637   LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
638   number of iterations of the original scalar loop that should be
639   handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are
640   as for vect_set_loop_condition.
641
642   Insert the branch-back condition before LOOP_COND_GSI and return the
643   final gcond.  */
644
645static gcond *
646vect_set_loop_condition_masked (class loop *loop, loop_vec_info loop_vinfo,
647				tree niters, tree final_iv,
648				bool niters_maybe_zero,
649				gimple_stmt_iterator loop_cond_gsi)
650{
651  gimple_seq preheader_seq = NULL;
652  gimple_seq header_seq = NULL;
653
654  tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
655  unsigned int compare_precision = TYPE_PRECISION (compare_type);
656  tree orig_niters = niters;
657
658  /* Type of the initial value of NITERS.  */
659  tree ni_actual_type = TREE_TYPE (niters);
660  unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
661  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
662
663  /* Convert NITERS to the same size as the compare.  */
664  if (compare_precision > ni_actual_precision
665      && niters_maybe_zero)
666    {
667      /* We know that there is always at least one iteration, so if the
668	 count is zero then it must have wrapped.  Cope with this by
669	 subtracting 1 before the conversion and adding 1 to the result.  */
670      gcc_assert (TYPE_UNSIGNED (ni_actual_type));
671      niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
672			     niters, build_minus_one_cst (ni_actual_type));
673      niters = gimple_convert (&preheader_seq, compare_type, niters);
674      niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
675			     niters, build_one_cst (compare_type));
676    }
677  else
678    niters = gimple_convert (&preheader_seq, compare_type, niters);
679
680  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
681
682  /* Iterate over all the rgroups and fill in their masks.  We could use
683     the first mask from any rgroup for the loop condition; here we
684     arbitrarily pick the last.  */
685  tree test_mask = NULL_TREE;
686  rgroup_masks *rgm;
687  unsigned int i;
688  vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
689  FOR_EACH_VEC_ELT (*masks, i, rgm)
690    if (!rgm->masks.is_empty ())
691      {
692	/* First try using permutes.  This adds a single vector
693	   instruction to the loop for each mask, but needs no extra
694	   loop invariants or IVs.  */
695	unsigned int nmasks = i + 1;
696	if ((nmasks & 1) == 0)
697	  {
698	    rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
699	    if (!half_rgm->masks.is_empty ()
700		&& vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
701	      continue;
702	  }
703
704	/* See whether zero-based IV would ever generate all-false masks
705	   before wrapping around.  */
706	bool might_wrap_p
707	  = (iv_limit == -1
708	     || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
709				    UNSIGNED)
710		 > compare_precision));
711
712	/* Set up all masks for this group.  */
713	test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
714						  &preheader_seq,
715						  loop_cond_gsi, rgm,
716						  niters, niters_skip,
717						  might_wrap_p);
718      }
719
720  /* Emit all accumulated statements.  */
721  add_preheader_seq (loop, preheader_seq);
722  add_header_seq (loop, header_seq);
723
724  /* Get a boolean result that tells us whether to iterate.  */
725  edge exit_edge = single_exit (loop);
726  tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
727  tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
728  gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
729					NULL_TREE, NULL_TREE);
730  gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
731
732  /* The loop iterates (NITERS - 1) / VF + 1 times.
733     Subtract one from this to get the latch count.  */
734  tree step = build_int_cst (compare_type,
735			     LOOP_VINFO_VECT_FACTOR (loop_vinfo));
736  tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
737				       build_minus_one_cst (compare_type));
738  loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
739				     niters_minus_one, step);
740
741  if (final_iv)
742    {
743      gassign *assign = gimple_build_assign (final_iv, orig_niters);
744      gsi_insert_on_edge_immediate (single_exit (loop), assign);
745    }
746
747  return cond_stmt;
748}
749
750/* Like vect_set_loop_condition, but handle the case in which there
751   are no loop masks.  */
752
753static gcond *
754vect_set_loop_condition_unmasked (class loop *loop, tree niters,
755				  tree step, tree final_iv,
756				  bool niters_maybe_zero,
757				  gimple_stmt_iterator loop_cond_gsi)
758{
759  tree indx_before_incr, indx_after_incr;
760  gcond *cond_stmt;
761  gcond *orig_cond;
762  edge pe = loop_preheader_edge (loop);
763  edge exit_edge = single_exit (loop);
764  gimple_stmt_iterator incr_gsi;
765  bool insert_after;
766  enum tree_code code;
767  tree niters_type = TREE_TYPE (niters);
768
769  orig_cond = get_loop_exit_condition (loop);
770  gcc_assert (orig_cond);
771  loop_cond_gsi = gsi_for_stmt (orig_cond);
772
773  tree init, limit;
774  if (!niters_maybe_zero && integer_onep (step))
775    {
776      /* In this case we can use a simple 0-based IV:
777
778	 A:
779	   x = 0;
780	   do
781	     {
782	       ...
783	       x += 1;
784	     }
785	   while (x < NITERS);  */
786      code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
787      init = build_zero_cst (niters_type);
788      limit = niters;
789    }
790  else
791    {
792      /* The following works for all values of NITERS except 0:
793
794	 B:
795	   x = 0;
796	   do
797	     {
798	       ...
799	       x += STEP;
800	     }
801	   while (x <= NITERS - STEP);
802
803	 so that the loop continues to iterate if x + STEP - 1 < NITERS
804	 but stops if x + STEP - 1 >= NITERS.
805
806	 However, if NITERS is zero, x never hits a value above NITERS - STEP
807	 before wrapping around.  There are two obvious ways of dealing with
808	 this:
809
810	 - start at STEP - 1 and compare x before incrementing it
811	 - start at -1 and compare x after incrementing it
812
813	 The latter is simpler and is what we use.  The loop in this case
814	 looks like:
815
816	 C:
817	   x = -1;
818	   do
819	     {
820	       ...
821	       x += STEP;
822	     }
823	   while (x < NITERS - STEP);
824
825	 In both cases the loop limit is NITERS - STEP.  */
826      gimple_seq seq = NULL;
827      limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
828      limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
829      if (seq)
830	{
831	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
832	  gcc_assert (!new_bb);
833	}
834      if (niters_maybe_zero)
835	{
836	  /* Case C.  */
837	  code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
838	  init = build_all_ones_cst (niters_type);
839	}
840      else
841	{
842	  /* Case B.  */
843	  code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
844	  init = build_zero_cst (niters_type);
845	}
846    }
847
848  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
849  create_iv (init, step, NULL_TREE, loop,
850             &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
851  indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
852					      true, NULL_TREE, true,
853					      GSI_SAME_STMT);
854  limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
855				     true, GSI_SAME_STMT);
856
857  cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
858				 NULL_TREE);
859
860  gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
861
862  /* Record the number of latch iterations.  */
863  if (limit == niters)
864    /* Case A: the loop iterates NITERS times.  Subtract one to get the
865       latch count.  */
866    loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
867				       build_int_cst (niters_type, 1));
868  else
869    /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
870       Subtract one from this to get the latch count.  */
871    loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
872				       limit, step);
873
874  if (final_iv)
875    {
876      gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
877					     indx_after_incr, init);
878      gsi_insert_on_edge_immediate (single_exit (loop), assign);
879    }
880
881  return cond_stmt;
882}
883
884/* If we're using fully-masked loops, make LOOP iterate:
885
886      N == (NITERS - 1) / STEP + 1
887
888   times.  When NITERS is zero, this is equivalent to making the loop
889   execute (1 << M) / STEP times, where M is the precision of NITERS.
890   NITERS_MAYBE_ZERO is true if this last case might occur.
891
892   If we're not using fully-masked loops, make LOOP iterate:
893
894      N == (NITERS - STEP) / STEP + 1
895
896   times, where NITERS is known to be outside the range [1, STEP - 1].
897   This is equivalent to making the loop execute NITERS / STEP times
898   when NITERS is nonzero and (1 << M) / STEP times otherwise.
899   NITERS_MAYBE_ZERO again indicates whether this last case might occur.
900
901   If FINAL_IV is nonnull, it is an SSA name that should be set to
902   N * STEP on exit from the loop.
903
904   Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
905
906void
907vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo,
908			 tree niters, tree step, tree final_iv,
909			 bool niters_maybe_zero)
910{
911  gcond *cond_stmt;
912  gcond *orig_cond = get_loop_exit_condition (loop);
913  gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
914
915  if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
916    cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
917						final_iv, niters_maybe_zero,
918						loop_cond_gsi);
919  else
920    cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
921						  final_iv, niters_maybe_zero,
922						  loop_cond_gsi);
923
924  /* Remove old loop exit test.  */
925  stmt_vec_info orig_cond_info;
926  if (loop_vinfo
927      && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
928    loop_vinfo->remove_stmt (orig_cond_info);
929  else
930    gsi_remove (&loop_cond_gsi, true);
931
932  if (dump_enabled_p ())
933    dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
934		     cond_stmt);
935}
936
937/* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
938   For all PHI arguments in FROM->dest and TO->dest from those
939   edges ensure that TO->dest PHI arguments have current_def
940   to that in from.  */
941
942static void
943slpeel_duplicate_current_defs_from_edges (edge from, edge to)
944{
945  gimple_stmt_iterator gsi_from, gsi_to;
946
947  for (gsi_from = gsi_start_phis (from->dest),
948       gsi_to = gsi_start_phis (to->dest);
949       !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
950    {
951      gimple *from_phi = gsi_stmt (gsi_from);
952      gimple *to_phi = gsi_stmt (gsi_to);
953      tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
954      tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
955      if (virtual_operand_p (from_arg))
956	{
957	  gsi_next (&gsi_from);
958	  continue;
959	}
960      if (virtual_operand_p (to_arg))
961	{
962	  gsi_next (&gsi_to);
963	  continue;
964	}
965      if (TREE_CODE (from_arg) != SSA_NAME)
966	gcc_assert (operand_equal_p (from_arg, to_arg, 0));
967      else if (TREE_CODE (to_arg) == SSA_NAME
968	       && from_arg != to_arg)
969	{
970	  if (get_current_def (to_arg) == NULL_TREE)
971	    {
972	      gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
973					      TREE_TYPE (get_current_def
974							   (from_arg))));
975	      set_current_def (to_arg, get_current_def (from_arg));
976	    }
977	}
978      gsi_next (&gsi_from);
979      gsi_next (&gsi_to);
980    }
981
982  gphi *from_phi = get_virtual_phi (from->dest);
983  gphi *to_phi = get_virtual_phi (to->dest);
984  if (from_phi)
985    set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
986		     get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
987}
988
989
990/* Given LOOP this function generates a new copy of it and puts it
991   on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
992   non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
993   basic blocks from SCALAR_LOOP instead of LOOP, but to either the
994   entry or exit of LOOP.  */
995
996class loop *
997slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop,
998					class loop *scalar_loop, edge e)
999{
1000  class loop *new_loop;
1001  basic_block *new_bbs, *bbs, *pbbs;
1002  bool at_exit;
1003  bool was_imm_dom;
1004  basic_block exit_dest;
1005  edge exit, new_exit;
1006  bool duplicate_outer_loop = false;
1007
1008  exit = single_exit (loop);
1009  at_exit = (e == exit);
1010  if (!at_exit && e != loop_preheader_edge (loop))
1011    return NULL;
1012
1013  if (scalar_loop == NULL)
1014    scalar_loop = loop;
1015
1016  bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1017  pbbs = bbs + 1;
1018  get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1019  /* Allow duplication of outer loops.  */
1020  if (scalar_loop->inner)
1021    duplicate_outer_loop = true;
1022  /* Check whether duplication is possible.  */
1023  if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1024    {
1025      free (bbs);
1026      return NULL;
1027    }
1028
1029  /* Generate new loop structure.  */
1030  new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1031  duplicate_subloops (scalar_loop, new_loop);
1032
1033  exit_dest = exit->dest;
1034  was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1035					  exit_dest) == loop->header ?
1036		 true : false);
1037
1038  /* Also copy the pre-header, this avoids jumping through hoops to
1039     duplicate the loop entry PHI arguments.  Create an empty
1040     pre-header unconditionally for this.  */
1041  basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1042  edge entry_e = single_pred_edge (preheader);
1043  bbs[0] = preheader;
1044  new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1045
1046  exit = single_exit (scalar_loop);
1047  copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1048	    &exit, 1, &new_exit, NULL,
1049	    at_exit ? loop->latch : e->src, true);
1050  exit = single_exit (loop);
1051  basic_block new_preheader = new_bbs[0];
1052
1053  add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1054
1055  /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1056  for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1057    rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1058
1059  if (scalar_loop != loop)
1060    {
1061      /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1062	 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1063	 but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1064	 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1065	 header) to have current_def set, so copy them over.  */
1066      slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1067						exit);
1068      slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1069							   0),
1070						EDGE_SUCC (loop->latch, 0));
1071    }
1072
1073  if (at_exit) /* Add the loop copy at exit.  */
1074    {
1075      if (scalar_loop != loop)
1076	{
1077	  gphi_iterator gsi;
1078	  new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1079
1080	  for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1081	       gsi_next (&gsi))
1082	    {
1083	      gphi *phi = gsi.phi ();
1084	      tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1085	      location_t orig_locus
1086		= gimple_phi_arg_location_from_edge (phi, e);
1087
1088	      add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1089	    }
1090	}
1091      redirect_edge_and_branch_force (e, new_preheader);
1092      flush_pending_stmts (e);
1093      set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1094      if (was_imm_dom || duplicate_outer_loop)
1095	set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1096
1097      /* And remove the non-necessary forwarder again.  Keep the other
1098         one so we have a proper pre-header for the loop at the exit edge.  */
1099      redirect_edge_pred (single_succ_edge (preheader),
1100			  single_pred (preheader));
1101      delete_basic_block (preheader);
1102      set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1103			       loop_preheader_edge (scalar_loop)->src);
1104    }
1105  else /* Add the copy at entry.  */
1106    {
1107      if (scalar_loop != loop)
1108	{
1109	  /* Remove the non-necessary forwarder of scalar_loop again.  */
1110	  redirect_edge_pred (single_succ_edge (preheader),
1111			      single_pred (preheader));
1112	  delete_basic_block (preheader);
1113	  set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1114				   loop_preheader_edge (scalar_loop)->src);
1115	  preheader = split_edge (loop_preheader_edge (loop));
1116	  entry_e = single_pred_edge (preheader);
1117	}
1118
1119      redirect_edge_and_branch_force (entry_e, new_preheader);
1120      flush_pending_stmts (entry_e);
1121      set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1122
1123      redirect_edge_and_branch_force (new_exit, preheader);
1124      flush_pending_stmts (new_exit);
1125      set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1126
1127      /* And remove the non-necessary forwarder again.  Keep the other
1128         one so we have a proper pre-header for the loop at the exit edge.  */
1129      redirect_edge_pred (single_succ_edge (new_preheader),
1130			  single_pred (new_preheader));
1131      delete_basic_block (new_preheader);
1132      set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1133			       loop_preheader_edge (new_loop)->src);
1134    }
1135
1136  if (scalar_loop != loop)
1137    {
1138      /* Update new_loop->header PHIs, so that on the preheader
1139	 edge they are the ones from loop rather than scalar_loop.  */
1140      gphi_iterator gsi_orig, gsi_new;
1141      edge orig_e = loop_preheader_edge (loop);
1142      edge new_e = loop_preheader_edge (new_loop);
1143
1144      for (gsi_orig = gsi_start_phis (loop->header),
1145	   gsi_new = gsi_start_phis (new_loop->header);
1146	   !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1147	   gsi_next (&gsi_orig), gsi_next (&gsi_new))
1148	{
1149	  gphi *orig_phi = gsi_orig.phi ();
1150	  gphi *new_phi = gsi_new.phi ();
1151	  tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1152	  location_t orig_locus
1153	    = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1154
1155	  add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1156	}
1157    }
1158
1159  free (new_bbs);
1160  free (bbs);
1161
1162  checking_verify_dominators (CDI_DOMINATORS);
1163
1164  return new_loop;
1165}
1166
1167
1168/* Given the condition expression COND, put it as the last statement of
1169   GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1170   DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1171   skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1172   new edge as irreducible if IRREDUCIBLE_P is true.  */
1173
1174static edge
1175slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1176		       basic_block guard_to, basic_block dom_bb,
1177		       profile_probability probability, bool irreducible_p)
1178{
1179  gimple_stmt_iterator gsi;
1180  edge new_e, enter_e;
1181  gcond *cond_stmt;
1182  gimple_seq gimplify_stmt_list = NULL;
1183
1184  enter_e = EDGE_SUCC (guard_bb, 0);
1185  enter_e->flags &= ~EDGE_FALLTHRU;
1186  enter_e->flags |= EDGE_FALSE_VALUE;
1187  gsi = gsi_last_bb (guard_bb);
1188
1189  cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1190				 NULL_TREE);
1191  if (gimplify_stmt_list)
1192    gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1193
1194  cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1195  gsi = gsi_last_bb (guard_bb);
1196  gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1197
1198  /* Add new edge to connect guard block to the merge/loop-exit block.  */
1199  new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1200
1201  new_e->probability = probability;
1202  if (irreducible_p)
1203    new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1204
1205  enter_e->probability = probability.invert ();
1206  set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1207
1208  /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1209  if (enter_e->dest->loop_father->header == enter_e->dest)
1210    split_edge (enter_e);
1211
1212  return new_e;
1213}
1214
1215
1216/* This function verifies that the following restrictions apply to LOOP:
1217   (1) it consists of exactly 2 basic blocks - header, and an empty latch
1218       for innermost loop and 5 basic blocks for outer-loop.
1219   (2) it is single entry, single exit
1220   (3) its exit condition is the last stmt in the header
1221   (4) E is the entry/exit edge of LOOP.
1222 */
1223
1224bool
1225slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e)
1226{
1227  edge exit_e = single_exit (loop);
1228  edge entry_e = loop_preheader_edge (loop);
1229  gcond *orig_cond = get_loop_exit_condition (loop);
1230  gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1231  unsigned int num_bb = loop->inner? 5 : 2;
1232
1233  /* All loops have an outer scope; the only case loop->outer is NULL is for
1234     the function itself.  */
1235  if (!loop_outer (loop)
1236      || loop->num_nodes != num_bb
1237      || !empty_block_p (loop->latch)
1238      || !single_exit (loop)
1239      /* Verify that new loop exit condition can be trivially modified.  */
1240      || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1241      || (e != exit_e && e != entry_e))
1242    return false;
1243
1244  return true;
1245}
1246
1247/* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1248   in the exit bb and rename all the uses after the loop.  This simplifies
1249   the *guard[12] routines, which assume loop closed SSA form for all PHIs
1250   (but normally loop closed SSA form doesn't require virtual PHIs to be
1251   in the same form).  Doing this early simplifies the checking what
1252   uses should be renamed.
1253
1254   If we create a new phi after the loop, return the definition that
1255   applies on entry to the loop, otherwise return null.  */
1256
1257static tree
1258create_lcssa_for_virtual_phi (class loop *loop)
1259{
1260  gphi_iterator gsi;
1261  edge exit_e = single_exit (loop);
1262
1263  for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1264    if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1265      {
1266	gphi *phi = gsi.phi ();
1267	for (gsi = gsi_start_phis (exit_e->dest);
1268	     !gsi_end_p (gsi); gsi_next (&gsi))
1269	  if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1270	    break;
1271	if (gsi_end_p (gsi))
1272	  {
1273	    tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1274	    gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1275	    tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1276	    imm_use_iterator imm_iter;
1277	    gimple *stmt;
1278	    use_operand_p use_p;
1279
1280	    SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1281	      = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1282	    add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1283	    gimple_phi_set_result (new_phi, new_vop);
1284	    FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1285	      if (stmt != new_phi
1286		  && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1287		FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1288		  SET_USE (use_p, new_vop);
1289
1290	    return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1291	  }
1292	break;
1293      }
1294  return NULL_TREE;
1295}
1296
1297/* Function vect_get_loop_location.
1298
1299   Extract the location of the loop in the source code.
1300   If the loop is not well formed for vectorization, an estimated
1301   location is calculated.
1302   Return the loop location if succeed and NULL if not.  */
1303
1304dump_user_location_t
1305find_loop_location (class loop *loop)
1306{
1307  gimple *stmt = NULL;
1308  basic_block bb;
1309  gimple_stmt_iterator si;
1310
1311  if (!loop)
1312    return dump_user_location_t ();
1313
1314  stmt = get_loop_exit_condition (loop);
1315
1316  if (stmt
1317      && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1318    return stmt;
1319
1320  /* If we got here the loop is probably not "well formed",
1321     try to estimate the loop location */
1322
1323  if (!loop->header)
1324    return dump_user_location_t ();
1325
1326  bb = loop->header;
1327
1328  for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1329    {
1330      stmt = gsi_stmt (si);
1331      if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1332        return stmt;
1333    }
1334
1335  return dump_user_location_t ();
1336}
1337
1338/* Return true if the phi described by STMT_INFO defines an IV of the
1339   loop to be vectorized.  */
1340
1341static bool
1342iv_phi_p (stmt_vec_info stmt_info)
1343{
1344  gphi *phi = as_a <gphi *> (stmt_info->stmt);
1345  if (virtual_operand_p (PHI_RESULT (phi)))
1346    return false;
1347
1348  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1349      || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1350    return false;
1351
1352  return true;
1353}
1354
1355/* Function vect_can_advance_ivs_p
1356
1357   In case the number of iterations that LOOP iterates is unknown at compile
1358   time, an epilog loop will be generated, and the loop induction variables
1359   (IVs) will be "advanced" to the value they are supposed to take just before
1360   the epilog loop.  Here we check that the access function of the loop IVs
1361   and the expression that represents the loop bound are simple enough.
1362   These restrictions will be relaxed in the future.  */
1363
1364bool
1365vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1366{
1367  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368  basic_block bb = loop->header;
1369  gphi_iterator gsi;
1370
1371  /* Analyze phi functions of the loop header.  */
1372
1373  if (dump_enabled_p ())
1374    dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1375  for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1376    {
1377      tree evolution_part;
1378
1379      gphi *phi = gsi.phi ();
1380      stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1381      if (dump_enabled_p ())
1382	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1383			 phi_info->stmt);
1384
1385      /* Skip virtual phi's. The data dependences that are associated with
1386	 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1387
1388	 Skip reduction phis.  */
1389      if (!iv_phi_p (phi_info))
1390	{
1391	  if (dump_enabled_p ())
1392	    dump_printf_loc (MSG_NOTE, vect_location,
1393			     "reduc or virtual phi. skip.\n");
1394	  continue;
1395	}
1396
1397      /* Analyze the evolution function.  */
1398
1399      evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1400      if (evolution_part == NULL_TREE)
1401        {
1402	  if (dump_enabled_p ())
1403	    dump_printf (MSG_MISSED_OPTIMIZATION,
1404			 "No access function or evolution.\n");
1405	  return false;
1406        }
1407
1408      /* FORNOW: We do not transform initial conditions of IVs
1409	 which evolution functions are not invariants in the loop.  */
1410
1411      if (!expr_invariant_in_loop_p (loop, evolution_part))
1412	{
1413	  if (dump_enabled_p ())
1414	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1415			     "evolution not invariant in loop.\n");
1416	  return false;
1417	}
1418
1419      /* FORNOW: We do not transform initial conditions of IVs
1420	 which evolution functions are a polynomial of degree >= 2.  */
1421
1422      if (tree_is_chrec (evolution_part))
1423	{
1424	  if (dump_enabled_p ())
1425	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1426			     "evolution is chrec.\n");
1427	  return false;
1428	}
1429    }
1430
1431  return true;
1432}
1433
1434
1435/*   Function vect_update_ivs_after_vectorizer.
1436
1437     "Advance" the induction variables of LOOP to the value they should take
1438     after the execution of LOOP.  This is currently necessary because the
1439     vectorizer does not handle induction variables that are used after the
1440     loop.  Such a situation occurs when the last iterations of LOOP are
1441     peeled, because:
1442     1. We introduced new uses after LOOP for IVs that were not originally used
1443        after LOOP: the IVs of LOOP are now used by an epilog loop.
1444     2. LOOP is going to be vectorized; this means that it will iterate N/VF
1445        times, whereas the loop IVs should be bumped N times.
1446
1447     Input:
1448     - LOOP - a loop that is going to be vectorized. The last few iterations
1449              of LOOP were peeled.
1450     - NITERS - the number of iterations that LOOP executes (before it is
1451                vectorized). i.e, the number of times the ivs should be bumped.
1452     - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1453                  coming out from LOOP on which there are uses of the LOOP ivs
1454		  (this is the path from LOOP->exit to epilog_loop->preheader).
1455
1456                  The new definitions of the ivs are placed in LOOP->exit.
1457                  The phi args associated with the edge UPDATE_E in the bb
1458                  UPDATE_E->dest are updated accordingly.
1459
1460     Assumption 1: Like the rest of the vectorizer, this function assumes
1461     a single loop exit that has a single predecessor.
1462
1463     Assumption 2: The phi nodes in the LOOP header and in update_bb are
1464     organized in the same order.
1465
1466     Assumption 3: The access function of the ivs is simple enough (see
1467     vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1468
1469     Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1470     coming out of LOOP on which the ivs of LOOP are used (this is the path
1471     that leads to the epilog loop; other paths skip the epilog loop).  This
1472     path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1473     needs to have its phis updated.
1474 */
1475
1476static void
1477vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1478				  tree niters, edge update_e)
1479{
1480  gphi_iterator gsi, gsi1;
1481  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1482  basic_block update_bb = update_e->dest;
1483  basic_block exit_bb = single_exit (loop)->dest;
1484
1485  /* Make sure there exists a single-predecessor exit bb:  */
1486  gcc_assert (single_pred_p (exit_bb));
1487  gcc_assert (single_succ_edge (exit_bb) == update_e);
1488
1489  for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1490       !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1491       gsi_next (&gsi), gsi_next (&gsi1))
1492    {
1493      tree init_expr;
1494      tree step_expr, off;
1495      tree type;
1496      tree var, ni, ni_name;
1497      gimple_stmt_iterator last_gsi;
1498
1499      gphi *phi = gsi.phi ();
1500      gphi *phi1 = gsi1.phi ();
1501      stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1502      if (dump_enabled_p ())
1503	dump_printf_loc (MSG_NOTE, vect_location,
1504			 "vect_update_ivs_after_vectorizer: phi: %G", phi);
1505
1506      /* Skip reduction and virtual phis.  */
1507      if (!iv_phi_p (phi_info))
1508	{
1509	  if (dump_enabled_p ())
1510	    dump_printf_loc (MSG_NOTE, vect_location,
1511			     "reduc or virtual phi. skip.\n");
1512	  continue;
1513	}
1514
1515      type = TREE_TYPE (gimple_phi_result (phi));
1516      step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1517      step_expr = unshare_expr (step_expr);
1518
1519      /* FORNOW: We do not support IVs whose evolution function is a polynomial
1520         of degree >= 2 or exponential.  */
1521      gcc_assert (!tree_is_chrec (step_expr));
1522
1523      init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1524
1525      off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1526			 fold_convert (TREE_TYPE (step_expr), niters),
1527			 step_expr);
1528      if (POINTER_TYPE_P (type))
1529	ni = fold_build_pointer_plus (init_expr, off);
1530      else
1531	ni = fold_build2 (PLUS_EXPR, type,
1532			  init_expr, fold_convert (type, off));
1533
1534      var = create_tmp_var (type, "tmp");
1535
1536      last_gsi = gsi_last_bb (exit_bb);
1537      gimple_seq new_stmts = NULL;
1538      ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1539      /* Exit_bb shouldn't be empty.  */
1540      if (!gsi_end_p (last_gsi))
1541	gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1542      else
1543	gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1544
1545      /* Fix phi expressions in the successor bb.  */
1546      adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1547    }
1548}
1549
1550/* Return a gimple value containing the misalignment (measured in vector
1551   elements) for the loop described by LOOP_VINFO, i.e. how many elements
1552   it is away from a perfectly aligned address.  Add any new statements
1553   to SEQ.  */
1554
1555static tree
1556get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1557{
1558  dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1559  stmt_vec_info stmt_info = dr_info->stmt;
1560  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1561
1562  poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1563  unsigned HOST_WIDE_INT target_align_c;
1564  tree target_align_minus_1;
1565
1566  bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1567					size_zero_node) < 0;
1568  tree offset = (negative
1569		 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1570		 : size_zero_node);
1571  tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1572							  offset);
1573  tree type = unsigned_type_for (TREE_TYPE (start_addr));
1574  if (target_align.is_constant (&target_align_c))
1575    target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1576  else
1577    {
1578      tree vla = build_int_cst (type, target_align);
1579      tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1580				    fold_build2 (MINUS_EXPR, type,
1581						 build_int_cst (type, 0), vla));
1582      target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1583					  build_int_cst (type, 1));
1584    }
1585
1586  HOST_WIDE_INT elem_size
1587    = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1588  tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1589
1590  /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1591  tree int_start_addr = fold_convert (type, start_addr);
1592  tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1593					target_align_minus_1);
1594
1595  /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1596  tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1597					elem_size_log);
1598
1599  return misalign_in_elems;
1600}
1601
1602/* Function vect_gen_prolog_loop_niters
1603
1604   Generate the number of iterations which should be peeled as prolog for the
1605   loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1606   DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1607   As a result, after the execution of this loop, the data reference DR will
1608   refer to an aligned location.  The following computation is generated:
1609
1610   If the misalignment of DR is known at compile time:
1611     addr_mis = int mis = DR_MISALIGNMENT (dr);
1612   Else, compute address misalignment in bytes:
1613     addr_mis = addr & (target_align - 1)
1614
1615   prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1616
1617   (elem_size = element type size; an element is the scalar element whose type
1618   is the inner type of the vectype)
1619
1620   The computations will be emitted at the end of BB.  We also compute and
1621   store upper bound (included) of the result in BOUND.
1622
1623   When the step of the data-ref in the loop is not 1 (as in interleaved data
1624   and SLP), the number of iterations of the prolog must be divided by the step
1625   (which is equal to the size of interleaved group).
1626
1627   The above formulas assume that VF == number of elements in the vector. This
1628   may not hold when there are multiple-types in the loop.
1629   In this case, for some data-references in the loop the VF does not represent
1630   the number of elements that fit in the vector.  Therefore, instead of VF we
1631   use TYPE_VECTOR_SUBPARTS.  */
1632
1633static tree
1634vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1635			     basic_block bb, int *bound)
1636{
1637  dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1638  tree var;
1639  tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1640  gimple_seq stmts = NULL, new_stmts = NULL;
1641  tree iters, iters_name;
1642  stmt_vec_info stmt_info = dr_info->stmt;
1643  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1644  poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1645
1646  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1647    {
1648      int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1649
1650      if (dump_enabled_p ())
1651        dump_printf_loc (MSG_NOTE, vect_location,
1652                         "known peeling = %d.\n", npeel);
1653
1654      iters = build_int_cst (niters_type, npeel);
1655      *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1656    }
1657  else
1658    {
1659      tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1660      tree type = TREE_TYPE (misalign_in_elems);
1661      HOST_WIDE_INT elem_size
1662	= int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1663      /* We only do prolog peeling if the target alignment is known at compile
1664         time.  */
1665      poly_uint64 align_in_elems =
1666	exact_div (target_align, elem_size);
1667      tree align_in_elems_minus_1 =
1668	build_int_cst (type, align_in_elems - 1);
1669      tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1670
1671      /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1672				 & (align_in_elems - 1)).  */
1673      bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1674					    size_zero_node) < 0;
1675      if (negative)
1676	iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1677			     align_in_elems_tree);
1678      else
1679	iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1680			     misalign_in_elems);
1681      iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1682      iters = fold_convert (niters_type, iters);
1683      unsigned HOST_WIDE_INT align_in_elems_c;
1684      if (align_in_elems.is_constant (&align_in_elems_c))
1685	*bound = align_in_elems_c - 1;
1686      else
1687	*bound = -1;
1688    }
1689
1690  if (dump_enabled_p ())
1691    dump_printf_loc (MSG_NOTE, vect_location,
1692		     "niters for prolog loop: %T\n", iters);
1693
1694  var = create_tmp_var (niters_type, "prolog_loop_niters");
1695  iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1696
1697  if (new_stmts)
1698    gimple_seq_add_seq (&stmts, new_stmts);
1699  if (stmts)
1700    {
1701      gcc_assert (single_succ_p (bb));
1702      gimple_stmt_iterator gsi = gsi_last_bb (bb);
1703      if (gsi_end_p (gsi))
1704	gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1705      else
1706	gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1707    }
1708  return iters_name;
1709}
1710
1711
1712/* Function vect_update_init_of_dr
1713
1714   If CODE is PLUS, the vector loop starts NITERS iterations after the
1715   scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1716   iterations before the scalar one (using masking to skip inactive
1717   elements).  This function updates the information recorded in DR to
1718   account for the difference.  Specifically, it updates the OFFSET
1719   field of DR_INFO.  */
1720
1721static void
1722vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code)
1723{
1724  struct data_reference *dr = dr_info->dr;
1725  tree offset = dr_info->offset;
1726  if (!offset)
1727    offset = build_zero_cst (sizetype);
1728
1729  niters = fold_build2 (MULT_EXPR, sizetype,
1730			fold_convert (sizetype, niters),
1731			fold_convert (sizetype, DR_STEP (dr)));
1732  offset = fold_build2 (code, sizetype,
1733			fold_convert (sizetype, offset), niters);
1734  dr_info->offset = offset;
1735}
1736
1737
1738/* Function vect_update_inits_of_drs
1739
1740   Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1741   CODE and NITERS are as for vect_update_inits_of_dr.  */
1742
1743void
1744vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1745			  tree_code code)
1746{
1747  unsigned int i;
1748  vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1749  struct data_reference *dr;
1750
1751  DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1752
1753  /* Adjust niters to sizetype.  We used to insert the stmts on loop preheader
1754     here, but since we might use these niters to update the epilogues niters
1755     and data references we can't insert them here as this definition might not
1756     always dominate its uses.  */
1757  if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1758    niters = fold_convert (sizetype, niters);
1759
1760  FOR_EACH_VEC_ELT (datarefs, i, dr)
1761    {
1762      dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1763      if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt)
1764	  && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt))
1765	vect_update_init_of_dr (dr_info, niters, code);
1766    }
1767}
1768
1769/* For the information recorded in LOOP_VINFO prepare the loop for peeling
1770   by masking.  This involves calculating the number of iterations to
1771   be peeled and then aligning all memory references appropriately.  */
1772
1773void
1774vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1775{
1776  tree misalign_in_elems;
1777  tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1778
1779  gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1780
1781  /* From the information recorded in LOOP_VINFO get the number of iterations
1782     that need to be skipped via masking.  */
1783  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1784    {
1785      poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1786			     - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1787      misalign_in_elems = build_int_cst (type, misalign);
1788    }
1789  else
1790    {
1791      gimple_seq seq1 = NULL, seq2 = NULL;
1792      misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1793      misalign_in_elems = fold_convert (type, misalign_in_elems);
1794      misalign_in_elems = force_gimple_operand (misalign_in_elems,
1795						&seq2, true, NULL_TREE);
1796      gimple_seq_add_seq (&seq1, seq2);
1797      if (seq1)
1798	{
1799	  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1800	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1801	  gcc_assert (!new_bb);
1802	}
1803    }
1804
1805  if (dump_enabled_p ())
1806    dump_printf_loc (MSG_NOTE, vect_location,
1807		     "misalignment for fully-masked loop: %T\n",
1808		     misalign_in_elems);
1809
1810  LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1811
1812  vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1813}
1814
1815/* This function builds ni_name = number of iterations.  Statements
1816   are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1817   it to TRUE if new ssa_var is generated.  */
1818
1819tree
1820vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1821{
1822  tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1823  if (TREE_CODE (ni) == INTEGER_CST)
1824    return ni;
1825  else
1826    {
1827      tree ni_name, var;
1828      gimple_seq stmts = NULL;
1829      edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1830
1831      var = create_tmp_var (TREE_TYPE (ni), "niters");
1832      ni_name = force_gimple_operand (ni, &stmts, false, var);
1833      if (stmts)
1834	{
1835	  gsi_insert_seq_on_edge_immediate (pe, stmts);
1836	  if (new_var_p != NULL)
1837	    *new_var_p = true;
1838	}
1839
1840      return ni_name;
1841    }
1842}
1843
1844/* Calculate the number of iterations above which vectorized loop will be
1845   preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1846   of prolog loop.  If it's integer const, the integer number is also passed
1847   in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1848   number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1849   value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1850   threshold below which the scalar (rather than vectorized) loop will be
1851   executed.  This function stores the upper bound (inclusive) of the result
1852   in BOUND_SCALAR.  */
1853
1854static tree
1855vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1856			     int bound_prolog, poly_int64 bound_epilog, int th,
1857			     poly_uint64 *bound_scalar,
1858			     bool check_profitability)
1859{
1860  tree type = TREE_TYPE (niters_prolog);
1861  tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1862			     build_int_cst (type, bound_epilog));
1863
1864  *bound_scalar = bound_prolog + bound_epilog;
1865  if (check_profitability)
1866    {
1867      /* TH indicates the minimum niters of vectorized loop, while we
1868	 compute the maximum niters of scalar loop.  */
1869      th--;
1870      /* Peeling for constant times.  */
1871      if (int_niters_prolog >= 0)
1872	{
1873	  *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1874	  return build_int_cst (type, *bound_scalar);
1875	}
1876      /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1877	 and BOUND_EPILOG are inclusive upper bounds.  */
1878      if (known_ge (th, bound_prolog + bound_epilog))
1879	{
1880	  *bound_scalar = th;
1881	  return build_int_cst (type, th);
1882	}
1883      /* Need to do runtime comparison.  */
1884      else if (maybe_gt (th, bound_epilog))
1885	{
1886	  *bound_scalar = upper_bound (*bound_scalar, th);
1887	  return fold_build2 (MAX_EXPR, type,
1888			      build_int_cst (type, th), niters);
1889	}
1890    }
1891  return niters;
1892}
1893
1894/* NITERS is the number of times that the original scalar loop executes
1895   after peeling.  Work out the maximum number of iterations N that can
1896   be handled by the vectorized form of the loop and then either:
1897
1898   a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1899
1900	niters_vector = N
1901
1902   b) set *STEP_VECTOR_PTR to one and generate:
1903
1904        niters_vector = N / vf
1905
1906   In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1907   any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1908   is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1909
1910void
1911vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1912			     tree *niters_vector_ptr, tree *step_vector_ptr,
1913			     bool niters_no_overflow)
1914{
1915  tree ni_minus_gap, var;
1916  tree niters_vector, step_vector, type = TREE_TYPE (niters);
1917  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1918  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1919  tree log_vf = NULL_TREE;
1920
1921  /* If epilogue loop is required because of data accesses with gaps, we
1922     subtract one iteration from the total number of iterations here for
1923     correct calculation of RATIO.  */
1924  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1925    {
1926      ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1927				  build_one_cst (type));
1928      if (!is_gimple_val (ni_minus_gap))
1929	{
1930	  var = create_tmp_var (type, "ni_gap");
1931	  gimple *stmts = NULL;
1932	  ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1933					       true, var);
1934	  gsi_insert_seq_on_edge_immediate (pe, stmts);
1935	}
1936    }
1937  else
1938    ni_minus_gap = niters;
1939
1940  unsigned HOST_WIDE_INT const_vf;
1941  if (vf.is_constant (&const_vf)
1942      && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1943    {
1944      /* Create: niters >> log2(vf) */
1945      /* If it's known that niters == number of latch executions + 1 doesn't
1946	 overflow, we can generate niters >> log2(vf); otherwise we generate
1947	 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1948	 will be at least one.  */
1949      log_vf = build_int_cst (type, exact_log2 (const_vf));
1950      if (niters_no_overflow)
1951	niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1952      else
1953	niters_vector
1954	  = fold_build2 (PLUS_EXPR, type,
1955			 fold_build2 (RSHIFT_EXPR, type,
1956				      fold_build2 (MINUS_EXPR, type,
1957						   ni_minus_gap,
1958						   build_int_cst (type, vf)),
1959				      log_vf),
1960			 build_int_cst (type, 1));
1961      step_vector = build_one_cst (type);
1962    }
1963  else
1964    {
1965      niters_vector = ni_minus_gap;
1966      step_vector = build_int_cst (type, vf);
1967    }
1968
1969  if (!is_gimple_val (niters_vector))
1970    {
1971      var = create_tmp_var (type, "bnd");
1972      gimple_seq stmts = NULL;
1973      niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1974      gsi_insert_seq_on_edge_immediate (pe, stmts);
1975      /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1976	 we set range information to make niters analyzer's life easier.
1977	 Note the number of latch iteration value can be TYPE_MAX_VALUE so
1978	 we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf.  */
1979      if (stmts != NULL && log_vf)
1980	{
1981	  if (niters_no_overflow)
1982	    set_range_info (niters_vector, VR_RANGE,
1983			    wi::one (TYPE_PRECISION (type)),
1984			    wi::rshift (wi::max_value (TYPE_PRECISION (type),
1985						       TYPE_SIGN (type)),
1986					exact_log2 (const_vf),
1987					TYPE_SIGN (type)));
1988	  /* For VF == 1 the vector IV might also overflow so we cannot
1989	     assert a minimum value of 1.  */
1990	  else if (const_vf > 1)
1991	    set_range_info (niters_vector, VR_RANGE,
1992			    wi::one (TYPE_PRECISION (type)),
1993			    wi::rshift (wi::max_value (TYPE_PRECISION (type),
1994						       TYPE_SIGN (type))
1995					- (const_vf - 1),
1996					exact_log2 (const_vf), TYPE_SIGN (type))
1997			    + 1);
1998	}
1999    }
2000  *niters_vector_ptr = niters_vector;
2001  *step_vector_ptr = step_vector;
2002
2003  return;
2004}
2005
2006/* Given NITERS_VECTOR which is the number of iterations for vectorized
2007   loop specified by LOOP_VINFO after vectorization, compute the number
2008   of iterations before vectorization (niters_vector * vf) and store it
2009   to NITERS_VECTOR_MULT_VF_PTR.  */
2010
2011static void
2012vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2013				     tree niters_vector,
2014				     tree *niters_vector_mult_vf_ptr)
2015{
2016  /* We should be using a step_vector of VF if VF is variable.  */
2017  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2018  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2019  tree type = TREE_TYPE (niters_vector);
2020  tree log_vf = build_int_cst (type, exact_log2 (vf));
2021  basic_block exit_bb = single_exit (loop)->dest;
2022
2023  gcc_assert (niters_vector_mult_vf_ptr != NULL);
2024  tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2025					    niters_vector, log_vf);
2026  if (!is_gimple_val (niters_vector_mult_vf))
2027    {
2028      tree var = create_tmp_var (type, "niters_vector_mult_vf");
2029      gimple_seq stmts = NULL;
2030      niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2031						    &stmts, true, var);
2032      gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2033      gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2034    }
2035  *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2036}
2037
2038/* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2039   this function searches for the corresponding lcssa phi node in exit
2040   bb of LOOP.  If it is found, return the phi result; otherwise return
2041   NULL.  */
2042
2043static tree
2044find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED,
2045		gphi *lcssa_phi)
2046{
2047  gphi_iterator gsi;
2048  edge e = single_exit (loop);
2049
2050  gcc_assert (single_pred_p (e->dest));
2051  for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2052    {
2053      gphi *phi = gsi.phi ();
2054      if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2055			   PHI_ARG_DEF (lcssa_phi, 0), 0))
2056	return PHI_RESULT (phi);
2057    }
2058  return NULL_TREE;
2059}
2060
2061/* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2062   from SECOND/FIRST and puts it at the original loop's preheader/exit
2063   edge, the two loops are arranged as below:
2064
2065       preheader_a:
2066     first_loop:
2067       header_a:
2068	 i_1 = PHI<i_0, i_2>;
2069	 ...
2070	 i_2 = i_1 + 1;
2071	 if (cond_a)
2072	   goto latch_a;
2073	 else
2074	   goto between_bb;
2075       latch_a:
2076	 goto header_a;
2077
2078       between_bb:
2079	 ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2080
2081     second_loop:
2082       header_b:
2083	 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2084				 or with i_2 if no LCSSA phi is created
2085				 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2086	 ...
2087	 i_4 = i_3 + 1;
2088	 if (cond_b)
2089	   goto latch_b;
2090	 else
2091	   goto exit_bb;
2092       latch_b:
2093	 goto header_b;
2094
2095       exit_bb:
2096
2097   This function creates loop closed SSA for the first loop; update the
2098   second loop's PHI nodes by replacing argument on incoming edge with the
2099   result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2100   is false, Loop closed ssa phis will only be created for non-iv phis for
2101   the first loop.
2102
2103   This function assumes exit bb of the first loop is preheader bb of the
2104   second loop, i.e, between_bb in the example code.  With PHIs updated,
2105   the second loop will execute rest iterations of the first.  */
2106
2107static void
2108slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2109				   class loop *first, class loop *second,
2110				   bool create_lcssa_for_iv_phis)
2111{
2112  gphi_iterator gsi_update, gsi_orig;
2113  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2114
2115  edge first_latch_e = EDGE_SUCC (first->latch, 0);
2116  edge second_preheader_e = loop_preheader_edge (second);
2117  basic_block between_bb = single_exit (first)->dest;
2118
2119  gcc_assert (between_bb == second_preheader_e->src);
2120  gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2121  /* Either the first loop or the second is the loop to be vectorized.  */
2122  gcc_assert (loop == first || loop == second);
2123
2124  for (gsi_orig = gsi_start_phis (first->header),
2125       gsi_update = gsi_start_phis (second->header);
2126       !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2127       gsi_next (&gsi_orig), gsi_next (&gsi_update))
2128    {
2129      gphi *orig_phi = gsi_orig.phi ();
2130      gphi *update_phi = gsi_update.phi ();
2131
2132      tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2133      /* Generate lcssa PHI node for the first loop.  */
2134      gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2135      stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2136      if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2137	{
2138	  tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2139	  gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2140	  add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2141	  arg = new_res;
2142	}
2143
2144      /* Update PHI node in the second loop by replacing arg on the loop's
2145	 incoming edge.  */
2146      adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2147    }
2148
2149  /* For epilogue peeling we have to make sure to copy all LC PHIs
2150     for correct vectorization of live stmts.  */
2151  if (loop == first)
2152    {
2153      basic_block orig_exit = single_exit (second)->dest;
2154      for (gsi_orig = gsi_start_phis (orig_exit);
2155	   !gsi_end_p (gsi_orig); gsi_next (&gsi_orig))
2156	{
2157	  gphi *orig_phi = gsi_orig.phi ();
2158	  tree orig_arg = PHI_ARG_DEF (orig_phi, 0);
2159	  if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p  (orig_arg))
2160	    continue;
2161
2162	  /* Already created in the above loop.   */
2163	  if (find_guard_arg (first, second, orig_phi))
2164	    continue;
2165
2166	  tree new_res = copy_ssa_name (orig_arg);
2167	  gphi *lcphi = create_phi_node (new_res, between_bb);
2168	  add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION);
2169	}
2170    }
2171}
2172
2173/* Function slpeel_add_loop_guard adds guard skipping from the beginning
2174   of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2175   are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2176   appear like below:
2177
2178       guard_bb:
2179	 if (cond)
2180	   goto merge_bb;
2181	 else
2182	   goto skip_loop;
2183
2184     skip_loop:
2185       header_a:
2186	 i_1 = PHI<i_0, i_2>;
2187	 ...
2188	 i_2 = i_1 + 1;
2189	 if (cond_a)
2190	   goto latch_a;
2191	 else
2192	   goto exit_a;
2193       latch_a:
2194	 goto header_a;
2195
2196       exit_a:
2197	 i_5 = PHI<i_2>;
2198
2199       merge_bb:
2200	 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2201
2202     update_loop:
2203       header_b:
2204	 i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2205	 ...
2206	 i_4 = i_3 + 1;
2207	 if (cond_b)
2208	   goto latch_b;
2209	 else
2210	   goto exit_bb;
2211       latch_b:
2212	 goto header_b;
2213
2214       exit_bb:
2215
2216   This function creates PHI nodes at merge_bb and replaces the use of i_5
2217   in the update_loop's PHI node with the result of new PHI result.  */
2218
2219static void
2220slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop,
2221				    class loop *update_loop,
2222				    edge guard_edge, edge merge_edge)
2223{
2224  location_t merge_loc, guard_loc;
2225  edge orig_e = loop_preheader_edge (skip_loop);
2226  edge update_e = loop_preheader_edge (update_loop);
2227  gphi_iterator gsi_orig, gsi_update;
2228
2229  for ((gsi_orig = gsi_start_phis (skip_loop->header),
2230	gsi_update = gsi_start_phis (update_loop->header));
2231       !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2232       gsi_next (&gsi_orig), gsi_next (&gsi_update))
2233    {
2234      gphi *orig_phi = gsi_orig.phi ();
2235      gphi *update_phi = gsi_update.phi ();
2236
2237      /* Generate new phi node at merge bb of the guard.  */
2238      tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2239      gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2240
2241      /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2242	 args in NEW_PHI for these edges.  */
2243      tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2244      tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2245      merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2246      guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2247      add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2248      add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2249
2250      /* Update phi in UPDATE_PHI.  */
2251      adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2252    }
2253}
2254
2255/* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2256   from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2257   point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2258   and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2259   The CFG looks like:
2260
2261     loop:
2262       header_a:
2263	 i_1 = PHI<i_0, i_2>;
2264	 ...
2265	 i_2 = i_1 + 1;
2266	 if (cond_a)
2267	   goto latch_a;
2268	 else
2269	   goto exit_a;
2270       latch_a:
2271	 goto header_a;
2272
2273       exit_a:
2274
2275       guard_bb:
2276	 if (cond)
2277	   goto merge_bb;
2278	 else
2279	   goto epilog_loop;
2280
2281       ;; fall_through_bb
2282
2283     epilog_loop:
2284       header_b:
2285	 i_3 = PHI<i_2, i_4>;
2286	 ...
2287	 i_4 = i_3 + 1;
2288	 if (cond_b)
2289	   goto latch_b;
2290	 else
2291	   goto merge_bb;
2292       latch_b:
2293	 goto header_b;
2294
2295       merge_bb:
2296	 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2297
2298       exit_bb:
2299	 i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2300
2301   For each name used out side EPILOG (i.e - for each name that has a lcssa
2302   phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2303   args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2304   the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2305   by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2306   in exit_bb will also be updated.  */
2307
2308static void
2309slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog,
2310				    edge guard_edge, edge merge_edge)
2311{
2312  gphi_iterator gsi;
2313  basic_block merge_bb = guard_edge->dest;
2314
2315  gcc_assert (single_succ_p (merge_bb));
2316  edge e = single_succ_edge (merge_bb);
2317  basic_block exit_bb = e->dest;
2318  gcc_assert (single_pred_p (exit_bb));
2319  gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2320
2321  for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2322    {
2323      gphi *update_phi = gsi.phi ();
2324      tree old_arg = PHI_ARG_DEF (update_phi, 0);
2325
2326      tree merge_arg = NULL_TREE;
2327
2328      /* If the old argument is a SSA_NAME use its current_def.  */
2329      if (TREE_CODE (old_arg) == SSA_NAME)
2330	merge_arg = get_current_def (old_arg);
2331      /* If it's a constant or doesn't have a current_def, just use the old
2332	 argument.  */
2333      if (!merge_arg)
2334	merge_arg = old_arg;
2335
2336      tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2337      /* If the var is live after loop but not a reduction, we simply
2338	 use the old arg.  */
2339      if (!guard_arg)
2340	guard_arg = old_arg;
2341
2342      /* Create new phi node in MERGE_BB:  */
2343      tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2344      gphi *merge_phi = create_phi_node (new_res, merge_bb);
2345
2346      /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2347	 the two PHI args in merge_phi for these edges.  */
2348      add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2349      add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2350
2351      /* Update the original phi in exit_bb.  */
2352      adjust_phi_and_debug_stmts (update_phi, e, new_res);
2353    }
2354}
2355
2356/* EPILOG loop is duplicated from the original loop for vectorizing,
2357   the arg of its loop closed ssa PHI needs to be updated.  */
2358
2359static void
2360slpeel_update_phi_nodes_for_lcssa (class loop *epilog)
2361{
2362  gphi_iterator gsi;
2363  basic_block exit_bb = single_exit (epilog)->dest;
2364
2365  gcc_assert (single_pred_p (exit_bb));
2366  edge e = EDGE_PRED (exit_bb, 0);
2367  for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2368    rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2369}
2370
2371/* Function vect_do_peeling.
2372
2373   Input:
2374   - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2375
2376       preheader:
2377     LOOP:
2378       header_bb:
2379	 loop_body
2380	 if (exit_loop_cond) goto exit_bb
2381	 else                goto header_bb
2382       exit_bb:
2383
2384   - NITERS: The number of iterations of the loop.
2385   - NITERSM1: The number of iterations of the loop's latch.
2386   - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2387   - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2388			      CHECK_PROFITABILITY is true.
2389   Output:
2390   - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2391     iterate after vectorization; see vect_set_loop_condition for details.
2392   - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2393     should be set to the number of scalar iterations handled by the
2394     vector loop.  The SSA name is only used on exit from the loop.
2395
2396   This function peels prolog and epilog from the loop, adds guards skipping
2397   PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2398   would look like:
2399
2400       guard_bb_1:
2401	 if (prefer_scalar_loop) goto merge_bb_1
2402	 else                    goto guard_bb_2
2403
2404       guard_bb_2:
2405         if (skip_prolog) goto merge_bb_2
2406         else             goto prolog_preheader
2407
2408       prolog_preheader:
2409     PROLOG:
2410       prolog_header_bb:
2411	 prolog_body
2412	 if (exit_prolog_cond) goto prolog_exit_bb
2413	 else                  goto prolog_header_bb
2414       prolog_exit_bb:
2415
2416       merge_bb_2:
2417
2418       vector_preheader:
2419     VECTOR LOOP:
2420       vector_header_bb:
2421	 vector_body
2422	 if (exit_vector_cond) goto vector_exit_bb
2423	 else                  goto vector_header_bb
2424       vector_exit_bb:
2425
2426       guard_bb_3:
2427	 if (skip_epilog) goto merge_bb_3
2428	 else             goto epilog_preheader
2429
2430       merge_bb_1:
2431
2432       epilog_preheader:
2433     EPILOG:
2434       epilog_header_bb:
2435	 epilog_body
2436	 if (exit_epilog_cond) goto merge_bb_3
2437	 else                  goto epilog_header_bb
2438
2439       merge_bb_3:
2440
2441   Note this function peels prolog and epilog only if it's necessary,
2442   as well as guards.
2443   This function returns the epilogue loop if a decision was made to vectorize
2444   it, otherwise NULL.
2445
2446   The analysis resulting in this epilogue loop's loop_vec_info was performed
2447   in the same vect_analyze_loop call as the main loop's.  At that time
2448   vect_analyze_loop constructs a list of accepted loop_vec_info's for lower
2449   vectorization factors than the main loop.  This list is stored in the main
2450   loop's loop_vec_info in the 'epilogue_vinfos' member.  Everytime we decide to
2451   vectorize the epilogue loop for a lower vectorization factor,  the
2452   loop_vec_info sitting at the top of the epilogue_vinfos list is removed,
2453   updated and linked to the epilogue loop.  This is later used to vectorize
2454   the epilogue.  The reason the loop_vec_info needs updating is that it was
2455   constructed based on the original main loop, and the epilogue loop is a
2456   copy of this loop, so all links pointing to statements in the original loop
2457   need updating.  Furthermore, these loop_vec_infos share the
2458   data_reference's records, which will also need to be updated.
2459
2460   TODO: Guard for prefer_scalar_loop should be emitted along with
2461   versioning conditions if loop versioning is needed.  */
2462
2463
2464class loop *
2465vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2466		 tree *niters_vector, tree *step_vector,
2467		 tree *niters_vector_mult_vf_var, int th,
2468		 bool check_profitability, bool niters_no_overflow,
2469		 tree *advance)
2470{
2471  edge e, guard_e;
2472  tree type = TREE_TYPE (niters), guard_cond;
2473  basic_block guard_bb, guard_to;
2474  profile_probability prob_prolog, prob_vector, prob_epilog;
2475  int estimated_vf;
2476  int prolog_peeling = 0;
2477  bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0;
2478  /* We currently do not support prolog peeling if the target alignment is not
2479     known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2480     target alignment being constant.  */
2481  dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2482  if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2483    return NULL;
2484
2485  if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2486    prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2487
2488  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2489  poly_uint64 bound_epilog = 0;
2490  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2491      && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2492    bound_epilog += vf - 1;
2493  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2494    bound_epilog += 1;
2495  bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2496  poly_uint64 bound_scalar = bound_epilog;
2497
2498  if (!prolog_peeling && !epilog_peeling)
2499    return NULL;
2500
2501  /* Before doing any peeling make sure to reset debug binds outside of
2502     the loop refering to defs not in LC SSA.  */
2503  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2504  for (unsigned i = 0; i < loop->num_nodes; ++i)
2505    {
2506      basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2507      imm_use_iterator ui;
2508      gimple *use_stmt;
2509      for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
2510	   gsi_next (&gsi))
2511	{
2512	  FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ()))
2513	    if (gimple_debug_bind_p (use_stmt)
2514		&& loop != gimple_bb (use_stmt)->loop_father
2515		&& !flow_loop_nested_p (loop,
2516					gimple_bb (use_stmt)->loop_father))
2517	      {
2518		gimple_debug_bind_reset_value (use_stmt);
2519		update_stmt (use_stmt);
2520	      }
2521	}
2522      for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2523	   gsi_next (&gsi))
2524	{
2525	  ssa_op_iter op_iter;
2526	  def_operand_p def_p;
2527	  FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF)
2528	    FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p))
2529	      if (gimple_debug_bind_p (use_stmt)
2530		  && loop != gimple_bb (use_stmt)->loop_father
2531		  && !flow_loop_nested_p (loop,
2532					  gimple_bb (use_stmt)->loop_father))
2533		{
2534		  gimple_debug_bind_reset_value (use_stmt);
2535		  update_stmt (use_stmt);
2536		}
2537	}
2538    }
2539
2540  prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2541  estimated_vf = vect_vf_for_cost (loop_vinfo);
2542  if (estimated_vf == 2)
2543    estimated_vf = 3;
2544  prob_prolog = prob_epilog = profile_probability::guessed_always ()
2545			.apply_scale (estimated_vf - 1, estimated_vf);
2546
2547  class loop *prolog, *epilog = NULL;
2548  class loop *first_loop = loop;
2549  bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2550
2551  /* We might have a queued need to update virtual SSA form.  As we
2552     delete the update SSA machinery below after doing a regular
2553     incremental SSA update during loop copying make sure we don't
2554     lose that fact.
2555     ???  Needing to update virtual SSA form by renaming is unfortunate
2556     but not all of the vectorizer code inserting new loads / stores
2557     properly assigns virtual operands to those statements.  */
2558  update_ssa (TODO_update_ssa_only_virtuals);
2559
2560  create_lcssa_for_virtual_phi (loop);
2561
2562  /* If we're vectorizing an epilogue loop, the update_ssa above will
2563     have ensured that the virtual operand is in SSA form throughout the
2564     vectorized main loop.  Normally it is possible to trace the updated
2565     vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses
2566     back to scalar-stmt vuses, meaning that the effect of the SSA update
2567     remains local to the main loop.  However, there are rare cases in
2568     which the vectorized loop has vdefs even when the original scalar
2569     loop didn't.  For example, vectorizing a load with IFN_LOAD_LANES
2570     introduces clobbers of the temporary vector array, which in turn
2571     needs new vdefs.  If the scalar loop doesn't write to memory, these
2572     new vdefs will be the only ones in the vector loop.
2573
2574     In that case, update_ssa will have added a new virtual phi to the
2575     main loop, which previously didn't need one.  Ensure that we (locally)
2576     maintain LCSSA form for the virtual operand, just as we would have
2577     done if the virtual phi had existed from the outset.  This makes it
2578     easier to duplicate the scalar epilogue loop below.  */
2579  tree vop_to_rename = NULL_TREE;
2580  if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
2581    {
2582      class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo);
2583      vop_to_rename = create_lcssa_for_virtual_phi (orig_loop);
2584    }
2585
2586  if (MAY_HAVE_DEBUG_BIND_STMTS)
2587    {
2588      gcc_assert (!adjust_vec.exists ());
2589      adjust_vec.create (32);
2590    }
2591  initialize_original_copy_tables ();
2592
2593  /* Record the anchor bb at which the guard should be placed if the scalar
2594     loop might be preferred.  */
2595  basic_block anchor = loop_preheader_edge (loop)->src;
2596
2597  /* Generate the number of iterations for the prolog loop.  We do this here
2598     so that we can also get the upper bound on the number of iterations.  */
2599  tree niters_prolog;
2600  int bound_prolog = 0;
2601  if (prolog_peeling)
2602    niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2603						  &bound_prolog);
2604  else
2605    niters_prolog = build_int_cst (type, 0);
2606
2607  loop_vec_info epilogue_vinfo = NULL;
2608  if (vect_epilogues)
2609    {
2610      epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2611      loop_vinfo->epilogue_vinfos.ordered_remove (0);
2612    }
2613
2614  tree niters_vector_mult_vf = NULL_TREE;
2615  /* Saving NITERs before the loop, as this may be changed by prologue.  */
2616  tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo);
2617  edge update_e = NULL, skip_e = NULL;
2618  unsigned int lowest_vf = constant_lower_bound (vf);
2619  /* If we know the number of scalar iterations for the main loop we should
2620     check whether after the main loop there are enough iterations left over
2621     for the epilogue.  */
2622  if (vect_epilogues
2623      && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2624      && prolog_peeling >= 0
2625      && known_eq (vf, lowest_vf))
2626    {
2627      unsigned HOST_WIDE_INT eiters
2628	= (LOOP_VINFO_INT_NITERS (loop_vinfo)
2629	   - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2630
2631      eiters -= prolog_peeling;
2632      eiters
2633	= eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2634
2635      unsigned int ratio;
2636      unsigned int epilogue_gaps
2637	= LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2638      while (!(constant_multiple_p
2639	       (GET_MODE_SIZE (loop_vinfo->vector_mode),
2640		GET_MODE_SIZE (epilogue_vinfo->vector_mode), &ratio)
2641	       && eiters >= lowest_vf / ratio + epilogue_gaps))
2642	{
2643	  delete epilogue_vinfo;
2644	  epilogue_vinfo = NULL;
2645	  if (loop_vinfo->epilogue_vinfos.length () == 0)
2646	    {
2647	      vect_epilogues = false;
2648	      break;
2649	    }
2650	  epilogue_vinfo = loop_vinfo->epilogue_vinfos[0];
2651	  loop_vinfo->epilogue_vinfos.ordered_remove (0);
2652	  epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo);
2653	}
2654    }
2655  /* Prolog loop may be skipped.  */
2656  bool skip_prolog = (prolog_peeling != 0);
2657  /* Skip this loop to epilog when there are not enough iterations to enter this
2658     vectorized loop.  If true we should perform runtime checks on the NITERS
2659     to check whether we should skip the current vectorized loop.  If we know
2660     the number of scalar iterations we may choose to add a runtime check if
2661     this number "maybe" smaller than the number of iterations required
2662     when we know the number of scalar iterations may potentially
2663     be smaller than the number of iterations required to enter this loop, for
2664     this we use the upper bounds on the prolog and epilog peeling.  When we
2665     don't know the number of iterations and don't require versioning it is
2666     because we have asserted that there are enough scalar iterations to enter
2667     the main loop, so this skip is not necessary.  When we are versioning then
2668     we only add such a skip if we have chosen to vectorize the epilogue.  */
2669  bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2670		      ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2671				  bound_prolog + bound_epilog)
2672		      : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2673			 || vect_epilogues));
2674  /* Epilog loop must be executed if the number of iterations for epilog
2675     loop is known at compile time, otherwise we need to add a check at
2676     the end of vector loop and skip to the end of epilog loop.  */
2677  bool skip_epilog = (prolog_peeling < 0
2678		      || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2679		      || !vf.is_constant ());
2680  /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2681  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2682    skip_epilog = false;
2683
2684  if (skip_vector)
2685    {
2686      split_edge (loop_preheader_edge (loop));
2687
2688      /* Due to the order in which we peel prolog and epilog, we first
2689	 propagate probability to the whole loop.  The purpose is to
2690	 avoid adjusting probabilities of both prolog and vector loops
2691	 separately.  Note in this case, the probability of epilog loop
2692	 needs to be scaled back later.  */
2693      basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2694      if (prob_vector.initialized_p ())
2695	{
2696	  scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2697	  scale_loop_profile (loop, prob_vector, 0);
2698	}
2699    }
2700
2701  dump_user_location_t loop_loc = find_loop_location (loop);
2702  class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2703  if (vect_epilogues)
2704    /* Make sure to set the epilogue's epilogue scalar loop, such that we can
2705       use the original scalar loop as remaining epilogue if necessary.  */
2706    LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo)
2707      = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2708
2709  if (prolog_peeling)
2710    {
2711      e = loop_preheader_edge (loop);
2712      if (!slpeel_can_duplicate_loop_p (loop, e))
2713	{
2714	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2715			   "loop can't be duplicated to preheader edge.\n");
2716	  gcc_unreachable ();
2717	}
2718      /* Peel prolog and put it on preheader edge of loop.  */
2719      prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2720      if (!prolog)
2721	{
2722	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2723			   "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2724	  gcc_unreachable ();
2725	}
2726      prolog->force_vectorize = false;
2727      slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2728      first_loop = prolog;
2729      reset_original_copy_tables ();
2730
2731      /* Update the number of iterations for prolog loop.  */
2732      tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2733      vect_set_loop_condition (prolog, NULL, niters_prolog,
2734			       step_prolog, NULL_TREE, false);
2735
2736      /* Skip the prolog loop.  */
2737      if (skip_prolog)
2738	{
2739	  guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2740				    niters_prolog, build_int_cst (type, 0));
2741	  guard_bb = loop_preheader_edge (prolog)->src;
2742	  basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2743	  guard_to = split_edge (loop_preheader_edge (loop));
2744	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2745					   guard_to, guard_bb,
2746					   prob_prolog.invert (),
2747					   irred_flag);
2748	  e = EDGE_PRED (guard_to, 0);
2749	  e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2750	  slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2751
2752	  scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2753	  scale_loop_profile (prolog, prob_prolog, bound_prolog);
2754	}
2755
2756      /* Update init address of DRs.  */
2757      vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2758      /* Update niters for vector loop.  */
2759      LOOP_VINFO_NITERS (loop_vinfo)
2760	= fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2761      LOOP_VINFO_NITERSM1 (loop_vinfo)
2762	= fold_build2 (MINUS_EXPR, type,
2763		       LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2764      bool new_var_p = false;
2765      niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2766      /* It's guaranteed that vector loop bound before vectorization is at
2767	 least VF, so set range information for newly generated var.  */
2768      if (new_var_p)
2769	set_range_info (niters, VR_RANGE,
2770			wi::to_wide (build_int_cst (type, vf)),
2771			wi::to_wide (TYPE_MAX_VALUE (type)));
2772
2773      /* Prolog iterates at most bound_prolog times, latch iterates at
2774	 most bound_prolog - 1 times.  */
2775      record_niter_bound (prolog, bound_prolog - 1, false, true);
2776      delete_update_ssa ();
2777      adjust_vec_debug_stmts ();
2778      scev_reset ();
2779    }
2780
2781  if (epilog_peeling)
2782    {
2783      e = single_exit (loop);
2784      if (!slpeel_can_duplicate_loop_p (loop, e))
2785	{
2786	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2787			   "loop can't be duplicated to exit edge.\n");
2788	  gcc_unreachable ();
2789	}
2790      /* Peel epilog and put it on exit edge of loop.  If we are vectorizing
2791	 said epilog then we should use a copy of the main loop as a starting
2792	 point.  This loop may have already had some preliminary transformations
2793	 to allow for more optimal vectorization, for example if-conversion.
2794	 If we are not vectorizing the epilog then we should use the scalar loop
2795	 as the transformations mentioned above make less or no sense when not
2796	 vectorizing.  */
2797      epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop;
2798      if (vop_to_rename)
2799	{
2800	  /* Vectorizing the main loop can sometimes introduce a vdef to
2801	     a loop that previously didn't have one; see the comment above
2802	     the definition of VOP_TO_RENAME for details.  The definition
2803	     D that holds on E will then be different from the definition
2804	     VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to
2805	     rename VOP_TO_RENAME to D when copying the loop.
2806
2807	     The virtual operand is in LCSSA form for the main loop,
2808	     and no stmt between the main loop and E needs a vdef,
2809	     so we know that D is provided by a phi rather than by a
2810	     vdef on a normal gimple stmt.  */
2811	  basic_block vdef_bb = e->src;
2812	  gphi *vphi;
2813	  while (!(vphi = get_virtual_phi (vdef_bb)))
2814	    vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb);
2815	  gcc_assert (vop_to_rename != gimple_phi_result (vphi));
2816	  set_current_def (vop_to_rename, gimple_phi_result (vphi));
2817	}
2818      epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e);
2819      if (!epilog)
2820	{
2821	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2822			   "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2823	  gcc_unreachable ();
2824	}
2825      epilog->force_vectorize = false;
2826      slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2827
2828      /* Scalar version loop may be preferred.  In this case, add guard
2829	 and skip to epilog.  Note this only happens when the number of
2830	 iterations of loop is unknown at compile time, otherwise this
2831	 won't be vectorized.  */
2832      if (skip_vector)
2833	{
2834	  /* Additional epilogue iteration is peeled if gap exists.  */
2835	  tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2836						bound_prolog, bound_epilog,
2837						th, &bound_scalar,
2838						check_profitability);
2839	  /* Build guard against NITERSM1 since NITERS may overflow.  */
2840	  guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2841	  guard_bb = anchor;
2842	  guard_to = split_edge (loop_preheader_edge (epilog));
2843	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2844					   guard_to, guard_bb,
2845					   prob_vector.invert (),
2846					   irred_flag);
2847	  skip_e = guard_e;
2848	  e = EDGE_PRED (guard_to, 0);
2849	  e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2850	  slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2851
2852	  /* Simply propagate profile info from guard_bb to guard_to which is
2853	     a merge point of control flow.  */
2854	  guard_to->count = guard_bb->count;
2855
2856	  /* Scale probability of epilog loop back.
2857	     FIXME: We should avoid scaling down and back up.  Profile may
2858	     get lost if we scale down to 0.  */
2859	  basic_block *bbs = get_loop_body (epilog);
2860	  for (unsigned int i = 0; i < epilog->num_nodes; i++)
2861	    bbs[i]->count = bbs[i]->count.apply_scale
2862				 (bbs[i]->count,
2863				  bbs[i]->count.apply_probability
2864				    (prob_vector));
2865	  free (bbs);
2866	}
2867
2868      basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2869      /* If loop is peeled for non-zero constant times, now niters refers to
2870	 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2871	 overflows.  */
2872      niters_no_overflow |= (prolog_peeling > 0);
2873      vect_gen_vector_loop_niters (loop_vinfo, niters,
2874				   niters_vector, step_vector,
2875				   niters_no_overflow);
2876      if (!integer_onep (*step_vector))
2877	{
2878	  /* On exit from the loop we will have an easy way of calcalating
2879	     NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2880	     until then.  */
2881	  niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2882	  SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2883	  *niters_vector_mult_vf_var = niters_vector_mult_vf;
2884	}
2885      else
2886	vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2887					     &niters_vector_mult_vf);
2888      /* Update IVs of original loop as if they were advanced by
2889	 niters_vector_mult_vf steps.  */
2890      gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2891      update_e = skip_vector ? e : loop_preheader_edge (epilog);
2892      vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2893					update_e);
2894
2895      if (skip_epilog)
2896	{
2897	  guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2898				    niters, niters_vector_mult_vf);
2899	  guard_bb = single_exit (loop)->dest;
2900	  guard_to = split_edge (single_exit (epilog));
2901	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2902					   skip_vector ? anchor : guard_bb,
2903					   prob_epilog.invert (),
2904					   irred_flag);
2905	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2906					      single_exit (epilog));
2907	  /* Only need to handle basic block before epilog loop if it's not
2908	     the guard_bb, which is the case when skip_vector is true.  */
2909	  if (guard_bb != bb_before_epilog)
2910	    {
2911	      prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2912
2913	      scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2914	    }
2915	  scale_loop_profile (epilog, prob_epilog, 0);
2916	}
2917      else
2918	slpeel_update_phi_nodes_for_lcssa (epilog);
2919
2920      unsigned HOST_WIDE_INT bound;
2921      if (bound_scalar.is_constant (&bound))
2922	{
2923	  gcc_assert (bound != 0);
2924	  /* -1 to convert loop iterations to latch iterations.  */
2925	  record_niter_bound (epilog, bound - 1, false, true);
2926	}
2927
2928      delete_update_ssa ();
2929      adjust_vec_debug_stmts ();
2930      scev_reset ();
2931    }
2932
2933  if (vect_epilogues)
2934    {
2935      epilog->aux = epilogue_vinfo;
2936      LOOP_VINFO_LOOP (epilogue_vinfo) = epilog;
2937
2938      loop_constraint_clear (epilog, LOOP_C_INFINITE);
2939
2940      /* We now must calculate the number of NITERS performed by the previous
2941	 loop and EPILOGUE_NITERS to be performed by the epilogue.  */
2942      tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf),
2943				 niters_prolog, niters_vector_mult_vf);
2944
2945      /* If skip_vector we may skip the previous loop, we insert a phi-node to
2946	 determine whether we are coming from the previous vectorized loop
2947	 using the update_e edge or the skip_vector basic block using the
2948	 skip_e edge.  */
2949      if (skip_vector)
2950	{
2951	  gcc_assert (update_e != NULL && skip_e != NULL);
2952	  gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)),
2953					   update_e->dest);
2954	  tree new_ssa = make_ssa_name (TREE_TYPE (niters));
2955	  gimple *stmt = gimple_build_assign (new_ssa, niters);
2956	  gimple_stmt_iterator gsi;
2957	  if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME
2958	      && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL)
2959	    {
2960	      gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf));
2961	      gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
2962	    }
2963	  else
2964	    {
2965	      gsi = gsi_last_bb (update_e->src);
2966	      gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2967	    }
2968
2969	  niters = new_ssa;
2970	  add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION);
2971	  add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e,
2972		       UNKNOWN_LOCATION);
2973	  niters = PHI_RESULT (new_phi);
2974	}
2975
2976      /* Subtract the number of iterations performed by the vectorized loop
2977	 from the number of total iterations.  */
2978      tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters),
2979					  before_loop_niters,
2980					  niters);
2981
2982      LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters;
2983      LOOP_VINFO_NITERSM1 (epilogue_vinfo)
2984	= fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters),
2985		       epilogue_niters,
2986		       build_one_cst (TREE_TYPE (epilogue_niters)));
2987
2988      /* Set ADVANCE to the number of iterations performed by the previous
2989	 loop and its prologue.  */
2990      *advance = niters;
2991
2992      /* Redo the peeling for niter analysis as the NITERs and alignment
2993	 may have been updated to take the main loop into account.  */
2994      determine_peel_for_niter (epilogue_vinfo);
2995    }
2996
2997  adjust_vec.release ();
2998  free_original_copy_tables ();
2999
3000  return vect_epilogues ? epilog : NULL;
3001}
3002
3003/* Function vect_create_cond_for_niters_checks.
3004
3005   Create a conditional expression that represents the run-time checks for
3006   loop's niter.  The loop is guaranteed to terminate if the run-time
3007   checks hold.
3008
3009   Input:
3010   COND_EXPR  - input conditional expression.  New conditions will be chained
3011		with logical AND operation.  If it is NULL, then the function
3012		is used to return the number of alias checks.
3013   LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3014		to be checked.
3015
3016   Output:
3017   COND_EXPR - conditional expression.
3018
3019   The returned COND_EXPR is the conditional expression to be used in the
3020   if statement that controls which version of the loop gets executed at
3021   runtime.  */
3022
3023static void
3024vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
3025{
3026  tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
3027
3028  if (*cond_expr)
3029    *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3030			      *cond_expr, part_cond_expr);
3031  else
3032    *cond_expr = part_cond_expr;
3033}
3034
3035/* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3036   and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
3037
3038static void
3039chain_cond_expr (tree *cond_expr, tree part_cond_expr)
3040{
3041  if (*cond_expr)
3042    *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3043			      *cond_expr, part_cond_expr);
3044  else
3045    *cond_expr = part_cond_expr;
3046}
3047
3048/* Function vect_create_cond_for_align_checks.
3049
3050   Create a conditional expression that represents the alignment checks for
3051   all of data references (array element references) whose alignment must be
3052   checked at runtime.
3053
3054   Input:
3055   COND_EXPR  - input conditional expression.  New conditions will be chained
3056                with logical AND operation.
3057   LOOP_VINFO - two fields of the loop information are used.
3058                LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
3059                LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
3060
3061   Output:
3062   COND_EXPR_STMT_LIST - statements needed to construct the conditional
3063                         expression.
3064   The returned value is the conditional expression to be used in the if
3065   statement that controls which version of the loop gets executed at runtime.
3066
3067   The algorithm makes two assumptions:
3068     1) The number of bytes "n" in a vector is a power of 2.
3069     2) An address "a" is aligned if a%n is zero and that this
3070        test can be done as a&(n-1) == 0.  For example, for 16
3071        byte vectors the test is a&0xf == 0.  */
3072
3073static void
3074vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
3075                                   tree *cond_expr,
3076				   gimple_seq *cond_expr_stmt_list)
3077{
3078  vec<stmt_vec_info> may_misalign_stmts
3079    = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
3080  stmt_vec_info stmt_info;
3081  int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
3082  tree mask_cst;
3083  unsigned int i;
3084  tree int_ptrsize_type;
3085  char tmp_name[20];
3086  tree or_tmp_name = NULL_TREE;
3087  tree and_tmp_name;
3088  gimple *and_stmt;
3089  tree ptrsize_zero;
3090  tree part_cond_expr;
3091
3092  /* Check that mask is one less than a power of 2, i.e., mask is
3093     all zeros followed by all ones.  */
3094  gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
3095
3096  int_ptrsize_type = signed_type_for (ptr_type_node);
3097
3098  /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
3099     of the first vector of the i'th data reference. */
3100
3101  FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
3102    {
3103      gimple_seq new_stmt_list = NULL;
3104      tree addr_base;
3105      tree addr_tmp_name;
3106      tree new_or_tmp_name;
3107      gimple *addr_stmt, *or_stmt;
3108      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3109      bool negative = tree_int_cst_compare
3110	(DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
3111      tree offset = negative
3112	? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
3113
3114      /* create: addr_tmp = (int)(address_of_first_vector) */
3115      addr_base =
3116	vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
3117					      offset);
3118      if (new_stmt_list != NULL)
3119	gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
3120
3121      sprintf (tmp_name, "addr2int%d", i);
3122      addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3123      addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
3124      gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
3125
3126      /* The addresses are OR together.  */
3127
3128      if (or_tmp_name != NULL_TREE)
3129        {
3130          /* create: or_tmp = or_tmp | addr_tmp */
3131          sprintf (tmp_name, "orptrs%d", i);
3132	  new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
3133	  or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
3134					 or_tmp_name, addr_tmp_name);
3135	  gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
3136          or_tmp_name = new_or_tmp_name;
3137        }
3138      else
3139        or_tmp_name = addr_tmp_name;
3140
3141    } /* end for i */
3142
3143  mask_cst = build_int_cst (int_ptrsize_type, mask);
3144
3145  /* create: and_tmp = or_tmp & mask  */
3146  and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
3147
3148  and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
3149				  or_tmp_name, mask_cst);
3150  gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
3151
3152  /* Make and_tmp the left operand of the conditional test against zero.
3153     if and_tmp has a nonzero bit then some address is unaligned.  */
3154  ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
3155  part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
3156				and_tmp_name, ptrsize_zero);
3157  chain_cond_expr (cond_expr, part_cond_expr);
3158}
3159
3160/* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
3161   create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
3162   Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
3163   and this new condition are true.  Treat a null *COND_EXPR as "true".  */
3164
3165static void
3166vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
3167{
3168  vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3169  unsigned int i;
3170  vec_object_pair *pair;
3171  FOR_EACH_VEC_ELT (pairs, i, pair)
3172    {
3173      tree addr1 = build_fold_addr_expr (pair->first);
3174      tree addr2 = build_fold_addr_expr (pair->second);
3175      tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
3176					 addr1, addr2);
3177      chain_cond_expr (cond_expr, part_cond_expr);
3178    }
3179}
3180
3181/* Create an expression that is true when all lower-bound conditions for
3182   the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
3183
3184static void
3185vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
3186{
3187  vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3188  for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3189    {
3190      tree expr = lower_bounds[i].expr;
3191      tree type = unsigned_type_for (TREE_TYPE (expr));
3192      expr = fold_convert (type, expr);
3193      poly_uint64 bound = lower_bounds[i].min_value;
3194      if (!lower_bounds[i].unsigned_p)
3195	{
3196	  expr = fold_build2 (PLUS_EXPR, type, expr,
3197			      build_int_cstu (type, bound - 1));
3198	  bound += bound - 1;
3199	}
3200      tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
3201					 build_int_cstu (type, bound));
3202      chain_cond_expr (cond_expr, part_cond_expr);
3203    }
3204}
3205
3206/* Function vect_create_cond_for_alias_checks.
3207
3208   Create a conditional expression that represents the run-time checks for
3209   overlapping of address ranges represented by a list of data references
3210   relations passed as input.
3211
3212   Input:
3213   COND_EXPR  - input conditional expression.  New conditions will be chained
3214                with logical AND operation.  If it is NULL, then the function
3215                is used to return the number of alias checks.
3216   LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
3217	        to be checked.
3218
3219   Output:
3220   COND_EXPR - conditional expression.
3221
3222   The returned COND_EXPR is the conditional expression to be used in the if
3223   statement that controls which version of the loop gets executed at runtime.
3224*/
3225
3226void
3227vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
3228{
3229  vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
3230    LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3231
3232  if (comp_alias_ddrs.is_empty ())
3233    return;
3234
3235  create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
3236			       &comp_alias_ddrs, cond_expr);
3237  if (dump_enabled_p ())
3238    dump_printf_loc (MSG_NOTE, vect_location,
3239		     "created %u versioning for alias checks.\n",
3240		     comp_alias_ddrs.length ());
3241}
3242
3243
3244/* Function vect_loop_versioning.
3245
3246   If the loop has data references that may or may not be aligned or/and
3247   has data reference relations whose independence was not proven then
3248   two versions of the loop need to be generated, one which is vectorized
3249   and one which isn't.  A test is then generated to control which of the
3250   loops is executed.  The test checks for the alignment of all of the
3251   data references that may or may not be aligned.  An additional
3252   sequence of runtime tests is generated for each pairs of DDRs whose
3253   independence was not proven.  The vectorized version of loop is
3254   executed only if both alias and alignment tests are passed.
3255
3256   The test generated to check which version of loop is executed
3257   is modified to also check for profitability as indicated by the
3258   cost model threshold TH.
3259
3260   The versioning precondition(s) are placed in *COND_EXPR and
3261   *COND_EXPR_STMT_LIST.  */
3262
3263class loop *
3264vect_loop_versioning (loop_vec_info loop_vinfo,
3265		      gimple *loop_vectorized_call)
3266{
3267  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
3268  class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
3269  basic_block condition_bb;
3270  gphi_iterator gsi;
3271  gimple_stmt_iterator cond_exp_gsi;
3272  basic_block merge_bb;
3273  basic_block new_exit_bb;
3274  edge new_exit_e, e;
3275  gphi *orig_phi, *new_phi;
3276  tree cond_expr = NULL_TREE;
3277  gimple_seq cond_expr_stmt_list = NULL;
3278  tree arg;
3279  profile_probability prob = profile_probability::likely ();
3280  gimple_seq gimplify_stmt_list = NULL;
3281  tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3282  bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3283  bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3284  bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3285  poly_uint64 versioning_threshold
3286    = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3287  tree version_simd_if_cond
3288    = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3289  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3290
3291  if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3292      && !ordered_p (th, versioning_threshold))
3293    cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3294			     build_int_cst (TREE_TYPE (scalar_loop_iters),
3295					    th - 1));
3296  if (maybe_ne (versioning_threshold, 0U))
3297    {
3298      tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3299			       build_int_cst (TREE_TYPE (scalar_loop_iters),
3300					      versioning_threshold - 1));
3301      if (cond_expr)
3302	cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3303				 expr, cond_expr);
3304      else
3305	cond_expr = expr;
3306    }
3307
3308  if (version_niter)
3309    vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3310
3311  if (cond_expr)
3312    cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3313					&cond_expr_stmt_list,
3314					is_gimple_condexpr, NULL_TREE);
3315
3316  if (version_align)
3317    vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3318				       &cond_expr_stmt_list);
3319
3320  if (version_alias)
3321    {
3322      vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3323      vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3324      vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3325    }
3326
3327  if (version_simd_if_cond)
3328    {
3329      gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3330      if (flag_checking)
3331	if (basic_block bb
3332	    = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3333	  gcc_assert (bb != loop->header
3334		      && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3335		      && (scalar_loop == NULL
3336			  || (bb != scalar_loop->header
3337			      && dominated_by_p (CDI_DOMINATORS,
3338						 scalar_loop->header, bb))));
3339      tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3340      tree c = fold_build2 (NE_EXPR, boolean_type_node,
3341			    version_simd_if_cond, zero);
3342      if (cond_expr)
3343        cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3344				 c, cond_expr);
3345      else
3346        cond_expr = c;
3347      if (dump_enabled_p ())
3348	dump_printf_loc (MSG_NOTE, vect_location,
3349			 "created versioning for simd if condition check.\n");
3350    }
3351
3352  cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3353				      &gimplify_stmt_list,
3354				      is_gimple_condexpr, NULL_TREE);
3355  gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3356
3357  /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3358     invariant in.  */
3359  class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3360  for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3361       !gsi_end_p (gsi); gsi_next (&gsi))
3362    {
3363      gimple *stmt = gsi_stmt (gsi);
3364      update_stmt (stmt);
3365      ssa_op_iter iter;
3366      use_operand_p use_p;
3367      basic_block def_bb;
3368      FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3369	if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3370	    && flow_bb_inside_loop_p (outermost, def_bb))
3371	  outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3372    }
3373
3374  /* Search for the outermost loop we can version.  Avoid versioning of
3375     non-perfect nests but allow if-conversion versioned loops inside.  */
3376  class loop *loop_to_version = loop;
3377  if (flow_loop_nested_p (outermost, loop))
3378    {
3379      if (dump_enabled_p ())
3380	dump_printf_loc (MSG_NOTE, vect_location,
3381			 "trying to apply versioning to outer loop %d\n",
3382			 outermost->num);
3383      if (outermost->num == 0)
3384	outermost = superloop_at_depth (loop, 1);
3385      /* And avoid applying versioning on non-perfect nests.  */
3386      while (loop_to_version != outermost
3387	     && (e = single_exit (loop_outer (loop_to_version)))
3388	     && !(e->flags & EDGE_COMPLEX)
3389	     && (!loop_outer (loop_to_version)->inner->next
3390		 || vect_loop_vectorized_call (loop_to_version))
3391	     && (!loop_outer (loop_to_version)->inner->next
3392		 || !loop_outer (loop_to_version)->inner->next->next))
3393	loop_to_version = loop_outer (loop_to_version);
3394    }
3395
3396  /* Apply versioning.  If there is already a scalar version created by
3397     if-conversion re-use that.  Note we cannot re-use the copy of
3398     an if-converted outer-loop when vectorizing the inner loop only.  */
3399  gcond *cond;
3400  if ((!loop_to_version->inner || loop == loop_to_version)
3401      && loop_vectorized_call)
3402    {
3403      gcc_assert (scalar_loop);
3404      condition_bb = gimple_bb (loop_vectorized_call);
3405      cond = as_a <gcond *> (last_stmt (condition_bb));
3406      gimple_cond_set_condition_from_tree (cond, cond_expr);
3407      update_stmt (cond);
3408
3409      if (cond_expr_stmt_list)
3410	{
3411	  cond_exp_gsi = gsi_for_stmt (loop_vectorized_call);
3412	  gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3413				 GSI_SAME_STMT);
3414	}
3415
3416      /* if-conversion uses profile_probability::always () for both paths,
3417	 reset the paths probabilities appropriately.  */
3418      edge te, fe;
3419      extract_true_false_edges_from_block (condition_bb, &te, &fe);
3420      te->probability = prob;
3421      fe->probability = prob.invert ();
3422      /* We can scale loops counts immediately but have to postpone
3423         scaling the scalar loop because we re-use it during peeling.  */
3424      scale_loop_frequencies (loop_to_version, te->probability);
3425      LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability;
3426
3427      nloop = scalar_loop;
3428      if (dump_enabled_p ())
3429	dump_printf_loc (MSG_NOTE, vect_location,
3430			 "reusing %sloop version created by if conversion\n",
3431			 loop_to_version != loop ? "outer " : "");
3432    }
3433  else
3434    {
3435      if (loop_to_version != loop
3436	  && dump_enabled_p ())
3437	dump_printf_loc (MSG_NOTE, vect_location,
3438			 "applying loop versioning to outer loop %d\n",
3439			 loop_to_version->num);
3440
3441      initialize_original_copy_tables ();
3442      nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3443			    prob, prob.invert (), prob, prob.invert (), true);
3444      gcc_assert (nloop);
3445      nloop = get_loop_copy (loop);
3446
3447      /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3448         reap those otherwise;  they also refer to the original
3449	 loops.  */
3450      class loop *l = loop;
3451      while (gimple *call = vect_loop_vectorized_call (l))
3452	{
3453	  call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3454	  fold_loop_internal_call (call, boolean_false_node);
3455	  l = loop_outer (l);
3456	}
3457      free_original_copy_tables ();
3458
3459      if (cond_expr_stmt_list)
3460	{
3461	  cond_exp_gsi = gsi_last_bb (condition_bb);
3462	  gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3463				 GSI_SAME_STMT);
3464	}
3465
3466      /* Loop versioning violates an assumption we try to maintain during
3467	 vectorization - that the loop exit block has a single predecessor.
3468	 After versioning, the exit block of both loop versions is the same
3469	 basic block (i.e. it has two predecessors). Just in order to simplify
3470	 following transformations in the vectorizer, we fix this situation
3471	 here by adding a new (empty) block on the exit-edge of the loop,
3472	 with the proper loop-exit phis to maintain loop-closed-form.
3473	 If loop versioning wasn't done from loop, but scalar_loop instead,
3474	 merge_bb will have already just a single successor.  */
3475
3476      merge_bb = single_exit (loop_to_version)->dest;
3477      if (EDGE_COUNT (merge_bb->preds) >= 2)
3478	{
3479	  gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3480	  new_exit_bb = split_edge (single_exit (loop_to_version));
3481	  new_exit_e = single_exit (loop_to_version);
3482	  e = EDGE_SUCC (new_exit_bb, 0);
3483
3484	  for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3485	       gsi_next (&gsi))
3486	    {
3487	      tree new_res;
3488	      orig_phi = gsi.phi ();
3489	      new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3490	      new_phi = create_phi_node (new_res, new_exit_bb);
3491	      arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3492	      add_phi_arg (new_phi, arg, new_exit_e,
3493			   gimple_phi_arg_location_from_edge (orig_phi, e));
3494	      adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3495	    }
3496	}
3497
3498      update_ssa (TODO_update_ssa);
3499    }
3500
3501  if (version_niter)
3502    {
3503      /* The versioned loop could be infinite, we need to clear existing
3504	 niter information which is copied from the original loop.  */
3505      gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3506      vect_free_loop_info_assumptions (nloop);
3507      /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
3508      loop_constraint_set (loop, LOOP_C_INFINITE);
3509    }
3510
3511  if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3512      && dump_enabled_p ())
3513    {
3514      if (version_alias)
3515        dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3516			 vect_location,
3517                         "loop versioned for vectorization because of "
3518			 "possible aliasing\n");
3519      if (version_align)
3520        dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3521			 vect_location,
3522                         "loop versioned for vectorization to enhance "
3523			 "alignment\n");
3524
3525    }
3526
3527  return nloop;
3528}
3529