tree-vect-loop-manip.c revision 1.12
1/* Vectorizer Specific Loop Manipulations
2   Copyright (C) 2003-2019 Free Software Foundation, Inc.
3   Contributed by Dorit Naishlos <dorit@il.ibm.com>
4   and Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3.  If not see
20<http://www.gnu.org/licenses/>.  */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "tree.h"
27#include "gimple.h"
28#include "cfghooks.h"
29#include "tree-pass.h"
30#include "ssa.h"
31#include "fold-const.h"
32#include "cfganal.h"
33#include "gimplify.h"
34#include "gimple-iterator.h"
35#include "gimplify-me.h"
36#include "tree-cfg.h"
37#include "tree-ssa-loop-manip.h"
38#include "tree-into-ssa.h"
39#include "tree-ssa.h"
40#include "cfgloop.h"
41#include "tree-scalar-evolution.h"
42#include "tree-vectorizer.h"
43#include "tree-ssa-loop-ivopts.h"
44#include "gimple-fold.h"
45#include "tree-ssa-loop-niter.h"
46#include "internal-fn.h"
47#include "stor-layout.h"
48#include "optabs-query.h"
49#include "vec-perm-indices.h"
50
51/*************************************************************************
52  Simple Loop Peeling Utilities
53
54  Utilities to support loop peeling for vectorization purposes.
55 *************************************************************************/
56
57
58/* Renames the use *OP_P.  */
59
60static void
61rename_use_op (use_operand_p op_p)
62{
63  tree new_name;
64
65  if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
66    return;
67
68  new_name = get_current_def (USE_FROM_PTR (op_p));
69
70  /* Something defined outside of the loop.  */
71  if (!new_name)
72    return;
73
74  /* An ordinary ssa name defined in the loop.  */
75
76  SET_USE (op_p, new_name);
77}
78
79
80/* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
81   on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
82   true.  */
83
84static void
85rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
86{
87  gimple *stmt;
88  use_operand_p use_p;
89  ssa_op_iter iter;
90  edge e;
91  edge_iterator ei;
92  struct loop *loop = bb->loop_father;
93  struct loop *outer_loop = NULL;
94
95  if (rename_from_outer_loop)
96    {
97      gcc_assert (loop);
98      outer_loop = loop_outer (loop);
99    }
100
101  for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
102       gsi_next (&gsi))
103    {
104      stmt = gsi_stmt (gsi);
105      FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
106	rename_use_op (use_p);
107    }
108
109  FOR_EACH_EDGE (e, ei, bb->preds)
110    {
111      if (!flow_bb_inside_loop_p (loop, e->src))
112	{
113	  if (!rename_from_outer_loop)
114	    continue;
115	  if (e->src != outer_loop->header)
116	    {
117	      if (outer_loop->inner->next)
118		{
119		  /* If outer_loop has 2 inner loops, allow there to
120		     be an extra basic block which decides which of the
121		     two loops to use using LOOP_VECTORIZED.  */
122		  if (!single_pred_p (e->src)
123		      || single_pred (e->src) != outer_loop->header)
124		    continue;
125		}
126	    }
127	}
128      for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
129	   gsi_next (&gsi))
130        rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
131    }
132}
133
134
135struct adjust_info
136{
137  tree from, to;
138  basic_block bb;
139};
140
141/* A stack of values to be adjusted in debug stmts.  We have to
142   process them LIFO, so that the closest substitution applies.  If we
143   processed them FIFO, without the stack, we might substitute uses
144   with a PHI DEF that would soon become non-dominant, and when we got
145   to the suitable one, it wouldn't have anything to substitute any
146   more.  */
147static vec<adjust_info, va_heap> adjust_vec;
148
149/* Adjust any debug stmts that referenced AI->from values to use the
150   loop-closed AI->to, if the references are dominated by AI->bb and
151   not by the definition of AI->from.  */
152
153static void
154adjust_debug_stmts_now (adjust_info *ai)
155{
156  basic_block bbphi = ai->bb;
157  tree orig_def = ai->from;
158  tree new_def = ai->to;
159  imm_use_iterator imm_iter;
160  gimple *stmt;
161  basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
162
163  gcc_assert (dom_info_available_p (CDI_DOMINATORS));
164
165  /* Adjust any debug stmts that held onto non-loop-closed
166     references.  */
167  FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
168    {
169      use_operand_p use_p;
170      basic_block bbuse;
171
172      if (!is_gimple_debug (stmt))
173	continue;
174
175      gcc_assert (gimple_debug_bind_p (stmt));
176
177      bbuse = gimple_bb (stmt);
178
179      if ((bbuse == bbphi
180	   || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
181	  && !(bbuse == bbdef
182	       || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
183	{
184	  if (new_def)
185	    FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
186	      SET_USE (use_p, new_def);
187	  else
188	    {
189	      gimple_debug_bind_reset_value (stmt);
190	      update_stmt (stmt);
191	    }
192	}
193    }
194}
195
196/* Adjust debug stmts as scheduled before.  */
197
198static void
199adjust_vec_debug_stmts (void)
200{
201  if (!MAY_HAVE_DEBUG_BIND_STMTS)
202    return;
203
204  gcc_assert (adjust_vec.exists ());
205
206  while (!adjust_vec.is_empty ())
207    {
208      adjust_debug_stmts_now (&adjust_vec.last ());
209      adjust_vec.pop ();
210    }
211}
212
213/* Adjust any debug stmts that referenced FROM values to use the
214   loop-closed TO, if the references are dominated by BB and not by
215   the definition of FROM.  If adjust_vec is non-NULL, adjustments
216   will be postponed until adjust_vec_debug_stmts is called.  */
217
218static void
219adjust_debug_stmts (tree from, tree to, basic_block bb)
220{
221  adjust_info ai;
222
223  if (MAY_HAVE_DEBUG_BIND_STMTS
224      && TREE_CODE (from) == SSA_NAME
225      && ! SSA_NAME_IS_DEFAULT_DEF (from)
226      && ! virtual_operand_p (from))
227    {
228      ai.from = from;
229      ai.to = to;
230      ai.bb = bb;
231
232      if (adjust_vec.exists ())
233	adjust_vec.safe_push (ai);
234      else
235	adjust_debug_stmts_now (&ai);
236    }
237}
238
239/* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
240   to adjust any debug stmts that referenced the old phi arg,
241   presumably non-loop-closed references left over from other
242   transformations.  */
243
244static void
245adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
246{
247  tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
248
249  SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
250
251  if (MAY_HAVE_DEBUG_BIND_STMTS)
252    adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
253			gimple_bb (update_phi));
254}
255
256/* Define one loop mask MASK from loop LOOP.  INIT_MASK is the value that
257   the mask should have during the first iteration and NEXT_MASK is the
258   value that it should have on subsequent iterations.  */
259
260static void
261vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
262		    tree next_mask)
263{
264  gphi *phi = create_phi_node (mask, loop->header);
265  add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
266  add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
267}
268
269/* Add SEQ to the end of LOOP's preheader block.  */
270
271static void
272add_preheader_seq (struct loop *loop, gimple_seq seq)
273{
274  if (seq)
275    {
276      edge pe = loop_preheader_edge (loop);
277      basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
278      gcc_assert (!new_bb);
279    }
280}
281
282/* Add SEQ to the beginning of LOOP's header block.  */
283
284static void
285add_header_seq (struct loop *loop, gimple_seq seq)
286{
287  if (seq)
288    {
289      gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
290      gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
291    }
292}
293
294/* Return true if the target can interleave elements of two vectors.
295   OFFSET is 0 if the first half of the vectors should be interleaved
296   or 1 if the second half should.  When returning true, store the
297   associated permutation in INDICES.  */
298
299static bool
300interleave_supported_p (vec_perm_indices *indices, tree vectype,
301			unsigned int offset)
302{
303  poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
304  poly_uint64 base = exact_div (nelts, 2) * offset;
305  vec_perm_builder sel (nelts, 2, 3);
306  for (unsigned int i = 0; i < 3; ++i)
307    {
308      sel.quick_push (base + i);
309      sel.quick_push (base + i + nelts);
310    }
311  indices->new_vector (sel, 2, nelts);
312  return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
313}
314
315/* Try to use permutes to define the masks in DEST_RGM using the masks
316   in SRC_RGM, given that the former has twice as many masks as the
317   latter.  Return true on success, adding any new statements to SEQ.  */
318
319static bool
320vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
321			       rgroup_masks *src_rgm)
322{
323  tree src_masktype = src_rgm->mask_type;
324  tree dest_masktype = dest_rgm->mask_type;
325  machine_mode src_mode = TYPE_MODE (src_masktype);
326  if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
327      && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
328      && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
329    {
330      /* Unpacking the source masks gives at least as many mask bits as
331	 we need.  We can then VIEW_CONVERT any excess bits away.  */
332      tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
333      for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
334	{
335	  tree src = src_rgm->masks[i / 2];
336	  tree dest = dest_rgm->masks[i];
337	  tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
338			    ? VEC_UNPACK_HI_EXPR
339			    : VEC_UNPACK_LO_EXPR);
340	  gassign *stmt;
341	  if (dest_masktype == unpack_masktype)
342	    stmt = gimple_build_assign (dest, code, src);
343	  else
344	    {
345	      tree temp = make_ssa_name (unpack_masktype);
346	      stmt = gimple_build_assign (temp, code, src);
347	      gimple_seq_add_stmt (seq, stmt);
348	      stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
349					  build1 (VIEW_CONVERT_EXPR,
350						  dest_masktype, temp));
351	    }
352	  gimple_seq_add_stmt (seq, stmt);
353	}
354      return true;
355    }
356  vec_perm_indices indices[2];
357  if (dest_masktype == src_masktype
358      && interleave_supported_p (&indices[0], src_masktype, 0)
359      && interleave_supported_p (&indices[1], src_masktype, 1))
360    {
361      /* The destination requires twice as many mask bits as the source, so
362	 we can use interleaving permutes to double up the number of bits.  */
363      tree masks[2];
364      for (unsigned int i = 0; i < 2; ++i)
365	masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
366      for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
367	{
368	  tree src = src_rgm->masks[i / 2];
369	  tree dest = dest_rgm->masks[i];
370	  gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
371					      src, src, masks[i & 1]);
372	  gimple_seq_add_stmt (seq, stmt);
373	}
374      return true;
375    }
376  return false;
377}
378
379/* Helper for vect_set_loop_condition_masked.  Generate definitions for
380   all the masks in RGM and return a mask that is nonzero when the loop
381   needs to iterate.  Add any new preheader statements to PREHEADER_SEQ.
382   Use LOOP_COND_GSI to insert code before the exit gcond.
383
384   RGM belongs to loop LOOP.  The loop originally iterated NITERS
385   times and has been vectorized according to LOOP_VINFO.  Each iteration
386   of the vectorized loop handles VF iterations of the scalar loop.
387
388   If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
389   starts with NITERS_SKIP dummy iterations of the scalar loop before
390   the real work starts.  The mask elements for these dummy iterations
391   must be 0, to ensure that the extra iterations do not have an effect.
392
393   It is known that:
394
395     NITERS * RGM->max_nscalars_per_iter
396
397   does not overflow.  However, MIGHT_WRAP_P says whether an induction
398   variable that starts at 0 and has step:
399
400     VF * RGM->max_nscalars_per_iter
401
402   might overflow before hitting a value above:
403
404     (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
405
406   This means that we cannot guarantee that such an induction variable
407   would ever hit a value that produces a set of all-false masks for RGM.  */
408
409static tree
410vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
411			      gimple_seq *preheader_seq,
412			      gimple_stmt_iterator loop_cond_gsi,
413			      rgroup_masks *rgm, tree vf,
414			      tree niters, tree niters_skip,
415			      bool might_wrap_p)
416{
417  tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
418  tree mask_type = rgm->mask_type;
419  unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
420  poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
421
422  /* Calculate the maximum number of scalar values that the rgroup
423     handles in total, the number that it handles for each iteration
424     of the vector loop, and the number that it should skip during the
425     first iteration of the vector loop.  */
426  tree nscalars_total = niters;
427  tree nscalars_step = vf;
428  tree nscalars_skip = niters_skip;
429  if (nscalars_per_iter != 1)
430    {
431      /* We checked before choosing to use a fully-masked loop that these
432	 multiplications don't overflow.  */
433      tree factor = build_int_cst (compare_type, nscalars_per_iter);
434      nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
435				     nscalars_total, factor);
436      nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
437				    nscalars_step, factor);
438      if (nscalars_skip)
439	nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
440				      nscalars_skip, factor);
441    }
442
443  /* Create an induction variable that counts the number of scalars
444     processed.  */
445  tree index_before_incr, index_after_incr;
446  gimple_stmt_iterator incr_gsi;
447  bool insert_after;
448  tree zero_index = build_int_cst (compare_type, 0);
449  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
450  create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
451	     insert_after, &index_before_incr, &index_after_incr);
452
453  tree test_index, test_limit, first_limit;
454  gimple_stmt_iterator *test_gsi;
455  if (might_wrap_p)
456    {
457      /* In principle the loop should stop iterating once the incremented
458	 IV reaches a value greater than or equal to:
459
460	   NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
461
462	 However, there's no guarantee that this addition doesn't overflow
463	 the comparison type, or that the IV hits a value above it before
464	 wrapping around.  We therefore adjust the limit down by one
465	 IV step:
466
467	   (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
468	   -[infinite-prec] NSCALARS_STEP
469
470	 and compare the IV against this limit _before_ incrementing it.
471	 Since the comparison type is unsigned, we actually want the
472	 subtraction to saturate at zero:
473
474	   (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
475	   -[sat] NSCALARS_STEP
476
477	 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
478
479	   NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
480
481	 where the rightmost subtraction can be done directly in
482	 COMPARE_TYPE.  */
483      test_index = index_before_incr;
484      tree adjust = nscalars_step;
485      if (nscalars_skip)
486	adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
487			       adjust, nscalars_skip);
488      test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
489				 nscalars_total, adjust);
490      test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
491				 test_limit, adjust);
492      test_gsi = &incr_gsi;
493
494      /* Get a safe limit for the first iteration.  */
495      if (nscalars_skip)
496	{
497	  /* The first vector iteration can handle at most NSCALARS_STEP
498	     scalars.  NSCALARS_STEP <= CONST_LIMIT, and adding
499	     NSCALARS_SKIP to that cannot overflow.  */
500	  tree const_limit = build_int_cst (compare_type,
501					    LOOP_VINFO_VECT_FACTOR (loop_vinfo)
502					    * nscalars_per_iter);
503	  first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
504				      nscalars_total, const_limit);
505	  first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
506				      first_limit, nscalars_skip);
507	}
508      else
509	/* For the first iteration it doesn't matter whether the IV hits
510	   a value above NSCALARS_TOTAL.  That only matters for the latch
511	   condition.  */
512	first_limit = nscalars_total;
513    }
514  else
515    {
516      /* Test the incremented IV, which will always hit a value above
517	 the bound before wrapping.  */
518      test_index = index_after_incr;
519      test_limit = nscalars_total;
520      if (nscalars_skip)
521	test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
522				   test_limit, nscalars_skip);
523      test_gsi = &loop_cond_gsi;
524
525      first_limit = test_limit;
526    }
527
528  /* Provide a definition of each mask in the group.  */
529  tree next_mask = NULL_TREE;
530  tree mask;
531  unsigned int i;
532  FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
533    {
534      /* Previous masks will cover BIAS scalars.  This mask covers the
535	 next batch.  */
536      poly_uint64 bias = nscalars_per_mask * i;
537      tree bias_tree = build_int_cst (compare_type, bias);
538      gimple *tmp_stmt;
539
540      /* See whether the first iteration of the vector loop is known
541	 to have a full mask.  */
542      poly_uint64 const_limit;
543      bool first_iteration_full
544	= (poly_int_tree_p (first_limit, &const_limit)
545	   && known_ge (const_limit, (i + 1) * nscalars_per_mask));
546
547      /* Rather than have a new IV that starts at BIAS and goes up to
548	 TEST_LIMIT, prefer to use the same 0-based IV for each mask
549	 and adjust the bound down by BIAS.  */
550      tree this_test_limit = test_limit;
551      if (i != 0)
552	{
553	  this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
554					  compare_type, this_test_limit,
555					  bias_tree);
556	  this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
557					  compare_type, this_test_limit,
558					  bias_tree);
559	}
560
561      /* Create the initial mask.  First include all scalars that
562	 are within the loop limit.  */
563      tree init_mask = NULL_TREE;
564      if (!first_iteration_full)
565	{
566	  tree start, end;
567	  if (first_limit == test_limit)
568	    {
569	      /* Use a natural test between zero (the initial IV value)
570		 and the loop limit.  The "else" block would be valid too,
571		 but this choice can avoid the need to load BIAS_TREE into
572		 a register.  */
573	      start = zero_index;
574	      end = this_test_limit;
575	    }
576	  else
577	    {
578	      /* FIRST_LIMIT is the maximum number of scalars handled by the
579		 first iteration of the vector loop.  Test the portion
580		 associated with this mask.  */
581	      start = bias_tree;
582	      end = first_limit;
583	    }
584
585	  init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
586	  tmp_stmt = vect_gen_while (init_mask, start, end);
587	  gimple_seq_add_stmt (preheader_seq, tmp_stmt);
588	}
589
590      /* Now AND out the bits that are within the number of skipped
591	 scalars.  */
592      poly_uint64 const_skip;
593      if (nscalars_skip
594	  && !(poly_int_tree_p (nscalars_skip, &const_skip)
595	       && known_le (const_skip, bias)))
596	{
597	  tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
598						    bias_tree, nscalars_skip);
599	  if (init_mask)
600	    init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
601				      init_mask, unskipped_mask);
602	  else
603	    init_mask = unskipped_mask;
604	}
605
606      if (!init_mask)
607	/* First iteration is full.  */
608	init_mask = build_minus_one_cst (mask_type);
609
610      /* Get the mask value for the next iteration of the loop.  */
611      next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
612      gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
613      gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
614
615      vect_set_loop_mask (loop, mask, init_mask, next_mask);
616    }
617  return next_mask;
618}
619
620/* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
621   LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
622   number of iterations of the original scalar loop that should be
623   handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are
624   as for vect_set_loop_condition.
625
626   Insert the branch-back condition before LOOP_COND_GSI and return the
627   final gcond.  */
628
629static gcond *
630vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
631				tree niters, tree final_iv,
632				bool niters_maybe_zero,
633				gimple_stmt_iterator loop_cond_gsi)
634{
635  gimple_seq preheader_seq = NULL;
636  gimple_seq header_seq = NULL;
637
638  tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
639  unsigned int compare_precision = TYPE_PRECISION (compare_type);
640  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
641  tree orig_niters = niters;
642
643  /* Type of the initial value of NITERS.  */
644  tree ni_actual_type = TREE_TYPE (niters);
645  unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
646
647  /* Convert NITERS to the same size as the compare.  */
648  if (compare_precision > ni_actual_precision
649      && niters_maybe_zero)
650    {
651      /* We know that there is always at least one iteration, so if the
652	 count is zero then it must have wrapped.  Cope with this by
653	 subtracting 1 before the conversion and adding 1 to the result.  */
654      gcc_assert (TYPE_UNSIGNED (ni_actual_type));
655      niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
656			     niters, build_minus_one_cst (ni_actual_type));
657      niters = gimple_convert (&preheader_seq, compare_type, niters);
658      niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
659			     niters, build_one_cst (compare_type));
660    }
661  else
662    niters = gimple_convert (&preheader_seq, compare_type, niters);
663
664  /* Convert skip_niters to the right type.  */
665  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
666
667  /* Now calculate the value that the induction variable must be able
668     to hit in order to ensure that we end the loop with an all-false mask.
669     This involves adding the maximum number of inactive trailing scalar
670     iterations.  */
671  widest_int iv_limit;
672  bool known_max_iters = max_loop_iterations (loop, &iv_limit);
673  if (known_max_iters)
674    {
675      if (niters_skip)
676	{
677	  /* Add the maximum number of skipped iterations to the
678	     maximum iteration count.  */
679	  if (TREE_CODE (niters_skip) == INTEGER_CST)
680	    iv_limit += wi::to_widest (niters_skip);
681	  else
682	    iv_limit += max_vf - 1;
683	}
684      /* IV_LIMIT is the maximum number of latch iterations, which is also
685	 the maximum in-range IV value.  Round this value down to the previous
686	 vector alignment boundary and then add an extra full iteration.  */
687      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
688      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
689    }
690
691  /* Get the vectorization factor in tree form.  */
692  tree vf = build_int_cst (compare_type,
693			   LOOP_VINFO_VECT_FACTOR (loop_vinfo));
694
695  /* Iterate over all the rgroups and fill in their masks.  We could use
696     the first mask from any rgroup for the loop condition; here we
697     arbitrarily pick the last.  */
698  tree test_mask = NULL_TREE;
699  rgroup_masks *rgm;
700  unsigned int i;
701  vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
702  FOR_EACH_VEC_ELT (*masks, i, rgm)
703    if (!rgm->masks.is_empty ())
704      {
705	/* First try using permutes.  This adds a single vector
706	   instruction to the loop for each mask, but needs no extra
707	   loop invariants or IVs.  */
708	unsigned int nmasks = i + 1;
709	if ((nmasks & 1) == 0)
710	  {
711	    rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
712	    if (!half_rgm->masks.is_empty ()
713		&& vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
714	      continue;
715	  }
716
717	/* See whether zero-based IV would ever generate all-false masks
718	   before wrapping around.  */
719	bool might_wrap_p
720	  = (!known_max_iters
721	     || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
722				    UNSIGNED)
723		 > compare_precision));
724
725	/* Set up all masks for this group.  */
726	test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
727						  &preheader_seq,
728						  loop_cond_gsi, rgm, vf,
729						  niters, niters_skip,
730						  might_wrap_p);
731      }
732
733  /* Emit all accumulated statements.  */
734  add_preheader_seq (loop, preheader_seq);
735  add_header_seq (loop, header_seq);
736
737  /* Get a boolean result that tells us whether to iterate.  */
738  edge exit_edge = single_exit (loop);
739  tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
740  tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
741  gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
742					NULL_TREE, NULL_TREE);
743  gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
744
745  /* The loop iterates (NITERS - 1) / VF + 1 times.
746     Subtract one from this to get the latch count.  */
747  tree step = build_int_cst (compare_type,
748			     LOOP_VINFO_VECT_FACTOR (loop_vinfo));
749  tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
750				       build_minus_one_cst (compare_type));
751  loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
752				     niters_minus_one, step);
753
754  if (final_iv)
755    {
756      gassign *assign = gimple_build_assign (final_iv, orig_niters);
757      gsi_insert_on_edge_immediate (single_exit (loop), assign);
758    }
759
760  return cond_stmt;
761}
762
763/* Like vect_set_loop_condition, but handle the case in which there
764   are no loop masks.  */
765
766static gcond *
767vect_set_loop_condition_unmasked (struct loop *loop, tree niters,
768				  tree step, tree final_iv,
769				  bool niters_maybe_zero,
770				  gimple_stmt_iterator loop_cond_gsi)
771{
772  tree indx_before_incr, indx_after_incr;
773  gcond *cond_stmt;
774  gcond *orig_cond;
775  edge pe = loop_preheader_edge (loop);
776  edge exit_edge = single_exit (loop);
777  gimple_stmt_iterator incr_gsi;
778  bool insert_after;
779  enum tree_code code;
780  tree niters_type = TREE_TYPE (niters);
781
782  orig_cond = get_loop_exit_condition (loop);
783  gcc_assert (orig_cond);
784  loop_cond_gsi = gsi_for_stmt (orig_cond);
785
786  tree init, limit;
787  if (!niters_maybe_zero && integer_onep (step))
788    {
789      /* In this case we can use a simple 0-based IV:
790
791	 A:
792	   x = 0;
793	   do
794	     {
795	       ...
796	       x += 1;
797	     }
798	   while (x < NITERS);  */
799      code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
800      init = build_zero_cst (niters_type);
801      limit = niters;
802    }
803  else
804    {
805      /* The following works for all values of NITERS except 0:
806
807	 B:
808	   x = 0;
809	   do
810	     {
811	       ...
812	       x += STEP;
813	     }
814	   while (x <= NITERS - STEP);
815
816	 so that the loop continues to iterate if x + STEP - 1 < NITERS
817	 but stops if x + STEP - 1 >= NITERS.
818
819	 However, if NITERS is zero, x never hits a value above NITERS - STEP
820	 before wrapping around.  There are two obvious ways of dealing with
821	 this:
822
823	 - start at STEP - 1 and compare x before incrementing it
824	 - start at -1 and compare x after incrementing it
825
826	 The latter is simpler and is what we use.  The loop in this case
827	 looks like:
828
829	 C:
830	   x = -1;
831	   do
832	     {
833	       ...
834	       x += STEP;
835	     }
836	   while (x < NITERS - STEP);
837
838	 In both cases the loop limit is NITERS - STEP.  */
839      gimple_seq seq = NULL;
840      limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
841      limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
842      if (seq)
843	{
844	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
845	  gcc_assert (!new_bb);
846	}
847      if (niters_maybe_zero)
848	{
849	  /* Case C.  */
850	  code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
851	  init = build_all_ones_cst (niters_type);
852	}
853      else
854	{
855	  /* Case B.  */
856	  code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
857	  init = build_zero_cst (niters_type);
858	}
859    }
860
861  standard_iv_increment_position (loop, &incr_gsi, &insert_after);
862  create_iv (init, step, NULL_TREE, loop,
863             &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
864  indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
865					      true, NULL_TREE, true,
866					      GSI_SAME_STMT);
867  limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
868				     true, GSI_SAME_STMT);
869
870  cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
871				 NULL_TREE);
872
873  gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
874
875  /* Record the number of latch iterations.  */
876  if (limit == niters)
877    /* Case A: the loop iterates NITERS times.  Subtract one to get the
878       latch count.  */
879    loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
880				       build_int_cst (niters_type, 1));
881  else
882    /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
883       Subtract one from this to get the latch count.  */
884    loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
885				       limit, step);
886
887  if (final_iv)
888    {
889      gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
890					     indx_after_incr, init);
891      gsi_insert_on_edge_immediate (single_exit (loop), assign);
892    }
893
894  return cond_stmt;
895}
896
897/* If we're using fully-masked loops, make LOOP iterate:
898
899      N == (NITERS - 1) / STEP + 1
900
901   times.  When NITERS is zero, this is equivalent to making the loop
902   execute (1 << M) / STEP times, where M is the precision of NITERS.
903   NITERS_MAYBE_ZERO is true if this last case might occur.
904
905   If we're not using fully-masked loops, make LOOP iterate:
906
907      N == (NITERS - STEP) / STEP + 1
908
909   times, where NITERS is known to be outside the range [1, STEP - 1].
910   This is equivalent to making the loop execute NITERS / STEP times
911   when NITERS is nonzero and (1 << M) / STEP times otherwise.
912   NITERS_MAYBE_ZERO again indicates whether this last case might occur.
913
914   If FINAL_IV is nonnull, it is an SSA name that should be set to
915   N * STEP on exit from the loop.
916
917   Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
918
919void
920vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
921			 tree niters, tree step, tree final_iv,
922			 bool niters_maybe_zero)
923{
924  gcond *cond_stmt;
925  gcond *orig_cond = get_loop_exit_condition (loop);
926  gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
927
928  if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
929    cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
930						final_iv, niters_maybe_zero,
931						loop_cond_gsi);
932  else
933    cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
934						  final_iv, niters_maybe_zero,
935						  loop_cond_gsi);
936
937  /* Remove old loop exit test.  */
938  stmt_vec_info orig_cond_info;
939  if (loop_vinfo
940      && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
941    loop_vinfo->remove_stmt (orig_cond_info);
942  else
943    gsi_remove (&loop_cond_gsi, true);
944
945  if (dump_enabled_p ())
946    dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
947		     cond_stmt);
948}
949
950/* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
951   For all PHI arguments in FROM->dest and TO->dest from those
952   edges ensure that TO->dest PHI arguments have current_def
953   to that in from.  */
954
955static void
956slpeel_duplicate_current_defs_from_edges (edge from, edge to)
957{
958  gimple_stmt_iterator gsi_from, gsi_to;
959
960  for (gsi_from = gsi_start_phis (from->dest),
961       gsi_to = gsi_start_phis (to->dest);
962       !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
963    {
964      gimple *from_phi = gsi_stmt (gsi_from);
965      gimple *to_phi = gsi_stmt (gsi_to);
966      tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
967      tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
968      if (virtual_operand_p (from_arg))
969	{
970	  gsi_next (&gsi_from);
971	  continue;
972	}
973      if (virtual_operand_p (to_arg))
974	{
975	  gsi_next (&gsi_to);
976	  continue;
977	}
978      if (TREE_CODE (from_arg) != SSA_NAME)
979	gcc_assert (operand_equal_p (from_arg, to_arg, 0));
980      else if (TREE_CODE (to_arg) == SSA_NAME
981	       && from_arg != to_arg)
982	{
983	  if (get_current_def (to_arg) == NULL_TREE)
984	    {
985	      gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
986					      TREE_TYPE (get_current_def
987							   (from_arg))));
988	      set_current_def (to_arg, get_current_def (from_arg));
989	    }
990	}
991      gsi_next (&gsi_from);
992      gsi_next (&gsi_to);
993    }
994
995  gphi *from_phi = get_virtual_phi (from->dest);
996  gphi *to_phi = get_virtual_phi (to->dest);
997  if (from_phi)
998    set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
999		     get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
1000}
1001
1002
1003/* Given LOOP this function generates a new copy of it and puts it
1004   on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
1005   non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
1006   basic blocks from SCALAR_LOOP instead of LOOP, but to either the
1007   entry or exit of LOOP.  */
1008
1009struct loop *
1010slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop,
1011					struct loop *scalar_loop, edge e)
1012{
1013  struct loop *new_loop;
1014  basic_block *new_bbs, *bbs, *pbbs;
1015  bool at_exit;
1016  bool was_imm_dom;
1017  basic_block exit_dest;
1018  edge exit, new_exit;
1019  bool duplicate_outer_loop = false;
1020
1021  exit = single_exit (loop);
1022  at_exit = (e == exit);
1023  if (!at_exit && e != loop_preheader_edge (loop))
1024    return NULL;
1025
1026  if (scalar_loop == NULL)
1027    scalar_loop = loop;
1028
1029  bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1030  pbbs = bbs + 1;
1031  get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1032  /* Allow duplication of outer loops.  */
1033  if (scalar_loop->inner)
1034    duplicate_outer_loop = true;
1035  /* Check whether duplication is possible.  */
1036  if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1037    {
1038      free (bbs);
1039      return NULL;
1040    }
1041
1042  /* Generate new loop structure.  */
1043  new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1044  duplicate_subloops (scalar_loop, new_loop);
1045
1046  exit_dest = exit->dest;
1047  was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1048					  exit_dest) == loop->header ?
1049		 true : false);
1050
1051  /* Also copy the pre-header, this avoids jumping through hoops to
1052     duplicate the loop entry PHI arguments.  Create an empty
1053     pre-header unconditionally for this.  */
1054  basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1055  edge entry_e = single_pred_edge (preheader);
1056  bbs[0] = preheader;
1057  new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1058
1059  exit = single_exit (scalar_loop);
1060  copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1061	    &exit, 1, &new_exit, NULL,
1062	    at_exit ? loop->latch : e->src, true);
1063  exit = single_exit (loop);
1064  basic_block new_preheader = new_bbs[0];
1065
1066  add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1067
1068  if (scalar_loop != loop)
1069    {
1070      /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1071	 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1072	 but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1073	 the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1074	 header) to have current_def set, so copy them over.  */
1075      slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1076						exit);
1077      slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1078							   0),
1079						EDGE_SUCC (loop->latch, 0));
1080    }
1081
1082  if (at_exit) /* Add the loop copy at exit.  */
1083    {
1084      if (scalar_loop != loop)
1085	{
1086	  gphi_iterator gsi;
1087	  new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1088
1089	  for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1090	       gsi_next (&gsi))
1091	    {
1092	      gphi *phi = gsi.phi ();
1093	      tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1094	      location_t orig_locus
1095		= gimple_phi_arg_location_from_edge (phi, e);
1096
1097	      add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1098	    }
1099	}
1100      redirect_edge_and_branch_force (e, new_preheader);
1101      flush_pending_stmts (e);
1102      set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1103      if (was_imm_dom || duplicate_outer_loop)
1104	set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1105
1106      /* And remove the non-necessary forwarder again.  Keep the other
1107         one so we have a proper pre-header for the loop at the exit edge.  */
1108      redirect_edge_pred (single_succ_edge (preheader),
1109			  single_pred (preheader));
1110      delete_basic_block (preheader);
1111      set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1112			       loop_preheader_edge (scalar_loop)->src);
1113    }
1114  else /* Add the copy at entry.  */
1115    {
1116      if (scalar_loop != loop)
1117	{
1118	  /* Remove the non-necessary forwarder of scalar_loop again.  */
1119	  redirect_edge_pred (single_succ_edge (preheader),
1120			      single_pred (preheader));
1121	  delete_basic_block (preheader);
1122	  set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1123				   loop_preheader_edge (scalar_loop)->src);
1124	  preheader = split_edge (loop_preheader_edge (loop));
1125	  entry_e = single_pred_edge (preheader);
1126	}
1127
1128      redirect_edge_and_branch_force (entry_e, new_preheader);
1129      flush_pending_stmts (entry_e);
1130      set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1131
1132      redirect_edge_and_branch_force (new_exit, preheader);
1133      flush_pending_stmts (new_exit);
1134      set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1135
1136      /* And remove the non-necessary forwarder again.  Keep the other
1137         one so we have a proper pre-header for the loop at the exit edge.  */
1138      redirect_edge_pred (single_succ_edge (new_preheader),
1139			  single_pred (new_preheader));
1140      delete_basic_block (new_preheader);
1141      set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1142			       loop_preheader_edge (new_loop)->src);
1143    }
1144
1145  /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1146  for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1147    rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1148
1149  if (scalar_loop != loop)
1150    {
1151      /* Update new_loop->header PHIs, so that on the preheader
1152	 edge they are the ones from loop rather than scalar_loop.  */
1153      gphi_iterator gsi_orig, gsi_new;
1154      edge orig_e = loop_preheader_edge (loop);
1155      edge new_e = loop_preheader_edge (new_loop);
1156
1157      for (gsi_orig = gsi_start_phis (loop->header),
1158	   gsi_new = gsi_start_phis (new_loop->header);
1159	   !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1160	   gsi_next (&gsi_orig), gsi_next (&gsi_new))
1161	{
1162	  gphi *orig_phi = gsi_orig.phi ();
1163	  gphi *new_phi = gsi_new.phi ();
1164	  tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1165	  location_t orig_locus
1166	    = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1167
1168	  add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1169	}
1170    }
1171
1172  free (new_bbs);
1173  free (bbs);
1174
1175  checking_verify_dominators (CDI_DOMINATORS);
1176
1177  return new_loop;
1178}
1179
1180
1181/* Given the condition expression COND, put it as the last statement of
1182   GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1183   DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1184   skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1185   new edge as irreducible if IRREDUCIBLE_P is true.  */
1186
1187static edge
1188slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1189		       basic_block guard_to, basic_block dom_bb,
1190		       profile_probability probability, bool irreducible_p)
1191{
1192  gimple_stmt_iterator gsi;
1193  edge new_e, enter_e;
1194  gcond *cond_stmt;
1195  gimple_seq gimplify_stmt_list = NULL;
1196
1197  enter_e = EDGE_SUCC (guard_bb, 0);
1198  enter_e->flags &= ~EDGE_FALLTHRU;
1199  enter_e->flags |= EDGE_FALSE_VALUE;
1200  gsi = gsi_last_bb (guard_bb);
1201
1202  cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1203				 NULL_TREE);
1204  if (gimplify_stmt_list)
1205    gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1206
1207  cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1208  gsi = gsi_last_bb (guard_bb);
1209  gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1210
1211  /* Add new edge to connect guard block to the merge/loop-exit block.  */
1212  new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1213
1214  new_e->probability = probability;
1215  if (irreducible_p)
1216    new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1217
1218  enter_e->probability = probability.invert ();
1219  set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1220
1221  /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1222  if (enter_e->dest->loop_father->header == enter_e->dest)
1223    split_edge (enter_e);
1224
1225  return new_e;
1226}
1227
1228
1229/* This function verifies that the following restrictions apply to LOOP:
1230   (1) it consists of exactly 2 basic blocks - header, and an empty latch
1231       for innermost loop and 5 basic blocks for outer-loop.
1232   (2) it is single entry, single exit
1233   (3) its exit condition is the last stmt in the header
1234   (4) E is the entry/exit edge of LOOP.
1235 */
1236
1237bool
1238slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
1239{
1240  edge exit_e = single_exit (loop);
1241  edge entry_e = loop_preheader_edge (loop);
1242  gcond *orig_cond = get_loop_exit_condition (loop);
1243  gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1244  unsigned int num_bb = loop->inner? 5 : 2;
1245
1246  /* All loops have an outer scope; the only case loop->outer is NULL is for
1247     the function itself.  */
1248  if (!loop_outer (loop)
1249      || loop->num_nodes != num_bb
1250      || !empty_block_p (loop->latch)
1251      || !single_exit (loop)
1252      /* Verify that new loop exit condition can be trivially modified.  */
1253      || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1254      || (e != exit_e && e != entry_e))
1255    return false;
1256
1257  return true;
1258}
1259
1260/* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1261   in the exit bb and rename all the uses after the loop.  This simplifies
1262   the *guard[12] routines, which assume loop closed SSA form for all PHIs
1263   (but normally loop closed SSA form doesn't require virtual PHIs to be
1264   in the same form).  Doing this early simplifies the checking what
1265   uses should be renamed.  */
1266
1267static void
1268create_lcssa_for_virtual_phi (struct loop *loop)
1269{
1270  gphi_iterator gsi;
1271  edge exit_e = single_exit (loop);
1272
1273  for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1274    if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1275      {
1276	gphi *phi = gsi.phi ();
1277	for (gsi = gsi_start_phis (exit_e->dest);
1278	     !gsi_end_p (gsi); gsi_next (&gsi))
1279	  if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1280	    break;
1281	if (gsi_end_p (gsi))
1282	  {
1283	    tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1284	    gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1285	    tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1286	    imm_use_iterator imm_iter;
1287	    gimple *stmt;
1288	    use_operand_p use_p;
1289
1290	    SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1291	      = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1292	    add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1293	    gimple_phi_set_result (new_phi, new_vop);
1294	    FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1295	      if (stmt != new_phi
1296		  && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1297		FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1298		  SET_USE (use_p, new_vop);
1299	  }
1300	break;
1301      }
1302
1303}
1304
1305/* Function vect_get_loop_location.
1306
1307   Extract the location of the loop in the source code.
1308   If the loop is not well formed for vectorization, an estimated
1309   location is calculated.
1310   Return the loop location if succeed and NULL if not.  */
1311
1312dump_user_location_t
1313find_loop_location (struct loop *loop)
1314{
1315  gimple *stmt = NULL;
1316  basic_block bb;
1317  gimple_stmt_iterator si;
1318
1319  if (!loop)
1320    return dump_user_location_t ();
1321
1322  stmt = get_loop_exit_condition (loop);
1323
1324  if (stmt
1325      && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1326    return stmt;
1327
1328  /* If we got here the loop is probably not "well formed",
1329     try to estimate the loop location */
1330
1331  if (!loop->header)
1332    return dump_user_location_t ();
1333
1334  bb = loop->header;
1335
1336  for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1337    {
1338      stmt = gsi_stmt (si);
1339      if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1340        return stmt;
1341    }
1342
1343  return dump_user_location_t ();
1344}
1345
1346/* Return true if the phi described by STMT_INFO defines an IV of the
1347   loop to be vectorized.  */
1348
1349static bool
1350iv_phi_p (stmt_vec_info stmt_info)
1351{
1352  gphi *phi = as_a <gphi *> (stmt_info->stmt);
1353  if (virtual_operand_p (PHI_RESULT (phi)))
1354    return false;
1355
1356  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1357      || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1358    return false;
1359
1360  return true;
1361}
1362
1363/* Function vect_can_advance_ivs_p
1364
1365   In case the number of iterations that LOOP iterates is unknown at compile
1366   time, an epilog loop will be generated, and the loop induction variables
1367   (IVs) will be "advanced" to the value they are supposed to take just before
1368   the epilog loop.  Here we check that the access function of the loop IVs
1369   and the expression that represents the loop bound are simple enough.
1370   These restrictions will be relaxed in the future.  */
1371
1372bool
1373vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1374{
1375  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1376  basic_block bb = loop->header;
1377  gphi_iterator gsi;
1378
1379  /* Analyze phi functions of the loop header.  */
1380
1381  if (dump_enabled_p ())
1382    dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1383  for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1384    {
1385      tree evolution_part;
1386
1387      gphi *phi = gsi.phi ();
1388      stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1389      if (dump_enabled_p ())
1390	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1391			 phi_info->stmt);
1392
1393      /* Skip virtual phi's. The data dependences that are associated with
1394	 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1395
1396	 Skip reduction phis.  */
1397      if (!iv_phi_p (phi_info))
1398	{
1399	  if (dump_enabled_p ())
1400	    dump_printf_loc (MSG_NOTE, vect_location,
1401			     "reduc or virtual phi. skip.\n");
1402	  continue;
1403	}
1404
1405      /* Analyze the evolution function.  */
1406
1407      evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1408      if (evolution_part == NULL_TREE)
1409        {
1410	  if (dump_enabled_p ())
1411	    dump_printf (MSG_MISSED_OPTIMIZATION,
1412			 "No access function or evolution.\n");
1413	  return false;
1414        }
1415
1416      /* FORNOW: We do not transform initial conditions of IVs
1417	 which evolution functions are not invariants in the loop.  */
1418
1419      if (!expr_invariant_in_loop_p (loop, evolution_part))
1420	{
1421	  if (dump_enabled_p ())
1422	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1423			     "evolution not invariant in loop.\n");
1424	  return false;
1425	}
1426
1427      /* FORNOW: We do not transform initial conditions of IVs
1428	 which evolution functions are a polynomial of degree >= 2.  */
1429
1430      if (tree_is_chrec (evolution_part))
1431	{
1432	  if (dump_enabled_p ())
1433	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1434			     "evolution is chrec.\n");
1435	  return false;
1436	}
1437    }
1438
1439  return true;
1440}
1441
1442
1443/*   Function vect_update_ivs_after_vectorizer.
1444
1445     "Advance" the induction variables of LOOP to the value they should take
1446     after the execution of LOOP.  This is currently necessary because the
1447     vectorizer does not handle induction variables that are used after the
1448     loop.  Such a situation occurs when the last iterations of LOOP are
1449     peeled, because:
1450     1. We introduced new uses after LOOP for IVs that were not originally used
1451        after LOOP: the IVs of LOOP are now used by an epilog loop.
1452     2. LOOP is going to be vectorized; this means that it will iterate N/VF
1453        times, whereas the loop IVs should be bumped N times.
1454
1455     Input:
1456     - LOOP - a loop that is going to be vectorized. The last few iterations
1457              of LOOP were peeled.
1458     - NITERS - the number of iterations that LOOP executes (before it is
1459                vectorized). i.e, the number of times the ivs should be bumped.
1460     - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1461                  coming out from LOOP on which there are uses of the LOOP ivs
1462		  (this is the path from LOOP->exit to epilog_loop->preheader).
1463
1464                  The new definitions of the ivs are placed in LOOP->exit.
1465                  The phi args associated with the edge UPDATE_E in the bb
1466                  UPDATE_E->dest are updated accordingly.
1467
1468     Assumption 1: Like the rest of the vectorizer, this function assumes
1469     a single loop exit that has a single predecessor.
1470
1471     Assumption 2: The phi nodes in the LOOP header and in update_bb are
1472     organized in the same order.
1473
1474     Assumption 3: The access function of the ivs is simple enough (see
1475     vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1476
1477     Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1478     coming out of LOOP on which the ivs of LOOP are used (this is the path
1479     that leads to the epilog loop; other paths skip the epilog loop).  This
1480     path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1481     needs to have its phis updated.
1482 */
1483
1484static void
1485vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1486				  tree niters, edge update_e)
1487{
1488  gphi_iterator gsi, gsi1;
1489  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1490  basic_block update_bb = update_e->dest;
1491  basic_block exit_bb = single_exit (loop)->dest;
1492
1493  /* Make sure there exists a single-predecessor exit bb:  */
1494  gcc_assert (single_pred_p (exit_bb));
1495  gcc_assert (single_succ_edge (exit_bb) == update_e);
1496
1497  for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1498       !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1499       gsi_next (&gsi), gsi_next (&gsi1))
1500    {
1501      tree init_expr;
1502      tree step_expr, off;
1503      tree type;
1504      tree var, ni, ni_name;
1505      gimple_stmt_iterator last_gsi;
1506
1507      gphi *phi = gsi.phi ();
1508      gphi *phi1 = gsi1.phi ();
1509      stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1510      if (dump_enabled_p ())
1511	dump_printf_loc (MSG_NOTE, vect_location,
1512			 "vect_update_ivs_after_vectorizer: phi: %G", phi);
1513
1514      /* Skip reduction and virtual phis.  */
1515      if (!iv_phi_p (phi_info))
1516	{
1517	  if (dump_enabled_p ())
1518	    dump_printf_loc (MSG_NOTE, vect_location,
1519			     "reduc or virtual phi. skip.\n");
1520	  continue;
1521	}
1522
1523      type = TREE_TYPE (gimple_phi_result (phi));
1524      step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1525      step_expr = unshare_expr (step_expr);
1526
1527      /* FORNOW: We do not support IVs whose evolution function is a polynomial
1528         of degree >= 2 or exponential.  */
1529      gcc_assert (!tree_is_chrec (step_expr));
1530
1531      init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1532
1533      off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1534			 fold_convert (TREE_TYPE (step_expr), niters),
1535			 step_expr);
1536      if (POINTER_TYPE_P (type))
1537	ni = fold_build_pointer_plus (init_expr, off);
1538      else
1539	ni = fold_build2 (PLUS_EXPR, type,
1540			  init_expr, fold_convert (type, off));
1541
1542      var = create_tmp_var (type, "tmp");
1543
1544      last_gsi = gsi_last_bb (exit_bb);
1545      gimple_seq new_stmts = NULL;
1546      ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1547      /* Exit_bb shouldn't be empty.  */
1548      if (!gsi_end_p (last_gsi))
1549	gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1550      else
1551	gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1552
1553      /* Fix phi expressions in the successor bb.  */
1554      adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1555    }
1556}
1557
1558/* Return a gimple value containing the misalignment (measured in vector
1559   elements) for the loop described by LOOP_VINFO, i.e. how many elements
1560   it is away from a perfectly aligned address.  Add any new statements
1561   to SEQ.  */
1562
1563static tree
1564get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1565{
1566  dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1567  stmt_vec_info stmt_info = dr_info->stmt;
1568  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1569
1570  poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1571  unsigned HOST_WIDE_INT target_align_c;
1572  tree target_align_minus_1;
1573
1574  bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1575					size_zero_node) < 0;
1576  tree offset = (negative
1577		 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1578		 : size_zero_node);
1579  tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1580							  offset);
1581  tree type = unsigned_type_for (TREE_TYPE (start_addr));
1582  if (target_align.is_constant (&target_align_c))
1583    target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1584  else
1585    {
1586      tree vla = build_int_cst (type, target_align);
1587      tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1588				    fold_build2 (MINUS_EXPR, type,
1589						 build_int_cst (type, 0), vla));
1590      target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1591					  build_int_cst (type, 1));
1592    }
1593
1594  HOST_WIDE_INT elem_size
1595    = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1596  tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1597
1598  /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1599  tree int_start_addr = fold_convert (type, start_addr);
1600  tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1601					target_align_minus_1);
1602
1603  /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1604  tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1605					elem_size_log);
1606
1607  return misalign_in_elems;
1608}
1609
1610/* Function vect_gen_prolog_loop_niters
1611
1612   Generate the number of iterations which should be peeled as prolog for the
1613   loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1614   DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1615   As a result, after the execution of this loop, the data reference DR will
1616   refer to an aligned location.  The following computation is generated:
1617
1618   If the misalignment of DR is known at compile time:
1619     addr_mis = int mis = DR_MISALIGNMENT (dr);
1620   Else, compute address misalignment in bytes:
1621     addr_mis = addr & (target_align - 1)
1622
1623   prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1624
1625   (elem_size = element type size; an element is the scalar element whose type
1626   is the inner type of the vectype)
1627
1628   The computations will be emitted at the end of BB.  We also compute and
1629   store upper bound (included) of the result in BOUND.
1630
1631   When the step of the data-ref in the loop is not 1 (as in interleaved data
1632   and SLP), the number of iterations of the prolog must be divided by the step
1633   (which is equal to the size of interleaved group).
1634
1635   The above formulas assume that VF == number of elements in the vector. This
1636   may not hold when there are multiple-types in the loop.
1637   In this case, for some data-references in the loop the VF does not represent
1638   the number of elements that fit in the vector.  Therefore, instead of VF we
1639   use TYPE_VECTOR_SUBPARTS.  */
1640
1641static tree
1642vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1643			     basic_block bb, int *bound)
1644{
1645  dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1646  tree var;
1647  tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1648  gimple_seq stmts = NULL, new_stmts = NULL;
1649  tree iters, iters_name;
1650  stmt_vec_info stmt_info = dr_info->stmt;
1651  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1652  poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1653
1654  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1655    {
1656      int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1657
1658      if (dump_enabled_p ())
1659        dump_printf_loc (MSG_NOTE, vect_location,
1660                         "known peeling = %d.\n", npeel);
1661
1662      iters = build_int_cst (niters_type, npeel);
1663      *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1664    }
1665  else
1666    {
1667      tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1668      tree type = TREE_TYPE (misalign_in_elems);
1669      HOST_WIDE_INT elem_size
1670	= int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1671      /* We only do prolog peeling if the target alignment is known at compile
1672         time.  */
1673      poly_uint64 align_in_elems =
1674	exact_div (target_align, elem_size);
1675      tree align_in_elems_minus_1 =
1676	build_int_cst (type, align_in_elems - 1);
1677      tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1678
1679      /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1680				 & (align_in_elems - 1)).  */
1681      bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1682					    size_zero_node) < 0;
1683      if (negative)
1684	iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1685			     align_in_elems_tree);
1686      else
1687	iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1688			     misalign_in_elems);
1689      iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1690      iters = fold_convert (niters_type, iters);
1691      unsigned HOST_WIDE_INT align_in_elems_c;
1692      if (align_in_elems.is_constant (&align_in_elems_c))
1693	*bound = align_in_elems_c - 1;
1694      else
1695	*bound = -1;
1696    }
1697
1698  if (dump_enabled_p ())
1699    dump_printf_loc (MSG_NOTE, vect_location,
1700		     "niters for prolog loop: %T\n", iters);
1701
1702  var = create_tmp_var (niters_type, "prolog_loop_niters");
1703  iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1704
1705  if (new_stmts)
1706    gimple_seq_add_seq (&stmts, new_stmts);
1707  if (stmts)
1708    {
1709      gcc_assert (single_succ_p (bb));
1710      gimple_stmt_iterator gsi = gsi_last_bb (bb);
1711      if (gsi_end_p (gsi))
1712	gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1713      else
1714	gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1715    }
1716  return iters_name;
1717}
1718
1719
1720/* Function vect_update_init_of_dr
1721
1722   If CODE is PLUS, the vector loop starts NITERS iterations after the
1723   scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1724   iterations before the scalar one (using masking to skip inactive
1725   elements).  This function updates the information recorded in DR to
1726   account for the difference.  Specifically, it updates the OFFSET
1727   field of DR.  */
1728
1729static void
1730vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
1731{
1732  tree offset = DR_OFFSET (dr);
1733
1734  niters = fold_build2 (MULT_EXPR, sizetype,
1735			fold_convert (sizetype, niters),
1736			fold_convert (sizetype, DR_STEP (dr)));
1737  offset = fold_build2 (code, sizetype,
1738			fold_convert (sizetype, offset), niters);
1739  DR_OFFSET (dr) = offset;
1740}
1741
1742
1743/* Function vect_update_inits_of_drs
1744
1745   Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1746   CODE and NITERS are as for vect_update_inits_of_dr.  */
1747
1748static void
1749vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1750			  tree_code code)
1751{
1752  unsigned int i;
1753  vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1754  struct data_reference *dr;
1755
1756  DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1757
1758  /* Adjust niters to sizetype and insert stmts on loop preheader edge.  */
1759  if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1760    {
1761      gimple_seq seq;
1762      edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1763      tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
1764
1765      niters = fold_convert (sizetype, niters);
1766      niters = force_gimple_operand (niters, &seq, false, var);
1767      if (seq)
1768	{
1769	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1770	  gcc_assert (!new_bb);
1771	}
1772    }
1773
1774  FOR_EACH_VEC_ELT (datarefs, i, dr)
1775    {
1776      dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1777      if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1778	vect_update_init_of_dr (dr, niters, code);
1779    }
1780}
1781
1782/* For the information recorded in LOOP_VINFO prepare the loop for peeling
1783   by masking.  This involves calculating the number of iterations to
1784   be peeled and then aligning all memory references appropriately.  */
1785
1786void
1787vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1788{
1789  tree misalign_in_elems;
1790  tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1791
1792  gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1793
1794  /* From the information recorded in LOOP_VINFO get the number of iterations
1795     that need to be skipped via masking.  */
1796  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1797    {
1798      poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1799			     - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1800      misalign_in_elems = build_int_cst (type, misalign);
1801    }
1802  else
1803    {
1804      gimple_seq seq1 = NULL, seq2 = NULL;
1805      misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1806      misalign_in_elems = fold_convert (type, misalign_in_elems);
1807      misalign_in_elems = force_gimple_operand (misalign_in_elems,
1808						&seq2, true, NULL_TREE);
1809      gimple_seq_add_seq (&seq1, seq2);
1810      if (seq1)
1811	{
1812	  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1813	  basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1814	  gcc_assert (!new_bb);
1815	}
1816    }
1817
1818  if (dump_enabled_p ())
1819    dump_printf_loc (MSG_NOTE, vect_location,
1820		     "misalignment for fully-masked loop: %T\n",
1821		     misalign_in_elems);
1822
1823  LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1824
1825  vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1826}
1827
1828/* This function builds ni_name = number of iterations.  Statements
1829   are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1830   it to TRUE if new ssa_var is generated.  */
1831
1832tree
1833vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1834{
1835  tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1836  if (TREE_CODE (ni) == INTEGER_CST)
1837    return ni;
1838  else
1839    {
1840      tree ni_name, var;
1841      gimple_seq stmts = NULL;
1842      edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1843
1844      var = create_tmp_var (TREE_TYPE (ni), "niters");
1845      ni_name = force_gimple_operand (ni, &stmts, false, var);
1846      if (stmts)
1847	{
1848	  gsi_insert_seq_on_edge_immediate (pe, stmts);
1849	  if (new_var_p != NULL)
1850	    *new_var_p = true;
1851	}
1852
1853      return ni_name;
1854    }
1855}
1856
1857/* Calculate the number of iterations above which vectorized loop will be
1858   preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1859   of prolog loop.  If it's integer const, the integer number is also passed
1860   in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1861   number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1862   value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1863   threshold below which the scalar (rather than vectorized) loop will be
1864   executed.  This function stores the upper bound (inclusive) of the result
1865   in BOUND_SCALAR.  */
1866
1867static tree
1868vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1869			     int bound_prolog, poly_int64 bound_epilog, int th,
1870			     poly_uint64 *bound_scalar,
1871			     bool check_profitability)
1872{
1873  tree type = TREE_TYPE (niters_prolog);
1874  tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1875			     build_int_cst (type, bound_epilog));
1876
1877  *bound_scalar = bound_prolog + bound_epilog;
1878  if (check_profitability)
1879    {
1880      /* TH indicates the minimum niters of vectorized loop, while we
1881	 compute the maximum niters of scalar loop.  */
1882      th--;
1883      /* Peeling for constant times.  */
1884      if (int_niters_prolog >= 0)
1885	{
1886	  *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1887	  return build_int_cst (type, *bound_scalar);
1888	}
1889      /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1890	 and BOUND_EPILOG are inclusive upper bounds.  */
1891      if (known_ge (th, bound_prolog + bound_epilog))
1892	{
1893	  *bound_scalar = th;
1894	  return build_int_cst (type, th);
1895	}
1896      /* Need to do runtime comparison.  */
1897      else if (maybe_gt (th, bound_epilog))
1898	{
1899	  *bound_scalar = upper_bound (*bound_scalar, th);
1900	  return fold_build2 (MAX_EXPR, type,
1901			      build_int_cst (type, th), niters);
1902	}
1903    }
1904  return niters;
1905}
1906
1907/* NITERS is the number of times that the original scalar loop executes
1908   after peeling.  Work out the maximum number of iterations N that can
1909   be handled by the vectorized form of the loop and then either:
1910
1911   a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1912
1913	niters_vector = N
1914
1915   b) set *STEP_VECTOR_PTR to one and generate:
1916
1917        niters_vector = N / vf
1918
1919   In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1920   any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1921   is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1922
1923void
1924vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1925			     tree *niters_vector_ptr, tree *step_vector_ptr,
1926			     bool niters_no_overflow)
1927{
1928  tree ni_minus_gap, var;
1929  tree niters_vector, step_vector, type = TREE_TYPE (niters);
1930  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1931  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1932  tree log_vf = NULL_TREE;
1933
1934  /* If epilogue loop is required because of data accesses with gaps, we
1935     subtract one iteration from the total number of iterations here for
1936     correct calculation of RATIO.  */
1937  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1938    {
1939      ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1940				  build_one_cst (type));
1941      if (!is_gimple_val (ni_minus_gap))
1942	{
1943	  var = create_tmp_var (type, "ni_gap");
1944	  gimple *stmts = NULL;
1945	  ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1946					       true, var);
1947	  gsi_insert_seq_on_edge_immediate (pe, stmts);
1948	}
1949    }
1950  else
1951    ni_minus_gap = niters;
1952
1953  unsigned HOST_WIDE_INT const_vf;
1954  if (vf.is_constant (&const_vf)
1955      && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1956    {
1957      /* Create: niters >> log2(vf) */
1958      /* If it's known that niters == number of latch executions + 1 doesn't
1959	 overflow, we can generate niters >> log2(vf); otherwise we generate
1960	 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1961	 will be at least one.  */
1962      log_vf = build_int_cst (type, exact_log2 (const_vf));
1963      if (niters_no_overflow)
1964	niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1965      else
1966	niters_vector
1967	  = fold_build2 (PLUS_EXPR, type,
1968			 fold_build2 (RSHIFT_EXPR, type,
1969				      fold_build2 (MINUS_EXPR, type,
1970						   ni_minus_gap,
1971						   build_int_cst (type, vf)),
1972				      log_vf),
1973			 build_int_cst (type, 1));
1974      step_vector = build_one_cst (type);
1975    }
1976  else
1977    {
1978      niters_vector = ni_minus_gap;
1979      step_vector = build_int_cst (type, vf);
1980    }
1981
1982  if (!is_gimple_val (niters_vector))
1983    {
1984      var = create_tmp_var (type, "bnd");
1985      gimple_seq stmts = NULL;
1986      niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1987      gsi_insert_seq_on_edge_immediate (pe, stmts);
1988      /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1989	 we set range information to make niters analyzer's life easier.  */
1990      if (stmts != NULL && log_vf)
1991	set_range_info (niters_vector, VR_RANGE,
1992			wi::to_wide (build_int_cst (type, 1)),
1993			wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1994						  TYPE_MAX_VALUE (type),
1995						  log_vf)));
1996    }
1997  *niters_vector_ptr = niters_vector;
1998  *step_vector_ptr = step_vector;
1999
2000  return;
2001}
2002
2003/* Given NITERS_VECTOR which is the number of iterations for vectorized
2004   loop specified by LOOP_VINFO after vectorization, compute the number
2005   of iterations before vectorization (niters_vector * vf) and store it
2006   to NITERS_VECTOR_MULT_VF_PTR.  */
2007
2008static void
2009vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
2010				     tree niters_vector,
2011				     tree *niters_vector_mult_vf_ptr)
2012{
2013  /* We should be using a step_vector of VF if VF is variable.  */
2014  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
2015  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2016  tree type = TREE_TYPE (niters_vector);
2017  tree log_vf = build_int_cst (type, exact_log2 (vf));
2018  basic_block exit_bb = single_exit (loop)->dest;
2019
2020  gcc_assert (niters_vector_mult_vf_ptr != NULL);
2021  tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2022					    niters_vector, log_vf);
2023  if (!is_gimple_val (niters_vector_mult_vf))
2024    {
2025      tree var = create_tmp_var (type, "niters_vector_mult_vf");
2026      gimple_seq stmts = NULL;
2027      niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2028						    &stmts, true, var);
2029      gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2030      gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2031    }
2032  *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2033}
2034
2035/* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2036   from SECOND/FIRST and puts it at the original loop's preheader/exit
2037   edge, the two loops are arranged as below:
2038
2039       preheader_a:
2040     first_loop:
2041       header_a:
2042	 i_1 = PHI<i_0, i_2>;
2043	 ...
2044	 i_2 = i_1 + 1;
2045	 if (cond_a)
2046	   goto latch_a;
2047	 else
2048	   goto between_bb;
2049       latch_a:
2050	 goto header_a;
2051
2052       between_bb:
2053	 ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2054
2055     second_loop:
2056       header_b:
2057	 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2058				 or with i_2 if no LCSSA phi is created
2059				 under condition of CREATE_LCSSA_FOR_IV_PHIS.
2060	 ...
2061	 i_4 = i_3 + 1;
2062	 if (cond_b)
2063	   goto latch_b;
2064	 else
2065	   goto exit_bb;
2066       latch_b:
2067	 goto header_b;
2068
2069       exit_bb:
2070
2071   This function creates loop closed SSA for the first loop; update the
2072   second loop's PHI nodes by replacing argument on incoming edge with the
2073   result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2074   is false, Loop closed ssa phis will only be created for non-iv phis for
2075   the first loop.
2076
2077   This function assumes exit bb of the first loop is preheader bb of the
2078   second loop, i.e, between_bb in the example code.  With PHIs updated,
2079   the second loop will execute rest iterations of the first.  */
2080
2081static void
2082slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2083				   struct loop *first, struct loop *second,
2084				   bool create_lcssa_for_iv_phis)
2085{
2086  gphi_iterator gsi_update, gsi_orig;
2087  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2088
2089  edge first_latch_e = EDGE_SUCC (first->latch, 0);
2090  edge second_preheader_e = loop_preheader_edge (second);
2091  basic_block between_bb = single_exit (first)->dest;
2092
2093  gcc_assert (between_bb == second_preheader_e->src);
2094  gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2095  /* Either the first loop or the second is the loop to be vectorized.  */
2096  gcc_assert (loop == first || loop == second);
2097
2098  for (gsi_orig = gsi_start_phis (first->header),
2099       gsi_update = gsi_start_phis (second->header);
2100       !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2101       gsi_next (&gsi_orig), gsi_next (&gsi_update))
2102    {
2103      gphi *orig_phi = gsi_orig.phi ();
2104      gphi *update_phi = gsi_update.phi ();
2105
2106      tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2107      /* Generate lcssa PHI node for the first loop.  */
2108      gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2109      stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2110      if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2111	{
2112	  tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2113	  gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2114	  add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2115	  arg = new_res;
2116	}
2117
2118      /* Update PHI node in the second loop by replacing arg on the loop's
2119	 incoming edge.  */
2120      adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2121    }
2122}
2123
2124/* Function slpeel_add_loop_guard adds guard skipping from the beginning
2125   of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2126   are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2127   appear like below:
2128
2129       guard_bb:
2130	 if (cond)
2131	   goto merge_bb;
2132	 else
2133	   goto skip_loop;
2134
2135     skip_loop:
2136       header_a:
2137	 i_1 = PHI<i_0, i_2>;
2138	 ...
2139	 i_2 = i_1 + 1;
2140	 if (cond_a)
2141	   goto latch_a;
2142	 else
2143	   goto exit_a;
2144       latch_a:
2145	 goto header_a;
2146
2147       exit_a:
2148	 i_5 = PHI<i_2>;
2149
2150       merge_bb:
2151	 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2152
2153     update_loop:
2154       header_b:
2155	 i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2156	 ...
2157	 i_4 = i_3 + 1;
2158	 if (cond_b)
2159	   goto latch_b;
2160	 else
2161	   goto exit_bb;
2162       latch_b:
2163	 goto header_b;
2164
2165       exit_bb:
2166
2167   This function creates PHI nodes at merge_bb and replaces the use of i_5
2168   in the update_loop's PHI node with the result of new PHI result.  */
2169
2170static void
2171slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop,
2172				    struct loop *update_loop,
2173				    edge guard_edge, edge merge_edge)
2174{
2175  location_t merge_loc, guard_loc;
2176  edge orig_e = loop_preheader_edge (skip_loop);
2177  edge update_e = loop_preheader_edge (update_loop);
2178  gphi_iterator gsi_orig, gsi_update;
2179
2180  for ((gsi_orig = gsi_start_phis (skip_loop->header),
2181	gsi_update = gsi_start_phis (update_loop->header));
2182       !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2183       gsi_next (&gsi_orig), gsi_next (&gsi_update))
2184    {
2185      gphi *orig_phi = gsi_orig.phi ();
2186      gphi *update_phi = gsi_update.phi ();
2187
2188      /* Generate new phi node at merge bb of the guard.  */
2189      tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2190      gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2191
2192      /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2193	 args in NEW_PHI for these edges.  */
2194      tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2195      tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2196      merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2197      guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2198      add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2199      add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2200
2201      /* Update phi in UPDATE_PHI.  */
2202      adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2203    }
2204}
2205
2206/* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2207   this function searches for the corresponding lcssa phi node in exit
2208   bb of LOOP.  If it is found, return the phi result; otherwise return
2209   NULL.  */
2210
2211static tree
2212find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED,
2213		gphi *lcssa_phi)
2214{
2215  gphi_iterator gsi;
2216  edge e = single_exit (loop);
2217
2218  gcc_assert (single_pred_p (e->dest));
2219  for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2220    {
2221      gphi *phi = gsi.phi ();
2222      if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2223			   PHI_ARG_DEF (lcssa_phi, 0), 0))
2224	return PHI_RESULT (phi);
2225    }
2226  return NULL_TREE;
2227}
2228
2229/* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2230   from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2231   point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2232   and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2233   The CFG looks like:
2234
2235     loop:
2236       header_a:
2237	 i_1 = PHI<i_0, i_2>;
2238	 ...
2239	 i_2 = i_1 + 1;
2240	 if (cond_a)
2241	   goto latch_a;
2242	 else
2243	   goto exit_a;
2244       latch_a:
2245	 goto header_a;
2246
2247       exit_a:
2248
2249       guard_bb:
2250	 if (cond)
2251	   goto merge_bb;
2252	 else
2253	   goto epilog_loop;
2254
2255       ;; fall_through_bb
2256
2257     epilog_loop:
2258       header_b:
2259	 i_3 = PHI<i_2, i_4>;
2260	 ...
2261	 i_4 = i_3 + 1;
2262	 if (cond_b)
2263	   goto latch_b;
2264	 else
2265	   goto merge_bb;
2266       latch_b:
2267	 goto header_b;
2268
2269       merge_bb:
2270	 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2271
2272       exit_bb:
2273	 i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2274
2275   For each name used out side EPILOG (i.e - for each name that has a lcssa
2276   phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2277   args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2278   the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2279   by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2280   in exit_bb will also be updated.  */
2281
2282static void
2283slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog,
2284				    edge guard_edge, edge merge_edge)
2285{
2286  gphi_iterator gsi;
2287  basic_block merge_bb = guard_edge->dest;
2288
2289  gcc_assert (single_succ_p (merge_bb));
2290  edge e = single_succ_edge (merge_bb);
2291  basic_block exit_bb = e->dest;
2292  gcc_assert (single_pred_p (exit_bb));
2293  gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2294
2295  for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2296    {
2297      gphi *update_phi = gsi.phi ();
2298      tree old_arg = PHI_ARG_DEF (update_phi, 0);
2299      /* This loop-closed-phi actually doesn't represent a use out of the
2300	 loop - the phi arg is a constant.  */
2301      if (TREE_CODE (old_arg) != SSA_NAME)
2302	continue;
2303
2304      tree merge_arg = get_current_def (old_arg);
2305      if (!merge_arg)
2306	merge_arg = old_arg;
2307
2308      tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2309      /* If the var is live after loop but not a reduction, we simply
2310	 use the old arg.  */
2311      if (!guard_arg)
2312	guard_arg = old_arg;
2313
2314      /* Create new phi node in MERGE_BB:  */
2315      tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2316      gphi *merge_phi = create_phi_node (new_res, merge_bb);
2317
2318      /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2319	 the two PHI args in merge_phi for these edges.  */
2320      add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2321      add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2322
2323      /* Update the original phi in exit_bb.  */
2324      adjust_phi_and_debug_stmts (update_phi, e, new_res);
2325    }
2326}
2327
2328/* EPILOG loop is duplicated from the original loop for vectorizing,
2329   the arg of its loop closed ssa PHI needs to be updated.  */
2330
2331static void
2332slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
2333{
2334  gphi_iterator gsi;
2335  basic_block exit_bb = single_exit (epilog)->dest;
2336
2337  gcc_assert (single_pred_p (exit_bb));
2338  edge e = EDGE_PRED (exit_bb, 0);
2339  for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2340    rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2341}
2342
2343/* Function vect_do_peeling.
2344
2345   Input:
2346   - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2347
2348       preheader:
2349     LOOP:
2350       header_bb:
2351	 loop_body
2352	 if (exit_loop_cond) goto exit_bb
2353	 else                goto header_bb
2354       exit_bb:
2355
2356   - NITERS: The number of iterations of the loop.
2357   - NITERSM1: The number of iterations of the loop's latch.
2358   - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2359   - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2360			      CHECK_PROFITABILITY is true.
2361   Output:
2362   - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2363     iterate after vectorization; see vect_set_loop_condition for details.
2364   - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2365     should be set to the number of scalar iterations handled by the
2366     vector loop.  The SSA name is only used on exit from the loop.
2367
2368   This function peels prolog and epilog from the loop, adds guards skipping
2369   PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2370   would look like:
2371
2372       guard_bb_1:
2373	 if (prefer_scalar_loop) goto merge_bb_1
2374	 else                    goto guard_bb_2
2375
2376       guard_bb_2:
2377         if (skip_prolog) goto merge_bb_2
2378         else             goto prolog_preheader
2379
2380       prolog_preheader:
2381     PROLOG:
2382       prolog_header_bb:
2383	 prolog_body
2384	 if (exit_prolog_cond) goto prolog_exit_bb
2385	 else                  goto prolog_header_bb
2386       prolog_exit_bb:
2387
2388       merge_bb_2:
2389
2390       vector_preheader:
2391     VECTOR LOOP:
2392       vector_header_bb:
2393	 vector_body
2394	 if (exit_vector_cond) goto vector_exit_bb
2395	 else                  goto vector_header_bb
2396       vector_exit_bb:
2397
2398       guard_bb_3:
2399	 if (skip_epilog) goto merge_bb_3
2400	 else             goto epilog_preheader
2401
2402       merge_bb_1:
2403
2404       epilog_preheader:
2405     EPILOG:
2406       epilog_header_bb:
2407	 epilog_body
2408	 if (exit_epilog_cond) goto merge_bb_3
2409	 else                  goto epilog_header_bb
2410
2411       merge_bb_3:
2412
2413   Note this function peels prolog and epilog only if it's necessary,
2414   as well as guards.
2415   Returns created epilogue or NULL.
2416
2417   TODO: Guard for prefer_scalar_loop should be emitted along with
2418   versioning conditions if loop versioning is needed.  */
2419
2420
2421struct loop *
2422vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2423		 tree *niters_vector, tree *step_vector,
2424		 tree *niters_vector_mult_vf_var, int th,
2425		 bool check_profitability, bool niters_no_overflow)
2426{
2427  edge e, guard_e;
2428  tree type = TREE_TYPE (niters), guard_cond;
2429  basic_block guard_bb, guard_to;
2430  profile_probability prob_prolog, prob_vector, prob_epilog;
2431  int estimated_vf;
2432  int prolog_peeling = 0;
2433  /* We currently do not support prolog peeling if the target alignment is not
2434     known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2435     target alignment being constant.  */
2436  dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2437  if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2438    return NULL;
2439
2440  if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2441    prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2442
2443  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2444  poly_uint64 bound_epilog = 0;
2445  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2446      && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2447    bound_epilog += vf - 1;
2448  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2449    bound_epilog += 1;
2450  bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2451  poly_uint64 bound_scalar = bound_epilog;
2452
2453  if (!prolog_peeling && !epilog_peeling)
2454    return NULL;
2455
2456  prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2457  estimated_vf = vect_vf_for_cost (loop_vinfo);
2458  if (estimated_vf == 2)
2459    estimated_vf = 3;
2460  prob_prolog = prob_epilog = profile_probability::guessed_always ()
2461			.apply_scale (estimated_vf - 1, estimated_vf);
2462
2463  struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2464  struct loop *first_loop = loop;
2465  bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2466  create_lcssa_for_virtual_phi (loop);
2467  update_ssa (TODO_update_ssa_only_virtuals);
2468
2469  if (MAY_HAVE_DEBUG_BIND_STMTS)
2470    {
2471      gcc_assert (!adjust_vec.exists ());
2472      adjust_vec.create (32);
2473    }
2474  initialize_original_copy_tables ();
2475
2476  /* Record the anchor bb at which the guard should be placed if the scalar
2477     loop might be preferred.  */
2478  basic_block anchor = loop_preheader_edge (loop)->src;
2479
2480  /* Generate the number of iterations for the prolog loop.  We do this here
2481     so that we can also get the upper bound on the number of iterations.  */
2482  tree niters_prolog;
2483  int bound_prolog = 0;
2484  if (prolog_peeling)
2485    niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2486						 &bound_prolog);
2487  else
2488    niters_prolog = build_int_cst (type, 0);
2489
2490  /* Prolog loop may be skipped.  */
2491  bool skip_prolog = (prolog_peeling != 0);
2492  /* Skip to epilog if scalar loop may be preferred.  It's only needed
2493     when we peel for epilog loop and when it hasn't been checked with
2494     loop versioning.  */
2495  bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2496		      ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2497				  bound_prolog + bound_epilog)
2498		      : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
2499  /* Epilog loop must be executed if the number of iterations for epilog
2500     loop is known at compile time, otherwise we need to add a check at
2501     the end of vector loop and skip to the end of epilog loop.  */
2502  bool skip_epilog = (prolog_peeling < 0
2503		      || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2504		      || !vf.is_constant ());
2505  /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2506  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2507    skip_epilog = false;
2508
2509  if (skip_vector)
2510    {
2511      split_edge (loop_preheader_edge (loop));
2512
2513      /* Due to the order in which we peel prolog and epilog, we first
2514	 propagate probability to the whole loop.  The purpose is to
2515	 avoid adjusting probabilities of both prolog and vector loops
2516	 separately.  Note in this case, the probability of epilog loop
2517	 needs to be scaled back later.  */
2518      basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2519      if (prob_vector.initialized_p ())
2520	{
2521	  scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2522	  scale_loop_profile (loop, prob_vector, 0);
2523	}
2524    }
2525
2526  dump_user_location_t loop_loc = find_loop_location (loop);
2527  struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2528  if (prolog_peeling)
2529    {
2530      e = loop_preheader_edge (loop);
2531      if (!slpeel_can_duplicate_loop_p (loop, e))
2532	{
2533	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2534			   "loop can't be duplicated to preheader edge.\n");
2535	  gcc_unreachable ();
2536	}
2537      /* Peel prolog and put it on preheader edge of loop.  */
2538      prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2539      if (!prolog)
2540	{
2541	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2542			   "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2543	  gcc_unreachable ();
2544	}
2545      prolog->force_vectorize = false;
2546      slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2547      first_loop = prolog;
2548      reset_original_copy_tables ();
2549
2550      /* Update the number of iterations for prolog loop.  */
2551      tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2552      vect_set_loop_condition (prolog, NULL, niters_prolog,
2553			       step_prolog, NULL_TREE, false);
2554
2555      /* Skip the prolog loop.  */
2556      if (skip_prolog)
2557	{
2558	  guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2559				    niters_prolog, build_int_cst (type, 0));
2560	  guard_bb = loop_preheader_edge (prolog)->src;
2561	  basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2562	  guard_to = split_edge (loop_preheader_edge (loop));
2563	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2564					   guard_to, guard_bb,
2565					   prob_prolog.invert (),
2566					   irred_flag);
2567	  e = EDGE_PRED (guard_to, 0);
2568	  e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2569	  slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2570
2571	  scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2572	  scale_loop_profile (prolog, prob_prolog, bound_prolog);
2573	}
2574      /* Update init address of DRs.  */
2575      vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2576      /* Update niters for vector loop.  */
2577      LOOP_VINFO_NITERS (loop_vinfo)
2578	= fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2579      LOOP_VINFO_NITERSM1 (loop_vinfo)
2580	= fold_build2 (MINUS_EXPR, type,
2581		       LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2582      bool new_var_p = false;
2583      niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2584      /* It's guaranteed that vector loop bound before vectorization is at
2585	 least VF, so set range information for newly generated var.  */
2586      if (new_var_p)
2587	set_range_info (niters, VR_RANGE,
2588			wi::to_wide (build_int_cst (type, vf)),
2589			wi::to_wide (TYPE_MAX_VALUE (type)));
2590
2591      /* Prolog iterates at most bound_prolog times, latch iterates at
2592	 most bound_prolog - 1 times.  */
2593      record_niter_bound (prolog, bound_prolog - 1, false, true);
2594      delete_update_ssa ();
2595      adjust_vec_debug_stmts ();
2596      scev_reset ();
2597    }
2598
2599  if (epilog_peeling)
2600    {
2601      e = single_exit (loop);
2602      if (!slpeel_can_duplicate_loop_p (loop, e))
2603	{
2604	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2605			   "loop can't be duplicated to exit edge.\n");
2606	  gcc_unreachable ();
2607	}
2608      /* Peel epilog and put it on exit edge of loop.  */
2609      epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2610      if (!epilog)
2611	{
2612	  dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2613			   "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2614	  gcc_unreachable ();
2615	}
2616      epilog->force_vectorize = false;
2617      slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2618
2619      /* Scalar version loop may be preferred.  In this case, add guard
2620	 and skip to epilog.  Note this only happens when the number of
2621	 iterations of loop is unknown at compile time, otherwise this
2622	 won't be vectorized.  */
2623      if (skip_vector)
2624	{
2625	  /* Additional epilogue iteration is peeled if gap exists.  */
2626	  tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2627						bound_prolog, bound_epilog,
2628						th, &bound_scalar,
2629						check_profitability);
2630	  /* Build guard against NITERSM1 since NITERS may overflow.  */
2631	  guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2632	  guard_bb = anchor;
2633	  guard_to = split_edge (loop_preheader_edge (epilog));
2634	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2635					   guard_to, guard_bb,
2636					   prob_vector.invert (),
2637					   irred_flag);
2638	  e = EDGE_PRED (guard_to, 0);
2639	  e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2640	  slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2641
2642	  /* Simply propagate profile info from guard_bb to guard_to which is
2643	     a merge point of control flow.  */
2644	  guard_to->count = guard_bb->count;
2645
2646	  /* Scale probability of epilog loop back.
2647	     FIXME: We should avoid scaling down and back up.  Profile may
2648	     get lost if we scale down to 0.  */
2649	  basic_block *bbs = get_loop_body (epilog);
2650	  for (unsigned int i = 0; i < epilog->num_nodes; i++)
2651	    bbs[i]->count = bbs[i]->count.apply_scale
2652				 (bbs[i]->count,
2653				  bbs[i]->count.apply_probability
2654				    (prob_vector));
2655	  free (bbs);
2656	}
2657
2658      basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2659      tree niters_vector_mult_vf;
2660      /* If loop is peeled for non-zero constant times, now niters refers to
2661	 orig_niters - prolog_peeling, it won't overflow even the orig_niters
2662	 overflows.  */
2663      niters_no_overflow |= (prolog_peeling > 0);
2664      vect_gen_vector_loop_niters (loop_vinfo, niters,
2665				   niters_vector, step_vector,
2666				   niters_no_overflow);
2667      if (!integer_onep (*step_vector))
2668	{
2669	  /* On exit from the loop we will have an easy way of calcalating
2670	     NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2671	     until then.  */
2672	  niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2673	  SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2674	  *niters_vector_mult_vf_var = niters_vector_mult_vf;
2675	}
2676      else
2677	vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2678					     &niters_vector_mult_vf);
2679      /* Update IVs of original loop as if they were advanced by
2680	 niters_vector_mult_vf steps.  */
2681      gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2682      edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
2683      vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2684					update_e);
2685
2686      if (skip_epilog)
2687	{
2688	  guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2689				    niters, niters_vector_mult_vf);
2690	  guard_bb = single_exit (loop)->dest;
2691	  guard_to = split_edge (single_exit (epilog));
2692	  guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2693					   skip_vector ? anchor : guard_bb,
2694					   prob_epilog.invert (),
2695					   irred_flag);
2696	  slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2697					      single_exit (epilog));
2698	  /* Only need to handle basic block before epilog loop if it's not
2699	     the guard_bb, which is the case when skip_vector is true.  */
2700	  if (guard_bb != bb_before_epilog)
2701	    {
2702	      prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2703
2704	      scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2705	    }
2706	  scale_loop_profile (epilog, prob_epilog, 0);
2707	}
2708      else
2709	slpeel_update_phi_nodes_for_lcssa (epilog);
2710
2711      unsigned HOST_WIDE_INT bound;
2712      if (bound_scalar.is_constant (&bound))
2713	{
2714	  gcc_assert (bound != 0);
2715	  /* -1 to convert loop iterations to latch iterations.  */
2716	  record_niter_bound (epilog, bound - 1, false, true);
2717	}
2718
2719      delete_update_ssa ();
2720      adjust_vec_debug_stmts ();
2721      scev_reset ();
2722    }
2723  adjust_vec.release ();
2724  free_original_copy_tables ();
2725
2726  return epilog;
2727}
2728
2729/* Function vect_create_cond_for_niters_checks.
2730
2731   Create a conditional expression that represents the run-time checks for
2732   loop's niter.  The loop is guaranteed to terminate if the run-time
2733   checks hold.
2734
2735   Input:
2736   COND_EXPR  - input conditional expression.  New conditions will be chained
2737		with logical AND operation.  If it is NULL, then the function
2738		is used to return the number of alias checks.
2739   LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2740		to be checked.
2741
2742   Output:
2743   COND_EXPR - conditional expression.
2744
2745   The returned COND_EXPR is the conditional expression to be used in the
2746   if statement that controls which version of the loop gets executed at
2747   runtime.  */
2748
2749static void
2750vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2751{
2752  tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2753
2754  if (*cond_expr)
2755    *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2756			      *cond_expr, part_cond_expr);
2757  else
2758    *cond_expr = part_cond_expr;
2759}
2760
2761/* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2762   and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
2763
2764static void
2765chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2766{
2767  if (*cond_expr)
2768    *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2769			      *cond_expr, part_cond_expr);
2770  else
2771    *cond_expr = part_cond_expr;
2772}
2773
2774/* Function vect_create_cond_for_align_checks.
2775
2776   Create a conditional expression that represents the alignment checks for
2777   all of data references (array element references) whose alignment must be
2778   checked at runtime.
2779
2780   Input:
2781   COND_EXPR  - input conditional expression.  New conditions will be chained
2782                with logical AND operation.
2783   LOOP_VINFO - two fields of the loop information are used.
2784                LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2785                LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2786
2787   Output:
2788   COND_EXPR_STMT_LIST - statements needed to construct the conditional
2789                         expression.
2790   The returned value is the conditional expression to be used in the if
2791   statement that controls which version of the loop gets executed at runtime.
2792
2793   The algorithm makes two assumptions:
2794     1) The number of bytes "n" in a vector is a power of 2.
2795     2) An address "a" is aligned if a%n is zero and that this
2796        test can be done as a&(n-1) == 0.  For example, for 16
2797        byte vectors the test is a&0xf == 0.  */
2798
2799static void
2800vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2801                                   tree *cond_expr,
2802				   gimple_seq *cond_expr_stmt_list)
2803{
2804  vec<stmt_vec_info> may_misalign_stmts
2805    = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2806  stmt_vec_info stmt_info;
2807  int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2808  tree mask_cst;
2809  unsigned int i;
2810  tree int_ptrsize_type;
2811  char tmp_name[20];
2812  tree or_tmp_name = NULL_TREE;
2813  tree and_tmp_name;
2814  gimple *and_stmt;
2815  tree ptrsize_zero;
2816  tree part_cond_expr;
2817
2818  /* Check that mask is one less than a power of 2, i.e., mask is
2819     all zeros followed by all ones.  */
2820  gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2821
2822  int_ptrsize_type = signed_type_for (ptr_type_node);
2823
2824  /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2825     of the first vector of the i'th data reference. */
2826
2827  FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2828    {
2829      gimple_seq new_stmt_list = NULL;
2830      tree addr_base;
2831      tree addr_tmp_name;
2832      tree new_or_tmp_name;
2833      gimple *addr_stmt, *or_stmt;
2834      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2835      bool negative = tree_int_cst_compare
2836	(DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
2837      tree offset = negative
2838	? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2839
2840      /* create: addr_tmp = (int)(address_of_first_vector) */
2841      addr_base =
2842	vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
2843					      offset);
2844      if (new_stmt_list != NULL)
2845	gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2846
2847      sprintf (tmp_name, "addr2int%d", i);
2848      addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2849      addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
2850      gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2851
2852      /* The addresses are OR together.  */
2853
2854      if (or_tmp_name != NULL_TREE)
2855        {
2856          /* create: or_tmp = or_tmp | addr_tmp */
2857          sprintf (tmp_name, "orptrs%d", i);
2858	  new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2859	  or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2860					 or_tmp_name, addr_tmp_name);
2861	  gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2862          or_tmp_name = new_or_tmp_name;
2863        }
2864      else
2865        or_tmp_name = addr_tmp_name;
2866
2867    } /* end for i */
2868
2869  mask_cst = build_int_cst (int_ptrsize_type, mask);
2870
2871  /* create: and_tmp = or_tmp & mask  */
2872  and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
2873
2874  and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
2875				  or_tmp_name, mask_cst);
2876  gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
2877
2878  /* Make and_tmp the left operand of the conditional test against zero.
2879     if and_tmp has a nonzero bit then some address is unaligned.  */
2880  ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
2881  part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
2882				and_tmp_name, ptrsize_zero);
2883  chain_cond_expr (cond_expr, part_cond_expr);
2884}
2885
2886/* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
2887   create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
2888   Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2889   and this new condition are true.  Treat a null *COND_EXPR as "true".  */
2890
2891static void
2892vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
2893{
2894  vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
2895  unsigned int i;
2896  vec_object_pair *pair;
2897  FOR_EACH_VEC_ELT (pairs, i, pair)
2898    {
2899      tree addr1 = build_fold_addr_expr (pair->first);
2900      tree addr2 = build_fold_addr_expr (pair->second);
2901      tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
2902					 addr1, addr2);
2903      chain_cond_expr (cond_expr, part_cond_expr);
2904    }
2905}
2906
2907/* Create an expression that is true when all lower-bound conditions for
2908   the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
2909
2910static void
2911vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
2912{
2913  vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
2914  for (unsigned int i = 0; i < lower_bounds.length (); ++i)
2915    {
2916      tree expr = lower_bounds[i].expr;
2917      tree type = unsigned_type_for (TREE_TYPE (expr));
2918      expr = fold_convert (type, expr);
2919      poly_uint64 bound = lower_bounds[i].min_value;
2920      if (!lower_bounds[i].unsigned_p)
2921	{
2922	  expr = fold_build2 (PLUS_EXPR, type, expr,
2923			      build_int_cstu (type, bound - 1));
2924	  bound += bound - 1;
2925	}
2926      tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
2927					 build_int_cstu (type, bound));
2928      chain_cond_expr (cond_expr, part_cond_expr);
2929    }
2930}
2931
2932/* Function vect_create_cond_for_alias_checks.
2933
2934   Create a conditional expression that represents the run-time checks for
2935   overlapping of address ranges represented by a list of data references
2936   relations passed as input.
2937
2938   Input:
2939   COND_EXPR  - input conditional expression.  New conditions will be chained
2940                with logical AND operation.  If it is NULL, then the function
2941                is used to return the number of alias checks.
2942   LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2943	        to be checked.
2944
2945   Output:
2946   COND_EXPR - conditional expression.
2947
2948   The returned COND_EXPR is the conditional expression to be used in the if
2949   statement that controls which version of the loop gets executed at runtime.
2950*/
2951
2952void
2953vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
2954{
2955  vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
2956    LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2957
2958  if (comp_alias_ddrs.is_empty ())
2959    return;
2960
2961  create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
2962			       &comp_alias_ddrs, cond_expr);
2963  if (dump_enabled_p ())
2964    dump_printf_loc (MSG_NOTE, vect_location,
2965		     "created %u versioning for alias checks.\n",
2966		     comp_alias_ddrs.length ());
2967}
2968
2969
2970/* Function vect_loop_versioning.
2971
2972   If the loop has data references that may or may not be aligned or/and
2973   has data reference relations whose independence was not proven then
2974   two versions of the loop need to be generated, one which is vectorized
2975   and one which isn't.  A test is then generated to control which of the
2976   loops is executed.  The test checks for the alignment of all of the
2977   data references that may or may not be aligned.  An additional
2978   sequence of runtime tests is generated for each pairs of DDRs whose
2979   independence was not proven.  The vectorized version of loop is
2980   executed only if both alias and alignment tests are passed.
2981
2982   The test generated to check which version of loop is executed
2983   is modified to also check for profitability as indicated by the
2984   cost model threshold TH.
2985
2986   The versioning precondition(s) are placed in *COND_EXPR and
2987   *COND_EXPR_STMT_LIST.  */
2988
2989struct loop *
2990vect_loop_versioning (loop_vec_info loop_vinfo,
2991		      unsigned int th, bool check_profitability,
2992		      poly_uint64 versioning_threshold)
2993{
2994  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
2995  struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2996  basic_block condition_bb;
2997  gphi_iterator gsi;
2998  gimple_stmt_iterator cond_exp_gsi;
2999  basic_block merge_bb;
3000  basic_block new_exit_bb;
3001  edge new_exit_e, e;
3002  gphi *orig_phi, *new_phi;
3003  tree cond_expr = NULL_TREE;
3004  gimple_seq cond_expr_stmt_list = NULL;
3005  tree arg;
3006  profile_probability prob = profile_probability::likely ();
3007  gimple_seq gimplify_stmt_list = NULL;
3008  tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
3009  bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
3010  bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
3011  bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
3012  tree version_simd_if_cond
3013    = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
3014
3015  if (check_profitability)
3016    cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3017			     build_int_cst (TREE_TYPE (scalar_loop_iters),
3018					    th - 1));
3019  if (maybe_ne (versioning_threshold, 0U))
3020    {
3021      tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3022			       build_int_cst (TREE_TYPE (scalar_loop_iters),
3023					      versioning_threshold - 1));
3024      if (cond_expr)
3025	cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3026				 expr, cond_expr);
3027      else
3028	cond_expr = expr;
3029    }
3030
3031  if (version_niter)
3032    vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3033
3034  if (cond_expr)
3035    cond_expr = force_gimple_operand_1 (cond_expr, &cond_expr_stmt_list,
3036					is_gimple_condexpr, NULL_TREE);
3037
3038  if (version_align)
3039    vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3040				       &cond_expr_stmt_list);
3041
3042  if (version_alias)
3043    {
3044      vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3045      vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3046      vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3047    }
3048
3049  if (version_simd_if_cond)
3050    {
3051      gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3052      if (flag_checking)
3053	if (basic_block bb
3054	    = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3055	  gcc_assert (bb != loop->header
3056		      && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3057		      && (scalar_loop == NULL
3058			  || (bb != scalar_loop->header
3059			      && dominated_by_p (CDI_DOMINATORS,
3060						 scalar_loop->header, bb))));
3061      tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3062      tree c = fold_build2 (NE_EXPR, boolean_type_node,
3063			    version_simd_if_cond, zero);
3064      if (cond_expr)
3065        cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3066				 c, cond_expr);
3067      else
3068        cond_expr = c;
3069      if (dump_enabled_p ())
3070	dump_printf_loc (MSG_NOTE, vect_location,
3071			 "created versioning for simd if condition check.\n");
3072    }
3073
3074  cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3075				      &gimplify_stmt_list,
3076				      is_gimple_condexpr, NULL_TREE);
3077  gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3078
3079  initialize_original_copy_tables ();
3080  if (scalar_loop)
3081    {
3082      edge scalar_e;
3083      basic_block preheader, scalar_preheader;
3084
3085      /* We don't want to scale SCALAR_LOOP's frequencies, we need to
3086	 scale LOOP's frequencies instead.  */
3087      nloop = loop_version (scalar_loop, cond_expr, &condition_bb,
3088			    prob, prob.invert (), prob, prob.invert (), true);
3089      scale_loop_frequencies (loop, prob);
3090      /* CONDITION_BB was created above SCALAR_LOOP's preheader,
3091	 while we need to move it above LOOP's preheader.  */
3092      e = loop_preheader_edge (loop);
3093      scalar_e = loop_preheader_edge (scalar_loop);
3094      /* The vector loop preheader might not be empty, since new
3095	 invariants could have been created while analyzing the loop.  */
3096      gcc_assert (single_pred_p (e->src));
3097      gcc_assert (empty_block_p (scalar_e->src)
3098		  && single_pred_p (scalar_e->src));
3099      gcc_assert (single_pred_p (condition_bb));
3100      preheader = e->src;
3101      scalar_preheader = scalar_e->src;
3102      scalar_e = find_edge (condition_bb, scalar_preheader);
3103      e = single_pred_edge (preheader);
3104      redirect_edge_and_branch_force (single_pred_edge (condition_bb),
3105				      scalar_preheader);
3106      redirect_edge_and_branch_force (scalar_e, preheader);
3107      redirect_edge_and_branch_force (e, condition_bb);
3108      set_immediate_dominator (CDI_DOMINATORS, condition_bb,
3109			       single_pred (condition_bb));
3110      set_immediate_dominator (CDI_DOMINATORS, scalar_preheader,
3111			       single_pred (scalar_preheader));
3112      set_immediate_dominator (CDI_DOMINATORS, preheader,
3113			       condition_bb);
3114    }
3115  else
3116    nloop = loop_version (loop, cond_expr, &condition_bb,
3117			  prob, prob.invert (), prob, prob.invert (), true);
3118
3119  if (version_niter)
3120    {
3121      /* The versioned loop could be infinite, we need to clear existing
3122	 niter information which is copied from the original loop.  */
3123      gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3124      vect_free_loop_info_assumptions (nloop);
3125      /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
3126      loop_constraint_set (loop, LOOP_C_INFINITE);
3127    }
3128
3129  if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3130      && dump_enabled_p ())
3131    {
3132      if (version_alias)
3133        dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3134			 vect_location,
3135                         "loop versioned for vectorization because of "
3136			 "possible aliasing\n");
3137      if (version_align)
3138        dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3139			 vect_location,
3140                         "loop versioned for vectorization to enhance "
3141			 "alignment\n");
3142
3143    }
3144  free_original_copy_tables ();
3145
3146  /* Loop versioning violates an assumption we try to maintain during
3147     vectorization - that the loop exit block has a single predecessor.
3148     After versioning, the exit block of both loop versions is the same
3149     basic block (i.e. it has two predecessors). Just in order to simplify
3150     following transformations in the vectorizer, we fix this situation
3151     here by adding a new (empty) block on the exit-edge of the loop,
3152     with the proper loop-exit phis to maintain loop-closed-form.
3153     If loop versioning wasn't done from loop, but scalar_loop instead,
3154     merge_bb will have already just a single successor.  */
3155
3156  merge_bb = single_exit (loop)->dest;
3157  if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2)
3158    {
3159      gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3160      new_exit_bb = split_edge (single_exit (loop));
3161      new_exit_e = single_exit (loop);
3162      e = EDGE_SUCC (new_exit_bb, 0);
3163
3164      for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
3165	{
3166	  tree new_res;
3167	  orig_phi = gsi.phi ();
3168	  new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3169	  new_phi = create_phi_node (new_res, new_exit_bb);
3170	  arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3171	  add_phi_arg (new_phi, arg, new_exit_e,
3172		       gimple_phi_arg_location_from_edge (orig_phi, e));
3173	  adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3174	}
3175    }
3176
3177  /* End loop-exit-fixes after versioning.  */
3178
3179  if (cond_expr_stmt_list)
3180    {
3181      cond_exp_gsi = gsi_last_bb (condition_bb);
3182      gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3183			     GSI_SAME_STMT);
3184    }
3185  update_ssa (TODO_update_ssa);
3186
3187  return nloop;
3188}
3189