1/* Loop Vectorization
2   Copyright (C) 2003-2020 Free Software Foundation, Inc.
3   Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4   Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3.  If not see
20<http://www.gnu.org/licenses/>.  */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "target.h"
27#include "rtl.h"
28#include "tree.h"
29#include "gimple.h"
30#include "cfghooks.h"
31#include "tree-pass.h"
32#include "ssa.h"
33#include "optabs-tree.h"
34#include "diagnostic-core.h"
35#include "fold-const.h"
36#include "stor-layout.h"
37#include "cfganal.h"
38#include "gimplify.h"
39#include "gimple-iterator.h"
40#include "gimplify-me.h"
41#include "tree-ssa-loop-ivopts.h"
42#include "tree-ssa-loop-manip.h"
43#include "tree-ssa-loop-niter.h"
44#include "tree-ssa-loop.h"
45#include "cfgloop.h"
46#include "tree-scalar-evolution.h"
47#include "tree-vectorizer.h"
48#include "gimple-fold.h"
49#include "cgraph.h"
50#include "tree-cfg.h"
51#include "tree-if-conv.h"
52#include "internal-fn.h"
53#include "tree-vector-builder.h"
54#include "vec-perm-indices.h"
55#include "tree-eh.h"
56
57/* Loop Vectorization Pass.
58
59   This pass tries to vectorize loops.
60
61   For example, the vectorizer transforms the following simple loop:
62
63        short a[N]; short b[N]; short c[N]; int i;
64
65        for (i=0; i<N; i++){
66          a[i] = b[i] + c[i];
67        }
68
69   as if it was manually vectorized by rewriting the source code into:
70
71        typedef int __attribute__((mode(V8HI))) v8hi;
72        short a[N];  short b[N]; short c[N];   int i;
73        v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74        v8hi va, vb, vc;
75
76        for (i=0; i<N/8; i++){
77          vb = pb[i];
78          vc = pc[i];
79          va = vb + vc;
80          pa[i] = va;
81        }
82
83        The main entry to this pass is vectorize_loops(), in which
84   the vectorizer applies a set of analyses on a given set of loops,
85   followed by the actual vectorization transformation for the loops that
86   had successfully passed the analysis phase.
87        Throughout this pass we make a distinction between two types of
88   data: scalars (which are represented by SSA_NAMES), and memory references
89   ("data-refs").  These two types of data require different handling both
90   during analysis and transformation. The types of data-refs that the
91   vectorizer currently supports are ARRAY_REFS which base is an array DECL
92   (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93   accesses are required to have a simple (consecutive) access pattern.
94
95   Analysis phase:
96   ===============
97        The driver for the analysis phase is vect_analyze_loop().
98   It applies a set of analyses, some of which rely on the scalar evolution
99   analyzer (scev) developed by Sebastian Pop.
100
101        During the analysis phase the vectorizer records some information
102   per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103   loop, as well as general information about the loop as a whole, which is
104   recorded in a "loop_vec_info" struct attached to each loop.
105
106   Transformation phase:
107   =====================
108        The loop transformation phase scans all the stmts in the loop, and
109   creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110   the loop that needs to be vectorized.  It inserts the vector code sequence
111   just before the scalar stmt S, and records a pointer to the vector code
112   in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113   attached to S).  This pointer will be used for the vectorization of following
114   stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115   otherwise, we rely on dead code elimination for removing it.
116
117        For example, say stmt S1 was vectorized into stmt VS1:
118
119   VS1: vb = px[i];
120   S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121   S2:  a = b;
122
123   To vectorize stmt S2, the vectorizer first finds the stmt that defines
124   the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125   vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
126   resulting sequence would be:
127
128   VS1: vb = px[i];
129   S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130   VS2: va = vb;
131   S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
132
133        Operands that are not SSA_NAMEs, are data-refs that appear in
134   load/store operations (like 'x[i]' in S1), and are handled differently.
135
136   Target modeling:
137   =================
138        Currently the only target specific information that is used is the
139   size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140   Targets that can support different sizes of vectors, for now will need
141   to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
142   flexibility will be added in the future.
143
144        Since we only vectorize operations which vector form can be
145   expressed using existing tree codes, to verify that an operation is
146   supported, the vectorizer checks the relevant optab at the relevant
147   machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
148   the value found is CODE_FOR_nothing, then there's no target support, and
149   we can't vectorize the stmt.
150
151   For additional information on this project see:
152   http://gcc.gnu.org/projects/tree-ssa/vectorization.html
153*/
154
155static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157					       bool *, bool *);
158
159/* Subroutine of vect_determine_vf_for_stmt that handles only one
160   statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161   may already be set for general statements (not just data refs).  */
162
163static opt_result
164vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
165			      bool vectype_maybe_set_p,
166			      poly_uint64 *vf)
167{
168  gimple *stmt = stmt_info->stmt;
169
170  if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171       && !STMT_VINFO_LIVE_P (stmt_info))
172      || gimple_clobber_p (stmt))
173    {
174      if (dump_enabled_p ())
175	dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176      return opt_result::success ();
177    }
178
179  tree stmt_vectype, nunits_vectype;
180  opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181						   &nunits_vectype);
182  if (!res)
183    return res;
184
185  if (stmt_vectype)
186    {
187      if (STMT_VINFO_VECTYPE (stmt_info))
188	/* The only case when a vectype had been already set is for stmts
189	   that contain a data ref, or for "pattern-stmts" (stmts generated
190	   by the vectorizer to represent/replace a certain idiom).  */
191	gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
192		     || vectype_maybe_set_p)
193		    && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
194      else
195	STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
196    }
197
198  if (nunits_vectype)
199    vect_update_max_nunits (vf, nunits_vectype);
200
201  return opt_result::success ();
202}
203
204/* Subroutine of vect_determine_vectorization_factor.  Set the vector
205   types of STMT_INFO and all attached pattern statements and update
206   the vectorization factor VF accordingly.  Return true on success
207   or false if something prevented vectorization.  */
208
209static opt_result
210vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf)
211{
212  vec_info *vinfo = stmt_info->vinfo;
213  if (dump_enabled_p ())
214    dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
215		     stmt_info->stmt);
216  opt_result res = vect_determine_vf_for_stmt_1 (stmt_info, false, vf);
217  if (!res)
218    return res;
219
220  if (STMT_VINFO_IN_PATTERN_P (stmt_info)
221      && STMT_VINFO_RELATED_STMT (stmt_info))
222    {
223      gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
224      stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
225
226      /* If a pattern statement has def stmts, analyze them too.  */
227      for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
228	   !gsi_end_p (si); gsi_next (&si))
229	{
230	  stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
231	  if (dump_enabled_p ())
232	    dump_printf_loc (MSG_NOTE, vect_location,
233			     "==> examining pattern def stmt: %G",
234			     def_stmt_info->stmt);
235	  res = vect_determine_vf_for_stmt_1 (def_stmt_info, true, vf);
236	  if (!res)
237	    return res;
238	}
239
240      if (dump_enabled_p ())
241	dump_printf_loc (MSG_NOTE, vect_location,
242			 "==> examining pattern statement: %G",
243			 stmt_info->stmt);
244      res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf);
245      if (!res)
246	return res;
247    }
248
249  return opt_result::success ();
250}
251
252/* Function vect_determine_vectorization_factor
253
254   Determine the vectorization factor (VF).  VF is the number of data elements
255   that are operated upon in parallel in a single iteration of the vectorized
256   loop.  For example, when vectorizing a loop that operates on 4byte elements,
257   on a target with vector size (VS) 16byte, the VF is set to 4, since 4
258   elements can fit in a single vector register.
259
260   We currently support vectorization of loops in which all types operated upon
261   are of the same size.  Therefore this function currently sets VF according to
262   the size of the types operated upon, and fails if there are multiple sizes
263   in the loop.
264
265   VF is also the factor by which the loop iterations are strip-mined, e.g.:
266   original loop:
267        for (i=0; i<N; i++){
268          a[i] = b[i] + c[i];
269        }
270
271   vectorized loop:
272        for (i=0; i<N; i+=VF){
273          a[i:VF] = b[i:VF] + c[i:VF];
274        }
275*/
276
277static opt_result
278vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
279{
280  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
281  basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
282  unsigned nbbs = loop->num_nodes;
283  poly_uint64 vectorization_factor = 1;
284  tree scalar_type = NULL_TREE;
285  gphi *phi;
286  tree vectype;
287  stmt_vec_info stmt_info;
288  unsigned i;
289
290  DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
291
292  for (i = 0; i < nbbs; i++)
293    {
294      basic_block bb = bbs[i];
295
296      for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
297	   gsi_next (&si))
298	{
299	  phi = si.phi ();
300	  stmt_info = loop_vinfo->lookup_stmt (phi);
301	  if (dump_enabled_p ())
302	    dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
303			     phi);
304
305	  gcc_assert (stmt_info);
306
307	  if (STMT_VINFO_RELEVANT_P (stmt_info)
308	      || STMT_VINFO_LIVE_P (stmt_info))
309            {
310	      gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
311              scalar_type = TREE_TYPE (PHI_RESULT (phi));
312
313	      if (dump_enabled_p ())
314		dump_printf_loc (MSG_NOTE, vect_location,
315				 "get vectype for scalar type:  %T\n",
316				 scalar_type);
317
318	      vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
319	      if (!vectype)
320		return opt_result::failure_at (phi,
321					       "not vectorized: unsupported "
322					       "data-type %T\n",
323					       scalar_type);
324	      STMT_VINFO_VECTYPE (stmt_info) = vectype;
325
326	      if (dump_enabled_p ())
327		dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
328				 vectype);
329
330	      if (dump_enabled_p ())
331		{
332		  dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
333		  dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
334		  dump_printf (MSG_NOTE, "\n");
335		}
336
337	      vect_update_max_nunits (&vectorization_factor, vectype);
338	    }
339	}
340
341      for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
342	   gsi_next (&si))
343	{
344	  stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
345	  opt_result res
346	    = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor);
347	  if (!res)
348	    return res;
349        }
350    }
351
352  /* TODO: Analyze cost. Decide if worth while to vectorize.  */
353  if (dump_enabled_p ())
354    {
355      dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
356      dump_dec (MSG_NOTE, vectorization_factor);
357      dump_printf (MSG_NOTE, "\n");
358    }
359
360  if (known_le (vectorization_factor, 1U))
361    return opt_result::failure_at (vect_location,
362				   "not vectorized: unsupported data-type\n");
363  LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
364  return opt_result::success ();
365}
366
367
368/* Function vect_is_simple_iv_evolution.
369
370   FORNOW: A simple evolution of an induction variables in the loop is
371   considered a polynomial evolution.  */
372
373static bool
374vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
375                             tree * step)
376{
377  tree init_expr;
378  tree step_expr;
379  tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
380  basic_block bb;
381
382  /* When there is no evolution in this loop, the evolution function
383     is not "simple".  */
384  if (evolution_part == NULL_TREE)
385    return false;
386
387  /* When the evolution is a polynomial of degree >= 2
388     the evolution function is not "simple".  */
389  if (tree_is_chrec (evolution_part))
390    return false;
391
392  step_expr = evolution_part;
393  init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
394
395  if (dump_enabled_p ())
396    dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
397		     step_expr, init_expr);
398
399  *init = init_expr;
400  *step = step_expr;
401
402  if (TREE_CODE (step_expr) != INTEGER_CST
403      && (TREE_CODE (step_expr) != SSA_NAME
404	  || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
405	      && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
406	  || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
407	      && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
408		  || !flag_associative_math)))
409      && (TREE_CODE (step_expr) != REAL_CST
410	  || !flag_associative_math))
411    {
412      if (dump_enabled_p ())
413        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
414                         "step unknown.\n");
415      return false;
416    }
417
418  return true;
419}
420
421/* Return true if PHI, described by STMT_INFO, is the inner PHI in
422   what we are assuming is a double reduction.  For example, given
423   a structure like this:
424
425      outer1:
426	x_1 = PHI <x_4(outer2), ...>;
427	...
428
429      inner:
430	x_2 = PHI <x_1(outer1), ...>;
431	...
432	x_3 = ...;
433	...
434
435      outer2:
436	x_4 = PHI <x_3(inner)>;
437	...
438
439   outer loop analysis would treat x_1 as a double reduction phi and
440   this function would then return true for x_2.  */
441
442static bool
443vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
444{
445  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
446  use_operand_p use_p;
447  ssa_op_iter op_iter;
448  FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
449    if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
450      if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
451	return true;
452  return false;
453}
454
455/* Function vect_analyze_scalar_cycles_1.
456
457   Examine the cross iteration def-use cycles of scalar variables
458   in LOOP.  LOOP_VINFO represents the loop that is now being
459   considered for vectorization (can be LOOP, or an outer-loop
460   enclosing LOOP).  */
461
462static void
463vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
464{
465  basic_block bb = loop->header;
466  tree init, step;
467  auto_vec<stmt_vec_info, 64> worklist;
468  gphi_iterator gsi;
469  bool double_reduc, reduc_chain;
470
471  DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
472
473  /* First - identify all inductions.  Reduction detection assumes that all the
474     inductions have been identified, therefore, this order must not be
475     changed.  */
476  for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
477    {
478      gphi *phi = gsi.phi ();
479      tree access_fn = NULL;
480      tree def = PHI_RESULT (phi);
481      stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
482
483      if (dump_enabled_p ())
484	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
485
486      /* Skip virtual phi's.  The data dependences that are associated with
487         virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
488      if (virtual_operand_p (def))
489	continue;
490
491      STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
492
493      /* Analyze the evolution function.  */
494      access_fn = analyze_scalar_evolution (loop, def);
495      if (access_fn)
496	{
497	  STRIP_NOPS (access_fn);
498	  if (dump_enabled_p ())
499	    dump_printf_loc (MSG_NOTE, vect_location,
500			     "Access function of PHI: %T\n", access_fn);
501	  STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
502	    = initial_condition_in_loop_num (access_fn, loop->num);
503	  STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
504	    = evolution_part_in_loop_num (access_fn, loop->num);
505	}
506
507      if (!access_fn
508	  || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
509	  || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
510	  || (LOOP_VINFO_LOOP (loop_vinfo) != loop
511	      && TREE_CODE (step) != INTEGER_CST))
512	{
513	  worklist.safe_push (stmt_vinfo);
514	  continue;
515	}
516
517      gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
518		  != NULL_TREE);
519      gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
520
521      if (dump_enabled_p ())
522	dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
523      STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
524    }
525
526
527  /* Second - identify all reductions and nested cycles.  */
528  while (worklist.length () > 0)
529    {
530      stmt_vec_info stmt_vinfo = worklist.pop ();
531      gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
532      tree def = PHI_RESULT (phi);
533
534      if (dump_enabled_p ())
535	dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
536
537      gcc_assert (!virtual_operand_p (def)
538		  && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
539
540      stmt_vec_info reduc_stmt_info
541	= vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
542				    &reduc_chain);
543      if (reduc_stmt_info)
544        {
545	  STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
546	  STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
547	  if (double_reduc)
548	    {
549	      if (dump_enabled_p ())
550		dump_printf_loc (MSG_NOTE, vect_location,
551				 "Detected double reduction.\n");
552
553              STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
554	      STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
555            }
556          else
557            {
558              if (loop != LOOP_VINFO_LOOP (loop_vinfo))
559                {
560                  if (dump_enabled_p ())
561                    dump_printf_loc (MSG_NOTE, vect_location,
562				     "Detected vectorizable nested cycle.\n");
563
564                  STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
565                }
566              else
567                {
568                  if (dump_enabled_p ())
569                    dump_printf_loc (MSG_NOTE, vect_location,
570				     "Detected reduction.\n");
571
572                  STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
573		  STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
574                  /* Store the reduction cycles for possible vectorization in
575                     loop-aware SLP if it was not detected as reduction
576		     chain.  */
577		  if (! reduc_chain)
578		    LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
579		      (reduc_stmt_info);
580                }
581            }
582        }
583      else
584        if (dump_enabled_p ())
585          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
586			   "Unknown def-use cycle pattern.\n");
587    }
588}
589
590
591/* Function vect_analyze_scalar_cycles.
592
593   Examine the cross iteration def-use cycles of scalar variables, by
594   analyzing the loop-header PHIs of scalar variables.  Classify each
595   cycle as one of the following: invariant, induction, reduction, unknown.
596   We do that for the loop represented by LOOP_VINFO, and also to its
597   inner-loop, if exists.
598   Examples for scalar cycles:
599
600   Example1: reduction:
601
602              loop1:
603              for (i=0; i<N; i++)
604                 sum += a[i];
605
606   Example2: induction:
607
608              loop2:
609              for (i=0; i<N; i++)
610                 a[i] = i;  */
611
612static void
613vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
614{
615  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
616
617  vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
618
619  /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
620     Reductions in such inner-loop therefore have different properties than
621     the reductions in the nest that gets vectorized:
622     1. When vectorized, they are executed in the same order as in the original
623        scalar loop, so we can't change the order of computation when
624        vectorizing them.
625     2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
626        current checks are too strict.  */
627
628  if (loop->inner)
629    vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
630}
631
632/* Transfer group and reduction information from STMT_INFO to its
633   pattern stmt.  */
634
635static void
636vect_fixup_reduc_chain (stmt_vec_info stmt_info)
637{
638  stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
639  stmt_vec_info stmtp;
640  gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
641	      && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
642  REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
643  do
644    {
645      stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
646      gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
647			   == STMT_VINFO_DEF_TYPE (stmt_info));
648      REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
649      stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
650      if (stmt_info)
651	REDUC_GROUP_NEXT_ELEMENT (stmtp)
652	  = STMT_VINFO_RELATED_STMT (stmt_info);
653    }
654  while (stmt_info);
655}
656
657/* Fixup scalar cycles that now have their stmts detected as patterns.  */
658
659static void
660vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
661{
662  stmt_vec_info first;
663  unsigned i;
664
665  FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
666    if (STMT_VINFO_IN_PATTERN_P (first))
667      {
668	stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
669	while (next)
670	  {
671	    if (! STMT_VINFO_IN_PATTERN_P (next)
672		|| STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
673	      break;
674	    next = REDUC_GROUP_NEXT_ELEMENT (next);
675	  }
676	/* If not all stmt in the chain are patterns or if we failed
677	   to update STMT_VINFO_REDUC_IDX try to handle the chain
678	   without patterns.  */
679	if (! next
680	    && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
681	  {
682	    vect_fixup_reduc_chain (first);
683	    LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
684	      = STMT_VINFO_RELATED_STMT (first);
685	  }
686      }
687}
688
689/* Function vect_get_loop_niters.
690
691   Determine how many iterations the loop is executed and place it
692   in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
693   in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
694   niter information holds in ASSUMPTIONS.
695
696   Return the loop exit condition.  */
697
698
699static gcond *
700vect_get_loop_niters (class loop *loop, tree *assumptions,
701		      tree *number_of_iterations, tree *number_of_iterationsm1)
702{
703  edge exit = single_exit (loop);
704  class tree_niter_desc niter_desc;
705  tree niter_assumptions, niter, may_be_zero;
706  gcond *cond = get_loop_exit_condition (loop);
707
708  *assumptions = boolean_true_node;
709  *number_of_iterationsm1 = chrec_dont_know;
710  *number_of_iterations = chrec_dont_know;
711  DUMP_VECT_SCOPE ("get_loop_niters");
712
713  if (!exit)
714    return cond;
715
716  may_be_zero = NULL_TREE;
717  if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
718      || chrec_contains_undetermined (niter_desc.niter))
719    return cond;
720
721  niter_assumptions = niter_desc.assumptions;
722  may_be_zero = niter_desc.may_be_zero;
723  niter = niter_desc.niter;
724
725  if (may_be_zero && integer_zerop (may_be_zero))
726    may_be_zero = NULL_TREE;
727
728  if (may_be_zero)
729    {
730      if (COMPARISON_CLASS_P (may_be_zero))
731	{
732	  /* Try to combine may_be_zero with assumptions, this can simplify
733	     computation of niter expression.  */
734	  if (niter_assumptions && !integer_nonzerop (niter_assumptions))
735	    niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
736					     niter_assumptions,
737					     fold_build1 (TRUTH_NOT_EXPR,
738							  boolean_type_node,
739							  may_be_zero));
740	  else
741	    niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
742				 build_int_cst (TREE_TYPE (niter), 0),
743				 rewrite_to_non_trapping_overflow (niter));
744
745	  may_be_zero = NULL_TREE;
746	}
747      else if (integer_nonzerop (may_be_zero))
748	{
749	  *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
750	  *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
751	  return cond;
752	}
753      else
754	return cond;
755    }
756
757  *assumptions = niter_assumptions;
758  *number_of_iterationsm1 = niter;
759
760  /* We want the number of loop header executions which is the number
761     of latch executions plus one.
762     ???  For UINT_MAX latch executions this number overflows to zero
763     for loops like do { n++; } while (n != 0);  */
764  if (niter && !chrec_contains_undetermined (niter))
765    niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
766			  build_int_cst (TREE_TYPE (niter), 1));
767  *number_of_iterations = niter;
768
769  return cond;
770}
771
772/* Function bb_in_loop_p
773
774   Used as predicate for dfs order traversal of the loop bbs.  */
775
776static bool
777bb_in_loop_p (const_basic_block bb, const void *data)
778{
779  const class loop *const loop = (const class loop *)data;
780  if (flow_bb_inside_loop_p (loop, bb))
781    return true;
782  return false;
783}
784
785
786/* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
787   stmt_vec_info structs for all the stmts in LOOP_IN.  */
788
789_loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
790  : vec_info (vec_info::loop, init_cost (loop_in), shared),
791    loop (loop_in),
792    bbs (XCNEWVEC (basic_block, loop->num_nodes)),
793    num_itersm1 (NULL_TREE),
794    num_iters (NULL_TREE),
795    num_iters_unchanged (NULL_TREE),
796    num_iters_assumptions (NULL_TREE),
797    th (0),
798    versioning_threshold (0),
799    vectorization_factor (0),
800    max_vectorization_factor (0),
801    mask_skip_niters (NULL_TREE),
802    mask_compare_type (NULL_TREE),
803    simd_if_cond (NULL_TREE),
804    unaligned_dr (NULL),
805    peeling_for_alignment (0),
806    ptr_mask (0),
807    ivexpr_map (NULL),
808    scan_map (NULL),
809    slp_unrolling_factor (1),
810    single_scalar_iteration_cost (0),
811    vec_outside_cost (0),
812    vec_inside_cost (0),
813    vectorizable (false),
814    can_fully_mask_p (true),
815    fully_masked_p (false),
816    peeling_for_gaps (false),
817    peeling_for_niter (false),
818    no_data_dependencies (false),
819    has_mask_store (false),
820    scalar_loop_scaling (profile_probability::uninitialized ()),
821    scalar_loop (NULL),
822    orig_loop_info (NULL)
823{
824  /* CHECKME: We want to visit all BBs before their successors (except for
825     latch blocks, for which this assertion wouldn't hold).  In the simple
826     case of the loop forms we allow, a dfs order of the BBs would the same
827     as reversed postorder traversal, so we are safe.  */
828
829  unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
830					  bbs, loop->num_nodes, loop);
831  gcc_assert (nbbs == loop->num_nodes);
832
833  for (unsigned int i = 0; i < nbbs; i++)
834    {
835      basic_block bb = bbs[i];
836      gimple_stmt_iterator si;
837
838      for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
839	{
840	  gimple *phi = gsi_stmt (si);
841	  gimple_set_uid (phi, 0);
842	  add_stmt (phi);
843	}
844
845      for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
846	{
847	  gimple *stmt = gsi_stmt (si);
848	  gimple_set_uid (stmt, 0);
849	  add_stmt (stmt);
850	  /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
851	     third argument is the #pragma omp simd if (x) condition, when 0,
852	     loop shouldn't be vectorized, when non-zero constant, it should
853	     be vectorized normally, otherwise versioned with vectorized loop
854	     done if the condition is non-zero at runtime.  */
855	  if (loop_in->simduid
856	      && is_gimple_call (stmt)
857	      && gimple_call_internal_p (stmt)
858	      && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
859	      && gimple_call_num_args (stmt) >= 3
860	      && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
861	      && (loop_in->simduid
862		  == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
863	    {
864	      tree arg = gimple_call_arg (stmt, 2);
865	      if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
866		simd_if_cond = arg;
867	      else
868		gcc_assert (integer_nonzerop (arg));
869	    }
870	}
871    }
872
873  epilogue_vinfos.create (6);
874}
875
876/* Free all levels of MASKS.  */
877
878void
879release_vec_loop_masks (vec_loop_masks *masks)
880{
881  rgroup_masks *rgm;
882  unsigned int i;
883  FOR_EACH_VEC_ELT (*masks, i, rgm)
884    rgm->masks.release ();
885  masks->release ();
886}
887
888/* Free all memory used by the _loop_vec_info, as well as all the
889   stmt_vec_info structs of all the stmts in the loop.  */
890
891_loop_vec_info::~_loop_vec_info ()
892{
893  free (bbs);
894
895  release_vec_loop_masks (&masks);
896  delete ivexpr_map;
897  delete scan_map;
898  epilogue_vinfos.release ();
899
900  loop->aux = NULL;
901}
902
903/* Return an invariant or register for EXPR and emit necessary
904   computations in the LOOP_VINFO loop preheader.  */
905
906tree
907cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
908{
909  if (is_gimple_reg (expr)
910      || is_gimple_min_invariant (expr))
911    return expr;
912
913  if (! loop_vinfo->ivexpr_map)
914    loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
915  tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
916  if (! cached)
917    {
918      gimple_seq stmts = NULL;
919      cached = force_gimple_operand (unshare_expr (expr),
920				     &stmts, true, NULL_TREE);
921      if (stmts)
922	{
923	  edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
924	  gsi_insert_seq_on_edge_immediate (e, stmts);
925	}
926    }
927  return cached;
928}
929
930/* Return true if we can use CMP_TYPE as the comparison type to produce
931   all masks required to mask LOOP_VINFO.  */
932
933static bool
934can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
935{
936  rgroup_masks *rgm;
937  unsigned int i;
938  FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
939    if (rgm->mask_type != NULL_TREE
940	&& !direct_internal_fn_supported_p (IFN_WHILE_ULT,
941					    cmp_type, rgm->mask_type,
942					    OPTIMIZE_FOR_SPEED))
943      return false;
944  return true;
945}
946
947/* Calculate the maximum number of scalars per iteration for every
948   rgroup in LOOP_VINFO.  */
949
950static unsigned int
951vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
952{
953  unsigned int res = 1;
954  unsigned int i;
955  rgroup_masks *rgm;
956  FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
957    res = MAX (res, rgm->max_nscalars_per_iter);
958  return res;
959}
960
961/* Each statement in LOOP_VINFO can be masked where necessary.  Check
962   whether we can actually generate the masks required.  Return true if so,
963   storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
964
965static bool
966vect_verify_full_masking (loop_vec_info loop_vinfo)
967{
968  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
969  unsigned int min_ni_width;
970  unsigned int max_nscalars_per_iter
971    = vect_get_max_nscalars_per_iter (loop_vinfo);
972
973  /* Use a normal loop if there are no statements that need masking.
974     This only happens in rare degenerate cases: it means that the loop
975     has no loads, no stores, and no live-out values.  */
976  if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
977    return false;
978
979  /* Get the maximum number of iterations that is representable
980     in the counter type.  */
981  tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
982  widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
983
984  /* Get a more refined estimate for the number of iterations.  */
985  widest_int max_back_edges;
986  if (max_loop_iterations (loop, &max_back_edges))
987    max_ni = wi::smin (max_ni, max_back_edges + 1);
988
989  /* Account for rgroup masks, in which each bit is replicated N times.  */
990  max_ni *= max_nscalars_per_iter;
991
992  /* Work out how many bits we need to represent the limit.  */
993  min_ni_width = wi::min_precision (max_ni, UNSIGNED);
994
995  /* Find a scalar mode for which WHILE_ULT is supported.  */
996  opt_scalar_int_mode cmp_mode_iter;
997  tree cmp_type = NULL_TREE;
998  tree iv_type = NULL_TREE;
999  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1000  unsigned int iv_precision = UINT_MAX;
1001
1002  if (iv_limit != -1)
1003    iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1004				      UNSIGNED);
1005
1006  FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1007    {
1008      unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1009      if (cmp_bits >= min_ni_width
1010	  && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1011	{
1012	  tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1013	  if (this_type
1014	      && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1015	    {
1016	      /* Although we could stop as soon as we find a valid mode,
1017		 there are at least two reasons why that's not always the
1018		 best choice:
1019
1020		 - An IV that's Pmode or wider is more likely to be reusable
1021		   in address calculations than an IV that's narrower than
1022		   Pmode.
1023
1024		 - Doing the comparison in IV_PRECISION or wider allows
1025		   a natural 0-based IV, whereas using a narrower comparison
1026		   type requires mitigations against wrap-around.
1027
1028		 Conversely, if the IV limit is variable, doing the comparison
1029		 in a wider type than the original type can introduce
1030		 unnecessary extensions, so picking the widest valid mode
1031		 is not always a good choice either.
1032
1033		 Here we prefer the first IV type that's Pmode or wider,
1034		 and the first comparison type that's IV_PRECISION or wider.
1035		 (The comparison type must be no wider than the IV type,
1036		 to avoid extensions in the vector loop.)
1037
1038		 ??? We might want to try continuing beyond Pmode for ILP32
1039		 targets if CMP_BITS < IV_PRECISION.  */
1040	      iv_type = this_type;
1041	      if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1042		cmp_type = this_type;
1043	      if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1044		break;
1045	    }
1046	}
1047    }
1048
1049  if (!cmp_type)
1050    return false;
1051
1052  LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1053  LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1054  return true;
1055}
1056
1057/* Calculate the cost of one scalar iteration of the loop.  */
1058static void
1059vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1060{
1061  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1062  basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1063  int nbbs = loop->num_nodes, factor;
1064  int innerloop_iters, i;
1065
1066  DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1067
1068  /* Gather costs for statements in the scalar loop.  */
1069
1070  /* FORNOW.  */
1071  innerloop_iters = 1;
1072  if (loop->inner)
1073    innerloop_iters = 50; /* FIXME */
1074
1075  for (i = 0; i < nbbs; i++)
1076    {
1077      gimple_stmt_iterator si;
1078      basic_block bb = bbs[i];
1079
1080      if (bb->loop_father == loop->inner)
1081        factor = innerloop_iters;
1082      else
1083        factor = 1;
1084
1085      for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1086        {
1087	  gimple *stmt = gsi_stmt (si);
1088	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1089
1090          if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1091            continue;
1092
1093          /* Skip stmts that are not vectorized inside the loop.  */
1094	  stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1095          if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1096              && (!STMT_VINFO_LIVE_P (vstmt_info)
1097                  || !VECTORIZABLE_CYCLE_DEF
1098			(STMT_VINFO_DEF_TYPE (vstmt_info))))
1099            continue;
1100
1101	  vect_cost_for_stmt kind;
1102          if (STMT_VINFO_DATA_REF (stmt_info))
1103            {
1104              if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1105               kind = scalar_load;
1106             else
1107               kind = scalar_store;
1108            }
1109	  else if (vect_nop_conversion_p (stmt_info))
1110	    continue;
1111	  else
1112            kind = scalar_stmt;
1113
1114	  record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1115			    factor, kind, stmt_info, 0, vect_prologue);
1116        }
1117    }
1118
1119  /* Now accumulate cost.  */
1120  void *target_cost_data = init_cost (loop);
1121  stmt_info_for_cost *si;
1122  int j;
1123  FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1124		    j, si)
1125    (void) add_stmt_cost (target_cost_data, si->count,
1126			  si->kind, si->stmt_info, si->misalign,
1127			  vect_body);
1128  unsigned dummy, body_cost = 0;
1129  finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1130  destroy_cost_data (target_cost_data);
1131  LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1132}
1133
1134
1135/* Function vect_analyze_loop_form_1.
1136
1137   Verify that certain CFG restrictions hold, including:
1138   - the loop has a pre-header
1139   - the loop has a single entry and exit
1140   - the loop exit condition is simple enough
1141   - the number of iterations can be analyzed, i.e, a countable loop.  The
1142     niter could be analyzed under some assumptions.  */
1143
1144opt_result
1145vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1146			  tree *assumptions, tree *number_of_iterationsm1,
1147			  tree *number_of_iterations, gcond **inner_loop_cond)
1148{
1149  DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1150
1151  /* Different restrictions apply when we are considering an inner-most loop,
1152     vs. an outer (nested) loop.
1153     (FORNOW. May want to relax some of these restrictions in the future).  */
1154
1155  if (!loop->inner)
1156    {
1157      /* Inner-most loop.  We currently require that the number of BBs is
1158	 exactly 2 (the header and latch).  Vectorizable inner-most loops
1159	 look like this:
1160
1161                        (pre-header)
1162                           |
1163                          header <--------+
1164                           | |            |
1165                           | +--> latch --+
1166                           |
1167                        (exit-bb)  */
1168
1169      if (loop->num_nodes != 2)
1170	return opt_result::failure_at (vect_location,
1171				       "not vectorized:"
1172				       " control flow in loop.\n");
1173
1174      if (empty_block_p (loop->header))
1175	return opt_result::failure_at (vect_location,
1176				       "not vectorized: empty loop.\n");
1177    }
1178  else
1179    {
1180      class loop *innerloop = loop->inner;
1181      edge entryedge;
1182
1183      /* Nested loop. We currently require that the loop is doubly-nested,
1184	 contains a single inner loop, and the number of BBs is exactly 5.
1185	 Vectorizable outer-loops look like this:
1186
1187			(pre-header)
1188			   |
1189			  header <---+
1190			   |         |
1191		          inner-loop |
1192			   |         |
1193			  tail ------+
1194			   |
1195		        (exit-bb)
1196
1197	 The inner-loop has the properties expected of inner-most loops
1198	 as described above.  */
1199
1200      if ((loop->inner)->inner || (loop->inner)->next)
1201	return opt_result::failure_at (vect_location,
1202				       "not vectorized:"
1203				       " multiple nested loops.\n");
1204
1205      if (loop->num_nodes != 5)
1206	return opt_result::failure_at (vect_location,
1207				       "not vectorized:"
1208				       " control flow in loop.\n");
1209
1210      entryedge = loop_preheader_edge (innerloop);
1211      if (entryedge->src != loop->header
1212	  || !single_exit (innerloop)
1213	  || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1214	return opt_result::failure_at (vect_location,
1215				       "not vectorized:"
1216				       " unsupported outerloop form.\n");
1217
1218      /* Analyze the inner-loop.  */
1219      tree inner_niterm1, inner_niter, inner_assumptions;
1220      opt_result res
1221	= vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1222				    &inner_assumptions, &inner_niterm1,
1223				    &inner_niter, NULL);
1224      if (!res)
1225	{
1226	  if (dump_enabled_p ())
1227	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1228			     "not vectorized: Bad inner loop.\n");
1229	  return res;
1230	}
1231
1232      /* Don't support analyzing niter under assumptions for inner
1233	 loop.  */
1234      if (!integer_onep (inner_assumptions))
1235	return opt_result::failure_at (vect_location,
1236				       "not vectorized: Bad inner loop.\n");
1237
1238      if (!expr_invariant_in_loop_p (loop, inner_niter))
1239	return opt_result::failure_at (vect_location,
1240				       "not vectorized: inner-loop count not"
1241				       " invariant.\n");
1242
1243      if (dump_enabled_p ())
1244        dump_printf_loc (MSG_NOTE, vect_location,
1245			 "Considering outer-loop vectorization.\n");
1246    }
1247
1248  if (!single_exit (loop))
1249    return opt_result::failure_at (vect_location,
1250				   "not vectorized: multiple exits.\n");
1251  if (EDGE_COUNT (loop->header->preds) != 2)
1252    return opt_result::failure_at (vect_location,
1253				   "not vectorized:"
1254				   " too many incoming edges.\n");
1255
1256  /* We assume that the loop exit condition is at the end of the loop. i.e,
1257     that the loop is represented as a do-while (with a proper if-guard
1258     before the loop if needed), where the loop header contains all the
1259     executable statements, and the latch is empty.  */
1260  if (!empty_block_p (loop->latch)
1261      || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1262    return opt_result::failure_at (vect_location,
1263				   "not vectorized: latch block not empty.\n");
1264
1265  /* Make sure the exit is not abnormal.  */
1266  edge e = single_exit (loop);
1267  if (e->flags & EDGE_ABNORMAL)
1268    return opt_result::failure_at (vect_location,
1269				   "not vectorized:"
1270				   " abnormal loop exit edge.\n");
1271
1272  *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1273				     number_of_iterationsm1);
1274  if (!*loop_cond)
1275    return opt_result::failure_at
1276      (vect_location,
1277       "not vectorized: complicated exit condition.\n");
1278
1279  if (integer_zerop (*assumptions)
1280      || !*number_of_iterations
1281      || chrec_contains_undetermined (*number_of_iterations))
1282    return opt_result::failure_at
1283      (*loop_cond,
1284       "not vectorized: number of iterations cannot be computed.\n");
1285
1286  if (integer_zerop (*number_of_iterations))
1287    return opt_result::failure_at
1288      (*loop_cond,
1289       "not vectorized: number of iterations = 0.\n");
1290
1291  return opt_result::success ();
1292}
1293
1294/* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1295
1296opt_loop_vec_info
1297vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1298{
1299  tree assumptions, number_of_iterations, number_of_iterationsm1;
1300  gcond *loop_cond, *inner_loop_cond = NULL;
1301
1302  opt_result res
1303    = vect_analyze_loop_form_1 (loop, &loop_cond,
1304				&assumptions, &number_of_iterationsm1,
1305				&number_of_iterations, &inner_loop_cond);
1306  if (!res)
1307    return opt_loop_vec_info::propagate_failure (res);
1308
1309  loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1310  LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1311  LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1312  LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1313  if (!integer_onep (assumptions))
1314    {
1315      /* We consider to vectorize this loop by versioning it under
1316	 some assumptions.  In order to do this, we need to clear
1317	 existing information computed by scev and niter analyzer.  */
1318      scev_reset_htab ();
1319      free_numbers_of_iterations_estimates (loop);
1320      /* Also set flag for this loop so that following scev and niter
1321	 analysis are done under the assumptions.  */
1322      loop_constraint_set (loop, LOOP_C_FINITE);
1323      /* Also record the assumptions for versioning.  */
1324      LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1325    }
1326
1327  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1328    {
1329      if (dump_enabled_p ())
1330        {
1331          dump_printf_loc (MSG_NOTE, vect_location,
1332			   "Symbolic number of iterations is ");
1333	  dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1334          dump_printf (MSG_NOTE, "\n");
1335        }
1336    }
1337
1338  stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1339  STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1340  if (inner_loop_cond)
1341    {
1342      stmt_vec_info inner_loop_cond_info
1343	= loop_vinfo->lookup_stmt (inner_loop_cond);
1344      STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1345    }
1346
1347  gcc_assert (!loop->aux);
1348  loop->aux = loop_vinfo;
1349  return opt_loop_vec_info::success (loop_vinfo);
1350}
1351
1352
1353
1354/* Scan the loop stmts and dependent on whether there are any (non-)SLP
1355   statements update the vectorization factor.  */
1356
1357static void
1358vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1359{
1360  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1361  basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1362  int nbbs = loop->num_nodes;
1363  poly_uint64 vectorization_factor;
1364  int i;
1365
1366  DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1367
1368  vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1369  gcc_assert (known_ne (vectorization_factor, 0U));
1370
1371  /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1372     vectorization factor of the loop is the unrolling factor required by
1373     the SLP instances.  If that unrolling factor is 1, we say, that we
1374     perform pure SLP on loop - cross iteration parallelism is not
1375     exploited.  */
1376  bool only_slp_in_loop = true;
1377  for (i = 0; i < nbbs; i++)
1378    {
1379      basic_block bb = bbs[i];
1380      for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1381	   gsi_next (&si))
1382	{
1383	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1384	  if (!stmt_info)
1385	    continue;
1386	  if ((STMT_VINFO_RELEVANT_P (stmt_info)
1387	       || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1388	      && !PURE_SLP_STMT (stmt_info))
1389	    /* STMT needs both SLP and loop-based vectorization.  */
1390	    only_slp_in_loop = false;
1391	}
1392      for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1393	   gsi_next (&si))
1394	{
1395	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1396	  stmt_info = vect_stmt_to_vectorize (stmt_info);
1397	  if ((STMT_VINFO_RELEVANT_P (stmt_info)
1398	       || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1399	      && !PURE_SLP_STMT (stmt_info))
1400	    /* STMT needs both SLP and loop-based vectorization.  */
1401	    only_slp_in_loop = false;
1402	}
1403    }
1404
1405  if (only_slp_in_loop)
1406    {
1407      if (dump_enabled_p ())
1408	dump_printf_loc (MSG_NOTE, vect_location,
1409			 "Loop contains only SLP stmts\n");
1410      vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1411    }
1412  else
1413    {
1414      if (dump_enabled_p ())
1415	dump_printf_loc (MSG_NOTE, vect_location,
1416			 "Loop contains SLP and non-SLP stmts\n");
1417      /* Both the vectorization factor and unroll factor have the form
1418	 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1419	 so they must have a common multiple.  */
1420      vectorization_factor
1421	= force_common_multiple (vectorization_factor,
1422				 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1423    }
1424
1425  LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1426  if (dump_enabled_p ())
1427    {
1428      dump_printf_loc (MSG_NOTE, vect_location,
1429		       "Updating vectorization factor to ");
1430      dump_dec (MSG_NOTE, vectorization_factor);
1431      dump_printf (MSG_NOTE, ".\n");
1432    }
1433}
1434
1435/* Return true if STMT_INFO describes a double reduction phi and if
1436   the other phi in the reduction is also relevant for vectorization.
1437   This rejects cases such as:
1438
1439      outer1:
1440	x_1 = PHI <x_3(outer2), ...>;
1441	...
1442
1443      inner:
1444	x_2 = ...;
1445	...
1446
1447      outer2:
1448	x_3 = PHI <x_2(inner)>;
1449
1450   if nothing in x_2 or elsewhere makes x_1 relevant.  */
1451
1452static bool
1453vect_active_double_reduction_p (stmt_vec_info stmt_info)
1454{
1455  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1456    return false;
1457
1458  return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1459}
1460
1461/* Function vect_analyze_loop_operations.
1462
1463   Scan the loop stmts and make sure they are all vectorizable.  */
1464
1465static opt_result
1466vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1467{
1468  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1469  basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1470  int nbbs = loop->num_nodes;
1471  int i;
1472  stmt_vec_info stmt_info;
1473  bool need_to_vectorize = false;
1474  bool ok;
1475
1476  DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1477
1478  auto_vec<stmt_info_for_cost> cost_vec;
1479
1480  for (i = 0; i < nbbs; i++)
1481    {
1482      basic_block bb = bbs[i];
1483
1484      for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1485	   gsi_next (&si))
1486        {
1487          gphi *phi = si.phi ();
1488          ok = true;
1489
1490	  stmt_info = loop_vinfo->lookup_stmt (phi);
1491          if (dump_enabled_p ())
1492	    dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1493	  if (virtual_operand_p (gimple_phi_result (phi)))
1494	    continue;
1495
1496          /* Inner-loop loop-closed exit phi in outer-loop vectorization
1497             (i.e., a phi in the tail of the outer-loop).  */
1498          if (! is_loop_header_bb_p (bb))
1499            {
1500              /* FORNOW: we currently don't support the case that these phis
1501                 are not used in the outerloop (unless it is double reduction,
1502                 i.e., this phi is vect_reduction_def), cause this case
1503                 requires to actually do something here.  */
1504              if (STMT_VINFO_LIVE_P (stmt_info)
1505		  && !vect_active_double_reduction_p (stmt_info))
1506		return opt_result::failure_at (phi,
1507					       "Unsupported loop-closed phi"
1508					       " in outer-loop.\n");
1509
1510              /* If PHI is used in the outer loop, we check that its operand
1511                 is defined in the inner loop.  */
1512              if (STMT_VINFO_RELEVANT_P (stmt_info))
1513                {
1514                  tree phi_op;
1515
1516                  if (gimple_phi_num_args (phi) != 1)
1517                    return opt_result::failure_at (phi, "unsupported phi");
1518
1519                  phi_op = PHI_ARG_DEF (phi, 0);
1520		  stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1521		  if (!op_def_info)
1522		    return opt_result::failure_at (phi, "unsupported phi\n");
1523
1524		  if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1525		      && (STMT_VINFO_RELEVANT (op_def_info)
1526			  != vect_used_in_outer_by_reduction))
1527		    return opt_result::failure_at (phi, "unsupported phi\n");
1528
1529		  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1530		       || (STMT_VINFO_DEF_TYPE (stmt_info)
1531			   == vect_double_reduction_def))
1532		      && !vectorizable_lc_phi (stmt_info, NULL, NULL))
1533		    return opt_result::failure_at (phi, "unsupported phi\n");
1534                }
1535
1536              continue;
1537            }
1538
1539          gcc_assert (stmt_info);
1540
1541          if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1542               || STMT_VINFO_LIVE_P (stmt_info))
1543              && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1544	    /* A scalar-dependence cycle that we don't support.  */
1545	    return opt_result::failure_at (phi,
1546					   "not vectorized:"
1547					   " scalar dependence cycle.\n");
1548
1549          if (STMT_VINFO_RELEVANT_P (stmt_info))
1550            {
1551              need_to_vectorize = true;
1552              if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1553		  && ! PURE_SLP_STMT (stmt_info))
1554		ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1555					     &cost_vec);
1556	      else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1557			|| (STMT_VINFO_DEF_TYPE (stmt_info)
1558			    == vect_double_reduction_def)
1559			|| STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1560		       && ! PURE_SLP_STMT (stmt_info))
1561		ok = vectorizable_reduction (stmt_info, NULL, NULL, &cost_vec);
1562            }
1563
1564	  /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1565	  if (ok
1566	      && STMT_VINFO_LIVE_P (stmt_info)
1567	      && !PURE_SLP_STMT (stmt_info))
1568	    ok = vectorizable_live_operation (stmt_info, NULL, NULL, NULL,
1569					      -1, false, &cost_vec);
1570
1571          if (!ok)
1572	    return opt_result::failure_at (phi,
1573					   "not vectorized: relevant phi not "
1574					   "supported: %G",
1575					   static_cast <gimple *> (phi));
1576        }
1577
1578      for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1579	   gsi_next (&si))
1580        {
1581	  gimple *stmt = gsi_stmt (si);
1582	  if (!gimple_clobber_p (stmt))
1583	    {
1584	      opt_result res
1585		= vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1586				     &need_to_vectorize,
1587				     NULL, NULL, &cost_vec);
1588	      if (!res)
1589		return res;
1590	    }
1591        }
1592    } /* bbs */
1593
1594  add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1595
1596  /* All operations in the loop are either irrelevant (deal with loop
1597     control, or dead), or only used outside the loop and can be moved
1598     out of the loop (e.g. invariants, inductions).  The loop can be
1599     optimized away by scalar optimizations.  We're better off not
1600     touching this loop.  */
1601  if (!need_to_vectorize)
1602    {
1603      if (dump_enabled_p ())
1604        dump_printf_loc (MSG_NOTE, vect_location,
1605			 "All the computation can be taken out of the loop.\n");
1606      return opt_result::failure_at
1607	(vect_location,
1608	 "not vectorized: redundant loop. no profit to vectorize.\n");
1609    }
1610
1611  return opt_result::success ();
1612}
1613
1614/* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1615   is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1616   definitely no, or -1 if it's worth retrying.  */
1617
1618static int
1619vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1620{
1621  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1622  unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1623
1624  /* Only fully-masked loops can have iteration counts less than the
1625     vectorization factor.  */
1626  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1627    {
1628      HOST_WIDE_INT max_niter;
1629
1630      if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1631	max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1632      else
1633	max_niter = max_stmt_executions_int (loop);
1634
1635      if (max_niter != -1
1636	  && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1637	{
1638	  if (dump_enabled_p ())
1639	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1640			     "not vectorized: iteration count smaller than "
1641			     "vectorization factor.\n");
1642	  return 0;
1643	}
1644    }
1645
1646  int min_profitable_iters, min_profitable_estimate;
1647  vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1648				      &min_profitable_estimate);
1649
1650  if (min_profitable_iters < 0)
1651    {
1652      if (dump_enabled_p ())
1653	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1654			 "not vectorized: vectorization not profitable.\n");
1655      if (dump_enabled_p ())
1656	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1657			 "not vectorized: vector version will never be "
1658			 "profitable.\n");
1659      return -1;
1660    }
1661
1662  int min_scalar_loop_bound = (param_min_vect_loop_bound
1663			       * assumed_vf);
1664
1665  /* Use the cost model only if it is more conservative than user specified
1666     threshold.  */
1667  unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1668				    min_profitable_iters);
1669
1670  LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1671
1672  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1673      && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1674    {
1675      if (dump_enabled_p ())
1676	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1677			 "not vectorized: vectorization not profitable.\n");
1678      if (dump_enabled_p ())
1679	dump_printf_loc (MSG_NOTE, vect_location,
1680			 "not vectorized: iteration count smaller than user "
1681			 "specified loop bound parameter or minimum profitable "
1682			 "iterations (whichever is more conservative).\n");
1683      return 0;
1684    }
1685
1686  /* The static profitablity threshold min_profitable_estimate includes
1687     the cost of having to check at runtime whether the scalar loop
1688     should be used instead.  If it turns out that we don't need or want
1689     such a check, the threshold we should use for the static estimate
1690     is simply the point at which the vector loop becomes more profitable
1691     than the scalar loop.  */
1692  if (min_profitable_estimate > min_profitable_iters
1693      && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1694      && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1695      && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1696      && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1697    {
1698      if (dump_enabled_p ())
1699	dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1700			 " choice between the scalar and vector loops\n");
1701      min_profitable_estimate = min_profitable_iters;
1702    }
1703
1704  HOST_WIDE_INT estimated_niter;
1705
1706  /* If we are vectorizing an epilogue then we know the maximum number of
1707     scalar iterations it will cover is at least one lower than the
1708     vectorization factor of the main loop.  */
1709  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1710    estimated_niter
1711      = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1712  else
1713    {
1714      estimated_niter = estimated_stmt_executions_int (loop);
1715      if (estimated_niter == -1)
1716	estimated_niter = likely_max_stmt_executions_int (loop);
1717    }
1718  if (estimated_niter != -1
1719      && ((unsigned HOST_WIDE_INT) estimated_niter
1720	  < MAX (th, (unsigned) min_profitable_estimate)))
1721    {
1722      if (dump_enabled_p ())
1723	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1724			 "not vectorized: estimated iteration count too "
1725			 "small.\n");
1726      if (dump_enabled_p ())
1727	dump_printf_loc (MSG_NOTE, vect_location,
1728			 "not vectorized: estimated iteration count smaller "
1729			 "than specified loop bound parameter or minimum "
1730			 "profitable iterations (whichever is more "
1731			 "conservative).\n");
1732      return -1;
1733    }
1734
1735  return 1;
1736}
1737
1738static opt_result
1739vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1740			   vec<data_reference_p> *datarefs,
1741			   unsigned int *n_stmts)
1742{
1743  *n_stmts = 0;
1744  for (unsigned i = 0; i < loop->num_nodes; i++)
1745    for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1746	 !gsi_end_p (gsi); gsi_next (&gsi))
1747      {
1748	gimple *stmt = gsi_stmt (gsi);
1749	if (is_gimple_debug (stmt))
1750	  continue;
1751	++(*n_stmts);
1752	opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1753	if (!res)
1754	  {
1755	    if (is_gimple_call (stmt) && loop->safelen)
1756	      {
1757		tree fndecl = gimple_call_fndecl (stmt), op;
1758		if (fndecl != NULL_TREE)
1759		  {
1760		    cgraph_node *node = cgraph_node::get (fndecl);
1761		    if (node != NULL && node->simd_clones != NULL)
1762		      {
1763			unsigned int j, n = gimple_call_num_args (stmt);
1764			for (j = 0; j < n; j++)
1765			  {
1766			    op = gimple_call_arg (stmt, j);
1767			    if (DECL_P (op)
1768				|| (REFERENCE_CLASS_P (op)
1769				    && get_base_address (op)))
1770			      break;
1771			  }
1772			op = gimple_call_lhs (stmt);
1773			/* Ignore #pragma omp declare simd functions
1774			   if they don't have data references in the
1775			   call stmt itself.  */
1776			if (j == n
1777			    && !(op
1778				 && (DECL_P (op)
1779				     || (REFERENCE_CLASS_P (op)
1780					 && get_base_address (op)))))
1781			  continue;
1782		      }
1783		  }
1784	      }
1785	    return res;
1786	  }
1787	/* If dependence analysis will give up due to the limit on the
1788	   number of datarefs stop here and fail fatally.  */
1789	if (datarefs->length ()
1790	    > (unsigned)param_loop_max_datarefs_for_datadeps)
1791	  return opt_result::failure_at (stmt, "exceeded param "
1792					 "loop-max-datarefs-for-datadeps\n");
1793      }
1794  return opt_result::success ();
1795}
1796
1797/* Look for SLP-only access groups and turn each individual access into its own
1798   group.  */
1799static void
1800vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1801{
1802  unsigned int i;
1803  struct data_reference *dr;
1804
1805  DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1806
1807  vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1808  FOR_EACH_VEC_ELT (datarefs, i, dr)
1809    {
1810      gcc_assert (DR_REF (dr));
1811      stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1812
1813      /* Check if the load is a part of an interleaving chain.  */
1814      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1815	{
1816	  stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1817	  unsigned int group_size = DR_GROUP_SIZE (first_element);
1818
1819	  /* Check if SLP-only groups.  */
1820	  if (!STMT_SLP_TYPE (stmt_info)
1821	      && STMT_VINFO_SLP_VECT_ONLY (first_element))
1822	    {
1823	      /* Dissolve the group.  */
1824	      STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1825
1826	      stmt_vec_info vinfo = first_element;
1827	      while (vinfo)
1828		{
1829		  stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1830		  DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1831		  DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1832		  DR_GROUP_SIZE (vinfo) = 1;
1833		  if (STMT_VINFO_STRIDED_P (first_element))
1834		    DR_GROUP_GAP (vinfo) = 0;
1835		  else
1836		    DR_GROUP_GAP (vinfo) = group_size - 1;
1837		  vinfo = next;
1838		}
1839	    }
1840	}
1841    }
1842}
1843
1844
1845/* Decides whether we need to create an epilogue loop to handle
1846   remaining scalar iterations and sets PEELING_FOR_NITERS accordingly.  */
1847
1848void
1849determine_peel_for_niter (loop_vec_info loop_vinfo)
1850{
1851  LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1852
1853  unsigned HOST_WIDE_INT const_vf;
1854  HOST_WIDE_INT max_niter
1855    = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1856
1857  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1858  if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1859    th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1860					  (loop_vinfo));
1861
1862  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1863    /* The main loop handles all iterations.  */
1864    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1865  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1866	   && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1867    {
1868      /* Work out the (constant) number of iterations that need to be
1869	 peeled for reasons other than niters.  */
1870      unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1871      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1872	peel_niter += 1;
1873      if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1874		       LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1875	LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1876    }
1877  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1878	   /* ??? When peeling for gaps but not alignment, we could
1879	      try to check whether the (variable) niters is known to be
1880	      VF * N + 1.  That's something of a niche case though.  */
1881	   || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1882	   || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1883	   || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1884		< (unsigned) exact_log2 (const_vf))
1885	       /* In case of versioning, check if the maximum number of
1886		  iterations is greater than th.  If they are identical,
1887		  the epilogue is unnecessary.  */
1888	       && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1889		   || ((unsigned HOST_WIDE_INT) max_niter
1890		       > (th / const_vf) * const_vf))))
1891    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1892}
1893
1894
1895/* Function vect_analyze_loop_2.
1896
1897   Apply a set of analyses on LOOP, and create a loop_vec_info struct
1898   for it.  The different analyses will record information in the
1899   loop_vec_info struct.  */
1900static opt_result
1901vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1902{
1903  opt_result ok = opt_result::success ();
1904  int res;
1905  unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1906  poly_uint64 min_vf = 2;
1907  loop_vec_info orig_loop_vinfo = NULL;
1908
1909  /* If we are dealing with an epilogue then orig_loop_vinfo points to the
1910     loop_vec_info of the first vectorized loop.  */
1911  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1912    orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1913  else
1914    orig_loop_vinfo = loop_vinfo;
1915  gcc_assert (orig_loop_vinfo);
1916
1917  /* The first group of checks is independent of the vector size.  */
1918  fatal = true;
1919
1920  if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1921      && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1922    return opt_result::failure_at (vect_location,
1923				   "not vectorized: simd if(0)\n");
1924
1925  /* Find all data references in the loop (which correspond to vdefs/vuses)
1926     and analyze their evolution in the loop.  */
1927
1928  loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1929
1930  /* Gather the data references and count stmts in the loop.  */
1931  if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1932    {
1933      opt_result res
1934	= vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1935				     &LOOP_VINFO_DATAREFS (loop_vinfo),
1936				     n_stmts);
1937      if (!res)
1938	{
1939	  if (dump_enabled_p ())
1940	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1941			     "not vectorized: loop contains function "
1942			     "calls or data references that cannot "
1943			     "be analyzed\n");
1944	  return res;
1945	}
1946      loop_vinfo->shared->save_datarefs ();
1947    }
1948  else
1949    loop_vinfo->shared->check_datarefs ();
1950
1951  /* Analyze the data references and also adjust the minimal
1952     vectorization factor according to the loads and stores.  */
1953
1954  ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1955  if (!ok)
1956    {
1957      if (dump_enabled_p ())
1958	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1959			 "bad data references.\n");
1960      return ok;
1961    }
1962
1963  /* Classify all cross-iteration scalar data-flow cycles.
1964     Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1965  vect_analyze_scalar_cycles (loop_vinfo);
1966
1967  vect_pattern_recog (loop_vinfo);
1968
1969  vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1970
1971  /* Analyze the access patterns of the data-refs in the loop (consecutive,
1972     complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1973
1974  ok = vect_analyze_data_ref_accesses (loop_vinfo);
1975  if (!ok)
1976    {
1977      if (dump_enabled_p ())
1978	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1979			 "bad data access.\n");
1980      return ok;
1981    }
1982
1983  /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1984
1985  ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1986  if (!ok)
1987    {
1988      if (dump_enabled_p ())
1989	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1990			 "unexpected pattern.\n");
1991      return ok;
1992    }
1993
1994  /* While the rest of the analysis below depends on it in some way.  */
1995  fatal = false;
1996
1997  /* Analyze data dependences between the data-refs in the loop
1998     and adjust the maximum vectorization factor according to
1999     the dependences.
2000     FORNOW: fail at the first data dependence that we encounter.  */
2001
2002  ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2003  if (!ok)
2004    {
2005      if (dump_enabled_p ())
2006	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2007			 "bad data dependence.\n");
2008      return ok;
2009    }
2010  if (max_vf != MAX_VECTORIZATION_FACTOR
2011      && maybe_lt (max_vf, min_vf))
2012    return opt_result::failure_at (vect_location, "bad data dependence.\n");
2013  LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2014
2015  ok = vect_determine_vectorization_factor (loop_vinfo);
2016  if (!ok)
2017    {
2018      if (dump_enabled_p ())
2019	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2020			 "can't determine vectorization factor.\n");
2021      return ok;
2022    }
2023  if (max_vf != MAX_VECTORIZATION_FACTOR
2024      && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2025    return opt_result::failure_at (vect_location, "bad data dependence.\n");
2026
2027  /* Compute the scalar iteration cost.  */
2028  vect_compute_single_scalar_iteration_cost (loop_vinfo);
2029
2030  poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2031
2032  /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2033  ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2034  if (!ok)
2035    return ok;
2036
2037  /* If there are any SLP instances mark them as pure_slp.  */
2038  bool slp = vect_make_slp_decision (loop_vinfo);
2039  if (slp)
2040    {
2041      /* Find stmts that need to be both vectorized and SLPed.  */
2042      vect_detect_hybrid_slp (loop_vinfo);
2043
2044      /* Update the vectorization factor based on the SLP decision.  */
2045      vect_update_vf_for_slp (loop_vinfo);
2046    }
2047
2048  bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2049
2050  /* We don't expect to have to roll back to anything other than an empty
2051     set of rgroups.  */
2052  gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2053
2054  /* This is the point where we can re-start analysis with SLP forced off.  */
2055start_over:
2056
2057  /* Now the vectorization factor is final.  */
2058  poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2059  gcc_assert (known_ne (vectorization_factor, 0U));
2060
2061  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2062    {
2063      dump_printf_loc (MSG_NOTE, vect_location,
2064		       "vectorization_factor = ");
2065      dump_dec (MSG_NOTE, vectorization_factor);
2066      dump_printf (MSG_NOTE, ", niters = %wd\n",
2067		   LOOP_VINFO_INT_NITERS (loop_vinfo));
2068    }
2069
2070  /* Analyze the alignment of the data-refs in the loop.
2071     Fail if a data reference is found that cannot be vectorized.  */
2072
2073  ok = vect_analyze_data_refs_alignment (loop_vinfo);
2074  if (!ok)
2075    {
2076      if (dump_enabled_p ())
2077	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2078			 "bad data alignment.\n");
2079      return ok;
2080    }
2081
2082  /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2083     It is important to call pruning after vect_analyze_data_ref_accesses,
2084     since we use grouping information gathered by interleaving analysis.  */
2085  ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2086  if (!ok)
2087    return ok;
2088
2089  /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2090     vectorization, since we do not want to add extra peeling or
2091     add versioning for alignment.  */
2092  if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2093    /* This pass will decide on using loop versioning and/or loop peeling in
2094       order to enhance the alignment of data references in the loop.  */
2095    ok = vect_enhance_data_refs_alignment (loop_vinfo);
2096  else
2097    ok = vect_verify_datarefs_alignment (loop_vinfo);
2098  if (!ok)
2099    return ok;
2100
2101  if (slp)
2102    {
2103      /* Analyze operations in the SLP instances.  Note this may
2104	 remove unsupported SLP instances which makes the above
2105	 SLP kind detection invalid.  */
2106      unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2107      vect_slp_analyze_operations (loop_vinfo);
2108      if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2109	{
2110	  ok = opt_result::failure_at (vect_location,
2111				       "unsupported SLP instances\n");
2112	  goto again;
2113	}
2114    }
2115
2116  /* Dissolve SLP-only groups.  */
2117  vect_dissolve_slp_only_groups (loop_vinfo);
2118
2119  /* Scan all the remaining operations in the loop that are not subject
2120     to SLP and make sure they are vectorizable.  */
2121  ok = vect_analyze_loop_operations (loop_vinfo);
2122  if (!ok)
2123    {
2124      if (dump_enabled_p ())
2125	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2126			 "bad operation or unsupported loop bound.\n");
2127      return ok;
2128    }
2129
2130  /* Decide whether to use a fully-masked loop for this vectorization
2131     factor.  */
2132  LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2133    = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2134       && vect_verify_full_masking (loop_vinfo));
2135  if (dump_enabled_p ())
2136    {
2137      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2138	dump_printf_loc (MSG_NOTE, vect_location,
2139			 "using a fully-masked loop.\n");
2140      else
2141	dump_printf_loc (MSG_NOTE, vect_location,
2142			 "not using a fully-masked loop.\n");
2143    }
2144
2145  /* If epilog loop is required because of data accesses with gaps,
2146     one additional iteration needs to be peeled.  Check if there is
2147     enough iterations for vectorization.  */
2148  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2149      && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2150      && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2151    {
2152      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2153      tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2154
2155      if (known_lt (wi::to_widest (scalar_niters), vf))
2156	return opt_result::failure_at (vect_location,
2157				       "loop has no enough iterations to"
2158				       " support peeling for gaps.\n");
2159    }
2160
2161  /* If we're vectorizing an epilogue loop, we either need a fully-masked
2162     loop or a loop that has a lower VF than the main loop.  */
2163  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2164      && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2165      && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2166		   LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2167    return opt_result::failure_at (vect_location,
2168				   "Vectorization factor too high for"
2169				   " epilogue loop.\n");
2170
2171  /* Check the costings of the loop make vectorizing worthwhile.  */
2172  res = vect_analyze_loop_costing (loop_vinfo);
2173  if (res < 0)
2174    {
2175      ok = opt_result::failure_at (vect_location,
2176				   "Loop costings may not be worthwhile.\n");
2177      goto again;
2178    }
2179  if (!res)
2180    return opt_result::failure_at (vect_location,
2181				   "Loop costings not worthwhile.\n");
2182
2183  determine_peel_for_niter (loop_vinfo);
2184  /* If an epilogue loop is required make sure we can create one.  */
2185  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2186      || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2187    {
2188      if (dump_enabled_p ())
2189        dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2190      if (!vect_can_advance_ivs_p (loop_vinfo)
2191	  || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2192					   single_exit (LOOP_VINFO_LOOP
2193							 (loop_vinfo))))
2194        {
2195	  ok = opt_result::failure_at (vect_location,
2196				       "not vectorized: can't create required "
2197				       "epilog loop\n");
2198          goto again;
2199        }
2200    }
2201
2202  /* During peeling, we need to check if number of loop iterations is
2203     enough for both peeled prolog loop and vector loop.  This check
2204     can be merged along with threshold check of loop versioning, so
2205     increase threshold for this case if necessary.
2206
2207     If we are analyzing an epilogue we still want to check what its
2208     versioning threshold would be.  If we decide to vectorize the epilogues we
2209     will want to use the lowest versioning threshold of all epilogues and main
2210     loop.  This will enable us to enter a vectorized epilogue even when
2211     versioning the loop.  We can't simply check whether the epilogue requires
2212     versioning though since we may have skipped some versioning checks when
2213     analyzing the epilogue.  For instance, checks for alias versioning will be
2214     skipped when dealing with epilogues as we assume we already checked them
2215     for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2216  if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2217    {
2218      poly_uint64 niters_th = 0;
2219      unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2220
2221      if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2222	{
2223	  /* Niters for peeled prolog loop.  */
2224	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2225	    {
2226	      dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2227	      tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2228	      niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2229	    }
2230	  else
2231	    niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2232	}
2233
2234      /* Niters for at least one iteration of vectorized loop.  */
2235      if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2236	niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2237      /* One additional iteration because of peeling for gap.  */
2238      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2239	niters_th += 1;
2240
2241      /*  Use the same condition as vect_transform_loop to decide when to use
2242	  the cost to determine a versioning threshold.  */
2243      if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2244	  && ordered_p (th, niters_th))
2245	niters_th = ordered_max (poly_uint64 (th), niters_th);
2246
2247      LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2248    }
2249
2250  gcc_assert (known_eq (vectorization_factor,
2251			LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2252
2253  /* Ok to vectorize!  */
2254  return opt_result::success ();
2255
2256again:
2257  /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2258  gcc_assert (!ok);
2259
2260  /* Try again with SLP forced off but if we didn't do any SLP there is
2261     no point in re-trying.  */
2262  if (!slp)
2263    return ok;
2264
2265  /* If there are reduction chains re-trying will fail anyway.  */
2266  if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2267    return ok;
2268
2269  /* Likewise if the grouped loads or stores in the SLP cannot be handled
2270     via interleaving or lane instructions.  */
2271  slp_instance instance;
2272  slp_tree node;
2273  unsigned i, j;
2274  FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2275    {
2276      stmt_vec_info vinfo;
2277      vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2278      if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2279	continue;
2280      vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2281      unsigned int size = DR_GROUP_SIZE (vinfo);
2282      tree vectype = STMT_VINFO_VECTYPE (vinfo);
2283      if (! vect_store_lanes_supported (vectype, size, false)
2284	 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2285	 && ! vect_grouped_store_supported (vectype, size))
2286	return opt_result::failure_at (vinfo->stmt,
2287				       "unsupported grouped store\n");
2288      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2289	{
2290	  vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2291	  vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2292	  bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2293	  size = DR_GROUP_SIZE (vinfo);
2294	  vectype = STMT_VINFO_VECTYPE (vinfo);
2295	  if (! vect_load_lanes_supported (vectype, size, false)
2296	      && ! vect_grouped_load_supported (vectype, single_element_p,
2297						size))
2298	    return opt_result::failure_at (vinfo->stmt,
2299					   "unsupported grouped load\n");
2300	}
2301    }
2302
2303  if (dump_enabled_p ())
2304    dump_printf_loc (MSG_NOTE, vect_location,
2305		     "re-trying with SLP disabled\n");
2306
2307  /* Roll back state appropriately.  No SLP this time.  */
2308  slp = false;
2309  /* Restore vectorization factor as it were without SLP.  */
2310  LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2311  /* Free the SLP instances.  */
2312  FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2313    vect_free_slp_instance (instance, false);
2314  LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2315  /* Reset SLP type to loop_vect on all stmts.  */
2316  for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2317    {
2318      basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2319      for (gimple_stmt_iterator si = gsi_start_phis (bb);
2320	   !gsi_end_p (si); gsi_next (&si))
2321	{
2322	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2323	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2324	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2325	      || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2326	    {
2327	      /* vectorizable_reduction adjusts reduction stmt def-types,
2328		 restore them to that of the PHI.  */
2329	      STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2330		= STMT_VINFO_DEF_TYPE (stmt_info);
2331	      STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2332					(STMT_VINFO_REDUC_DEF (stmt_info)))
2333		= STMT_VINFO_DEF_TYPE (stmt_info);
2334	    }
2335	}
2336      for (gimple_stmt_iterator si = gsi_start_bb (bb);
2337	   !gsi_end_p (si); gsi_next (&si))
2338	{
2339	  stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2340	  STMT_SLP_TYPE (stmt_info) = loop_vect;
2341	  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2342	    {
2343	      gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2344	      stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2345	      STMT_SLP_TYPE (stmt_info) = loop_vect;
2346	      for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2347		   !gsi_end_p (pi); gsi_next (&pi))
2348		STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2349		  = loop_vect;
2350	    }
2351	}
2352    }
2353  /* Free optimized alias test DDRS.  */
2354  LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2355  LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2356  LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2357  /* Reset target cost data.  */
2358  destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2359  LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2360    = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2361  /* Reset accumulated rgroup information.  */
2362  release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2363  /* Reset assorted flags.  */
2364  LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2365  LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2366  LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2367  LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2368  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2369
2370  goto start_over;
2371}
2372
2373/* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2374   to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2375   OLD_LOOP_VINFO is better unless something specifically indicates
2376   otherwise.
2377
2378   Note that this deliberately isn't a partial order.  */
2379
2380static bool
2381vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2382			  loop_vec_info old_loop_vinfo)
2383{
2384  struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2385  gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2386
2387  poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2388  poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2389
2390  /* Always prefer a VF of loop->simdlen over any other VF.  */
2391  if (loop->simdlen)
2392    {
2393      bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2394      bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2395      if (new_simdlen_p != old_simdlen_p)
2396	return new_simdlen_p;
2397    }
2398
2399  /* Limit the VFs to what is likely to be the maximum number of iterations,
2400     to handle cases in which at least one loop_vinfo is fully-masked.  */
2401  HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2402  if (estimated_max_niter != -1)
2403    {
2404      if (known_le (estimated_max_niter, new_vf))
2405	new_vf = estimated_max_niter;
2406      if (known_le (estimated_max_niter, old_vf))
2407	old_vf = estimated_max_niter;
2408    }
2409
2410  /* Check whether the (fractional) cost per scalar iteration is lower
2411     or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
2412  poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2413			     * poly_widest_int (old_vf));
2414  poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2415			     * poly_widest_int (new_vf));
2416  if (maybe_lt (rel_old, rel_new))
2417    {
2418      /* When old_loop_vinfo uses a variable vectorization factor,
2419	 we know that it has a lower cost for at least one runtime VF.
2420	 However, we don't know how likely that VF is.
2421
2422	 One option would be to compare the costs for the estimated VFs.
2423	 The problem is that that can put too much pressure on the cost
2424	 model.  E.g. if the estimated VF is also the lowest possible VF,
2425	 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2426	 for the estimated VF, we'd then choose new_loop_vinfo even
2427	 though (a) new_loop_vinfo might not actually be better than
2428	 old_loop_vinfo for that VF and (b) it would be significantly
2429	 worse at larger VFs.
2430
2431	 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2432	 no more expensive than old_loop_vinfo even after doubling the
2433	 estimated old_loop_vinfo VF.  For all but trivial loops, this
2434	 ensures that we only pick new_loop_vinfo if it is significantly
2435	 better than old_loop_vinfo at the estimated VF.  */
2436      if (rel_new.is_constant ())
2437	return false;
2438
2439      HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2440      HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2441      widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2442				      * widest_int (old_estimated_vf));
2443      widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2444				      * widest_int (new_estimated_vf));
2445      return estimated_rel_new * 2 <= estimated_rel_old;
2446    }
2447  if (known_lt (rel_new, rel_old))
2448    return true;
2449
2450  /* If there's nothing to choose between the loop bodies, see whether
2451     there's a difference in the prologue and epilogue costs.  */
2452  if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2453    return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2454
2455  return false;
2456}
2457
2458/* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2459   true if we should.  */
2460
2461static bool
2462vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2463			loop_vec_info old_loop_vinfo)
2464{
2465  if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2466    return false;
2467
2468  if (dump_enabled_p ())
2469    dump_printf_loc (MSG_NOTE, vect_location,
2470		     "***** Preferring vector mode %s to vector mode %s\n",
2471		     GET_MODE_NAME (new_loop_vinfo->vector_mode),
2472		     GET_MODE_NAME (old_loop_vinfo->vector_mode));
2473  return true;
2474}
2475
2476/* If LOOP_VINFO is already a main loop, return it unmodified.  Otherwise
2477   try to reanalyze it as a main loop.  Return the loop_vinfo on success
2478   and null on failure.  */
2479
2480static loop_vec_info
2481vect_reanalyze_as_main_loop (loop_vec_info loop_vinfo, unsigned int *n_stmts)
2482{
2483  if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2484    return loop_vinfo;
2485
2486  if (dump_enabled_p ())
2487    dump_printf_loc (MSG_NOTE, vect_location,
2488		     "***** Reanalyzing as a main loop with vector mode %s\n",
2489		     GET_MODE_NAME (loop_vinfo->vector_mode));
2490
2491  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2492  vec_info_shared *shared = loop_vinfo->shared;
2493  opt_loop_vec_info main_loop_vinfo = vect_analyze_loop_form (loop, shared);
2494  gcc_assert (main_loop_vinfo);
2495
2496  main_loop_vinfo->vector_mode = loop_vinfo->vector_mode;
2497
2498  bool fatal = false;
2499  bool res = vect_analyze_loop_2 (main_loop_vinfo, fatal, n_stmts);
2500  loop->aux = NULL;
2501  if (!res)
2502    {
2503      if (dump_enabled_p ())
2504	dump_printf_loc (MSG_NOTE, vect_location,
2505			 "***** Failed to analyze main loop with vector"
2506			 " mode %s\n",
2507			 GET_MODE_NAME (loop_vinfo->vector_mode));
2508      delete main_loop_vinfo;
2509      return NULL;
2510    }
2511  LOOP_VINFO_VECTORIZABLE_P (main_loop_vinfo) = 1;
2512  return main_loop_vinfo;
2513}
2514
2515/* Function vect_analyze_loop.
2516
2517   Apply a set of analyses on LOOP, and create a loop_vec_info struct
2518   for it.  The different analyses will record information in the
2519   loop_vec_info struct.  */
2520opt_loop_vec_info
2521vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2522{
2523  auto_vector_modes vector_modes;
2524
2525  /* Autodetect first vector size we try.  */
2526  unsigned int autovec_flags
2527    = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2528						    loop->simdlen != 0);
2529  unsigned int mode_i = 0;
2530
2531  DUMP_VECT_SCOPE ("analyze_loop_nest");
2532
2533  if (loop_outer (loop)
2534      && loop_vec_info_for_loop (loop_outer (loop))
2535      && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2536    return opt_loop_vec_info::failure_at (vect_location,
2537					  "outer-loop already vectorized.\n");
2538
2539  if (!find_loop_nest (loop, &shared->loop_nest))
2540    return opt_loop_vec_info::failure_at
2541      (vect_location,
2542       "not vectorized: loop nest containing two or more consecutive inner"
2543       " loops cannot be vectorized\n");
2544
2545  unsigned n_stmts = 0;
2546  machine_mode autodetected_vector_mode = VOIDmode;
2547  opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2548  machine_mode next_vector_mode = VOIDmode;
2549  poly_uint64 lowest_th = 0;
2550  unsigned vectorized_loops = 0;
2551  bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2552			     && !unlimited_cost_model (loop));
2553
2554  bool vect_epilogues = false;
2555  opt_result res = opt_result::success ();
2556  unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2557  while (1)
2558    {
2559      /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2560      opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2561      if (!loop_vinfo)
2562	{
2563	  if (dump_enabled_p ())
2564	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2565			     "bad loop form.\n");
2566	  gcc_checking_assert (first_loop_vinfo == NULL);
2567	  return loop_vinfo;
2568	}
2569      loop_vinfo->vector_mode = next_vector_mode;
2570
2571      bool fatal = false;
2572
2573      /* When pick_lowest_cost_p is true, we should in principle iterate
2574	 over all the loop_vec_infos that LOOP_VINFO could replace and
2575	 try to vectorize LOOP_VINFO under the same conditions.
2576	 E.g. when trying to replace an epilogue loop, we should vectorize
2577	 LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
2578	 to replace the main loop, we should vectorize LOOP_VINFO as a main
2579	 loop too.
2580
2581	 However, autovectorize_vector_modes is usually sorted as follows:
2582
2583	 - Modes that naturally produce lower VFs usually follow modes that
2584	   naturally produce higher VFs.
2585
2586	 - When modes naturally produce the same VF, maskable modes
2587	   usually follow unmaskable ones, so that the maskable mode
2588	   can be used to vectorize the epilogue of the unmaskable mode.
2589
2590	 This order is preferred because it leads to the maximum
2591	 epilogue vectorization opportunities.  Targets should only use
2592	 a different order if they want to make wide modes available while
2593	 disparaging them relative to earlier, smaller modes.  The assumption
2594	 in that case is that the wider modes are more expensive in some
2595	 way that isn't reflected directly in the costs.
2596
2597	 There should therefore be few interesting cases in which
2598	 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2599	 treated as a standalone loop, and ends up being genuinely cheaper
2600	 than FIRST_LOOP_VINFO.  */
2601      if (vect_epilogues)
2602	LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2603
2604      res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2605      if (mode_i == 0)
2606	autodetected_vector_mode = loop_vinfo->vector_mode;
2607      if (dump_enabled_p ())
2608	{
2609	  if (res)
2610	    dump_printf_loc (MSG_NOTE, vect_location,
2611			     "***** Analysis succeeded with vector mode %s\n",
2612			     GET_MODE_NAME (loop_vinfo->vector_mode));
2613	  else
2614	    dump_printf_loc (MSG_NOTE, vect_location,
2615			     "***** Analysis failed with vector mode %s\n",
2616			     GET_MODE_NAME (loop_vinfo->vector_mode));
2617	}
2618
2619      loop->aux = NULL;
2620
2621      if (!fatal)
2622	while (mode_i < vector_modes.length ()
2623	       && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2624	  {
2625	    if (dump_enabled_p ())
2626	      dump_printf_loc (MSG_NOTE, vect_location,
2627			       "***** The result for vector mode %s would"
2628			       " be the same\n",
2629			       GET_MODE_NAME (vector_modes[mode_i]));
2630	    mode_i += 1;
2631	  }
2632
2633      if (res)
2634	{
2635	  LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2636	  vectorized_loops++;
2637
2638	  /* Once we hit the desired simdlen for the first time,
2639	     discard any previous attempts.  */
2640	  if (simdlen
2641	      && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2642	    {
2643	      delete first_loop_vinfo;
2644	      first_loop_vinfo = opt_loop_vec_info::success (NULL);
2645	      LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2646	      simdlen = 0;
2647	    }
2648	  else if (pick_lowest_cost_p && first_loop_vinfo)
2649	    {
2650	      /* Keep trying to roll back vectorization attempts while the
2651		 loop_vec_infos they produced were worse than this one.  */
2652	      vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2653	      while (!vinfos.is_empty ()
2654		     && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2655		{
2656		  gcc_assert (vect_epilogues);
2657		  delete vinfos.pop ();
2658		}
2659	      if (vinfos.is_empty ()
2660		  && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2661		{
2662		  loop_vec_info main_loop_vinfo
2663		    = vect_reanalyze_as_main_loop (loop_vinfo, &n_stmts);
2664		  if (main_loop_vinfo == loop_vinfo)
2665		    {
2666		      delete first_loop_vinfo;
2667		      first_loop_vinfo = opt_loop_vec_info::success (NULL);
2668		    }
2669		  else if (main_loop_vinfo
2670			   && vect_joust_loop_vinfos (main_loop_vinfo,
2671						      first_loop_vinfo))
2672		    {
2673		      delete first_loop_vinfo;
2674		      first_loop_vinfo = opt_loop_vec_info::success (NULL);
2675		      delete loop_vinfo;
2676		      loop_vinfo
2677			= opt_loop_vec_info::success (main_loop_vinfo);
2678		    }
2679		  else
2680		    delete main_loop_vinfo;
2681		}
2682	    }
2683
2684	  if (first_loop_vinfo == NULL)
2685	    {
2686	      first_loop_vinfo = loop_vinfo;
2687	      lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2688	    }
2689	  else if (vect_epilogues
2690		   /* For now only allow one epilogue loop.  */
2691		   && first_loop_vinfo->epilogue_vinfos.is_empty ())
2692	    {
2693	      first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2694	      poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2695	      gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2696			  || maybe_ne (lowest_th, 0U));
2697	      /* Keep track of the known smallest versioning
2698		 threshold.  */
2699	      if (ordered_p (lowest_th, th))
2700		lowest_th = ordered_min (lowest_th, th);
2701	    }
2702	  else
2703	    delete loop_vinfo;
2704
2705	  /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2706	     enabled, SIMDUID is not set, it is the innermost loop and we have
2707	     either already found the loop's SIMDLEN or there was no SIMDLEN to
2708	     begin with.
2709	     TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
2710	  vect_epilogues = (!simdlen
2711			    && loop->inner == NULL
2712			    && param_vect_epilogues_nomask
2713			    && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2714			    && !loop->simduid
2715			    /* For now only allow one epilogue loop, but allow
2716			       pick_lowest_cost_p to replace it.  */
2717			    && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2718				|| pick_lowest_cost_p));
2719
2720	  /* Commit to first_loop_vinfo if we have no reason to try
2721	     alternatives.  */
2722	  if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2723	    break;
2724	}
2725      else
2726	{
2727	  delete loop_vinfo;
2728	  if (fatal)
2729	    {
2730	      gcc_checking_assert (first_loop_vinfo == NULL);
2731	      break;
2732	    }
2733	}
2734
2735      if (mode_i < vector_modes.length ()
2736	  && VECTOR_MODE_P (autodetected_vector_mode)
2737	  && (related_vector_mode (vector_modes[mode_i],
2738				   GET_MODE_INNER (autodetected_vector_mode))
2739	      == autodetected_vector_mode)
2740	  && (related_vector_mode (autodetected_vector_mode,
2741				   GET_MODE_INNER (vector_modes[mode_i]))
2742	      == vector_modes[mode_i]))
2743	{
2744	  if (dump_enabled_p ())
2745	    dump_printf_loc (MSG_NOTE, vect_location,
2746			     "***** Skipping vector mode %s, which would"
2747			     " repeat the analysis for %s\n",
2748			     GET_MODE_NAME (vector_modes[mode_i]),
2749			     GET_MODE_NAME (autodetected_vector_mode));
2750	  mode_i += 1;
2751	}
2752
2753      if (mode_i == vector_modes.length ()
2754	  || autodetected_vector_mode == VOIDmode)
2755	break;
2756
2757      /* Try the next biggest vector size.  */
2758      next_vector_mode = vector_modes[mode_i++];
2759      if (dump_enabled_p ())
2760	dump_printf_loc (MSG_NOTE, vect_location,
2761			 "***** Re-trying analysis with vector mode %s\n",
2762			 GET_MODE_NAME (next_vector_mode));
2763    }
2764
2765  if (first_loop_vinfo)
2766    {
2767      loop->aux = (loop_vec_info) first_loop_vinfo;
2768      if (dump_enabled_p ())
2769	dump_printf_loc (MSG_NOTE, vect_location,
2770			 "***** Choosing vector mode %s\n",
2771			 GET_MODE_NAME (first_loop_vinfo->vector_mode));
2772      LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2773      return first_loop_vinfo;
2774    }
2775
2776  return opt_loop_vec_info::propagate_failure (res);
2777}
2778
2779/* Return true if there is an in-order reduction function for CODE, storing
2780   it in *REDUC_FN if so.  */
2781
2782static bool
2783fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2784{
2785  switch (code)
2786    {
2787    case PLUS_EXPR:
2788      *reduc_fn = IFN_FOLD_LEFT_PLUS;
2789      return true;
2790
2791    default:
2792      return false;
2793    }
2794}
2795
2796/* Function reduction_fn_for_scalar_code
2797
2798   Input:
2799   CODE - tree_code of a reduction operations.
2800
2801   Output:
2802   REDUC_FN - the corresponding internal function to be used to reduce the
2803      vector of partial results into a single scalar result, or IFN_LAST
2804      if the operation is a supported reduction operation, but does not have
2805      such an internal function.
2806
2807   Return FALSE if CODE currently cannot be vectorized as reduction.  */
2808
2809static bool
2810reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2811{
2812  switch (code)
2813    {
2814      case MAX_EXPR:
2815        *reduc_fn = IFN_REDUC_MAX;
2816        return true;
2817
2818      case MIN_EXPR:
2819        *reduc_fn = IFN_REDUC_MIN;
2820        return true;
2821
2822      case PLUS_EXPR:
2823        *reduc_fn = IFN_REDUC_PLUS;
2824        return true;
2825
2826      case BIT_AND_EXPR:
2827	*reduc_fn = IFN_REDUC_AND;
2828	return true;
2829
2830      case BIT_IOR_EXPR:
2831	*reduc_fn = IFN_REDUC_IOR;
2832	return true;
2833
2834      case BIT_XOR_EXPR:
2835	*reduc_fn = IFN_REDUC_XOR;
2836	return true;
2837
2838      case MULT_EXPR:
2839      case MINUS_EXPR:
2840        *reduc_fn = IFN_LAST;
2841        return true;
2842
2843      default:
2844       return false;
2845    }
2846}
2847
2848/* If there is a neutral value X such that SLP reduction NODE would not
2849   be affected by the introduction of additional X elements, return that X,
2850   otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
2851   is the vector type that would hold element X.  REDUC_CHAIN is true if
2852   the SLP statements perform a single reduction, false if each statement
2853   performs an independent reduction.  */
2854
2855static tree
2856neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
2857			      tree_code code, bool reduc_chain)
2858{
2859  vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2860  stmt_vec_info stmt_vinfo = stmts[0];
2861  tree scalar_type = TREE_TYPE (vector_type);
2862  class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2863  gcc_assert (loop);
2864
2865  switch (code)
2866    {
2867    case WIDEN_SUM_EXPR:
2868    case DOT_PROD_EXPR:
2869    case SAD_EXPR:
2870    case PLUS_EXPR:
2871    case MINUS_EXPR:
2872    case BIT_IOR_EXPR:
2873    case BIT_XOR_EXPR:
2874      return build_zero_cst (scalar_type);
2875
2876    case MULT_EXPR:
2877      return build_one_cst (scalar_type);
2878
2879    case BIT_AND_EXPR:
2880      return build_all_ones_cst (scalar_type);
2881
2882    case MAX_EXPR:
2883    case MIN_EXPR:
2884      /* For MIN/MAX the initial values are neutral.  A reduction chain
2885	 has only a single initial value, so that value is neutral for
2886	 all statements.  */
2887      if (reduc_chain)
2888	return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2889				      loop_preheader_edge (loop));
2890      return NULL_TREE;
2891
2892    default:
2893      return NULL_TREE;
2894    }
2895}
2896
2897/* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2898   STMT is printed with a message MSG. */
2899
2900static void
2901report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2902{
2903  dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2904}
2905
2906/* Return true if we need an in-order reduction for operation CODE
2907   on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2908   overflow must wrap.  */
2909
2910bool
2911needs_fold_left_reduction_p (tree type, tree_code code)
2912{
2913  /* CHECKME: check for !flag_finite_math_only too?  */
2914  if (SCALAR_FLOAT_TYPE_P (type))
2915    switch (code)
2916      {
2917      case MIN_EXPR:
2918      case MAX_EXPR:
2919	return false;
2920
2921      default:
2922	return !flag_associative_math;
2923      }
2924
2925  if (INTEGRAL_TYPE_P (type))
2926    {
2927      if (!operation_no_trapping_overflow (type, code))
2928	return true;
2929      return false;
2930    }
2931
2932  if (SAT_FIXED_POINT_TYPE_P (type))
2933    return true;
2934
2935  return false;
2936}
2937
2938/* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2939   has a handled computation expression.  Store the main reduction
2940   operation in *CODE.  */
2941
2942static bool
2943check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2944		      tree loop_arg, enum tree_code *code,
2945		      vec<std::pair<ssa_op_iter, use_operand_p> > &path)
2946{
2947  auto_bitmap visited;
2948  tree lookfor = PHI_RESULT (phi);
2949  ssa_op_iter curri;
2950  use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2951  while (USE_FROM_PTR (curr) != loop_arg)
2952    curr = op_iter_next_use (&curri);
2953  curri.i = curri.numops;
2954  do
2955    {
2956      path.safe_push (std::make_pair (curri, curr));
2957      tree use = USE_FROM_PTR (curr);
2958      if (use == lookfor)
2959	break;
2960      gimple *def = SSA_NAME_DEF_STMT (use);
2961      if (gimple_nop_p (def)
2962	  || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2963	{
2964pop:
2965	  do
2966	    {
2967	      std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2968	      curri = x.first;
2969	      curr = x.second;
2970	      do
2971		curr = op_iter_next_use (&curri);
2972	      /* Skip already visited or non-SSA operands (from iterating
2973	         over PHI args).  */
2974	      while (curr != NULL_USE_OPERAND_P
2975		     && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2976			 || ! bitmap_set_bit (visited,
2977					      SSA_NAME_VERSION
2978					        (USE_FROM_PTR (curr)))));
2979	    }
2980	  while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2981	  if (curr == NULL_USE_OPERAND_P)
2982	    break;
2983	}
2984      else
2985	{
2986	  if (gimple_code (def) == GIMPLE_PHI)
2987	    curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2988	  else
2989	    curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2990	  while (curr != NULL_USE_OPERAND_P
2991		 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2992		     || ! bitmap_set_bit (visited,
2993					  SSA_NAME_VERSION
2994					    (USE_FROM_PTR (curr)))))
2995	    curr = op_iter_next_use (&curri);
2996	  if (curr == NULL_USE_OPERAND_P)
2997	    goto pop;
2998	}
2999    }
3000  while (1);
3001  if (dump_file && (dump_flags & TDF_DETAILS))
3002    {
3003      dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3004      unsigned i;
3005      std::pair<ssa_op_iter, use_operand_p> *x;
3006      FOR_EACH_VEC_ELT (path, i, x)
3007	dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3008      dump_printf (MSG_NOTE, "\n");
3009    }
3010
3011  /* Check whether the reduction path detected is valid.  */
3012  bool fail = path.length () == 0;
3013  bool neg = false;
3014  int sign = -1;
3015  *code = ERROR_MARK;
3016  for (unsigned i = 1; i < path.length (); ++i)
3017    {
3018      gimple *use_stmt = USE_STMT (path[i].second);
3019      tree op = USE_FROM_PTR (path[i].second);
3020      if (! is_gimple_assign (use_stmt)
3021	  /* The following make sure we can compute the operand index
3022	     easily plus it mostly disallows chaining via COND_EXPR condition
3023	     operands.  */
3024	  || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3025	      && (gimple_num_ops (use_stmt) <= 2
3026		  || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3027	      && (gimple_num_ops (use_stmt) <= 3
3028		  || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3029	{
3030	  fail = true;
3031	  break;
3032	}
3033      tree_code use_code = gimple_assign_rhs_code (use_stmt);
3034      if (use_code == MINUS_EXPR)
3035	{
3036	  use_code = PLUS_EXPR;
3037	  /* Track whether we negate the reduction value each iteration.  */
3038	  if (gimple_assign_rhs2 (use_stmt) == op)
3039	    neg = ! neg;
3040	}
3041      if (CONVERT_EXPR_CODE_P (use_code)
3042	  && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3043				    TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3044	;
3045      else if (*code == ERROR_MARK)
3046	{
3047	  *code = use_code;
3048	  sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3049	}
3050      else if (use_code != *code)
3051	{
3052	  fail = true;
3053	  break;
3054	}
3055      else if ((use_code == MIN_EXPR
3056		|| use_code == MAX_EXPR)
3057	       && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3058	{
3059	  fail = true;
3060	  break;
3061	}
3062      /* Check there's only a single stmt the op is used on.  For the
3063	 not value-changing tail and the last stmt allow out-of-loop uses.
3064	 ???  We could relax this and handle arbitrary live stmts by
3065	 forcing a scalar epilogue for example.  */
3066      imm_use_iterator imm_iter;
3067      gimple *op_use_stmt;
3068      unsigned cnt = 0;
3069      FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3070	if (!is_gimple_debug (op_use_stmt)
3071	    && (*code != ERROR_MARK
3072		|| flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3073	  {
3074	    /* We want to allow x + x but not x < 1 ? x : 2.  */
3075	    if (is_gimple_assign (op_use_stmt)
3076		&& gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3077	      {
3078		use_operand_p use_p;
3079		FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3080		  cnt++;
3081	      }
3082	    else
3083	      cnt++;
3084	  }
3085      if (cnt != 1)
3086	{
3087	  fail = true;
3088	  break;
3089	}
3090    }
3091  return ! fail && ! neg && *code != ERROR_MARK;
3092}
3093
3094bool
3095check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3096		      tree loop_arg, enum tree_code code)
3097{
3098  auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3099  enum tree_code code_;
3100  return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3101	  && code_ == code);
3102}
3103
3104
3105
3106/* Function vect_is_simple_reduction
3107
3108   (1) Detect a cross-iteration def-use cycle that represents a simple
3109   reduction computation.  We look for the following pattern:
3110
3111   loop_header:
3112     a1 = phi < a0, a2 >
3113     a3 = ...
3114     a2 = operation (a3, a1)
3115
3116   or
3117
3118   a3 = ...
3119   loop_header:
3120     a1 = phi < a0, a2 >
3121     a2 = operation (a3, a1)
3122
3123   such that:
3124   1. operation is commutative and associative and it is safe to
3125      change the order of the computation
3126   2. no uses for a2 in the loop (a2 is used out of the loop)
3127   3. no uses of a1 in the loop besides the reduction operation
3128   4. no uses of a1 outside the loop.
3129
3130   Conditions 1,4 are tested here.
3131   Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3132
3133   (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3134   nested cycles.
3135
3136   (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3137   reductions:
3138
3139     a1 = phi < a0, a2 >
3140     inner loop (def of a3)
3141     a2 = phi < a3 >
3142
3143   (4) Detect condition expressions, ie:
3144     for (int i = 0; i < N; i++)
3145       if (a[i] < val)
3146	ret_val = a[i];
3147
3148*/
3149
3150static stmt_vec_info
3151vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3152			  bool *double_reduc, bool *reduc_chain_p)
3153{
3154  gphi *phi = as_a <gphi *> (phi_info->stmt);
3155  gimple *phi_use_stmt = NULL;
3156  imm_use_iterator imm_iter;
3157  use_operand_p use_p;
3158
3159  *double_reduc = false;
3160  *reduc_chain_p = false;
3161  STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3162
3163  tree phi_name = PHI_RESULT (phi);
3164  /* ???  If there are no uses of the PHI result the inner loop reduction
3165     won't be detected as possibly double-reduction by vectorizable_reduction
3166     because that tries to walk the PHI arg from the preheader edge which
3167     can be constant.  See PR60382.  */
3168  if (has_zero_uses (phi_name))
3169    return NULL;
3170  class loop *loop = (gimple_bb (phi))->loop_father;
3171  unsigned nphi_def_loop_uses = 0;
3172  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3173    {
3174      gimple *use_stmt = USE_STMT (use_p);
3175      if (is_gimple_debug (use_stmt))
3176	continue;
3177
3178      if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3179        {
3180          if (dump_enabled_p ())
3181	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3182			     "intermediate value used outside loop.\n");
3183
3184          return NULL;
3185        }
3186
3187      nphi_def_loop_uses++;
3188      phi_use_stmt = use_stmt;
3189    }
3190
3191  tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3192  if (TREE_CODE (latch_def) != SSA_NAME)
3193    {
3194      if (dump_enabled_p ())
3195	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3196			 "reduction: not ssa_name: %T\n", latch_def);
3197      return NULL;
3198    }
3199
3200  stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3201  if (!def_stmt_info
3202      || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3203    return NULL;
3204
3205  bool nested_in_vect_loop
3206    = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3207  unsigned nlatch_def_loop_uses = 0;
3208  auto_vec<gphi *, 3> lcphis;
3209  bool inner_loop_of_double_reduc = false;
3210  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3211    {
3212      gimple *use_stmt = USE_STMT (use_p);
3213      if (is_gimple_debug (use_stmt))
3214	continue;
3215      if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3216	nlatch_def_loop_uses++;
3217      else
3218	{
3219	  /* We can have more than one loop-closed PHI.  */
3220	  lcphis.safe_push (as_a <gphi *> (use_stmt));
3221	  if (nested_in_vect_loop
3222	      && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3223		  == vect_double_reduction_def))
3224	    inner_loop_of_double_reduc = true;
3225	}
3226    }
3227
3228  /* If we are vectorizing an inner reduction we are executing that
3229     in the original order only in case we are not dealing with a
3230     double reduction.  */
3231  if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3232    {
3233      if (dump_enabled_p ())
3234	report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3235			"detected nested cycle: ");
3236      return def_stmt_info;
3237    }
3238
3239  /* When the inner loop of a double reduction ends up with more than
3240     one loop-closed PHI we have failed to classify alternate such
3241     PHIs as double reduction, leading to wrong code.  See PR103237.  */
3242  if (inner_loop_of_double_reduc && lcphis.length () != 1)
3243    {
3244      if (dump_enabled_p ())
3245	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3246			 "unhandle double reduction\n");
3247      return NULL;
3248    }
3249
3250  /* If this isn't a nested cycle or if the nested cycle reduction value
3251     is used ouside of the inner loop we cannot handle uses of the reduction
3252     value.  */
3253  if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3254    {
3255      if (dump_enabled_p ())
3256	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3257			 "reduction used in loop.\n");
3258      return NULL;
3259    }
3260
3261  /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3262     defined in the inner loop.  */
3263  if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3264    {
3265      tree op1 = PHI_ARG_DEF (def_stmt, 0);
3266      if (gimple_phi_num_args (def_stmt) != 1
3267          || TREE_CODE (op1) != SSA_NAME)
3268        {
3269          if (dump_enabled_p ())
3270	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3271			     "unsupported phi node definition.\n");
3272
3273          return NULL;
3274        }
3275
3276      gimple *def1 = SSA_NAME_DEF_STMT (op1);
3277      if (gimple_bb (def1)
3278	  && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3279          && loop->inner
3280          && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3281          && is_gimple_assign (def1)
3282	  && is_a <gphi *> (phi_use_stmt)
3283	  && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3284        {
3285          if (dump_enabled_p ())
3286            report_vect_op (MSG_NOTE, def_stmt,
3287			    "detected double reduction: ");
3288
3289          *double_reduc = true;
3290	  return def_stmt_info;
3291        }
3292
3293      return NULL;
3294    }
3295
3296  /* Look for the expression computing latch_def from then loop PHI result.  */
3297  auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3298  enum tree_code code;
3299  if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3300			    path))
3301    {
3302      STMT_VINFO_REDUC_CODE (phi_info) = code;
3303      if (code == COND_EXPR && !nested_in_vect_loop)
3304	STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3305
3306      /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3307	 reduction chain for which the additional restriction is that
3308	 all operations in the chain are the same.  */
3309      auto_vec<stmt_vec_info, 8> reduc_chain;
3310      unsigned i;
3311      bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3312      for (i = path.length () - 1; i >= 1; --i)
3313	{
3314	  gimple *stmt = USE_STMT (path[i].second);
3315	  stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3316	  STMT_VINFO_REDUC_IDX (stmt_info)
3317	    = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3318	  enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3319	  bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3320				     && (i == 1 || i == path.length () - 1));
3321	  if ((stmt_code != code && !leading_conversion)
3322	      /* We can only handle the final value in epilogue
3323		 generation for reduction chains.  */
3324	      || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3325	    is_slp_reduc = false;
3326	  /* For reduction chains we support a trailing/leading
3327	     conversions.  We do not store those in the actual chain.  */
3328	  if (leading_conversion)
3329	    continue;
3330	  reduc_chain.safe_push (stmt_info);
3331	}
3332      if (is_slp_reduc && reduc_chain.length () > 1)
3333	{
3334	  for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3335	    {
3336	      REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3337	      REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3338	    }
3339	  REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3340	  REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3341
3342	  /* Save the chain for further analysis in SLP detection.  */
3343	  LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3344	  REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3345
3346	  *reduc_chain_p = true;
3347	  if (dump_enabled_p ())
3348	    dump_printf_loc (MSG_NOTE, vect_location,
3349			    "reduction: detected reduction chain\n");
3350	}
3351      else if (dump_enabled_p ())
3352	dump_printf_loc (MSG_NOTE, vect_location,
3353			 "reduction: detected reduction\n");
3354
3355      return def_stmt_info;
3356    }
3357
3358  if (dump_enabled_p ())
3359    dump_printf_loc (MSG_NOTE, vect_location,
3360		     "reduction: unknown pattern\n");
3361
3362  return NULL;
3363}
3364
3365/* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3366int
3367vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3368                             int *peel_iters_epilogue,
3369                             stmt_vector_for_cost *scalar_cost_vec,
3370			     stmt_vector_for_cost *prologue_cost_vec,
3371			     stmt_vector_for_cost *epilogue_cost_vec)
3372{
3373  int retval = 0;
3374  int assumed_vf = vect_vf_for_cost (loop_vinfo);
3375
3376  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3377    {
3378      *peel_iters_epilogue = assumed_vf / 2;
3379      if (dump_enabled_p ())
3380        dump_printf_loc (MSG_NOTE, vect_location,
3381			 "cost model: epilogue peel iters set to vf/2 "
3382			 "because loop iterations are unknown .\n");
3383
3384      /* If peeled iterations are known but number of scalar loop
3385         iterations are unknown, count a taken branch per peeled loop.  */
3386      retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3387				 NULL, 0, vect_prologue);
3388      retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3389				  NULL, 0, vect_epilogue);
3390    }
3391  else
3392    {
3393      int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3394      peel_iters_prologue = niters < peel_iters_prologue ?
3395                            niters : peel_iters_prologue;
3396      *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3397      /* If we need to peel for gaps, but no peeling is required, we have to
3398	 peel VF iterations.  */
3399      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3400	*peel_iters_epilogue = assumed_vf;
3401    }
3402
3403  stmt_info_for_cost *si;
3404  int j;
3405  if (peel_iters_prologue)
3406    FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3407      retval += record_stmt_cost (prologue_cost_vec,
3408				  si->count * peel_iters_prologue,
3409				  si->kind, si->stmt_info, si->misalign,
3410				  vect_prologue);
3411  if (*peel_iters_epilogue)
3412    FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3413      retval += record_stmt_cost (epilogue_cost_vec,
3414				  si->count * *peel_iters_epilogue,
3415				  si->kind, si->stmt_info, si->misalign,
3416				  vect_epilogue);
3417
3418  return retval;
3419}
3420
3421/* Function vect_estimate_min_profitable_iters
3422
3423   Return the number of iterations required for the vector version of the
3424   loop to be profitable relative to the cost of the scalar version of the
3425   loop.
3426
3427   *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3428   of iterations for vectorization.  -1 value means loop vectorization
3429   is not profitable.  This returned value may be used for dynamic
3430   profitability check.
3431
3432   *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3433   for static check against estimated number of iterations.  */
3434
3435static void
3436vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3437				    int *ret_min_profitable_niters,
3438				    int *ret_min_profitable_estimate)
3439{
3440  int min_profitable_iters;
3441  int min_profitable_estimate;
3442  int peel_iters_prologue;
3443  int peel_iters_epilogue;
3444  unsigned vec_inside_cost = 0;
3445  int vec_outside_cost = 0;
3446  unsigned vec_prologue_cost = 0;
3447  unsigned vec_epilogue_cost = 0;
3448  int scalar_single_iter_cost = 0;
3449  int scalar_outside_cost = 0;
3450  int assumed_vf = vect_vf_for_cost (loop_vinfo);
3451  int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3452  void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3453
3454  /* Cost model disabled.  */
3455  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3456    {
3457      if (dump_enabled_p ())
3458	dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3459      *ret_min_profitable_niters = 0;
3460      *ret_min_profitable_estimate = 0;
3461      return;
3462    }
3463
3464  /* Requires loop versioning tests to handle misalignment.  */
3465  if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3466    {
3467      /*  FIXME: Make cost depend on complexity of individual check.  */
3468      unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3469      (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3470			    vect_prologue);
3471      if (dump_enabled_p ())
3472	dump_printf (MSG_NOTE,
3473		     "cost model: Adding cost of checks for loop "
3474		     "versioning to treat misalignment.\n");
3475    }
3476
3477  /* Requires loop versioning with alias checks.  */
3478  if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3479    {
3480      /*  FIXME: Make cost depend on complexity of individual check.  */
3481      unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3482      (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3483			    vect_prologue);
3484      len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3485      if (len)
3486	/* Count LEN - 1 ANDs and LEN comparisons.  */
3487	(void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3488			      NULL, 0, vect_prologue);
3489      len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3490      if (len)
3491	{
3492	  /* Count LEN - 1 ANDs and LEN comparisons.  */
3493	  unsigned int nstmts = len * 2 - 1;
3494	  /* +1 for each bias that needs adding.  */
3495	  for (unsigned int i = 0; i < len; ++i)
3496	    if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3497	      nstmts += 1;
3498	  (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3499				NULL, 0, vect_prologue);
3500	}
3501      if (dump_enabled_p ())
3502	dump_printf (MSG_NOTE,
3503		     "cost model: Adding cost of checks for loop "
3504		     "versioning aliasing.\n");
3505    }
3506
3507  /* Requires loop versioning with niter checks.  */
3508  if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3509    {
3510      /*  FIXME: Make cost depend on complexity of individual check.  */
3511      (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3512			    vect_prologue);
3513      if (dump_enabled_p ())
3514	dump_printf (MSG_NOTE,
3515		     "cost model: Adding cost of checks for loop "
3516		     "versioning niters.\n");
3517    }
3518
3519  if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3520    (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3521			  vect_prologue);
3522
3523  /* Count statements in scalar loop.  Using this as scalar cost for a single
3524     iteration for now.
3525
3526     TODO: Add outer loop support.
3527
3528     TODO: Consider assigning different costs to different scalar
3529     statements.  */
3530
3531  scalar_single_iter_cost
3532    = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3533
3534  /* Add additional cost for the peeled instructions in prologue and epilogue
3535     loop.  (For fully-masked loops there will be no peeling.)
3536
3537     FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3538     at compile-time - we assume it's vf/2 (the worst would be vf-1).
3539
3540     TODO: Build an expression that represents peel_iters for prologue and
3541     epilogue to be used in a run-time test.  */
3542
3543  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3544    {
3545      peel_iters_prologue = 0;
3546      peel_iters_epilogue = 0;
3547
3548      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3549	{
3550	  /* We need to peel exactly one iteration.  */
3551	  peel_iters_epilogue += 1;
3552	  stmt_info_for_cost *si;
3553	  int j;
3554	  FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3555			    j, si)
3556	    (void) add_stmt_cost (target_cost_data, si->count,
3557				  si->kind, si->stmt_info, si->misalign,
3558				  vect_epilogue);
3559	}
3560
3561      /* Calculate how many masks we need to generate.  */
3562      unsigned int num_masks = 0;
3563      rgroup_masks *rgm;
3564      unsigned int num_vectors_m1;
3565      FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3566	if (rgm->mask_type)
3567	  num_masks += num_vectors_m1 + 1;
3568      gcc_assert (num_masks > 0);
3569
3570      /* In the worst case, we need to generate each mask in the prologue
3571	 and in the loop body.  One of the loop body mask instructions
3572	 replaces the comparison in the scalar loop, and since we don't
3573	 count the scalar comparison against the scalar body, we shouldn't
3574	 count that vector instruction against the vector body either.
3575
3576	 Sometimes we can use unpacks instead of generating prologue
3577	 masks and sometimes the prologue mask will fold to a constant,
3578	 so the actual prologue cost might be smaller.  However, it's
3579	 simpler and safer to use the worst-case cost; if this ends up
3580	 being the tie-breaker between vectorizing or not, then it's
3581	 probably better not to vectorize.  */
3582      (void) add_stmt_cost (target_cost_data, num_masks, vector_stmt,
3583			    NULL, 0, vect_prologue);
3584      (void) add_stmt_cost (target_cost_data, num_masks - 1, vector_stmt,
3585			    NULL, 0, vect_body);
3586    }
3587  else if (npeel < 0)
3588    {
3589      peel_iters_prologue = assumed_vf / 2;
3590      if (dump_enabled_p ())
3591	dump_printf (MSG_NOTE, "cost model: "
3592		     "prologue peel iters set to vf/2.\n");
3593
3594      /* If peeling for alignment is unknown, loop bound of main loop becomes
3595         unknown.  */
3596      peel_iters_epilogue = assumed_vf / 2;
3597      if (dump_enabled_p ())
3598	dump_printf (MSG_NOTE, "cost model: "
3599		     "epilogue peel iters set to vf/2 because "
3600		     "peeling for alignment is unknown.\n");
3601
3602      /* If peeled iterations are unknown, count a taken branch and a not taken
3603         branch per peeled loop. Even if scalar loop iterations are known,
3604         vector iterations are not known since peeled prologue iterations are
3605         not known. Hence guards remain the same.  */
3606      (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3607			    NULL, 0, vect_prologue);
3608      (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3609			    NULL, 0, vect_prologue);
3610      (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3611			    NULL, 0, vect_epilogue);
3612      (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3613			    NULL, 0, vect_epilogue);
3614      stmt_info_for_cost *si;
3615      int j;
3616      FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3617	{
3618	  (void) add_stmt_cost (target_cost_data,
3619				si->count * peel_iters_prologue,
3620				si->kind, si->stmt_info, si->misalign,
3621				vect_prologue);
3622	  (void) add_stmt_cost (target_cost_data,
3623				si->count * peel_iters_epilogue,
3624				si->kind, si->stmt_info, si->misalign,
3625				vect_epilogue);
3626	}
3627    }
3628  else
3629    {
3630      stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3631      stmt_info_for_cost *si;
3632      int j;
3633      void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3634
3635      prologue_cost_vec.create (2);
3636      epilogue_cost_vec.create (2);
3637      peel_iters_prologue = npeel;
3638
3639      (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3640					  &peel_iters_epilogue,
3641					  &LOOP_VINFO_SCALAR_ITERATION_COST
3642					    (loop_vinfo),
3643					  &prologue_cost_vec,
3644					  &epilogue_cost_vec);
3645
3646      FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3647	(void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3648			      si->misalign, vect_prologue);
3649
3650      FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3651	(void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3652			      si->misalign, vect_epilogue);
3653
3654      prologue_cost_vec.release ();
3655      epilogue_cost_vec.release ();
3656    }
3657
3658  /* FORNOW: The scalar outside cost is incremented in one of the
3659     following ways:
3660
3661     1. The vectorizer checks for alignment and aliasing and generates
3662     a condition that allows dynamic vectorization.  A cost model
3663     check is ANDED with the versioning condition.  Hence scalar code
3664     path now has the added cost of the versioning check.
3665
3666       if (cost > th & versioning_check)
3667         jmp to vector code
3668
3669     Hence run-time scalar is incremented by not-taken branch cost.
3670
3671     2. The vectorizer then checks if a prologue is required.  If the
3672     cost model check was not done before during versioning, it has to
3673     be done before the prologue check.
3674
3675       if (cost <= th)
3676         prologue = scalar_iters
3677       if (prologue == 0)
3678         jmp to vector code
3679       else
3680         execute prologue
3681       if (prologue == num_iters)
3682	 go to exit
3683
3684     Hence the run-time scalar cost is incremented by a taken branch,
3685     plus a not-taken branch, plus a taken branch cost.
3686
3687     3. The vectorizer then checks if an epilogue is required.  If the
3688     cost model check was not done before during prologue check, it
3689     has to be done with the epilogue check.
3690
3691       if (prologue == 0)
3692         jmp to vector code
3693       else
3694         execute prologue
3695       if (prologue == num_iters)
3696	 go to exit
3697       vector code:
3698         if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3699           jmp to epilogue
3700
3701     Hence the run-time scalar cost should be incremented by 2 taken
3702     branches.
3703
3704     TODO: The back end may reorder the BBS's differently and reverse
3705     conditions/branch directions.  Change the estimates below to
3706     something more reasonable.  */
3707
3708  /* If the number of iterations is known and we do not do versioning, we can
3709     decide whether to vectorize at compile time.  Hence the scalar version
3710     do not carry cost model guard costs.  */
3711  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3712      || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3713    {
3714      /* Cost model check occurs at versioning.  */
3715      if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3716	scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3717      else
3718	{
3719	  /* Cost model check occurs at prologue generation.  */
3720	  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3721	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3722	      + vect_get_stmt_cost (cond_branch_not_taken);
3723	  /* Cost model check occurs at epilogue generation.  */
3724	  else
3725	    scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3726	}
3727    }
3728
3729  /* Complete the target-specific cost calculations.  */
3730  finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3731	       &vec_inside_cost, &vec_epilogue_cost);
3732
3733  vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3734
3735  /* Stash the costs so that we can compare two loop_vec_infos.  */
3736  loop_vinfo->vec_inside_cost = vec_inside_cost;
3737  loop_vinfo->vec_outside_cost = vec_outside_cost;
3738
3739  if (dump_enabled_p ())
3740    {
3741      dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3742      dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3743                   vec_inside_cost);
3744      dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3745                   vec_prologue_cost);
3746      dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3747                   vec_epilogue_cost);
3748      dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3749                   scalar_single_iter_cost);
3750      dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3751                   scalar_outside_cost);
3752      dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3753                   vec_outside_cost);
3754      dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3755                   peel_iters_prologue);
3756      dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3757                   peel_iters_epilogue);
3758    }
3759
3760  /* Calculate number of iterations required to make the vector version
3761     profitable, relative to the loop bodies only.  The following condition
3762     must hold true:
3763     SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3764     where
3765     SIC = scalar iteration cost, VIC = vector iteration cost,
3766     VOC = vector outside cost, VF = vectorization factor,
3767     NPEEL = prologue iterations + epilogue iterations,
3768     SOC = scalar outside cost for run time cost model check.  */
3769
3770  int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3771			  - vec_inside_cost);
3772  if (saving_per_viter <= 0)
3773    {
3774      if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3775	warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3776		    "vectorization did not happen for a simd loop");
3777
3778      if (dump_enabled_p ())
3779        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3780			 "cost model: the vector iteration cost = %d "
3781			 "divided by the scalar iteration cost = %d "
3782			 "is greater or equal to the vectorization factor = %d"
3783                         ".\n",
3784			 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3785      *ret_min_profitable_niters = -1;
3786      *ret_min_profitable_estimate = -1;
3787      return;
3788    }
3789
3790  /* ??? The "if" arm is written to handle all cases; see below for what
3791     we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3792  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3793    {
3794      /* Rewriting the condition above in terms of the number of
3795	 vector iterations (vniters) rather than the number of
3796	 scalar iterations (niters) gives:
3797
3798	 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3799
3800	 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3801
3802	 For integer N, X and Y when X > 0:
3803
3804	 N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3805      int outside_overhead = (vec_outside_cost
3806			      - scalar_single_iter_cost * peel_iters_prologue
3807			      - scalar_single_iter_cost * peel_iters_epilogue
3808			      - scalar_outside_cost);
3809      /* We're only interested in cases that require at least one
3810	 vector iteration.  */
3811      int min_vec_niters = 1;
3812      if (outside_overhead > 0)
3813	min_vec_niters = outside_overhead / saving_per_viter + 1;
3814
3815      if (dump_enabled_p ())
3816	dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3817		     min_vec_niters);
3818
3819      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3820	{
3821	  /* Now that we know the minimum number of vector iterations,
3822	     find the minimum niters for which the scalar cost is larger:
3823
3824	     SIC * niters > VIC * vniters + VOC - SOC
3825
3826	     We know that the minimum niters is no more than
3827	     vniters * VF + NPEEL, but it might be (and often is) less
3828	     than that if a partial vector iteration is cheaper than the
3829	     equivalent scalar code.  */
3830	  int threshold = (vec_inside_cost * min_vec_niters
3831			   + vec_outside_cost
3832			   - scalar_outside_cost);
3833	  if (threshold <= 0)
3834	    min_profitable_iters = 1;
3835	  else
3836	    min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3837	}
3838      else
3839	/* Convert the number of vector iterations into a number of
3840	   scalar iterations.  */
3841	min_profitable_iters = (min_vec_niters * assumed_vf
3842				+ peel_iters_prologue
3843				+ peel_iters_epilogue);
3844    }
3845  else
3846    {
3847      min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3848			      * assumed_vf
3849			      - vec_inside_cost * peel_iters_prologue
3850			      - vec_inside_cost * peel_iters_epilogue);
3851      if (min_profitable_iters <= 0)
3852        min_profitable_iters = 0;
3853      else
3854	{
3855	  min_profitable_iters /= saving_per_viter;
3856
3857	  if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3858	      <= (((int) vec_inside_cost * min_profitable_iters)
3859		  + (((int) vec_outside_cost - scalar_outside_cost)
3860		     * assumed_vf)))
3861	    min_profitable_iters++;
3862	}
3863    }
3864
3865  if (dump_enabled_p ())
3866    dump_printf (MSG_NOTE,
3867		 "  Calculated minimum iters for profitability: %d\n",
3868		 min_profitable_iters);
3869
3870  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3871      && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3872    /* We want the vectorized loop to execute at least once.  */
3873    min_profitable_iters = assumed_vf + peel_iters_prologue;
3874
3875  if (dump_enabled_p ())
3876    dump_printf_loc (MSG_NOTE, vect_location,
3877                     "  Runtime profitability threshold = %d\n",
3878                     min_profitable_iters);
3879
3880  *ret_min_profitable_niters = min_profitable_iters;
3881
3882  /* Calculate number of iterations required to make the vector version
3883     profitable, relative to the loop bodies only.
3884
3885     Non-vectorized variant is SIC * niters and it must win over vector
3886     variant on the expected loop trip count.  The following condition must hold true:
3887     SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3888
3889  if (vec_outside_cost <= 0)
3890    min_profitable_estimate = 0;
3891  else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3892    {
3893      /* This is a repeat of the code above, but with + SOC rather
3894	 than - SOC.  */
3895      int outside_overhead = (vec_outside_cost
3896			      - scalar_single_iter_cost * peel_iters_prologue
3897			      - scalar_single_iter_cost * peel_iters_epilogue
3898			      + scalar_outside_cost);
3899      int min_vec_niters = 1;
3900      if (outside_overhead > 0)
3901	min_vec_niters = outside_overhead / saving_per_viter + 1;
3902
3903      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3904	{
3905	  int threshold = (vec_inside_cost * min_vec_niters
3906			   + vec_outside_cost
3907			   + scalar_outside_cost);
3908	  min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3909	}
3910      else
3911	min_profitable_estimate = (min_vec_niters * assumed_vf
3912				   + peel_iters_prologue
3913				   + peel_iters_epilogue);
3914    }
3915  else
3916    {
3917      min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3918				 * assumed_vf
3919				 - vec_inside_cost * peel_iters_prologue
3920				 - vec_inside_cost * peel_iters_epilogue)
3921				 / ((scalar_single_iter_cost * assumed_vf)
3922				   - vec_inside_cost);
3923    }
3924  min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3925  if (dump_enabled_p ())
3926    dump_printf_loc (MSG_NOTE, vect_location,
3927		     "  Static estimate profitability threshold = %d\n",
3928		     min_profitable_estimate);
3929
3930  *ret_min_profitable_estimate = min_profitable_estimate;
3931}
3932
3933/* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3934   vector elements (not bits) for a vector with NELT elements.  */
3935static void
3936calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3937			      vec_perm_builder *sel)
3938{
3939  /* The encoding is a single stepped pattern.  Any wrap-around is handled
3940     by vec_perm_indices.  */
3941  sel->new_vector (nelt, 1, 3);
3942  for (unsigned int i = 0; i < 3; i++)
3943    sel->quick_push (i + offset);
3944}
3945
3946/* Checks whether the target supports whole-vector shifts for vectors of mode
3947   MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3948   it supports vec_perm_const with masks for all necessary shift amounts.  */
3949static bool
3950have_whole_vector_shift (machine_mode mode)
3951{
3952  if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3953    return true;
3954
3955  /* Variable-length vectors should be handled via the optab.  */
3956  unsigned int nelt;
3957  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3958    return false;
3959
3960  vec_perm_builder sel;
3961  vec_perm_indices indices;
3962  for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3963    {
3964      calc_vec_perm_mask_for_shift (i, nelt, &sel);
3965      indices.new_vector (sel, 2, nelt);
3966      if (!can_vec_perm_const_p (mode, indices, false))
3967	return false;
3968    }
3969  return true;
3970}
3971
3972/* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3973   functions. Design better to avoid maintenance issues.  */
3974
3975/* Function vect_model_reduction_cost.
3976
3977   Models cost for a reduction operation, including the vector ops
3978   generated within the strip-mine loop in some cases, the initial
3979   definition before the loop, and the epilogue code that must be generated.  */
3980
3981static void
3982vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3983			   vect_reduction_type reduction_type,
3984			   int ncopies, stmt_vector_for_cost *cost_vec)
3985{
3986  int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3987  enum tree_code code;
3988  optab optab;
3989  tree vectype;
3990  machine_mode mode;
3991  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3992  class loop *loop = NULL;
3993
3994  if (loop_vinfo)
3995    loop = LOOP_VINFO_LOOP (loop_vinfo);
3996
3997  /* Condition reductions generate two reductions in the loop.  */
3998  if (reduction_type == COND_REDUCTION)
3999    ncopies *= 2;
4000
4001  vectype = STMT_VINFO_VECTYPE (stmt_info);
4002  mode = TYPE_MODE (vectype);
4003  stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4004
4005  code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4006
4007  if (reduction_type == EXTRACT_LAST_REDUCTION)
4008    /* No extra instructions are needed in the prologue.  The loop body
4009       operations are costed in vectorizable_condition.  */
4010    inside_cost = 0;
4011  else if (reduction_type == FOLD_LEFT_REDUCTION)
4012    {
4013      /* No extra instructions needed in the prologue.  */
4014      prologue_cost = 0;
4015
4016      if (reduc_fn != IFN_LAST)
4017	/* Count one reduction-like operation per vector.  */
4018	inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4019					stmt_info, 0, vect_body);
4020      else
4021	{
4022	  /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4023	  unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4024	  inside_cost = record_stmt_cost (cost_vec, nelements,
4025					  vec_to_scalar, stmt_info, 0,
4026					  vect_body);
4027	  inside_cost += record_stmt_cost (cost_vec, nelements,
4028					   scalar_stmt, stmt_info, 0,
4029					   vect_body);
4030	}
4031    }
4032  else
4033    {
4034      /* Add in cost for initial definition.
4035	 For cond reduction we have four vectors: initial index, step,
4036	 initial result of the data reduction, initial value of the index
4037	 reduction.  */
4038      int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4039      prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4040					 scalar_to_vec, stmt_info, 0,
4041					 vect_prologue);
4042    }
4043
4044  /* Determine cost of epilogue code.
4045
4046     We have a reduction operator that will reduce the vector in one statement.
4047     Also requires scalar extract.  */
4048
4049  if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4050    {
4051      if (reduc_fn != IFN_LAST)
4052	{
4053	  if (reduction_type == COND_REDUCTION)
4054	    {
4055	      /* An EQ stmt and an COND_EXPR stmt.  */
4056	      epilogue_cost += record_stmt_cost (cost_vec, 2,
4057						 vector_stmt, stmt_info, 0,
4058						 vect_epilogue);
4059	      /* Reduction of the max index and a reduction of the found
4060		 values.  */
4061	      epilogue_cost += record_stmt_cost (cost_vec, 2,
4062						 vec_to_scalar, stmt_info, 0,
4063						 vect_epilogue);
4064	      /* A broadcast of the max value.  */
4065	      epilogue_cost += record_stmt_cost (cost_vec, 1,
4066						 scalar_to_vec, stmt_info, 0,
4067						 vect_epilogue);
4068	    }
4069	  else
4070	    {
4071	      epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4072						 stmt_info, 0, vect_epilogue);
4073	      epilogue_cost += record_stmt_cost (cost_vec, 1,
4074						 vec_to_scalar, stmt_info, 0,
4075						 vect_epilogue);
4076	    }
4077	}
4078      else if (reduction_type == COND_REDUCTION)
4079	{
4080	  unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4081	  /* Extraction of scalar elements.  */
4082	  epilogue_cost += record_stmt_cost (cost_vec,
4083					     2 * estimated_nunits,
4084					     vec_to_scalar, stmt_info, 0,
4085					     vect_epilogue);
4086	  /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4087	  epilogue_cost += record_stmt_cost (cost_vec,
4088					     2 * estimated_nunits - 3,
4089					     scalar_stmt, stmt_info, 0,
4090					     vect_epilogue);
4091	}
4092      else if (reduction_type == EXTRACT_LAST_REDUCTION
4093	       || reduction_type == FOLD_LEFT_REDUCTION)
4094	/* No extra instructions need in the epilogue.  */
4095	;
4096      else
4097	{
4098	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4099	  tree bitsize =
4100	    TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4101	  int element_bitsize = tree_to_uhwi (bitsize);
4102	  int nelements = vec_size_in_bits / element_bitsize;
4103
4104	  if (code == COND_EXPR)
4105	    code = MAX_EXPR;
4106
4107	  optab = optab_for_tree_code (code, vectype, optab_default);
4108
4109	  /* We have a whole vector shift available.  */
4110	  if (optab != unknown_optab
4111	      && VECTOR_MODE_P (mode)
4112	      && optab_handler (optab, mode) != CODE_FOR_nothing
4113	      && have_whole_vector_shift (mode))
4114	    {
4115	      /* Final reduction via vector shifts and the reduction operator.
4116		 Also requires scalar extract.  */
4117	      epilogue_cost += record_stmt_cost (cost_vec,
4118						 exact_log2 (nelements) * 2,
4119						 vector_stmt, stmt_info, 0,
4120						 vect_epilogue);
4121	      epilogue_cost += record_stmt_cost (cost_vec, 1,
4122						 vec_to_scalar, stmt_info, 0,
4123						 vect_epilogue);
4124	    }
4125	  else
4126	    /* Use extracts and reduction op for final reduction.  For N
4127	       elements, we have N extracts and N-1 reduction ops.  */
4128	    epilogue_cost += record_stmt_cost (cost_vec,
4129					       nelements + nelements - 1,
4130					       vector_stmt, stmt_info, 0,
4131					       vect_epilogue);
4132	}
4133    }
4134
4135  if (dump_enabled_p ())
4136    dump_printf (MSG_NOTE,
4137                 "vect_model_reduction_cost: inside_cost = %d, "
4138                 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4139                 prologue_cost, epilogue_cost);
4140}
4141
4142
4143/* Function vect_model_induction_cost.
4144
4145   Models cost for induction operations.  */
4146
4147static void
4148vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4149			   stmt_vector_for_cost *cost_vec)
4150{
4151  unsigned inside_cost, prologue_cost;
4152
4153  if (PURE_SLP_STMT (stmt_info))
4154    return;
4155
4156  /* loop cost for vec_loop.  */
4157  inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4158				  stmt_info, 0, vect_body);
4159
4160  /* prologue cost for vec_init and vec_step.  */
4161  prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4162				    stmt_info, 0, vect_prologue);
4163
4164  if (dump_enabled_p ())
4165    dump_printf_loc (MSG_NOTE, vect_location,
4166                     "vect_model_induction_cost: inside_cost = %d, "
4167                     "prologue_cost = %d .\n", inside_cost, prologue_cost);
4168}
4169
4170
4171
4172/* Function get_initial_def_for_reduction
4173
4174   Input:
4175   STMT_VINFO - a stmt that performs a reduction operation in the loop.
4176   INIT_VAL - the initial value of the reduction variable
4177
4178   Output:
4179   ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4180        of the reduction (used for adjusting the epilog - see below).
4181   Return a vector variable, initialized according to the operation that
4182	STMT_VINFO performs. This vector will be used as the initial value
4183	of the vector of partial results.
4184
4185   Option1 (adjust in epilog): Initialize the vector as follows:
4186     add/bit or/xor:    [0,0,...,0,0]
4187     mult/bit and:      [1,1,...,1,1]
4188     min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4189   and when necessary (e.g. add/mult case) let the caller know
4190   that it needs to adjust the result by init_val.
4191
4192   Option2: Initialize the vector as follows:
4193     add/bit or/xor:    [init_val,0,0,...,0]
4194     mult/bit and:      [init_val,1,1,...,1]
4195     min/max/cond_expr: [init_val,init_val,...,init_val]
4196   and no adjustments are needed.
4197
4198   For example, for the following code:
4199
4200   s = init_val;
4201   for (i=0;i<n;i++)
4202     s = s + a[i];
4203
4204   STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4205   For a vector of 4 units, we want to return either [0,0,0,init_val],
4206   or [0,0,0,0] and let the caller know that it needs to adjust
4207   the result at the end by 'init_val'.
4208
4209   FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4210   initialization vector is simpler (same element in all entries), if
4211   ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4212
4213   A cost model should help decide between these two schemes.  */
4214
4215static tree
4216get_initial_def_for_reduction (stmt_vec_info stmt_vinfo,
4217			       enum tree_code code, tree init_val,
4218                               tree *adjustment_def)
4219{
4220  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4221  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4222  tree scalar_type = TREE_TYPE (init_val);
4223  tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4224  tree def_for_init;
4225  tree init_def;
4226  REAL_VALUE_TYPE real_init_val = dconst0;
4227  int int_init_val = 0;
4228  gimple_seq stmts = NULL;
4229
4230  gcc_assert (vectype);
4231
4232  gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4233	      || SCALAR_FLOAT_TYPE_P (scalar_type));
4234
4235  gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4236	      || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4237
4238  /* ADJUSTMENT_DEF is NULL when called from
4239     vect_create_epilog_for_reduction to vectorize double reduction.  */
4240  if (adjustment_def)
4241    *adjustment_def = NULL;
4242
4243  switch (code)
4244    {
4245    case WIDEN_SUM_EXPR:
4246    case DOT_PROD_EXPR:
4247    case SAD_EXPR:
4248    case PLUS_EXPR:
4249    case MINUS_EXPR:
4250    case BIT_IOR_EXPR:
4251    case BIT_XOR_EXPR:
4252    case MULT_EXPR:
4253    case BIT_AND_EXPR:
4254      {
4255        if (code == MULT_EXPR)
4256          {
4257            real_init_val = dconst1;
4258            int_init_val = 1;
4259          }
4260
4261        if (code == BIT_AND_EXPR)
4262          int_init_val = -1;
4263
4264        if (SCALAR_FLOAT_TYPE_P (scalar_type))
4265          def_for_init = build_real (scalar_type, real_init_val);
4266        else
4267          def_for_init = build_int_cst (scalar_type, int_init_val);
4268
4269	if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4270	  {
4271	    /* Option1: the first element is '0' or '1' as well.  */
4272	    if (!operand_equal_p (def_for_init, init_val, 0))
4273	      *adjustment_def = init_val;
4274	    init_def = gimple_build_vector_from_val (&stmts, vectype,
4275						     def_for_init);
4276	  }
4277	else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4278	  {
4279	    /* Option2 (variable length): the first element is INIT_VAL.  */
4280	    init_def = gimple_build_vector_from_val (&stmts, vectype,
4281						     def_for_init);
4282	    init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4283				     vectype, init_def, init_val);
4284	  }
4285	else
4286	  {
4287	    /* Option2: the first element is INIT_VAL.  */
4288	    tree_vector_builder elts (vectype, 1, 2);
4289	    elts.quick_push (init_val);
4290	    elts.quick_push (def_for_init);
4291	    init_def = gimple_build_vector (&stmts, &elts);
4292	  }
4293      }
4294      break;
4295
4296    case MIN_EXPR:
4297    case MAX_EXPR:
4298    case COND_EXPR:
4299      {
4300	init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4301	init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4302      }
4303      break;
4304
4305    default:
4306      gcc_unreachable ();
4307    }
4308
4309  if (stmts)
4310    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4311  return init_def;
4312}
4313
4314/* Get at the initial defs for the reduction PHIs in SLP_NODE.
4315   NUMBER_OF_VECTORS is the number of vector defs to create.
4316   If NEUTRAL_OP is nonnull, introducing extra elements of that
4317   value will not change the result.  */
4318
4319static void
4320get_initial_defs_for_reduction (slp_tree slp_node,
4321				vec<tree> *vec_oprnds,
4322				unsigned int number_of_vectors,
4323				bool reduc_chain, tree neutral_op)
4324{
4325  vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4326  stmt_vec_info stmt_vinfo = stmts[0];
4327  vec_info *vinfo = stmt_vinfo->vinfo;
4328  unsigned HOST_WIDE_INT nunits;
4329  unsigned j, number_of_places_left_in_vector;
4330  tree vector_type;
4331  unsigned int group_size = stmts.length ();
4332  unsigned int i;
4333  class loop *loop;
4334
4335  vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4336
4337  gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4338
4339  loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4340  gcc_assert (loop);
4341  edge pe = loop_preheader_edge (loop);
4342
4343  gcc_assert (!reduc_chain || neutral_op);
4344
4345  /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4346     created vectors. It is greater than 1 if unrolling is performed.
4347
4348     For example, we have two scalar operands, s1 and s2 (e.g., group of
4349     strided accesses of size two), while NUNITS is four (i.e., four scalars
4350     of this type can be packed in a vector).  The output vector will contain
4351     two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4352     will be 2).
4353
4354     If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4355     vectors containing the operands.
4356
4357     For example, NUNITS is four as before, and the group size is 8
4358     (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4359     {s5, s6, s7, s8}.  */
4360
4361  if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4362    nunits = group_size;
4363
4364  number_of_places_left_in_vector = nunits;
4365  bool constant_p = true;
4366  tree_vector_builder elts (vector_type, nunits, 1);
4367  elts.quick_grow (nunits);
4368  gimple_seq ctor_seq = NULL;
4369  for (j = 0; j < nunits * number_of_vectors; ++j)
4370    {
4371      tree op;
4372      i = j % group_size;
4373      stmt_vinfo = stmts[i];
4374
4375      /* Get the def before the loop.  In reduction chain we have only
4376	 one initial value.  Else we have as many as PHIs in the group.  */
4377      if (reduc_chain)
4378	op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4379      else if (((vec_oprnds->length () + 1) * nunits
4380		- number_of_places_left_in_vector >= group_size)
4381	       && neutral_op)
4382	op = neutral_op;
4383      else
4384	op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4385
4386      /* Create 'vect_ = {op0,op1,...,opn}'.  */
4387      number_of_places_left_in_vector--;
4388      elts[nunits - number_of_places_left_in_vector - 1] = op;
4389      if (!CONSTANT_CLASS_P (op))
4390	constant_p = false;
4391
4392      if (number_of_places_left_in_vector == 0)
4393	{
4394	  tree init;
4395	  if (constant_p && !neutral_op
4396	      ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4397	      : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4398	    /* Build the vector directly from ELTS.  */
4399	    init = gimple_build_vector (&ctor_seq, &elts);
4400	  else if (neutral_op)
4401	    {
4402	      /* Build a vector of the neutral value and shift the
4403		 other elements into place.  */
4404	      init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4405						   neutral_op);
4406	      int k = nunits;
4407	      while (k > 0 && elts[k - 1] == neutral_op)
4408		k -= 1;
4409	      while (k > 0)
4410		{
4411		  k -= 1;
4412		  init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4413				       vector_type, init, elts[k]);
4414		}
4415	    }
4416	  else
4417	    {
4418	      /* First time round, duplicate ELTS to fill the
4419		 required number of vectors.  */
4420	      duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4421					number_of_vectors, *vec_oprnds);
4422	      break;
4423	    }
4424	  vec_oprnds->quick_push (init);
4425
4426	  number_of_places_left_in_vector = nunits;
4427	  elts.new_vector (vector_type, nunits, 1);
4428	  elts.quick_grow (nunits);
4429	  constant_p = true;
4430	}
4431    }
4432  if (ctor_seq != NULL)
4433    gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4434}
4435
4436/* For a statement STMT_INFO taking part in a reduction operation return
4437   the stmt_vec_info the meta information is stored on.  */
4438
4439stmt_vec_info
4440info_for_reduction (stmt_vec_info stmt_info)
4441{
4442  stmt_info = vect_orig_stmt (stmt_info);
4443  gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4444  if (!is_a <gphi *> (stmt_info->stmt)
4445      || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4446    stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4447  gphi *phi = as_a <gphi *> (stmt_info->stmt);
4448  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4449    {
4450      if (gimple_phi_num_args (phi) == 1)
4451	stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4452    }
4453  else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4454    {
4455      edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4456      stmt_vec_info info
4457	  = stmt_info->vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4458      if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4459	stmt_info = info;
4460    }
4461  return stmt_info;
4462}
4463
4464/* Function vect_create_epilog_for_reduction
4465
4466   Create code at the loop-epilog to finalize the result of a reduction
4467   computation.
4468
4469   STMT_INFO is the scalar reduction stmt that is being vectorized.
4470   SLP_NODE is an SLP node containing a group of reduction statements. The
4471     first one in this group is STMT_INFO.
4472   SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4473   REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4474     (counting from 0)
4475
4476   This function:
4477   1. Completes the reduction def-use cycles.
4478   2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4479      by calling the function specified by REDUC_FN if available, or by
4480      other means (whole-vector shifts or a scalar loop).
4481      The function also creates a new phi node at the loop exit to preserve
4482      loop-closed form, as illustrated below.
4483
4484     The flow at the entry to this function:
4485
4486        loop:
4487          vec_def = phi <vec_init, null>        # REDUCTION_PHI
4488          VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4489          s_loop = scalar_stmt                  # (scalar) STMT_INFO
4490        loop_exit:
4491          s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4492          use <s_out0>
4493          use <s_out0>
4494
4495     The above is transformed by this function into:
4496
4497        loop:
4498          vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4499          VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4500          s_loop = scalar_stmt                  # (scalar) STMT_INFO
4501        loop_exit:
4502          s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4503          v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4504          v_out2 = reduce <v_out1>
4505          s_out3 = extract_field <v_out2, 0>
4506          s_out4 = adjust_result <s_out3>
4507          use <s_out4>
4508          use <s_out4>
4509*/
4510
4511static void
4512vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
4513				  slp_tree slp_node,
4514				  slp_instance slp_node_instance)
4515{
4516  stmt_vec_info reduc_info = info_for_reduction (stmt_info);
4517  gcc_assert (reduc_info->is_reduc_info);
4518  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4519  /* For double reductions we need to get at the inner loop reduction
4520     stmt which has the meta info attached.  Our stmt_info is that of the
4521     loop-closed PHI of the inner loop which we remember as
4522     def for the reduction PHI generation.  */
4523  bool double_reduc = false;
4524  stmt_vec_info rdef_info = stmt_info;
4525  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4526    {
4527      gcc_assert (!slp_node);
4528      double_reduc = true;
4529      stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4530					    (stmt_info->stmt, 0));
4531      stmt_info = vect_stmt_to_vectorize (stmt_info);
4532    }
4533  gphi *reduc_def_stmt
4534    = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4535  enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4536  internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4537  stmt_vec_info prev_phi_info;
4538  tree vectype;
4539  machine_mode mode;
4540  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4541  basic_block exit_bb;
4542  tree scalar_dest;
4543  tree scalar_type;
4544  gimple *new_phi = NULL, *phi;
4545  stmt_vec_info phi_info;
4546  gimple_stmt_iterator exit_gsi;
4547  tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4548  gimple *epilog_stmt = NULL;
4549  gimple *exit_phi;
4550  tree bitsize;
4551  tree def;
4552  tree orig_name, scalar_result;
4553  imm_use_iterator imm_iter, phi_imm_iter;
4554  use_operand_p use_p, phi_use_p;
4555  gimple *use_stmt;
4556  bool nested_in_vect_loop = false;
4557  auto_vec<gimple *> new_phis;
4558  int j, i;
4559  auto_vec<tree> scalar_results;
4560  unsigned int group_size = 1, k;
4561  auto_vec<gimple *> phis;
4562  bool slp_reduc = false;
4563  bool direct_slp_reduc;
4564  tree new_phi_result;
4565  tree induction_index = NULL_TREE;
4566
4567  if (slp_node)
4568    group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4569
4570  if (nested_in_vect_loop_p (loop, stmt_info))
4571    {
4572      outer_loop = loop;
4573      loop = loop->inner;
4574      nested_in_vect_loop = true;
4575      gcc_assert (!slp_node);
4576    }
4577  gcc_assert (!nested_in_vect_loop || double_reduc);
4578
4579  vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4580  gcc_assert (vectype);
4581  mode = TYPE_MODE (vectype);
4582
4583  tree initial_def = NULL;
4584  tree induc_val = NULL_TREE;
4585  tree adjustment_def = NULL;
4586  if (slp_node)
4587    ;
4588  else
4589    {
4590      /* Get at the scalar def before the loop, that defines the initial value
4591	 of the reduction variable.  */
4592      initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4593					   loop_preheader_edge (loop));
4594      /* Optimize: for induction condition reduction, if we can't use zero
4595         for induc_val, use initial_def.  */
4596      if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4597	induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4598      else if (double_reduc)
4599	;
4600      else if (nested_in_vect_loop)
4601	;
4602      else
4603	adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4604    }
4605
4606  unsigned vec_num;
4607  int ncopies;
4608  if (slp_node)
4609    {
4610      vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4611      ncopies = 1;
4612    }
4613  else
4614    {
4615      vec_num = 1;
4616      ncopies = 0;
4617      phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
4618      do
4619	{
4620	  ncopies++;
4621	  phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4622	}
4623      while (phi_info);
4624    }
4625
4626  /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4627     which is updated with the current index of the loop for every match of
4628     the original loop's cond_expr (VEC_STMT).  This results in a vector
4629     containing the last time the condition passed for that vector lane.
4630     The first match will be a 1 to allow 0 to be used for non-matching
4631     indexes.  If there are no matches at all then the vector will be all
4632     zeroes.
4633
4634     PR92772: This algorithm is broken for architectures that support
4635     masked vectors, but do not provide fold_extract_last.  */
4636  if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4637    {
4638      auto_vec<std::pair<tree, bool>, 2> ccompares;
4639      stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4640      cond_info = vect_stmt_to_vectorize (cond_info);
4641      while (cond_info != reduc_info)
4642	{
4643	  if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4644	    {
4645	      gimple *vec_stmt = STMT_VINFO_VEC_STMT (cond_info)->stmt;
4646	      gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4647	      ccompares.safe_push
4648		(std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4649				 STMT_VINFO_REDUC_IDX (cond_info) == 2));
4650	    }
4651	  cond_info
4652	    = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4653						 1 + STMT_VINFO_REDUC_IDX
4654							(cond_info)));
4655	  cond_info = vect_stmt_to_vectorize (cond_info);
4656	}
4657      gcc_assert (ccompares.length () != 0);
4658
4659      tree indx_before_incr, indx_after_incr;
4660      poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4661      int scalar_precision
4662	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4663      tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4664      tree cr_index_vector_type = get_related_vectype_for_scalar_type
4665	(TYPE_MODE (vectype), cr_index_scalar_type,
4666	 TYPE_VECTOR_SUBPARTS (vectype));
4667
4668      /* First we create a simple vector induction variable which starts
4669	 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4670	 vector size (STEP).  */
4671
4672      /* Create a {1,2,3,...} vector.  */
4673      tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4674
4675      /* Create a vector of the step value.  */
4676      tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4677      tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4678
4679      /* Create an induction variable.  */
4680      gimple_stmt_iterator incr_gsi;
4681      bool insert_after;
4682      standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4683      create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4684		 insert_after, &indx_before_incr, &indx_after_incr);
4685
4686      /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4687	 filled with zeros (VEC_ZERO).  */
4688
4689      /* Create a vector of 0s.  */
4690      tree zero = build_zero_cst (cr_index_scalar_type);
4691      tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4692
4693      /* Create a vector phi node.  */
4694      tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4695      new_phi = create_phi_node (new_phi_tree, loop->header);
4696      loop_vinfo->add_stmt (new_phi);
4697      add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4698		   loop_preheader_edge (loop), UNKNOWN_LOCATION);
4699
4700      /* Now take the condition from the loops original cond_exprs
4701	 and produce a new cond_exprs (INDEX_COND_EXPR) which for
4702	 every match uses values from the induction variable
4703	 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4704	 (NEW_PHI_TREE).
4705	 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4706	 the new cond_expr (INDEX_COND_EXPR).  */
4707      gimple_seq stmts = NULL;
4708      for (int i = ccompares.length () - 1; i != -1; --i)
4709	{
4710	  tree ccompare = ccompares[i].first;
4711	  if (ccompares[i].second)
4712	    new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4713					 cr_index_vector_type,
4714					 ccompare,
4715					 indx_before_incr, new_phi_tree);
4716	  else
4717	    new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4718					 cr_index_vector_type,
4719					 ccompare,
4720					 new_phi_tree, indx_before_incr);
4721	}
4722      gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4723      stmt_vec_info index_vec_info
4724	= loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (new_phi_tree));
4725      STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4726
4727      /* Update the phi with the vec cond.  */
4728      induction_index = new_phi_tree;
4729      add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4730		   loop_latch_edge (loop), UNKNOWN_LOCATION);
4731    }
4732
4733  /* 2. Create epilog code.
4734        The reduction epilog code operates across the elements of the vector
4735        of partial results computed by the vectorized loop.
4736        The reduction epilog code consists of:
4737
4738        step 1: compute the scalar result in a vector (v_out2)
4739        step 2: extract the scalar result (s_out3) from the vector (v_out2)
4740        step 3: adjust the scalar result (s_out3) if needed.
4741
4742        Step 1 can be accomplished using one the following three schemes:
4743          (scheme 1) using reduc_fn, if available.
4744          (scheme 2) using whole-vector shifts, if available.
4745          (scheme 3) using a scalar loop. In this case steps 1+2 above are
4746                     combined.
4747
4748          The overall epilog code looks like this:
4749
4750          s_out0 = phi <s_loop>         # original EXIT_PHI
4751          v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4752          v_out2 = reduce <v_out1>              # step 1
4753          s_out3 = extract_field <v_out2, 0>    # step 2
4754          s_out4 = adjust_result <s_out3>       # step 3
4755
4756          (step 3 is optional, and steps 1 and 2 may be combined).
4757          Lastly, the uses of s_out0 are replaced by s_out4.  */
4758
4759
4760  /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4761         v_out1 = phi <VECT_DEF>
4762         Store them in NEW_PHIS.  */
4763  if (double_reduc)
4764    loop = outer_loop;
4765  exit_bb = single_exit (loop)->dest;
4766  prev_phi_info = NULL;
4767  new_phis.create (slp_node ? vec_num : ncopies);
4768  for (unsigned i = 0; i < vec_num; i++)
4769    {
4770      if (slp_node)
4771	def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
4772      else
4773	def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
4774      for (j = 0; j < ncopies; j++)
4775        {
4776	  tree new_def = copy_ssa_name (def);
4777          phi = create_phi_node (new_def, exit_bb);
4778	  stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4779          if (j == 0)
4780            new_phis.quick_push (phi);
4781          else
4782	    {
4783	      def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4784	      STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4785	    }
4786
4787          SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4788	  prev_phi_info = phi_info;
4789        }
4790    }
4791
4792  exit_gsi = gsi_after_labels (exit_bb);
4793
4794  /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4795         (i.e. when reduc_fn is not available) and in the final adjustment
4796	 code (if needed).  Also get the original scalar reduction variable as
4797         defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4798         represents a reduction pattern), the tree-code and scalar-def are
4799         taken from the original stmt that the pattern-stmt (STMT) replaces.
4800         Otherwise (it is a regular reduction) - the tree-code and scalar-def
4801         are taken from STMT.  */
4802
4803  stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4804  if (orig_stmt_info != stmt_info)
4805    {
4806      /* Reduction pattern  */
4807      gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4808      gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4809    }
4810
4811  scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4812  scalar_type = TREE_TYPE (scalar_dest);
4813  scalar_results.create (group_size);
4814  new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4815  bitsize = TYPE_SIZE (scalar_type);
4816
4817  /* SLP reduction without reduction chain, e.g.,
4818     # a1 = phi <a2, a0>
4819     # b1 = phi <b2, b0>
4820     a2 = operation (a1)
4821     b2 = operation (b1)  */
4822  slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4823
4824  /* True if we should implement SLP_REDUC using native reduction operations
4825     instead of scalar operations.  */
4826  direct_slp_reduc = (reduc_fn != IFN_LAST
4827		      && slp_reduc
4828		      && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4829
4830  /* In case of reduction chain, e.g.,
4831     # a1 = phi <a3, a0>
4832     a2 = operation (a1)
4833     a3 = operation (a2),
4834
4835     we may end up with more than one vector result.  Here we reduce them to
4836     one vector.  */
4837  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4838    {
4839      gimple_seq stmts = NULL;
4840      tree first_vect = PHI_RESULT (new_phis[0]);
4841      first_vect = gimple_convert (&stmts, vectype, first_vect);
4842      for (k = 1; k < new_phis.length (); k++)
4843        {
4844	  gimple *next_phi = new_phis[k];
4845          tree second_vect = PHI_RESULT (next_phi);
4846	  second_vect = gimple_convert (&stmts, vectype, second_vect);
4847          first_vect = gimple_build (&stmts, code, vectype,
4848				     first_vect, second_vect);
4849        }
4850      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4851
4852      new_phi_result = first_vect;
4853      new_phis.truncate (0);
4854      new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4855    }
4856  /* Likewise if we couldn't use a single defuse cycle.  */
4857  else if (ncopies > 1)
4858    {
4859      gcc_assert (new_phis.length () == 1);
4860      gimple_seq stmts = NULL;
4861      tree first_vect = PHI_RESULT (new_phis[0]);
4862      first_vect = gimple_convert (&stmts, vectype, first_vect);
4863      stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4864      for (int k = 1; k < ncopies; ++k)
4865	{
4866	  next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4867	  tree second_vect = PHI_RESULT (next_phi_info->stmt);
4868	  second_vect = gimple_convert (&stmts, vectype, second_vect);
4869	  first_vect = gimple_build (&stmts, code, vectype,
4870				     first_vect, second_vect);
4871	}
4872      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4873      new_phi_result = first_vect;
4874      new_phis.truncate (0);
4875      new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4876    }
4877  else
4878    new_phi_result = PHI_RESULT (new_phis[0]);
4879
4880  if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4881      && reduc_fn != IFN_LAST)
4882    {
4883      /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4884	 various data values where the condition matched and another vector
4885	 (INDUCTION_INDEX) containing all the indexes of those matches.  We
4886	 need to extract the last matching index (which will be the index with
4887	 highest value) and use this to index into the data vector.
4888	 For the case where there were no matches, the data vector will contain
4889	 all default values and the index vector will be all zeros.  */
4890
4891      /* Get various versions of the type of the vector of indexes.  */
4892      tree index_vec_type = TREE_TYPE (induction_index);
4893      gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4894      tree index_scalar_type = TREE_TYPE (index_vec_type);
4895      tree index_vec_cmp_type = truth_type_for (index_vec_type);
4896
4897      /* Get an unsigned integer version of the type of the data vector.  */
4898      int scalar_precision
4899	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4900      tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4901      tree vectype_unsigned = build_vector_type
4902	(scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4903
4904      /* First we need to create a vector (ZERO_VEC) of zeros and another
4905	 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4906	 can create using a MAX reduction and then expanding.
4907	 In the case where the loop never made any matches, the max index will
4908	 be zero.  */
4909
4910      /* Vector of {0, 0, 0,...}.  */
4911      tree zero_vec = build_zero_cst (vectype);
4912
4913      gimple_seq stmts = NULL;
4914      new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
4915      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4916
4917      /* Find maximum value from the vector of found indexes.  */
4918      tree max_index = make_ssa_name (index_scalar_type);
4919      gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4920							  1, induction_index);
4921      gimple_call_set_lhs (max_index_stmt, max_index);
4922      gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4923
4924      /* Vector of {max_index, max_index, max_index,...}.  */
4925      tree max_index_vec = make_ssa_name (index_vec_type);
4926      tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4927						      max_index);
4928      gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4929							max_index_vec_rhs);
4930      gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4931
4932      /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4933	 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4934	 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4935	 otherwise.  Only one value should match, resulting in a vector
4936	 (VEC_COND) with one data value and the rest zeros.
4937	 In the case where the loop never made any matches, every index will
4938	 match, resulting in a vector with all data values (which will all be
4939	 the default value).  */
4940
4941      /* Compare the max index vector to the vector of found indexes to find
4942	 the position of the max value.  */
4943      tree vec_compare = make_ssa_name (index_vec_cmp_type);
4944      gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4945						      induction_index,
4946						      max_index_vec);
4947      gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4948
4949      /* Use the compare to choose either values from the data vector or
4950	 zero.  */
4951      tree vec_cond = make_ssa_name (vectype);
4952      gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4953						   vec_compare, new_phi_result,
4954						   zero_vec);
4955      gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4956
4957      /* Finally we need to extract the data value from the vector (VEC_COND)
4958	 into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4959	 reduction, but because this doesn't exist, we can use a MAX reduction
4960	 instead.  The data value might be signed or a float so we need to cast
4961	 it first.
4962	 In the case where the loop never made any matches, the data values are
4963	 all identical, and so will reduce down correctly.  */
4964
4965      /* Make the matched data values unsigned.  */
4966      tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4967      tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4968				       vec_cond);
4969      gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4970							VIEW_CONVERT_EXPR,
4971							vec_cond_cast_rhs);
4972      gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4973
4974      /* Reduce down to a scalar value.  */
4975      tree data_reduc = make_ssa_name (scalar_type_unsigned);
4976      gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4977							   1, vec_cond_cast);
4978      gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4979      gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4980
4981      /* Convert the reduced value back to the result type and set as the
4982	 result.  */
4983      stmts = NULL;
4984      new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4985			       data_reduc);
4986      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4987      scalar_results.safe_push (new_temp);
4988    }
4989  else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4990	   && reduc_fn == IFN_LAST)
4991    {
4992      /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4993	 idx = 0;
4994         idx_val = induction_index[0];
4995	 val = data_reduc[0];
4996         for (idx = 0, val = init, i = 0; i < nelts; ++i)
4997	   if (induction_index[i] > idx_val)
4998	     val = data_reduc[i], idx_val = induction_index[i];
4999	 return val;  */
5000
5001      tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5002      tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5003      unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5004      poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5005      /* Enforced by vectorizable_reduction, which ensures we have target
5006	 support before allowing a conditional reduction on variable-length
5007	 vectors.  */
5008      unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5009      tree idx_val = NULL_TREE, val = NULL_TREE;
5010      for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5011	{
5012	  tree old_idx_val = idx_val;
5013	  tree old_val = val;
5014	  idx_val = make_ssa_name (idx_eltype);
5015	  epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5016					     build3 (BIT_FIELD_REF, idx_eltype,
5017						     induction_index,
5018						     bitsize_int (el_size),
5019						     bitsize_int (off)));
5020	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5021	  val = make_ssa_name (data_eltype);
5022	  epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5023					     build3 (BIT_FIELD_REF,
5024						     data_eltype,
5025						     new_phi_result,
5026						     bitsize_int (el_size),
5027						     bitsize_int (off)));
5028	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5029	  if (off != 0)
5030	    {
5031	      tree new_idx_val = idx_val;
5032	      if (off != v_size - el_size)
5033		{
5034		  new_idx_val = make_ssa_name (idx_eltype);
5035		  epilog_stmt = gimple_build_assign (new_idx_val,
5036						     MAX_EXPR, idx_val,
5037						     old_idx_val);
5038		  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5039		}
5040	      tree new_val = make_ssa_name (data_eltype);
5041	      epilog_stmt = gimple_build_assign (new_val,
5042						 COND_EXPR,
5043						 build2 (GT_EXPR,
5044							 boolean_type_node,
5045							 idx_val,
5046							 old_idx_val),
5047						 val, old_val);
5048	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5049	      idx_val = new_idx_val;
5050	      val = new_val;
5051	    }
5052	}
5053      /* Convert the reduced value back to the result type and set as the
5054	 result.  */
5055      gimple_seq stmts = NULL;
5056      val = gimple_convert (&stmts, scalar_type, val);
5057      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5058      scalar_results.safe_push (val);
5059    }
5060
5061  /* 2.3 Create the reduction code, using one of the three schemes described
5062         above. In SLP we simply need to extract all the elements from the
5063         vector (without reducing them), so we use scalar shifts.  */
5064  else if (reduc_fn != IFN_LAST && !slp_reduc)
5065    {
5066      tree tmp;
5067      tree vec_elem_type;
5068
5069      /* Case 1:  Create:
5070         v_out2 = reduc_expr <v_out1>  */
5071
5072      if (dump_enabled_p ())
5073        dump_printf_loc (MSG_NOTE, vect_location,
5074			 "Reduce using direct vector reduction.\n");
5075
5076      gimple_seq stmts = NULL;
5077      new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5078      vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5079      new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5080			       vec_elem_type, new_phi_result);
5081      new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5082      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5083
5084      if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5085	  && induc_val)
5086	{
5087	  /* Earlier we set the initial value to be a vector if induc_val
5088	     values.  Check the result and if it is induc_val then replace
5089	     with the original initial value, unless induc_val is
5090	     the same as initial_def already.  */
5091	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5092				  induc_val);
5093
5094	  tmp = make_ssa_name (new_scalar_dest);
5095	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5096					     initial_def, new_temp);
5097	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5098	  new_temp = tmp;
5099	}
5100
5101      scalar_results.safe_push (new_temp);
5102    }
5103  else if (direct_slp_reduc)
5104    {
5105      /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5106	 with the elements for other SLP statements replaced with the
5107	 neutral value.  We can then do a normal reduction on each vector.  */
5108
5109      /* Enforced by vectorizable_reduction.  */
5110      gcc_assert (new_phis.length () == 1);
5111      gcc_assert (pow2p_hwi (group_size));
5112
5113      slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5114      vec<stmt_vec_info> orig_phis
5115	= SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5116      gimple_seq seq = NULL;
5117
5118      /* Build a vector {0, 1, 2, ...}, with the same number of elements
5119	 and the same element size as VECTYPE.  */
5120      tree index = build_index_vector (vectype, 0, 1);
5121      tree index_type = TREE_TYPE (index);
5122      tree index_elt_type = TREE_TYPE (index_type);
5123      tree mask_type = truth_type_for (index_type);
5124
5125      /* Create a vector that, for each element, identifies which of
5126	 the REDUC_GROUP_SIZE results should use it.  */
5127      tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5128      index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5129			    build_vector_from_val (index_type, index_mask));
5130
5131      /* Get a neutral vector value.  This is simply a splat of the neutral
5132	 scalar value if we have one, otherwise the initial scalar value
5133	 is itself a neutral value.  */
5134      tree vector_identity = NULL_TREE;
5135      tree neutral_op = NULL_TREE;
5136      if (slp_node)
5137	{
5138	  stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5139	  neutral_op
5140	    = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5141					    vectype, code, first != NULL);
5142	}
5143      if (neutral_op)
5144	vector_identity = gimple_build_vector_from_val (&seq, vectype,
5145							neutral_op);
5146      for (unsigned int i = 0; i < group_size; ++i)
5147	{
5148	  /* If there's no univeral neutral value, we can use the
5149	     initial scalar value from the original PHI.  This is used
5150	     for MIN and MAX reduction, for example.  */
5151	  if (!neutral_op)
5152	    {
5153	      tree scalar_value
5154		= PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5155					 loop_preheader_edge (loop));
5156	      scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5157					     scalar_value);
5158	      vector_identity = gimple_build_vector_from_val (&seq, vectype,
5159							      scalar_value);
5160	    }
5161
5162	  /* Calculate the equivalent of:
5163
5164	     sel[j] = (index[j] == i);
5165
5166	     which selects the elements of NEW_PHI_RESULT that should
5167	     be included in the result.  */
5168	  tree compare_val = build_int_cst (index_elt_type, i);
5169	  compare_val = build_vector_from_val (index_type, compare_val);
5170	  tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5171				   index, compare_val);
5172
5173	  /* Calculate the equivalent of:
5174
5175	     vec = seq ? new_phi_result : vector_identity;
5176
5177	     VEC is now suitable for a full vector reduction.  */
5178	  tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5179				   sel, new_phi_result, vector_identity);
5180
5181	  /* Do the reduction and convert it to the appropriate type.  */
5182	  tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5183				      TREE_TYPE (vectype), vec);
5184	  scalar = gimple_convert (&seq, scalar_type, scalar);
5185	  scalar_results.safe_push (scalar);
5186	}
5187      gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5188    }
5189  else
5190    {
5191      bool reduce_with_shift;
5192      tree vec_temp;
5193
5194      gcc_assert (slp_reduc || new_phis.length () == 1);
5195
5196      /* See if the target wants to do the final (shift) reduction
5197	 in a vector mode of smaller size and first reduce upper/lower
5198	 halves against each other.  */
5199      enum machine_mode mode1 = mode;
5200      tree stype = TREE_TYPE (vectype);
5201      unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5202      unsigned nunits1 = nunits;
5203      if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5204	  && new_phis.length () == 1)
5205	{
5206	  nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5207	  /* For SLP reductions we have to make sure lanes match up, but
5208	     since we're doing individual element final reduction reducing
5209	     vector width here is even more important.
5210	     ???  We can also separate lanes with permutes, for the common
5211	     case of power-of-two group-size odd/even extracts would work.  */
5212	  if (slp_reduc && nunits != nunits1)
5213	    {
5214	      nunits1 = least_common_multiple (nunits1, group_size);
5215	      gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5216	    }
5217	}
5218      if (!slp_reduc
5219	  && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5220	nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5221
5222      tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5223							   stype, nunits1);
5224      reduce_with_shift = have_whole_vector_shift (mode1);
5225      if (!VECTOR_MODE_P (mode1))
5226	reduce_with_shift = false;
5227      else
5228	{
5229	  optab optab = optab_for_tree_code (code, vectype1, optab_default);
5230	  if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5231	    reduce_with_shift = false;
5232	}
5233
5234      /* First reduce the vector to the desired vector size we should
5235	 do shift reduction on by combining upper and lower halves.  */
5236      new_temp = new_phi_result;
5237      while (nunits > nunits1)
5238	{
5239	  nunits /= 2;
5240	  vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5241							  stype, nunits);
5242	  unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5243
5244	  /* The target has to make sure we support lowpart/highpart
5245	     extraction, either via direct vector extract or through
5246	     an integer mode punning.  */
5247	  tree dst1, dst2;
5248	  if (convert_optab_handler (vec_extract_optab,
5249				     TYPE_MODE (TREE_TYPE (new_temp)),
5250				     TYPE_MODE (vectype1))
5251	      != CODE_FOR_nothing)
5252	    {
5253	      /* Extract sub-vectors directly once vec_extract becomes
5254		 a conversion optab.  */
5255	      dst1 = make_ssa_name (vectype1);
5256	      epilog_stmt
5257		  = gimple_build_assign (dst1, BIT_FIELD_REF,
5258					 build3 (BIT_FIELD_REF, vectype1,
5259						 new_temp, TYPE_SIZE (vectype1),
5260						 bitsize_int (0)));
5261	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5262	      dst2 =  make_ssa_name (vectype1);
5263	      epilog_stmt
5264		  = gimple_build_assign (dst2, BIT_FIELD_REF,
5265					 build3 (BIT_FIELD_REF, vectype1,
5266						 new_temp, TYPE_SIZE (vectype1),
5267						 bitsize_int (bitsize)));
5268	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5269	    }
5270	  else
5271	    {
5272	      /* Extract via punning to appropriately sized integer mode
5273		 vector.  */
5274	      tree eltype = build_nonstandard_integer_type (bitsize, 1);
5275	      tree etype = build_vector_type (eltype, 2);
5276	      gcc_assert (convert_optab_handler (vec_extract_optab,
5277						 TYPE_MODE (etype),
5278						 TYPE_MODE (eltype))
5279			  != CODE_FOR_nothing);
5280	      tree tem = make_ssa_name (etype);
5281	      epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5282						 build1 (VIEW_CONVERT_EXPR,
5283							 etype, new_temp));
5284	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5285	      new_temp = tem;
5286	      tem = make_ssa_name (eltype);
5287	      epilog_stmt
5288		  = gimple_build_assign (tem, BIT_FIELD_REF,
5289					 build3 (BIT_FIELD_REF, eltype,
5290						 new_temp, TYPE_SIZE (eltype),
5291						 bitsize_int (0)));
5292	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5293	      dst1 = make_ssa_name (vectype1);
5294	      epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5295						 build1 (VIEW_CONVERT_EXPR,
5296							 vectype1, tem));
5297	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5298	      tem = make_ssa_name (eltype);
5299	      epilog_stmt
5300		  = gimple_build_assign (tem, BIT_FIELD_REF,
5301					 build3 (BIT_FIELD_REF, eltype,
5302						 new_temp, TYPE_SIZE (eltype),
5303						 bitsize_int (bitsize)));
5304	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5305	      dst2 =  make_ssa_name (vectype1);
5306	      epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5307						 build1 (VIEW_CONVERT_EXPR,
5308							 vectype1, tem));
5309	      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5310	    }
5311
5312	  new_temp = make_ssa_name (vectype1);
5313	  epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5314	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5315	  new_phis[0] = epilog_stmt;
5316	}
5317
5318      if (reduce_with_shift && !slp_reduc)
5319	{
5320	  int element_bitsize = tree_to_uhwi (bitsize);
5321	  /* Enforced by vectorizable_reduction, which disallows SLP reductions
5322	     for variable-length vectors and also requires direct target support
5323	     for loop reductions.  */
5324	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5325	  int nelements = vec_size_in_bits / element_bitsize;
5326	  vec_perm_builder sel;
5327	  vec_perm_indices indices;
5328
5329          int elt_offset;
5330
5331          tree zero_vec = build_zero_cst (vectype1);
5332          /* Case 2: Create:
5333             for (offset = nelements/2; offset >= 1; offset/=2)
5334                {
5335                  Create:  va' = vec_shift <va, offset>
5336                  Create:  va = vop <va, va'>
5337                }  */
5338
5339          tree rhs;
5340
5341          if (dump_enabled_p ())
5342            dump_printf_loc (MSG_NOTE, vect_location,
5343			     "Reduce using vector shifts\n");
5344
5345	  gimple_seq stmts = NULL;
5346	  new_temp = gimple_convert (&stmts, vectype1, new_temp);
5347          for (elt_offset = nelements / 2;
5348               elt_offset >= 1;
5349               elt_offset /= 2)
5350            {
5351	      calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5352	      indices.new_vector (sel, 2, nelements);
5353	      tree mask = vect_gen_perm_mask_any (vectype1, indices);
5354	      new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5355				       new_temp, zero_vec, mask);
5356	      new_temp = gimple_build (&stmts, code,
5357				       vectype1, new_name, new_temp);
5358            }
5359	  gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5360
5361	  /* 2.4  Extract the final scalar result.  Create:
5362	     s_out3 = extract_field <v_out2, bitpos>  */
5363
5364	  if (dump_enabled_p ())
5365	    dump_printf_loc (MSG_NOTE, vect_location,
5366			     "extract scalar result\n");
5367
5368	  rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5369			bitsize, bitsize_zero_node);
5370	  epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5371	  new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5372	  gimple_assign_set_lhs (epilog_stmt, new_temp);
5373	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5374	  scalar_results.safe_push (new_temp);
5375        }
5376      else
5377        {
5378          /* Case 3: Create:
5379             s = extract_field <v_out2, 0>
5380             for (offset = element_size;
5381                  offset < vector_size;
5382                  offset += element_size;)
5383               {
5384                 Create:  s' = extract_field <v_out2, offset>
5385                 Create:  s = op <s, s'>  // For non SLP cases
5386               }  */
5387
5388          if (dump_enabled_p ())
5389            dump_printf_loc (MSG_NOTE, vect_location,
5390			     "Reduce using scalar code.\n");
5391
5392	  int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5393	  int element_bitsize = tree_to_uhwi (bitsize);
5394	  tree compute_type = TREE_TYPE (vectype);
5395	  gimple_seq stmts = NULL;
5396          FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5397            {
5398              int bit_offset;
5399              if (gimple_code (new_phi) == GIMPLE_PHI)
5400                vec_temp = PHI_RESULT (new_phi);
5401              else
5402                vec_temp = gimple_assign_lhs (new_phi);
5403	      new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5404				       vec_temp, bitsize, bitsize_zero_node);
5405
5406              /* In SLP we don't need to apply reduction operation, so we just
5407                 collect s' values in SCALAR_RESULTS.  */
5408              if (slp_reduc)
5409                scalar_results.safe_push (new_temp);
5410
5411              for (bit_offset = element_bitsize;
5412                   bit_offset < vec_size_in_bits;
5413                   bit_offset += element_bitsize)
5414                {
5415                  tree bitpos = bitsize_int (bit_offset);
5416		  new_name = gimple_build (&stmts, BIT_FIELD_REF,
5417					   compute_type, vec_temp,
5418					   bitsize, bitpos);
5419                  if (slp_reduc)
5420                    {
5421                      /* In SLP we don't need to apply reduction operation, so
5422                         we just collect s' values in SCALAR_RESULTS.  */
5423                      new_temp = new_name;
5424                      scalar_results.safe_push (new_name);
5425                    }
5426                  else
5427		    new_temp = gimple_build (&stmts, code, compute_type,
5428					     new_name, new_temp);
5429                }
5430            }
5431
5432          /* The only case where we need to reduce scalar results in SLP, is
5433             unrolling.  If the size of SCALAR_RESULTS is greater than
5434             REDUC_GROUP_SIZE, we reduce them combining elements modulo
5435             REDUC_GROUP_SIZE.  */
5436          if (slp_reduc)
5437            {
5438              tree res, first_res, new_res;
5439
5440              /* Reduce multiple scalar results in case of SLP unrolling.  */
5441              for (j = group_size; scalar_results.iterate (j, &res);
5442                   j++)
5443                {
5444                  first_res = scalar_results[j % group_size];
5445		  new_res = gimple_build (&stmts, code, compute_type,
5446					  first_res, res);
5447                  scalar_results[j % group_size] = new_res;
5448                }
5449	      for (k = 0; k < group_size; k++)
5450		scalar_results[k] = gimple_convert (&stmts, scalar_type,
5451						    scalar_results[k]);
5452            }
5453          else
5454	    {
5455	      /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5456	      new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5457	      scalar_results.safe_push (new_temp);
5458	    }
5459
5460	  gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5461        }
5462
5463      if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5464	  && induc_val)
5465	{
5466	  /* Earlier we set the initial value to be a vector if induc_val
5467	     values.  Check the result and if it is induc_val then replace
5468	     with the original initial value, unless induc_val is
5469	     the same as initial_def already.  */
5470	  tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5471				  induc_val);
5472
5473	  tree tmp = make_ssa_name (new_scalar_dest);
5474	  epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5475					     initial_def, new_temp);
5476	  gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5477	  scalar_results[0] = tmp;
5478	}
5479    }
5480
5481  /* 2.5 Adjust the final result by the initial value of the reduction
5482	 variable. (When such adjustment is not needed, then
5483	 'adjustment_def' is zero).  For example, if code is PLUS we create:
5484	 new_temp = loop_exit_def + adjustment_def  */
5485
5486  if (adjustment_def)
5487    {
5488      gcc_assert (!slp_reduc);
5489      gimple_seq stmts = NULL;
5490      if (nested_in_vect_loop)
5491	{
5492          new_phi = new_phis[0];
5493	  gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5494	  adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5495	  new_temp = gimple_build (&stmts, code, vectype,
5496				   PHI_RESULT (new_phi), adjustment_def);
5497	}
5498      else
5499	{
5500          new_temp = scalar_results[0];
5501	  gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5502	  adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5503	  new_temp = gimple_build (&stmts, code, scalar_type,
5504				   new_temp, adjustment_def);
5505	}
5506
5507      epilog_stmt = gimple_seq_last_stmt (stmts);
5508      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5509      if (nested_in_vect_loop)
5510        {
5511	  stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5512	  STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5513	    = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5514
5515          if (!double_reduc)
5516            scalar_results.quick_push (new_temp);
5517          else
5518            scalar_results[0] = new_temp;
5519        }
5520      else
5521        scalar_results[0] = new_temp;
5522
5523      new_phis[0] = epilog_stmt;
5524    }
5525
5526  if (double_reduc)
5527    loop = loop->inner;
5528
5529  /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5530          phis with new adjusted scalar results, i.e., replace use <s_out0>
5531          with use <s_out4>.
5532
5533     Transform:
5534        loop_exit:
5535          s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5536          v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5537          v_out2 = reduce <v_out1>
5538          s_out3 = extract_field <v_out2, 0>
5539          s_out4 = adjust_result <s_out3>
5540          use <s_out0>
5541          use <s_out0>
5542
5543     into:
5544
5545        loop_exit:
5546          s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5547          v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5548          v_out2 = reduce <v_out1>
5549          s_out3 = extract_field <v_out2, 0>
5550          s_out4 = adjust_result <s_out3>
5551          use <s_out4>
5552          use <s_out4> */
5553
5554
5555  /* In SLP reduction chain we reduce vector results into one vector if
5556     necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5557     LHS of the last stmt in the reduction chain, since we are looking for
5558     the loop exit phi node.  */
5559  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5560    {
5561      stmt_vec_info dest_stmt_info
5562	= vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5563      scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5564      group_size = 1;
5565    }
5566
5567  /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5568     case that REDUC_GROUP_SIZE is greater than vectorization factor).
5569     Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5570     The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5571     correspond to the first vector stmt, etc.
5572     (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5573  if (group_size > new_phis.length ())
5574    gcc_assert (!(group_size % new_phis.length ()));
5575
5576  for (k = 0; k < group_size; k++)
5577    {
5578      if (slp_reduc)
5579        {
5580	  stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5581
5582	  orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5583	  /* SLP statements can't participate in patterns.  */
5584	  gcc_assert (!orig_stmt_info);
5585	  scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5586        }
5587
5588      if (nested_in_vect_loop)
5589        {
5590          if (double_reduc)
5591            loop = outer_loop;
5592          else
5593	    gcc_unreachable ();
5594        }
5595
5596      phis.create (3);
5597      /* Find the loop-closed-use at the loop exit of the original scalar
5598         result.  (The reduction result is expected to have two immediate uses,
5599         one at the latch block, and one at the loop exit).  For double
5600         reductions we are looking for exit phis of the outer loop.  */
5601      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5602        {
5603          if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5604	    {
5605	      if (!is_gimple_debug (USE_STMT (use_p)))
5606		phis.safe_push (USE_STMT (use_p));
5607	    }
5608          else
5609            {
5610              if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5611                {
5612                  tree phi_res = PHI_RESULT (USE_STMT (use_p));
5613
5614                  FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5615                    {
5616                      if (!flow_bb_inside_loop_p (loop,
5617                                             gimple_bb (USE_STMT (phi_use_p)))
5618			  && !is_gimple_debug (USE_STMT (phi_use_p)))
5619                        phis.safe_push (USE_STMT (phi_use_p));
5620                    }
5621                }
5622            }
5623        }
5624
5625      FOR_EACH_VEC_ELT (phis, i, exit_phi)
5626        {
5627          /* Replace the uses:  */
5628          orig_name = PHI_RESULT (exit_phi);
5629          scalar_result = scalar_results[k];
5630          FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5631	    {
5632	      FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5633		SET_USE (use_p, scalar_result);
5634	      update_stmt (use_stmt);
5635	    }
5636        }
5637
5638      phis.release ();
5639    }
5640}
5641
5642/* Return a vector of type VECTYPE that is equal to the vector select
5643   operation "MASK ? VEC : IDENTITY".  Insert the select statements
5644   before GSI.  */
5645
5646static tree
5647merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5648		     tree vec, tree identity)
5649{
5650  tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5651  gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5652					  mask, vec, identity);
5653  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5654  return cond;
5655}
5656
5657/* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5658   order, starting with LHS.  Insert the extraction statements before GSI and
5659   associate the new scalar SSA names with variable SCALAR_DEST.
5660   Return the SSA name for the result.  */
5661
5662static tree
5663vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5664		       tree_code code, tree lhs, tree vector_rhs)
5665{
5666  tree vectype = TREE_TYPE (vector_rhs);
5667  tree scalar_type = TREE_TYPE (vectype);
5668  tree bitsize = TYPE_SIZE (scalar_type);
5669  unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5670  unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5671
5672  for (unsigned HOST_WIDE_INT bit_offset = 0;
5673       bit_offset < vec_size_in_bits;
5674       bit_offset += element_bitsize)
5675    {
5676      tree bitpos = bitsize_int (bit_offset);
5677      tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5678			 bitsize, bitpos);
5679
5680      gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5681      rhs = make_ssa_name (scalar_dest, stmt);
5682      gimple_assign_set_lhs (stmt, rhs);
5683      gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5684
5685      stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5686      tree new_name = make_ssa_name (scalar_dest, stmt);
5687      gimple_assign_set_lhs (stmt, new_name);
5688      gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5689      lhs = new_name;
5690    }
5691  return lhs;
5692}
5693
5694/* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
5695   type of the vector input.  */
5696
5697static internal_fn
5698get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5699{
5700  internal_fn mask_reduc_fn;
5701
5702  switch (reduc_fn)
5703    {
5704    case IFN_FOLD_LEFT_PLUS:
5705      mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5706      break;
5707
5708    default:
5709      return IFN_LAST;
5710    }
5711
5712  if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5713				      OPTIMIZE_FOR_SPEED))
5714    return mask_reduc_fn;
5715  return IFN_LAST;
5716}
5717
5718/* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5719   statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5720   statement.  CODE is the operation performed by STMT_INFO and OPS are
5721   its scalar operands.  REDUC_INDEX is the index of the operand in
5722   OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5723   implements in-order reduction, or IFN_LAST if we should open-code it.
5724   VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5725   that should be used to control the operation in a fully-masked loop.  */
5726
5727static bool
5728vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5729			       gimple_stmt_iterator *gsi,
5730			       stmt_vec_info *vec_stmt, slp_tree slp_node,
5731			       gimple *reduc_def_stmt,
5732			       tree_code code, internal_fn reduc_fn,
5733			       tree ops[3], tree vectype_in,
5734			       int reduc_index, vec_loop_masks *masks)
5735{
5736  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5737  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5738  tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5739  stmt_vec_info new_stmt_info = NULL;
5740  internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5741
5742  int ncopies;
5743  if (slp_node)
5744    ncopies = 1;
5745  else
5746    ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5747
5748  gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5749  gcc_assert (ncopies == 1);
5750  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5751
5752  if (slp_node)
5753    gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5754			  TYPE_VECTOR_SUBPARTS (vectype_in)));
5755
5756  tree op0 = ops[1 - reduc_index];
5757
5758  int group_size = 1;
5759  stmt_vec_info scalar_dest_def_info;
5760  auto_vec<tree> vec_oprnds0;
5761  if (slp_node)
5762    {
5763      auto_vec<vec<tree> > vec_defs (2);
5764      vect_get_slp_defs (slp_node, &vec_defs);
5765      vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5766      vec_defs[0].release ();
5767      vec_defs[1].release ();
5768      group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5769      scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5770    }
5771  else
5772    {
5773      tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5774      vec_oprnds0.create (1);
5775      vec_oprnds0.quick_push (loop_vec_def0);
5776      scalar_dest_def_info = stmt_info;
5777    }
5778
5779  tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5780  tree scalar_type = TREE_TYPE (scalar_dest);
5781  tree reduc_var = gimple_phi_result (reduc_def_stmt);
5782
5783  int vec_num = vec_oprnds0.length ();
5784  gcc_assert (vec_num == 1 || slp_node);
5785  tree vec_elem_type = TREE_TYPE (vectype_out);
5786  gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5787
5788  tree vector_identity = NULL_TREE;
5789  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5790    vector_identity = build_zero_cst (vectype_out);
5791
5792  tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5793  int i;
5794  tree def0;
5795  FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5796    {
5797      gimple *new_stmt;
5798      tree mask = NULL_TREE;
5799      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5800	mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5801
5802      /* Handle MINUS by adding the negative.  */
5803      if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5804	{
5805	  tree negated = make_ssa_name (vectype_out);
5806	  new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5807	  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5808	  def0 = negated;
5809	}
5810
5811      if (mask && mask_reduc_fn == IFN_LAST)
5812	def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5813				    vector_identity);
5814
5815      /* On the first iteration the input is simply the scalar phi
5816	 result, and for subsequent iterations it is the output of
5817	 the preceding operation.  */
5818      if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5819	{
5820	  if (mask && mask_reduc_fn != IFN_LAST)
5821	    new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5822						   def0, mask);
5823	  else
5824	    new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5825						   def0);
5826	  /* For chained SLP reductions the output of the previous reduction
5827	     operation serves as the input of the next. For the final statement
5828	     the output cannot be a temporary - we reuse the original
5829	     scalar destination of the last statement.  */
5830	  if (i != vec_num - 1)
5831	    {
5832	      gimple_set_lhs (new_stmt, scalar_dest_var);
5833	      reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5834	      gimple_set_lhs (new_stmt, reduc_var);
5835	    }
5836	}
5837      else
5838	{
5839	  reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5840					     reduc_var, def0);
5841	  new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5842	  /* Remove the statement, so that we can use the same code paths
5843	     as for statements that we've just created.  */
5844	  gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5845	  gsi_remove (&tmp_gsi, true);
5846	}
5847
5848      if (i == vec_num - 1)
5849	{
5850	  gimple_set_lhs (new_stmt, scalar_dest);
5851	  new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5852						    new_stmt);
5853	}
5854      else
5855	new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5856						     new_stmt, gsi);
5857
5858      if (slp_node)
5859	SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5860    }
5861
5862  if (!slp_node)
5863    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5864
5865  return true;
5866}
5867
5868/* Function is_nonwrapping_integer_induction.
5869
5870   Check if STMT_VINO (which is part of loop LOOP) both increments and
5871   does not cause overflow.  */
5872
5873static bool
5874is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
5875{
5876  gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5877  tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5878  tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5879  tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5880  widest_int ni, max_loop_value, lhs_max;
5881  wi::overflow_type overflow = wi::OVF_NONE;
5882
5883  /* Make sure the loop is integer based.  */
5884  if (TREE_CODE (base) != INTEGER_CST
5885      || TREE_CODE (step) != INTEGER_CST)
5886    return false;
5887
5888  /* Check that the max size of the loop will not wrap.  */
5889
5890  if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5891    return true;
5892
5893  if (! max_stmt_executions (loop, &ni))
5894    return false;
5895
5896  max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5897			    &overflow);
5898  if (overflow)
5899    return false;
5900
5901  max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5902			    TYPE_SIGN (lhs_type), &overflow);
5903  if (overflow)
5904    return false;
5905
5906  return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5907	  <= TYPE_PRECISION (lhs_type));
5908}
5909
5910/* Check if masking can be supported by inserting a conditional expression.
5911   CODE is the code for the operation.  COND_FN is the conditional internal
5912   function, if it exists.  VECTYPE_IN is the type of the vector input.  */
5913static bool
5914use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5915			 tree vectype_in)
5916{
5917  if (cond_fn != IFN_LAST
5918      && direct_internal_fn_supported_p (cond_fn, vectype_in,
5919					 OPTIMIZE_FOR_SPEED))
5920    return false;
5921
5922  switch (code)
5923    {
5924    case DOT_PROD_EXPR:
5925    case SAD_EXPR:
5926      return true;
5927
5928    default:
5929      return false;
5930    }
5931}
5932
5933/* Insert a conditional expression to enable masked vectorization.  CODE is the
5934   code for the operation.  VOP is the array of operands.  MASK is the loop
5935   mask.  GSI is a statement iterator used to place the new conditional
5936   expression.  */
5937static void
5938build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5939		      gimple_stmt_iterator *gsi)
5940{
5941  switch (code)
5942    {
5943    case DOT_PROD_EXPR:
5944      {
5945	tree vectype = TREE_TYPE (vop[1]);
5946	tree zero = build_zero_cst (vectype);
5947	tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5948	gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5949					       mask, vop[1], zero);
5950	gsi_insert_before (gsi, select, GSI_SAME_STMT);
5951	vop[1] = masked_op1;
5952	break;
5953      }
5954
5955    case SAD_EXPR:
5956      {
5957	tree vectype = TREE_TYPE (vop[1]);
5958	tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5959	gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5960					       mask, vop[1], vop[0]);
5961	gsi_insert_before (gsi, select, GSI_SAME_STMT);
5962	vop[1] = masked_op1;
5963	break;
5964      }
5965
5966    default:
5967      gcc_unreachable ();
5968    }
5969}
5970
5971/* Function vectorizable_reduction.
5972
5973   Check if STMT_INFO performs a reduction operation that can be vectorized.
5974   If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5975   stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5976   Return true if STMT_INFO is vectorizable in this way.
5977
5978   This function also handles reduction idioms (patterns) that have been
5979   recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
5980   may be of this form:
5981     X = pattern_expr (arg0, arg1, ..., X)
5982   and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5983   sequence that had been detected and replaced by the pattern-stmt
5984   (STMT_INFO).
5985
5986   This function also handles reduction of condition expressions, for example:
5987     for (int i = 0; i < N; i++)
5988       if (a[i] < value)
5989	 last = a[i];
5990   This is handled by vectorising the loop and creating an additional vector
5991   containing the loop indexes for which "a[i] < value" was true.  In the
5992   function epilogue this is reduced to a single max value and then used to
5993   index into the vector of results.
5994
5995   In some cases of reduction patterns, the type of the reduction variable X is
5996   different than the type of the other arguments of STMT_INFO.
5997   In such cases, the vectype that is used when transforming STMT_INFO into
5998   a vector stmt is different than the vectype that is used to determine the
5999   vectorization factor, because it consists of a different number of elements
6000   than the actual number of elements that are being operated upon in parallel.
6001
6002   For example, consider an accumulation of shorts into an int accumulator.
6003   On some targets it's possible to vectorize this pattern operating on 8
6004   shorts at a time (hence, the vectype for purposes of determining the
6005   vectorization factor should be V8HI); on the other hand, the vectype that
6006   is used to create the vector form is actually V4SI (the type of the result).
6007
6008   Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6009   indicates what is the actual level of parallelism (V8HI in the example), so
6010   that the right vectorization factor would be derived.  This vectype
6011   corresponds to the type of arguments to the reduction stmt, and should *NOT*
6012   be used to create the vectorized stmt.  The right vectype for the vectorized
6013   stmt is obtained from the type of the result X:
6014      get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6015
6016   This means that, contrary to "regular" reductions (or "regular" stmts in
6017   general), the following equation:
6018      STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6019   does *NOT* necessarily hold for reduction patterns.  */
6020
6021bool
6022vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
6023			slp_instance slp_node_instance,
6024			stmt_vector_for_cost *cost_vec)
6025{
6026  tree scalar_dest;
6027  tree vectype_in = NULL_TREE;
6028  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6029  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6030  enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6031  stmt_vec_info cond_stmt_vinfo = NULL;
6032  tree scalar_type;
6033  int i;
6034  int ncopies;
6035  bool single_defuse_cycle = false;
6036  bool nested_cycle = false;
6037  bool double_reduc = false;
6038  int vec_num;
6039  tree tem;
6040  tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6041  tree cond_reduc_val = NULL_TREE;
6042
6043  /* Make sure it was already recognized as a reduction computation.  */
6044  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6045      && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6046      && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6047    return false;
6048
6049  /* The stmt we store reduction analysis meta on.  */
6050  stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6051  reduc_info->is_reduc_info = true;
6052
6053  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6054    {
6055      if (is_a <gphi *> (stmt_info->stmt))
6056	/* Analysis for double-reduction is done on the outer
6057	   loop PHI, nested cycles have no further restrictions.  */
6058	STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6059      else
6060	STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6061      return true;
6062    }
6063
6064  stmt_vec_info orig_stmt_of_analysis = stmt_info;
6065  stmt_vec_info phi_info = stmt_info;
6066  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6067      || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6068    {
6069      if (!is_a <gphi *> (stmt_info->stmt))
6070	{
6071	  STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6072	  return true;
6073	}
6074      if (slp_node)
6075	{
6076	  slp_node_instance->reduc_phis = slp_node;
6077	  /* ???  We're leaving slp_node to point to the PHIs, we only
6078	     need it to get at the number of vector stmts which wasn't
6079	     yet initialized for the instance root.  */
6080	}
6081      if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6082	stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6083      else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6084	{
6085	  use_operand_p use_p;
6086	  gimple *use_stmt;
6087	  bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6088				     &use_p, &use_stmt);
6089	  gcc_assert (res);
6090	  phi_info = loop_vinfo->lookup_stmt (use_stmt);
6091	  stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6092	}
6093    }
6094
6095  /* PHIs should not participate in patterns.  */
6096  gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6097  gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6098
6099  /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6100     and compute the reduction chain length.  */
6101  tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6102					  loop_latch_edge (loop));
6103  unsigned reduc_chain_length = 0;
6104  bool only_slp_reduc_chain = true;
6105  stmt_info = NULL;
6106  while (reduc_def != PHI_RESULT (reduc_def_phi))
6107    {
6108      stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6109      stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6110      if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6111	{
6112	  if (dump_enabled_p ())
6113	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6114			     "reduction chain broken by patterns.\n");
6115	  return false;
6116	}
6117      if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6118	only_slp_reduc_chain = false;
6119      /* ???  For epilogue generation live members of the chain need
6120         to point back to the PHI via their original stmt for
6121	 info_for_reduction to work.  */
6122      if (STMT_VINFO_LIVE_P (vdef))
6123	STMT_VINFO_REDUC_DEF (def) = phi_info;
6124      gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6125      if (!assign)
6126	{
6127	  if (dump_enabled_p ())
6128	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6129			     "reduction chain includes calls.\n");
6130	  return false;
6131	}
6132      if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6133	{
6134	  if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6135				      TREE_TYPE (gimple_assign_rhs1 (assign))))
6136	    {
6137	      if (dump_enabled_p ())
6138		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6139				 "conversion in the reduction chain.\n");
6140	      return false;
6141	    }
6142	}
6143      else if (!stmt_info)
6144	/* First non-conversion stmt.  */
6145	stmt_info = vdef;
6146      reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6147      reduc_chain_length++;
6148    }
6149  /* PHIs should not participate in patterns.  */
6150  gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6151
6152  if (nested_in_vect_loop_p (loop, stmt_info))
6153    {
6154      loop = loop->inner;
6155      nested_cycle = true;
6156    }
6157
6158  /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6159     element.  */
6160  if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6161    {
6162      gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6163      stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6164    }
6165  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6166    gcc_assert (slp_node
6167		&& REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6168
6169  /* 1. Is vectorizable reduction?  */
6170  /* Not supportable if the reduction variable is used in the loop, unless
6171     it's a reduction chain.  */
6172  if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6173      && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6174    return false;
6175
6176  /* Reductions that are not used even in an enclosing outer-loop,
6177     are expected to be "live" (used out of the loop).  */
6178  if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6179      && !STMT_VINFO_LIVE_P (stmt_info))
6180    return false;
6181
6182  /* 2. Has this been recognized as a reduction pattern?
6183
6184     Check if STMT represents a pattern that has been recognized
6185     in earlier analysis stages.  For stmts that represent a pattern,
6186     the STMT_VINFO_RELATED_STMT field records the last stmt in
6187     the original sequence that constitutes the pattern.  */
6188
6189  stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6190  if (orig_stmt_info)
6191    {
6192      gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6193      gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6194    }
6195
6196  /* 3. Check the operands of the operation.  The first operands are defined
6197        inside the loop body. The last operand is the reduction variable,
6198        which is defined by the loop-header-phi.  */
6199
6200  tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6201  STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6202  gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6203  enum tree_code code = gimple_assign_rhs_code (stmt);
6204  bool lane_reduc_code_p
6205    = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6206  int op_type = TREE_CODE_LENGTH (code);
6207
6208  scalar_dest = gimple_assign_lhs (stmt);
6209  scalar_type = TREE_TYPE (scalar_dest);
6210  if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6211      && !SCALAR_FLOAT_TYPE_P (scalar_type))
6212    return false;
6213
6214  /* Do not try to vectorize bit-precision reductions.  */
6215  if (!type_has_mode_precision_p (scalar_type))
6216    return false;
6217
6218  /* For lane-reducing ops we're reducing the number of reduction PHIs
6219     which means the only use of that may be in the lane-reducing operation.  */
6220  if (lane_reduc_code_p
6221      && reduc_chain_length != 1
6222      && !only_slp_reduc_chain)
6223    {
6224      if (dump_enabled_p ())
6225	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6226			 "lane-reducing reduction with extra stmts.\n");
6227      return false;
6228    }
6229
6230  /* All uses but the last are expected to be defined in the loop.
6231     The last use is the reduction variable.  In case of nested cycle this
6232     assumption is not true: we use reduc_index to record the index of the
6233     reduction variable.  */
6234  reduc_def = PHI_RESULT (reduc_def_phi);
6235  for (i = 0; i < op_type; i++)
6236    {
6237      tree op = gimple_op (stmt, i + 1);
6238      /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6239      if (i == 0 && code == COND_EXPR)
6240        continue;
6241
6242      stmt_vec_info def_stmt_info;
6243      enum vect_def_type dt;
6244      if (!vect_is_simple_use (op, loop_vinfo, &dt, &tem,
6245			       &def_stmt_info))
6246	{
6247	  if (dump_enabled_p ())
6248	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6249			     "use not simple.\n");
6250	  return false;
6251	}
6252      if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6253	continue;
6254
6255      /* There should be only one cycle def in the stmt, the one
6256         leading to reduc_def.  */
6257      if (VECTORIZABLE_CYCLE_DEF (dt))
6258	return false;
6259
6260      /* To properly compute ncopies we are interested in the widest
6261	 non-reduction input type in case we're looking at a widening
6262	 accumulation that we later handle in vect_transform_reduction.  */
6263      if (lane_reduc_code_p
6264	  && tem
6265	  && (!vectype_in
6266	      || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6267		  < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6268	vectype_in = tem;
6269
6270      if (code == COND_EXPR)
6271	{
6272	  /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6273	  if (dt == vect_constant_def)
6274	    {
6275	      cond_reduc_dt = dt;
6276	      cond_reduc_val = op;
6277	    }
6278	  if (dt == vect_induction_def
6279	      && def_stmt_info
6280	      && is_nonwrapping_integer_induction (def_stmt_info, loop))
6281	    {
6282	      cond_reduc_dt = dt;
6283	      cond_stmt_vinfo = def_stmt_info;
6284	    }
6285	}
6286    }
6287  if (!vectype_in)
6288    vectype_in = STMT_VINFO_VECTYPE (phi_info);
6289  STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6290
6291  enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6292  STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6293  /* If we have a condition reduction, see if we can simplify it further.  */
6294  if (v_reduc_type == COND_REDUCTION)
6295    {
6296      if (slp_node)
6297	return false;
6298
6299      /* When the condition uses the reduction value in the condition, fail.  */
6300      if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6301	{
6302	  if (dump_enabled_p ())
6303	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6304			     "condition depends on previous iteration\n");
6305	  return false;
6306	}
6307
6308      if (reduc_chain_length == 1
6309	  && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6310					     vectype_in, OPTIMIZE_FOR_SPEED))
6311	{
6312	  if (dump_enabled_p ())
6313	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6314			     "optimizing condition reduction with"
6315			     " FOLD_EXTRACT_LAST.\n");
6316	  STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6317	}
6318      else if (cond_reduc_dt == vect_induction_def)
6319	{
6320	  tree base
6321	    = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6322	  tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6323
6324	  gcc_assert (TREE_CODE (base) == INTEGER_CST
6325		      && TREE_CODE (step) == INTEGER_CST);
6326	  cond_reduc_val = NULL_TREE;
6327	  enum tree_code cond_reduc_op_code = ERROR_MARK;
6328	  tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6329	  if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6330	    ;
6331	  /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6332	     above base; punt if base is the minimum value of the type for
6333	     MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6334	  else if (tree_int_cst_sgn (step) == -1)
6335	    {
6336	      cond_reduc_op_code = MIN_EXPR;
6337	      if (tree_int_cst_sgn (base) == -1)
6338		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6339	      else if (tree_int_cst_lt (base,
6340					TYPE_MAX_VALUE (TREE_TYPE (base))))
6341		cond_reduc_val
6342		  = int_const_binop (PLUS_EXPR, base, integer_one_node);
6343	    }
6344	  else
6345	    {
6346	      cond_reduc_op_code = MAX_EXPR;
6347	      if (tree_int_cst_sgn (base) == 1)
6348		cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6349	      else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6350					base))
6351		cond_reduc_val
6352		  = int_const_binop (MINUS_EXPR, base, integer_one_node);
6353	    }
6354	  if (cond_reduc_val)
6355	    {
6356	      if (dump_enabled_p ())
6357		dump_printf_loc (MSG_NOTE, vect_location,
6358				 "condition expression based on "
6359				 "integer induction.\n");
6360	      STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6361	      STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6362		= cond_reduc_val;
6363	      STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6364	    }
6365	}
6366      else if (cond_reduc_dt == vect_constant_def)
6367	{
6368	  enum vect_def_type cond_initial_dt;
6369	  tree cond_initial_val
6370	    = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6371
6372	  gcc_assert (cond_reduc_val != NULL_TREE);
6373	  vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6374	  if (cond_initial_dt == vect_constant_def
6375	      && types_compatible_p (TREE_TYPE (cond_initial_val),
6376				     TREE_TYPE (cond_reduc_val)))
6377	    {
6378	      tree e = fold_binary (LE_EXPR, boolean_type_node,
6379				    cond_initial_val, cond_reduc_val);
6380	      if (e && (integer_onep (e) || integer_zerop (e)))
6381		{
6382		  if (dump_enabled_p ())
6383		    dump_printf_loc (MSG_NOTE, vect_location,
6384				     "condition expression based on "
6385				     "compile time constant.\n");
6386		  /* Record reduction code at analysis stage.  */
6387		  STMT_VINFO_REDUC_CODE (reduc_info)
6388		    = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6389		  STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6390		}
6391	    }
6392	}
6393    }
6394
6395  if (STMT_VINFO_LIVE_P (phi_info))
6396    return false;
6397
6398  if (slp_node)
6399    ncopies = 1;
6400  else
6401    ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6402
6403  gcc_assert (ncopies >= 1);
6404
6405  poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6406
6407  if (nested_cycle)
6408    {
6409      gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6410		  == vect_double_reduction_def);
6411      double_reduc = true;
6412    }
6413
6414  /* 4.2. Check support for the epilog operation.
6415
6416          If STMT represents a reduction pattern, then the type of the
6417          reduction variable may be different than the type of the rest
6418          of the arguments.  For example, consider the case of accumulation
6419          of shorts into an int accumulator; The original code:
6420                        S1: int_a = (int) short_a;
6421          orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6422
6423          was replaced with:
6424                        STMT: int_acc = widen_sum <short_a, int_acc>
6425
6426          This means that:
6427          1. The tree-code that is used to create the vector operation in the
6428             epilog code (that reduces the partial results) is not the
6429             tree-code of STMT, but is rather the tree-code of the original
6430             stmt from the pattern that STMT is replacing.  I.e, in the example
6431             above we want to use 'widen_sum' in the loop, but 'plus' in the
6432             epilog.
6433          2. The type (mode) we use to check available target support
6434             for the vector operation to be created in the *epilog*, is
6435             determined by the type of the reduction variable (in the example
6436             above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6437             However the type (mode) we use to check available target support
6438             for the vector operation to be created *inside the loop*, is
6439             determined by the type of the other arguments to STMT (in the
6440             example we'd check this: optab_handler (widen_sum_optab,
6441	     vect_short_mode)).
6442
6443          This is contrary to "regular" reductions, in which the types of all
6444          the arguments are the same as the type of the reduction variable.
6445          For "regular" reductions we can therefore use the same vector type
6446          (and also the same tree-code) when generating the epilog code and
6447          when generating the code inside the loop.  */
6448
6449  enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6450  STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6451
6452  vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6453  if (reduction_type == TREE_CODE_REDUCTION)
6454    {
6455      /* Check whether it's ok to change the order of the computation.
6456	 Generally, when vectorizing a reduction we change the order of the
6457	 computation.  This may change the behavior of the program in some
6458	 cases, so we need to check that this is ok.  One exception is when
6459	 vectorizing an outer-loop: the inner-loop is executed sequentially,
6460	 and therefore vectorizing reductions in the inner-loop during
6461	 outer-loop vectorization is safe.  */
6462      if (needs_fold_left_reduction_p (scalar_type, orig_code))
6463	{
6464	  /* When vectorizing a reduction chain w/o SLP the reduction PHI
6465	     is not directy used in stmt.  */
6466	  if (!only_slp_reduc_chain
6467	      && reduc_chain_length != 1)
6468	    {
6469	      if (dump_enabled_p ())
6470		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6471				 "in-order reduction chain without SLP.\n");
6472	      return false;
6473	    }
6474	  STMT_VINFO_REDUC_TYPE (reduc_info)
6475	    = reduction_type = FOLD_LEFT_REDUCTION;
6476	}
6477      else if (!commutative_tree_code (orig_code)
6478	       || !associative_tree_code (orig_code))
6479	{
6480	  if (dump_enabled_p ())
6481	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6482			    "reduction: not commutative/associative");
6483	  return false;
6484	}
6485    }
6486
6487  if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6488      && ncopies > 1)
6489    {
6490      if (dump_enabled_p ())
6491	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6492			 "multiple types in double reduction or condition "
6493			 "reduction or fold-left reduction.\n");
6494      return false;
6495    }
6496
6497  internal_fn reduc_fn = IFN_LAST;
6498  if (reduction_type == TREE_CODE_REDUCTION
6499      || reduction_type == FOLD_LEFT_REDUCTION
6500      || reduction_type == INTEGER_INDUC_COND_REDUCTION
6501      || reduction_type == CONST_COND_REDUCTION)
6502    {
6503      if (reduction_type == FOLD_LEFT_REDUCTION
6504	  ? fold_left_reduction_fn (orig_code, &reduc_fn)
6505	  : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6506	{
6507	  if (reduc_fn != IFN_LAST
6508	      && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6509						  OPTIMIZE_FOR_SPEED))
6510	    {
6511	      if (dump_enabled_p ())
6512		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6513				 "reduc op not supported by target.\n");
6514
6515	      reduc_fn = IFN_LAST;
6516	    }
6517	}
6518      else
6519	{
6520	  if (!nested_cycle || double_reduc)
6521	    {
6522	      if (dump_enabled_p ())
6523		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6524				 "no reduc code for scalar code.\n");
6525
6526	      return false;
6527	    }
6528	}
6529    }
6530  else if (reduction_type == COND_REDUCTION)
6531    {
6532      int scalar_precision
6533	= GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6534      cr_index_scalar_type = make_unsigned_type (scalar_precision);
6535      cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6536						nunits_out);
6537
6538      if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6539					  OPTIMIZE_FOR_SPEED))
6540	reduc_fn = IFN_REDUC_MAX;
6541    }
6542  STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6543
6544  if (reduction_type != EXTRACT_LAST_REDUCTION
6545      && (!nested_cycle || double_reduc)
6546      && reduc_fn == IFN_LAST
6547      && !nunits_out.is_constant ())
6548    {
6549      if (dump_enabled_p ())
6550	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6551			 "missing target support for reduction on"
6552			 " variable-length vectors.\n");
6553      return false;
6554    }
6555
6556  /* For SLP reductions, see if there is a neutral value we can use.  */
6557  tree neutral_op = NULL_TREE;
6558  if (slp_node)
6559    neutral_op = neutral_op_for_slp_reduction
6560      (slp_node_instance->reduc_phis, vectype_out, orig_code,
6561       REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6562
6563  if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6564    {
6565      /* We can't support in-order reductions of code such as this:
6566
6567	   for (int i = 0; i < n1; ++i)
6568	     for (int j = 0; j < n2; ++j)
6569	       l += a[j];
6570
6571	 since GCC effectively transforms the loop when vectorizing:
6572
6573	   for (int i = 0; i < n1 / VF; ++i)
6574	     for (int j = 0; j < n2; ++j)
6575	       for (int k = 0; k < VF; ++k)
6576		 l += a[j];
6577
6578	 which is a reassociation of the original operation.  */
6579      if (dump_enabled_p ())
6580	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6581			 "in-order double reduction not supported.\n");
6582
6583      return false;
6584    }
6585
6586  if (reduction_type == FOLD_LEFT_REDUCTION
6587      && slp_node
6588      && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6589    {
6590      /* We cannot use in-order reductions in this case because there is
6591	 an implicit reassociation of the operations involved.  */
6592      if (dump_enabled_p ())
6593	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6594			 "in-order unchained SLP reductions not supported.\n");
6595      return false;
6596    }
6597
6598  /* For double reductions, and for SLP reductions with a neutral value,
6599     we construct a variable-length initial vector by loading a vector
6600     full of the neutral value and then shift-and-inserting the start
6601     values into the low-numbered elements.  */
6602  if ((double_reduc || neutral_op)
6603      && !nunits_out.is_constant ()
6604      && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6605					  vectype_out, OPTIMIZE_FOR_SPEED))
6606    {
6607      if (dump_enabled_p ())
6608	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6609			 "reduction on variable-length vectors requires"
6610			 " target support for a vector-shift-and-insert"
6611			 " operation.\n");
6612      return false;
6613    }
6614
6615  /* Check extra constraints for variable-length unchained SLP reductions.  */
6616  if (STMT_SLP_TYPE (stmt_info)
6617      && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6618      && !nunits_out.is_constant ())
6619    {
6620      /* We checked above that we could build the initial vector when
6621	 there's a neutral element value.  Check here for the case in
6622	 which each SLP statement has its own initial value and in which
6623	 that value needs to be repeated for every instance of the
6624	 statement within the initial vector.  */
6625      unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6626      if (!neutral_op
6627	  && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6628					      TREE_TYPE (vectype_out)))
6629	{
6630	  if (dump_enabled_p ())
6631	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6632			     "unsupported form of SLP reduction for"
6633			     " variable-length vectors: cannot build"
6634			     " initial vector.\n");
6635	  return false;
6636	}
6637      /* The epilogue code relies on the number of elements being a multiple
6638	 of the group size.  The duplicate-and-interleave approach to setting
6639	 up the initial vector does too.  */
6640      if (!multiple_p (nunits_out, group_size))
6641	{
6642	  if (dump_enabled_p ())
6643	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6644			     "unsupported form of SLP reduction for"
6645			     " variable-length vectors: the vector size"
6646			     " is not a multiple of the number of results.\n");
6647	  return false;
6648	}
6649    }
6650
6651  if (reduction_type == COND_REDUCTION)
6652    {
6653      widest_int ni;
6654
6655      if (! max_loop_iterations (loop, &ni))
6656	{
6657	  if (dump_enabled_p ())
6658	    dump_printf_loc (MSG_NOTE, vect_location,
6659			     "loop count not known, cannot create cond "
6660			     "reduction.\n");
6661	  return false;
6662	}
6663      /* Convert backedges to iterations.  */
6664      ni += 1;
6665
6666      /* The additional index will be the same type as the condition.  Check
6667	 that the loop can fit into this less one (because we'll use up the
6668	 zero slot for when there are no matches).  */
6669      tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6670      if (wi::geu_p (ni, wi::to_widest (max_index)))
6671	{
6672	  if (dump_enabled_p ())
6673	    dump_printf_loc (MSG_NOTE, vect_location,
6674			     "loop size is greater than data size.\n");
6675	  return false;
6676	}
6677    }
6678
6679  /* In case the vectorization factor (VF) is bigger than the number
6680     of elements that we can fit in a vectype (nunits), we have to generate
6681     more than one vector stmt - i.e - we need to "unroll" the
6682     vector stmt by a factor VF/nunits.  For more details see documentation
6683     in vectorizable_operation.  */
6684
6685  /* If the reduction is used in an outer loop we need to generate
6686     VF intermediate results, like so (e.g. for ncopies=2):
6687	r0 = phi (init, r0)
6688	r1 = phi (init, r1)
6689	r0 = x0 + r0;
6690        r1 = x1 + r1;
6691    (i.e. we generate VF results in 2 registers).
6692    In this case we have a separate def-use cycle for each copy, and therefore
6693    for each copy we get the vector def for the reduction variable from the
6694    respective phi node created for this copy.
6695
6696    Otherwise (the reduction is unused in the loop nest), we can combine
6697    together intermediate results, like so (e.g. for ncopies=2):
6698	r = phi (init, r)
6699	r = x0 + r;
6700	r = x1 + r;
6701   (i.e. we generate VF/2 results in a single register).
6702   In this case for each copy we get the vector def for the reduction variable
6703   from the vectorized reduction operation generated in the previous iteration.
6704
6705   This only works when we see both the reduction PHI and its only consumer
6706   in vectorizable_reduction and there are no intermediate stmts
6707   participating.  */
6708  if (ncopies > 1
6709      && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6710      && reduc_chain_length == 1)
6711    single_defuse_cycle = true;
6712
6713  if (single_defuse_cycle || lane_reduc_code_p)
6714    {
6715      gcc_assert (code != COND_EXPR);
6716
6717      /* 4. Supportable by target?  */
6718      bool ok = true;
6719
6720      /* 4.1. check support for the operation in the loop  */
6721      optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6722      if (!optab)
6723	{
6724	  if (dump_enabled_p ())
6725	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6726			     "no optab.\n");
6727	  ok = false;
6728        }
6729
6730      machine_mode vec_mode = TYPE_MODE (vectype_in);
6731      if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6732        {
6733          if (dump_enabled_p ())
6734            dump_printf (MSG_NOTE, "op not supported by target.\n");
6735	  if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6736	      || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6737	    ok = false;
6738	  else
6739	    if (dump_enabled_p ())
6740	      dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6741        }
6742
6743      /* Worthwhile without SIMD support?  */
6744      if (ok
6745	  && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6746	  && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6747        {
6748          if (dump_enabled_p ())
6749	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6750			     "not worthwhile without SIMD support.\n");
6751	  ok = false;
6752        }
6753
6754      /* lane-reducing operations have to go through vect_transform_reduction.
6755         For the other cases try without the single cycle optimization.  */
6756      if (!ok)
6757	{
6758	  if (lane_reduc_code_p)
6759	    return false;
6760	  else
6761	    single_defuse_cycle = false;
6762	}
6763    }
6764  STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6765
6766  /* If the reduction stmt is one of the patterns that have lane
6767     reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6768  if ((ncopies > 1 && ! single_defuse_cycle)
6769      && lane_reduc_code_p)
6770    {
6771      if (dump_enabled_p ())
6772	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6773			 "multi def-use cycle not possible for lane-reducing "
6774			 "reduction operation\n");
6775      return false;
6776    }
6777
6778  if (slp_node)
6779    vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6780  else
6781    vec_num = 1;
6782
6783  vect_model_reduction_cost (stmt_info, reduc_fn, reduction_type, ncopies,
6784			     cost_vec);
6785  /* Cost the reduction op inside the loop if transformed via
6786     vect_transform_reduction.  Otherwise this is costed by the
6787     separate vectorizable_* routines.  */
6788  if (single_defuse_cycle
6789      || code == DOT_PROD_EXPR
6790      || code == WIDEN_SUM_EXPR
6791      || code == SAD_EXPR)
6792    record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
6793
6794  if (dump_enabled_p ()
6795      && reduction_type == FOLD_LEFT_REDUCTION)
6796    dump_printf_loc (MSG_NOTE, vect_location,
6797		     "using an in-order (fold-left) reduction.\n");
6798  STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
6799  /* All but single defuse-cycle optimized, lane-reducing and fold-left
6800     reductions go through their own vectorizable_* routines.  */
6801  if (!single_defuse_cycle
6802      && code != DOT_PROD_EXPR
6803      && code != WIDEN_SUM_EXPR
6804      && code != SAD_EXPR
6805      && reduction_type != FOLD_LEFT_REDUCTION)
6806    {
6807      stmt_vec_info tem
6808	= vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6809      if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
6810	{
6811	  gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
6812	  tem = REDUC_GROUP_FIRST_ELEMENT (tem);
6813	}
6814      STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
6815      STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
6816    }
6817  else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6818    {
6819      vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6820      internal_fn cond_fn = get_conditional_internal_fn (code);
6821
6822      if (reduction_type != FOLD_LEFT_REDUCTION
6823	  && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6824	  && (cond_fn == IFN_LAST
6825	      || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6826						  OPTIMIZE_FOR_SPEED)))
6827	{
6828	  if (dump_enabled_p ())
6829	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6830			     "can't use a fully-masked loop because no"
6831			     " conditional operation is available.\n");
6832	  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6833	}
6834      else if (reduction_type == FOLD_LEFT_REDUCTION
6835	       && reduc_fn == IFN_LAST
6836	       && !expand_vec_cond_expr_p (vectype_in,
6837					   truth_type_for (vectype_in),
6838					   SSA_NAME))
6839	{
6840	  if (dump_enabled_p ())
6841	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6842			     "can't use a fully-masked loop because no"
6843			     " conditional operation is available.\n");
6844	  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6845	}
6846      else
6847	vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6848			       vectype_in, NULL);
6849    }
6850  return true;
6851}
6852
6853/* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6854   value.  */
6855
6856bool
6857vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6858			  stmt_vec_info *vec_stmt, slp_tree slp_node)
6859{
6860  tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6861  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6862  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6863  int i;
6864  int ncopies;
6865  int j;
6866  int vec_num;
6867
6868  stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6869  gcc_assert (reduc_info->is_reduc_info);
6870
6871  if (nested_in_vect_loop_p (loop, stmt_info))
6872    {
6873      loop = loop->inner;
6874      gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
6875    }
6876
6877  gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6878  enum tree_code code = gimple_assign_rhs_code (stmt);
6879  int op_type = TREE_CODE_LENGTH (code);
6880
6881  /* Flatten RHS.  */
6882  tree ops[3];
6883  switch (get_gimple_rhs_class (code))
6884    {
6885    case GIMPLE_TERNARY_RHS:
6886      ops[2] = gimple_assign_rhs3 (stmt);
6887      /* Fall thru.  */
6888    case GIMPLE_BINARY_RHS:
6889      ops[0] = gimple_assign_rhs1 (stmt);
6890      ops[1] = gimple_assign_rhs2 (stmt);
6891      break;
6892    default:
6893      gcc_unreachable ();
6894    }
6895
6896  /* All uses but the last are expected to be defined in the loop.
6897     The last use is the reduction variable.  In case of nested cycle this
6898     assumption is not true: we use reduc_index to record the index of the
6899     reduction variable.  */
6900  stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
6901  gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6902  int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
6903  tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6904
6905  if (slp_node)
6906    {
6907      ncopies = 1;
6908      vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6909    }
6910  else
6911    {
6912      ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6913      vec_num = 1;
6914    }
6915
6916  internal_fn cond_fn = get_conditional_internal_fn (code);
6917  vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6918  bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6919
6920  /* Transform.  */
6921  stmt_vec_info new_stmt_info = NULL;
6922  stmt_vec_info prev_stmt_info;
6923  tree new_temp = NULL_TREE;
6924  auto_vec<tree> vec_oprnds0;
6925  auto_vec<tree> vec_oprnds1;
6926  auto_vec<tree> vec_oprnds2;
6927  tree def0;
6928
6929  if (dump_enabled_p ())
6930    dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6931
6932  /* FORNOW: Multiple types are not supported for condition.  */
6933  if (code == COND_EXPR)
6934    gcc_assert (ncopies == 1);
6935
6936  bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6937
6938  vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6939  if (reduction_type == FOLD_LEFT_REDUCTION)
6940    {
6941      internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6942      return vectorize_fold_left_reduction
6943	  (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6944	   reduc_fn, ops, vectype_in, reduc_index, masks);
6945    }
6946
6947  bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
6948  gcc_assert (single_defuse_cycle
6949	      || code == DOT_PROD_EXPR
6950	      || code == WIDEN_SUM_EXPR
6951	      || code == SAD_EXPR);
6952
6953  /* Create the destination vector  */
6954  tree scalar_dest = gimple_assign_lhs (stmt);
6955  tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6956
6957  prev_stmt_info = NULL;
6958  if (!slp_node)
6959    {
6960      vec_oprnds0.create (1);
6961      vec_oprnds1.create (1);
6962      if (op_type == ternary_op)
6963        vec_oprnds2.create (1);
6964    }
6965
6966  for (j = 0; j < ncopies; j++)
6967    {
6968      /* Handle uses.  */
6969      if (j == 0)
6970        {
6971	  if (slp_node)
6972	    {
6973	      /* Get vec defs for all the operands except the reduction index,
6974		 ensuring the ordering of the ops in the vector is kept.  */
6975	      auto_vec<vec<tree>, 3> vec_defs;
6976	      vect_get_slp_defs (slp_node, &vec_defs);
6977	      vec_oprnds0.safe_splice (vec_defs[0]);
6978	      vec_defs[0].release ();
6979	      vec_oprnds1.safe_splice (vec_defs[1]);
6980	      vec_defs[1].release ();
6981	      if (op_type == ternary_op)
6982		{
6983		  vec_oprnds2.safe_splice (vec_defs[2]);
6984		  vec_defs[2].release ();
6985		}
6986	    }
6987          else
6988	    {
6989              vec_oprnds0.quick_push
6990		(vect_get_vec_def_for_operand (ops[0], stmt_info));
6991              vec_oprnds1.quick_push
6992		(vect_get_vec_def_for_operand (ops[1], stmt_info));
6993              if (op_type == ternary_op)
6994		vec_oprnds2.quick_push
6995		  (vect_get_vec_def_for_operand (ops[2], stmt_info));
6996	    }
6997        }
6998      else
6999        {
7000          if (!slp_node)
7001            {
7002	      gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7003
7004	      if (single_defuse_cycle && reduc_index == 0)
7005		vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7006	      else
7007		vec_oprnds0[0]
7008		  = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7009						    vec_oprnds0[0]);
7010	      if (single_defuse_cycle && reduc_index == 1)
7011		vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7012	      else
7013		vec_oprnds1[0]
7014		  = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7015						    vec_oprnds1[0]);
7016	      if (op_type == ternary_op)
7017		{
7018		  if (single_defuse_cycle && reduc_index == 2)
7019		    vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7020		  else
7021		    vec_oprnds2[0]
7022		      = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7023							vec_oprnds2[0]);
7024		}
7025            }
7026        }
7027
7028      FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7029        {
7030	  tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7031	  if (masked_loop_p && !mask_by_cond_expr)
7032	    {
7033	      /* Make sure that the reduction accumulator is vop[0].  */
7034	      if (reduc_index == 1)
7035		{
7036		  gcc_assert (commutative_tree_code (code));
7037		  std::swap (vop[0], vop[1]);
7038		}
7039	      tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7040					      vectype_in, i * ncopies + j);
7041	      gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7042							vop[0], vop[1],
7043							vop[0]);
7044	      new_temp = make_ssa_name (vec_dest, call);
7045	      gimple_call_set_lhs (call, new_temp);
7046	      gimple_call_set_nothrow (call, true);
7047	      new_stmt_info
7048		= vect_finish_stmt_generation (stmt_info, call, gsi);
7049	    }
7050	  else
7051	    {
7052	      if (op_type == ternary_op)
7053		vop[2] = vec_oprnds2[i];
7054
7055	      if (masked_loop_p && mask_by_cond_expr)
7056		{
7057		  tree mask = vect_get_loop_mask (gsi, masks,
7058						  vec_num * ncopies,
7059						  vectype_in, i * ncopies + j);
7060		  build_vect_cond_expr (code, vop, mask, gsi);
7061		}
7062
7063	      gassign *new_stmt = gimple_build_assign (vec_dest, code,
7064						       vop[0], vop[1], vop[2]);
7065	      new_temp = make_ssa_name (vec_dest, new_stmt);
7066	      gimple_assign_set_lhs (new_stmt, new_temp);
7067	      new_stmt_info
7068		= vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7069	    }
7070
7071          if (slp_node)
7072	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7073        }
7074
7075      if (slp_node || single_defuse_cycle)
7076        continue;
7077
7078      if (j == 0)
7079	STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7080      else
7081	STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7082
7083      prev_stmt_info = new_stmt_info;
7084    }
7085
7086  if (single_defuse_cycle && !slp_node)
7087    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7088
7089  return true;
7090}
7091
7092/* Transform phase of a cycle PHI.  */
7093
7094bool
7095vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
7096			  slp_tree slp_node, slp_instance slp_node_instance)
7097{
7098  tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7099  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7100  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7101  int i;
7102  int ncopies;
7103  stmt_vec_info prev_phi_info;
7104  int j;
7105  bool nested_cycle = false;
7106  int vec_num;
7107
7108  if (nested_in_vect_loop_p (loop, stmt_info))
7109    {
7110      loop = loop->inner;
7111      nested_cycle = true;
7112    }
7113
7114  stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7115  reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7116  stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7117  gcc_assert (reduc_info->is_reduc_info);
7118
7119  if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7120      || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7121    /* Leave the scalar phi in place.  */
7122    return true;
7123
7124  tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7125  /* For a nested cycle we do not fill the above.  */
7126  if (!vectype_in)
7127    vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7128  gcc_assert (vectype_in);
7129
7130  if (slp_node)
7131    {
7132      /* The size vect_schedule_slp_instance computes is off for us.  */
7133      vec_num = vect_get_num_vectors
7134	  (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7135	   * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
7136      ncopies = 1;
7137    }
7138  else
7139    {
7140      vec_num = 1;
7141      ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7142    }
7143
7144  /* Check whether we should use a single PHI node and accumulate
7145     vectors to one before the backedge.  */
7146  if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7147    ncopies = 1;
7148
7149  /* Create the destination vector  */
7150  gphi *phi = as_a <gphi *> (stmt_info->stmt);
7151  tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7152					       vectype_out);
7153
7154  /* Get the loop-entry arguments.  */
7155  tree vec_initial_def;
7156  auto_vec<tree> vec_initial_defs;
7157  if (slp_node)
7158    {
7159      vec_initial_defs.reserve (vec_num);
7160      gcc_assert (slp_node == slp_node_instance->reduc_phis);
7161      stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7162      tree neutral_op
7163	= neutral_op_for_slp_reduction (slp_node, vectype_out,
7164					STMT_VINFO_REDUC_CODE (reduc_info),
7165					first != NULL);
7166      get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
7167				      &vec_initial_defs, vec_num,
7168				      first != NULL, neutral_op);
7169    }
7170  else
7171    {
7172      /* Get at the scalar def before the loop, that defines the initial
7173	 value of the reduction variable.  */
7174      tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7175						loop_preheader_edge (loop));
7176      /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7177	 and we can't use zero for induc_val, use initial_def.  Similarly
7178	 for REDUC_MIN and initial_def larger than the base.  */
7179      if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7180	{
7181	  tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7182	  if (TREE_CODE (initial_def) == INTEGER_CST
7183	      && !integer_zerop (induc_val)
7184	      && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7185		   && tree_int_cst_lt (initial_def, induc_val))
7186		  || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7187		      && tree_int_cst_lt (induc_val, initial_def))))
7188	    {
7189	      induc_val = initial_def;
7190	      /* Communicate we used the initial_def to epilouge
7191		 generation.  */
7192	      STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7193	    }
7194	  vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7195	}
7196      else if (nested_cycle)
7197	{
7198	  /* Do not use an adjustment def as that case is not supported
7199	     correctly if ncopies is not one.  */
7200	  vec_initial_def = vect_get_vec_def_for_operand (initial_def,
7201							  reduc_stmt_info);
7202	}
7203      else
7204	{
7205	  tree adjustment_def = NULL_TREE;
7206	  tree *adjustment_defp = &adjustment_def;
7207	  enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7208	  if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7209	    adjustment_defp = NULL;
7210	  vec_initial_def
7211	    = get_initial_def_for_reduction (reduc_stmt_info, code,
7212					     initial_def, adjustment_defp);
7213	  STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7214	}
7215      vec_initial_defs.create (1);
7216      vec_initial_defs.quick_push (vec_initial_def);
7217    }
7218
7219  /* Generate the reduction PHIs upfront.  */
7220  prev_phi_info = NULL;
7221  for (i = 0; i < vec_num; i++)
7222    {
7223      tree vec_init_def = vec_initial_defs[i];
7224      for (j = 0; j < ncopies; j++)
7225	{
7226	  /* Create the reduction-phi that defines the reduction
7227	     operand.  */
7228	  gphi *new_phi = create_phi_node (vec_dest, loop->header);
7229	  stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7230
7231	  /* Set the loop-entry arg of the reduction-phi.  */
7232	  if (j != 0 && nested_cycle)
7233	    vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7234							   vec_init_def);
7235	  add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7236		       UNKNOWN_LOCATION);
7237
7238	  /* The loop-latch arg is set in epilogue processing.  */
7239
7240	  if (slp_node)
7241	    SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7242	  else
7243	    {
7244	      if (j == 0)
7245		STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7246	      else
7247		STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7248	      prev_phi_info = new_phi_info;
7249	    }
7250	}
7251    }
7252
7253  return true;
7254}
7255
7256/* Vectorizes LC PHIs.  */
7257
7258bool
7259vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
7260		     slp_tree slp_node)
7261{
7262  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7263  if (!loop_vinfo
7264      || !is_a <gphi *> (stmt_info->stmt)
7265      || gimple_phi_num_args (stmt_info->stmt) != 1)
7266    return false;
7267
7268  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7269      && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7270    return false;
7271
7272  if (!vec_stmt) /* transformation not required.  */
7273    {
7274      STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7275      return true;
7276    }
7277
7278  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7279  tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7280  basic_block bb = gimple_bb (stmt_info->stmt);
7281  edge e = single_pred_edge (bb);
7282  tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7283  vec<tree> vec_oprnds = vNULL;
7284  vect_get_vec_defs (gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
7285		     stmt_info, &vec_oprnds, NULL, slp_node);
7286  if (slp_node)
7287    {
7288      unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7289      gcc_assert (vec_oprnds.length () == vec_num);
7290      for (unsigned i = 0; i < vec_num; i++)
7291	{
7292	  /* Create the vectorized LC PHI node.  */
7293	  gphi *new_phi = create_phi_node (vec_dest, bb);
7294	  add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7295	  stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7296	  SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7297	}
7298    }
7299  else
7300    {
7301      unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
7302      stmt_vec_info prev_phi_info = NULL;
7303      for (unsigned i = 0; i < ncopies; i++)
7304	{
7305	  if (i != 0)
7306	    vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
7307	  /* Create the vectorized LC PHI node.  */
7308	  gphi *new_phi = create_phi_node (vec_dest, bb);
7309	  add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
7310	  stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7311	  if (i == 0)
7312	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7313	  else
7314	    STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7315	  prev_phi_info = new_phi_info;
7316	}
7317    }
7318  vec_oprnds.release ();
7319
7320  return true;
7321}
7322
7323
7324/* Function vect_min_worthwhile_factor.
7325
7326   For a loop where we could vectorize the operation indicated by CODE,
7327   return the minimum vectorization factor that makes it worthwhile
7328   to use generic vectors.  */
7329static unsigned int
7330vect_min_worthwhile_factor (enum tree_code code)
7331{
7332  switch (code)
7333    {
7334    case PLUS_EXPR:
7335    case MINUS_EXPR:
7336    case NEGATE_EXPR:
7337      return 4;
7338
7339    case BIT_AND_EXPR:
7340    case BIT_IOR_EXPR:
7341    case BIT_XOR_EXPR:
7342    case BIT_NOT_EXPR:
7343      return 2;
7344
7345    default:
7346      return INT_MAX;
7347    }
7348}
7349
7350/* Return true if VINFO indicates we are doing loop vectorization and if
7351   it is worth decomposing CODE operations into scalar operations for
7352   that loop's vectorization factor.  */
7353
7354bool
7355vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7356{
7357  loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7358  unsigned HOST_WIDE_INT value;
7359  return (loop_vinfo
7360	  && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7361	  && value >= vect_min_worthwhile_factor (code));
7362}
7363
7364/* Function vectorizable_induction
7365
7366   Check if STMT_INFO performs an induction computation that can be vectorized.
7367   If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7368   phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7369   Return true if STMT_INFO is vectorizable in this way.  */
7370
7371bool
7372vectorizable_induction (stmt_vec_info stmt_info,
7373			gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7374			stmt_vec_info *vec_stmt, slp_tree slp_node,
7375			stmt_vector_for_cost *cost_vec)
7376{
7377  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7378  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7379  unsigned ncopies;
7380  bool nested_in_vect_loop = false;
7381  class loop *iv_loop;
7382  tree vec_def;
7383  edge pe = loop_preheader_edge (loop);
7384  basic_block new_bb;
7385  tree new_vec, vec_init, vec_step, t;
7386  tree new_name;
7387  gimple *new_stmt;
7388  gphi *induction_phi;
7389  tree induc_def, vec_dest;
7390  tree init_expr, step_expr;
7391  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7392  unsigned i;
7393  tree expr;
7394  gimple_seq stmts;
7395  imm_use_iterator imm_iter;
7396  use_operand_p use_p;
7397  gimple *exit_phi;
7398  edge latch_e;
7399  tree loop_arg;
7400  gimple_stmt_iterator si;
7401
7402  gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7403  if (!phi)
7404    return false;
7405
7406  if (!STMT_VINFO_RELEVANT_P (stmt_info))
7407    return false;
7408
7409  /* Make sure it was recognized as induction computation.  */
7410  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7411    return false;
7412
7413  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7414  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7415
7416  if (slp_node)
7417    ncopies = 1;
7418  else
7419    ncopies = vect_get_num_copies (loop_vinfo, vectype);
7420  gcc_assert (ncopies >= 1);
7421
7422  /* FORNOW. These restrictions should be relaxed.  */
7423  if (nested_in_vect_loop_p (loop, stmt_info))
7424    {
7425      imm_use_iterator imm_iter;
7426      use_operand_p use_p;
7427      gimple *exit_phi;
7428      edge latch_e;
7429      tree loop_arg;
7430
7431      if (ncopies > 1)
7432	{
7433	  if (dump_enabled_p ())
7434	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7435			     "multiple types in nested loop.\n");
7436	  return false;
7437	}
7438
7439      /* FORNOW: outer loop induction with SLP not supported.  */
7440      if (STMT_SLP_TYPE (stmt_info))
7441	return false;
7442
7443      exit_phi = NULL;
7444      latch_e = loop_latch_edge (loop->inner);
7445      loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7446      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7447	{
7448	  gimple *use_stmt = USE_STMT (use_p);
7449	  if (is_gimple_debug (use_stmt))
7450	    continue;
7451
7452	  if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7453	    {
7454	      exit_phi = use_stmt;
7455	      break;
7456	    }
7457	}
7458      if (exit_phi)
7459	{
7460	  stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7461	  if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7462		&& !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7463	    {
7464	      if (dump_enabled_p ())
7465		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7466				 "inner-loop induction only used outside "
7467				 "of the outer vectorized loop.\n");
7468	      return false;
7469	    }
7470	}
7471
7472      nested_in_vect_loop = true;
7473      iv_loop = loop->inner;
7474    }
7475  else
7476    iv_loop = loop;
7477  gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7478
7479  if (slp_node && !nunits.is_constant ())
7480    {
7481      /* The current SLP code creates the initial value element-by-element.  */
7482      if (dump_enabled_p ())
7483	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7484			 "SLP induction not supported for variable-length"
7485			 " vectors.\n");
7486      return false;
7487    }
7488
7489  if (!vec_stmt) /* transformation not required.  */
7490    {
7491      STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7492      DUMP_VECT_SCOPE ("vectorizable_induction");
7493      vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7494      return true;
7495    }
7496
7497  /* Transform.  */
7498
7499  /* Compute a vector variable, initialized with the first VF values of
7500     the induction variable.  E.g., for an iv with IV_PHI='X' and
7501     evolution S, for a vector of 4 units, we want to compute:
7502     [X, X + S, X + 2*S, X + 3*S].  */
7503
7504  if (dump_enabled_p ())
7505    dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7506
7507  latch_e = loop_latch_edge (iv_loop);
7508  loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7509
7510  step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7511  gcc_assert (step_expr != NULL_TREE);
7512  tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7513
7514  pe = loop_preheader_edge (iv_loop);
7515  init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7516				     loop_preheader_edge (iv_loop));
7517
7518  stmts = NULL;
7519  if (!nested_in_vect_loop)
7520    {
7521      /* Convert the initial value to the IV update type.  */
7522      tree new_type = TREE_TYPE (step_expr);
7523      init_expr = gimple_convert (&stmts, new_type, init_expr);
7524
7525      /* If we are using the loop mask to "peel" for alignment then we need
7526	 to adjust the start value here.  */
7527      tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7528      if (skip_niters != NULL_TREE)
7529	{
7530	  if (FLOAT_TYPE_P (vectype))
7531	    skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7532					skip_niters);
7533	  else
7534	    skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7535	  tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7536					 skip_niters, step_expr);
7537	  init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7538				    init_expr, skip_step);
7539	}
7540    }
7541
7542  if (stmts)
7543    {
7544      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7545      gcc_assert (!new_bb);
7546    }
7547
7548  /* Find the first insertion point in the BB.  */
7549  basic_block bb = gimple_bb (phi);
7550  si = gsi_after_labels (bb);
7551
7552  /* For SLP induction we have to generate several IVs as for example
7553     with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7554     [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7555     [VF*S, VF*S, VF*S, VF*S] for all.  */
7556  if (slp_node)
7557    {
7558      /* Enforced above.  */
7559      unsigned int const_nunits = nunits.to_constant ();
7560
7561      /* Generate [VF*S, VF*S, ... ].  */
7562      if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7563	{
7564	  expr = build_int_cst (integer_type_node, vf);
7565	  expr = fold_convert (TREE_TYPE (step_expr), expr);
7566	}
7567      else
7568	expr = build_int_cst (TREE_TYPE (step_expr), vf);
7569      new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7570			      expr, step_expr);
7571      if (! CONSTANT_CLASS_P (new_name))
7572	new_name = vect_init_vector (stmt_info, new_name,
7573				     TREE_TYPE (step_expr), NULL);
7574      new_vec = build_vector_from_val (step_vectype, new_name);
7575      vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7576
7577      /* Now generate the IVs.  */
7578      unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7579      unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7580      unsigned elts = const_nunits * nvects;
7581      unsigned nivs = least_common_multiple (group_size,
7582					     const_nunits) / const_nunits;
7583      gcc_assert (elts % group_size == 0);
7584      tree elt = init_expr;
7585      unsigned ivn;
7586      for (ivn = 0; ivn < nivs; ++ivn)
7587	{
7588	  tree_vector_builder elts (step_vectype, const_nunits, 1);
7589	  stmts = NULL;
7590	  for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7591	    {
7592	      if (ivn*const_nunits + eltn >= group_size
7593		  && (ivn * const_nunits + eltn) % group_size == 0)
7594		elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7595				    elt, step_expr);
7596	      elts.quick_push (elt);
7597	    }
7598	  vec_init = gimple_build_vector (&stmts, &elts);
7599	  vec_init = gimple_convert (&stmts, vectype, vec_init);
7600	  if (stmts)
7601	    {
7602	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7603	      gcc_assert (!new_bb);
7604	    }
7605
7606	  /* Create the induction-phi that defines the induction-operand.  */
7607	  vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7608	  induction_phi = create_phi_node (vec_dest, iv_loop->header);
7609	  stmt_vec_info induction_phi_info
7610	    = loop_vinfo->add_stmt (induction_phi);
7611	  induc_def = PHI_RESULT (induction_phi);
7612
7613	  /* Create the iv update inside the loop  */
7614	  gimple_seq stmts = NULL;
7615	  vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7616	  vec_def = gimple_build (&stmts,
7617				  PLUS_EXPR, step_vectype, vec_def, vec_step);
7618	  vec_def = gimple_convert (&stmts, vectype, vec_def);
7619	  loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
7620	  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7621
7622	  /* Set the arguments of the phi node:  */
7623	  add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7624	  add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7625		       UNKNOWN_LOCATION);
7626
7627	  SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7628	}
7629
7630      /* Re-use IVs when we can.  */
7631      if (ivn < nvects)
7632	{
7633	  unsigned vfp
7634	    = least_common_multiple (group_size, const_nunits) / group_size;
7635	  /* Generate [VF'*S, VF'*S, ... ].  */
7636	  if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7637	    {
7638	      expr = build_int_cst (integer_type_node, vfp);
7639	      expr = fold_convert (TREE_TYPE (step_expr), expr);
7640	    }
7641	  else
7642	    expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7643	  new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7644				  expr, step_expr);
7645	  if (! CONSTANT_CLASS_P (new_name))
7646	    new_name = vect_init_vector (stmt_info, new_name,
7647					 TREE_TYPE (step_expr), NULL);
7648	  new_vec = build_vector_from_val (step_vectype, new_name);
7649	  vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7650	  for (; ivn < nvects; ++ivn)
7651	    {
7652	      gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7653	      tree def;
7654	      if (gimple_code (iv) == GIMPLE_PHI)
7655		def = gimple_phi_result (iv);
7656	      else
7657		def = gimple_assign_lhs (iv);
7658	      gimple_seq stmts = NULL;
7659	      def = gimple_convert (&stmts, step_vectype, def);
7660	      def = gimple_build (&stmts,
7661				  PLUS_EXPR, step_vectype, def, vec_step);
7662	      def = gimple_convert (&stmts, vectype, def);
7663	      if (gimple_code (iv) == GIMPLE_PHI)
7664		gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7665	      else
7666		{
7667		  gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7668		  gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7669		}
7670	      SLP_TREE_VEC_STMTS (slp_node).quick_push
7671		(loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
7672	    }
7673	}
7674
7675      return true;
7676    }
7677
7678  /* Create the vector that holds the initial_value of the induction.  */
7679  if (nested_in_vect_loop)
7680    {
7681      /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7682	 been created during vectorization of previous stmts.  We obtain it
7683	 from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7684      vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7685      /* If the initial value is not of proper type, convert it.  */
7686      if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7687	{
7688	  new_stmt
7689	    = gimple_build_assign (vect_get_new_ssa_name (vectype,
7690							  vect_simple_var,
7691							  "vec_iv_"),
7692				   VIEW_CONVERT_EXPR,
7693				   build1 (VIEW_CONVERT_EXPR, vectype,
7694					   vec_init));
7695	  vec_init = gimple_assign_lhs (new_stmt);
7696	  new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7697						 new_stmt);
7698	  gcc_assert (!new_bb);
7699	  loop_vinfo->add_stmt (new_stmt);
7700	}
7701    }
7702  else
7703    {
7704      /* iv_loop is the loop to be vectorized. Create:
7705	 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7706      stmts = NULL;
7707      new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7708
7709      unsigned HOST_WIDE_INT const_nunits;
7710      if (nunits.is_constant (&const_nunits))
7711	{
7712	  tree_vector_builder elts (step_vectype, const_nunits, 1);
7713	  elts.quick_push (new_name);
7714	  for (i = 1; i < const_nunits; i++)
7715	    {
7716	      /* Create: new_name_i = new_name + step_expr  */
7717	      new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7718				       new_name, step_expr);
7719	      elts.quick_push (new_name);
7720	    }
7721	  /* Create a vector from [new_name_0, new_name_1, ...,
7722	     new_name_nunits-1]  */
7723	  vec_init = gimple_build_vector (&stmts, &elts);
7724	}
7725      else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7726	/* Build the initial value directly from a VEC_SERIES_EXPR.  */
7727	vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7728				 new_name, step_expr);
7729      else
7730	{
7731	  /* Build:
7732	        [base, base, base, ...]
7733		+ (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7734	  gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7735	  gcc_assert (flag_associative_math);
7736	  tree index = build_index_vector (step_vectype, 0, 1);
7737	  tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7738							new_name);
7739	  tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7740							step_expr);
7741	  vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7742	  vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7743				   vec_init, step_vec);
7744	  vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7745				   vec_init, base_vec);
7746	}
7747      vec_init = gimple_convert (&stmts, vectype, vec_init);
7748
7749      if (stmts)
7750	{
7751	  new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7752	  gcc_assert (!new_bb);
7753	}
7754    }
7755
7756
7757  /* Create the vector that holds the step of the induction.  */
7758  if (nested_in_vect_loop)
7759    /* iv_loop is nested in the loop to be vectorized. Generate:
7760       vec_step = [S, S, S, S]  */
7761    new_name = step_expr;
7762  else
7763    {
7764      /* iv_loop is the loop to be vectorized. Generate:
7765	  vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7766      gimple_seq seq = NULL;
7767      if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7768	{
7769	  expr = build_int_cst (integer_type_node, vf);
7770	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7771	}
7772      else
7773	expr = build_int_cst (TREE_TYPE (step_expr), vf);
7774      new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7775			       expr, step_expr);
7776      if (seq)
7777	{
7778	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7779	  gcc_assert (!new_bb);
7780	}
7781    }
7782
7783  t = unshare_expr (new_name);
7784  gcc_assert (CONSTANT_CLASS_P (new_name)
7785	      || TREE_CODE (new_name) == SSA_NAME);
7786  new_vec = build_vector_from_val (step_vectype, t);
7787  vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7788
7789
7790  /* Create the following def-use cycle:
7791     loop prolog:
7792         vec_init = ...
7793	 vec_step = ...
7794     loop:
7795         vec_iv = PHI <vec_init, vec_loop>
7796         ...
7797         STMT
7798         ...
7799         vec_loop = vec_iv + vec_step;  */
7800
7801  /* Create the induction-phi that defines the induction-operand.  */
7802  vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7803  induction_phi = create_phi_node (vec_dest, iv_loop->header);
7804  stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7805  induc_def = PHI_RESULT (induction_phi);
7806
7807  /* Create the iv update inside the loop  */
7808  stmts = NULL;
7809  vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7810  vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7811  vec_def = gimple_convert (&stmts, vectype, vec_def);
7812  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7813  new_stmt = SSA_NAME_DEF_STMT (vec_def);
7814  stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7815
7816  /* Set the arguments of the phi node:  */
7817  add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7818  add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7819	       UNKNOWN_LOCATION);
7820
7821  STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7822
7823  /* In case that vectorization factor (VF) is bigger than the number
7824     of elements that we can fit in a vectype (nunits), we have to generate
7825     more than one vector stmt - i.e - we need to "unroll" the
7826     vector stmt by a factor VF/nunits.  For more details see documentation
7827     in vectorizable_operation.  */
7828
7829  if (ncopies > 1)
7830    {
7831      gimple_seq seq = NULL;
7832      stmt_vec_info prev_stmt_vinfo;
7833      /* FORNOW. This restriction should be relaxed.  */
7834      gcc_assert (!nested_in_vect_loop);
7835
7836      /* Create the vector that holds the step of the induction.  */
7837      if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7838	{
7839	  expr = build_int_cst (integer_type_node, nunits);
7840	  expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7841	}
7842      else
7843	expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7844      new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7845			       expr, step_expr);
7846      if (seq)
7847	{
7848	  new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7849	  gcc_assert (!new_bb);
7850	}
7851
7852      t = unshare_expr (new_name);
7853      gcc_assert (CONSTANT_CLASS_P (new_name)
7854		  || TREE_CODE (new_name) == SSA_NAME);
7855      new_vec = build_vector_from_val (step_vectype, t);
7856      vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7857
7858      vec_def = induc_def;
7859      prev_stmt_vinfo = induction_phi_info;
7860      for (i = 1; i < ncopies; i++)
7861	{
7862	  /* vec_i = vec_prev + vec_step  */
7863	  gimple_seq stmts = NULL;
7864	  vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7865	  vec_def = gimple_build (&stmts,
7866				  PLUS_EXPR, step_vectype, vec_def, vec_step);
7867	  vec_def = gimple_convert (&stmts, vectype, vec_def);
7868
7869	  gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7870	  new_stmt = SSA_NAME_DEF_STMT (vec_def);
7871	  new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7872	  STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7873	  prev_stmt_vinfo = new_stmt_info;
7874	}
7875    }
7876
7877  if (nested_in_vect_loop)
7878    {
7879      /* Find the loop-closed exit-phi of the induction, and record
7880         the final vector of induction results:  */
7881      exit_phi = NULL;
7882      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7883        {
7884	  gimple *use_stmt = USE_STMT (use_p);
7885	  if (is_gimple_debug (use_stmt))
7886	    continue;
7887
7888	  if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7889	    {
7890	      exit_phi = use_stmt;
7891	      break;
7892	    }
7893        }
7894      if (exit_phi)
7895	{
7896	  stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7897	  /* FORNOW. Currently not supporting the case that an inner-loop induction
7898	     is not used in the outer-loop (i.e. only outside the outer-loop).  */
7899	  gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7900		      && !STMT_VINFO_LIVE_P (stmt_vinfo));
7901
7902	  STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7903	  if (dump_enabled_p ())
7904	    dump_printf_loc (MSG_NOTE, vect_location,
7905			     "vector of inductions after inner-loop:%G",
7906			     new_stmt);
7907	}
7908    }
7909
7910
7911  if (dump_enabled_p ())
7912    dump_printf_loc (MSG_NOTE, vect_location,
7913		     "transform induction: created def-use cycle: %G%G",
7914		     induction_phi, SSA_NAME_DEF_STMT (vec_def));
7915
7916  return true;
7917}
7918
7919/* Function vectorizable_live_operation.
7920
7921   STMT_INFO computes a value that is used outside the loop.  Check if
7922   it can be supported.  */
7923
7924bool
7925vectorizable_live_operation (stmt_vec_info stmt_info,
7926			     gimple_stmt_iterator *gsi,
7927			     slp_tree slp_node, slp_instance slp_node_instance,
7928			     int slp_index, bool vec_stmt_p,
7929			     stmt_vector_for_cost *)
7930{
7931  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7932  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7933  imm_use_iterator imm_iter;
7934  tree lhs, lhs_type, bitsize, vec_bitsize;
7935  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7936  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7937  int ncopies;
7938  gimple *use_stmt;
7939  auto_vec<tree> vec_oprnds;
7940  int vec_entry = 0;
7941  poly_uint64 vec_index = 0;
7942
7943  gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7944
7945  /* Due to how we generate code for SLP_TREE_TWO_OPERATORS we cannot
7946     vectorize live operations out of it.  */
7947  if (slp_node && SLP_TREE_TWO_OPERATORS (slp_node))
7948    return false;
7949
7950  /* If a stmt of a reduction is live, vectorize it via
7951     vect_create_epilog_for_reduction.  vectorizable_reduction assessed
7952     validity so just trigger the transform here.  */
7953  if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
7954    {
7955      if (!vec_stmt_p)
7956	return true;
7957      if (slp_node)
7958	{
7959	  /* For reduction chains the meta-info is attached to
7960	     the group leader.  */
7961	  if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7962	    stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7963	  /* For SLP reductions we vectorize the epilogue for
7964	     all involved stmts together.  */
7965	  else if (slp_index != 0)
7966	    return true;
7967	}
7968      stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7969      gcc_assert (reduc_info->is_reduc_info);
7970      if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
7971	  || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
7972	return true;
7973      vect_create_epilog_for_reduction (stmt_info, slp_node,
7974					slp_node_instance);
7975      return true;
7976    }
7977
7978  /* FORNOW.  CHECKME.  */
7979  if (nested_in_vect_loop_p (loop, stmt_info))
7980    return false;
7981
7982  /* If STMT is not relevant and it is a simple assignment and its inputs are
7983     invariant then it can remain in place, unvectorized.  The original last
7984     scalar value that it computes will be used.  */
7985  if (!STMT_VINFO_RELEVANT_P (stmt_info))
7986    {
7987      gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7988      if (dump_enabled_p ())
7989	dump_printf_loc (MSG_NOTE, vect_location,
7990			 "statement is simple and uses invariant.  Leaving in "
7991			 "place.\n");
7992      return true;
7993    }
7994
7995  if (slp_node)
7996    ncopies = 1;
7997  else
7998    ncopies = vect_get_num_copies (loop_vinfo, vectype);
7999
8000  if (slp_node)
8001    {
8002      gcc_assert (slp_index >= 0);
8003
8004      int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8005      int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8006
8007      /* Get the last occurrence of the scalar index from the concatenation of
8008	 all the slp vectors. Calculate which slp vector it is and the index
8009	 within.  */
8010      poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8011
8012      /* Calculate which vector contains the result, and which lane of
8013	 that vector we need.  */
8014      if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8015	{
8016	  if (dump_enabled_p ())
8017	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8018			     "Cannot determine which vector holds the"
8019			     " final result.\n");
8020	  return false;
8021	}
8022    }
8023
8024  if (!vec_stmt_p)
8025    {
8026      /* No transformation required.  */
8027      if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8028	{
8029	  if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8030					       OPTIMIZE_FOR_SPEED))
8031	    {
8032	      if (dump_enabled_p ())
8033		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8034				 "can't use a fully-masked loop because "
8035				 "the target doesn't support extract last "
8036				 "reduction.\n");
8037	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8038	    }
8039	  else if (slp_node)
8040	    {
8041	      if (dump_enabled_p ())
8042		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8043				 "can't use a fully-masked loop because an "
8044				 "SLP statement is live after the loop.\n");
8045	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8046	    }
8047	  else if (ncopies > 1)
8048	    {
8049	      if (dump_enabled_p ())
8050		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8051				 "can't use a fully-masked loop because"
8052				 " ncopies is greater than 1.\n");
8053	      LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8054	    }
8055	  else
8056	    {
8057	      gcc_assert (ncopies == 1 && !slp_node);
8058	      vect_record_loop_mask (loop_vinfo,
8059				     &LOOP_VINFO_MASKS (loop_vinfo),
8060				     1, vectype, NULL);
8061	    }
8062	}
8063      return true;
8064    }
8065
8066  /* Use the lhs of the original scalar statement.  */
8067  gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8068
8069  lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8070	: gimple_get_lhs (stmt);
8071  lhs_type = TREE_TYPE (lhs);
8072
8073  bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8074	     ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8075	     : TYPE_SIZE (TREE_TYPE (vectype)));
8076  vec_bitsize = TYPE_SIZE (vectype);
8077
8078  /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8079  tree vec_lhs, bitstart;
8080  if (slp_node)
8081    {
8082      gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8083
8084      /* Get the correct slp vectorized stmt.  */
8085      gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
8086      if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8087	vec_lhs = gimple_phi_result (phi);
8088      else
8089	vec_lhs = gimple_get_lhs (vec_stmt);
8090
8091      /* Get entry to use.  */
8092      bitstart = bitsize_int (vec_index);
8093      bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8094    }
8095  else
8096    {
8097      enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8098      vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
8099      gcc_checking_assert (ncopies == 1
8100			   || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8101
8102      /* For multiple copies, get the last copy.  */
8103      for (int i = 1; i < ncopies; ++i)
8104	vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
8105
8106      /* Get the last lane in the vector.  */
8107      bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8108    }
8109
8110  /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8111     requirement, insert one phi node for it.  It looks like:
8112	 loop;
8113       BB:
8114	 # lhs' = PHI <lhs>
8115     ==>
8116	 loop;
8117       BB:
8118	 # vec_lhs' = PHI <vec_lhs>
8119	 new_tree = lane_extract <vec_lhs', ...>;
8120	 lhs' = new_tree;  */
8121
8122  basic_block exit_bb = single_exit (loop)->dest;
8123  gcc_assert (single_pred_p (exit_bb));
8124
8125  tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8126  gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8127  SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8128
8129  gimple_seq stmts = NULL;
8130  tree new_tree;
8131  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8132    {
8133      /* Emit:
8134
8135	   SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8136
8137	 where VEC_LHS is the vectorized live-out result and MASK is
8138	 the loop mask for the final iteration.  */
8139      gcc_assert (ncopies == 1 && !slp_node);
8140      tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8141      tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1,
8142				      vectype, 0);
8143      tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8144				      mask, vec_lhs_phi);
8145
8146      /* Convert the extracted vector element to the required scalar type.  */
8147      new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8148    }
8149  else
8150    {
8151      tree bftype = TREE_TYPE (vectype);
8152      if (VECTOR_BOOLEAN_TYPE_P (vectype))
8153	bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8154      new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
8155      new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8156				       &stmts, true, NULL_TREE);
8157    }
8158
8159  if (stmts)
8160    {
8161      gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8162      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8163
8164      /* Remove existing phi from lhs and create one copy from new_tree.  */
8165      tree lhs_phi = NULL_TREE;
8166      gimple_stmt_iterator gsi;
8167      for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8168	{
8169	  gimple *phi = gsi_stmt (gsi);
8170	  if ((gimple_phi_arg_def (phi, 0) == lhs))
8171	    {
8172	      remove_phi_node (&gsi, false);
8173	      lhs_phi = gimple_phi_result (phi);
8174	      gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8175	      gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8176	      break;
8177	    }
8178	}
8179    }
8180
8181  /* Replace use of lhs with newly computed result.  If the use stmt is a
8182     single arg PHI, just replace all uses of PHI result.  It's necessary
8183     because lcssa PHI defining lhs may be before newly inserted stmt.  */
8184  use_operand_p use_p;
8185  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8186    if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8187	&& !is_gimple_debug (use_stmt))
8188    {
8189      if (gimple_code (use_stmt) == GIMPLE_PHI
8190	  && gimple_phi_num_args (use_stmt) == 1)
8191	{
8192	  replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8193	}
8194      else
8195	{
8196	  FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8197	    SET_USE (use_p, new_tree);
8198	}
8199      update_stmt (use_stmt);
8200    }
8201
8202  return true;
8203}
8204
8205/* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8206
8207static void
8208vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8209{
8210  ssa_op_iter op_iter;
8211  imm_use_iterator imm_iter;
8212  def_operand_p def_p;
8213  gimple *ustmt;
8214
8215  FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8216    {
8217      FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8218	{
8219	  basic_block bb;
8220
8221	  if (!is_gimple_debug (ustmt))
8222	    continue;
8223
8224	  bb = gimple_bb (ustmt);
8225
8226	  if (!flow_bb_inside_loop_p (loop, bb))
8227	    {
8228	      if (gimple_debug_bind_p (ustmt))
8229		{
8230		  if (dump_enabled_p ())
8231		    dump_printf_loc (MSG_NOTE, vect_location,
8232                                     "killing debug use\n");
8233
8234		  gimple_debug_bind_reset_value (ustmt);
8235		  update_stmt (ustmt);
8236		}
8237	      else
8238		gcc_unreachable ();
8239	    }
8240	}
8241    }
8242}
8243
8244/* Given loop represented by LOOP_VINFO, return true if computation of
8245   LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8246   otherwise.  */
8247
8248static bool
8249loop_niters_no_overflow (loop_vec_info loop_vinfo)
8250{
8251  /* Constant case.  */
8252  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8253    {
8254      tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8255      tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8256
8257      gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8258      gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8259      if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8260	return true;
8261    }
8262
8263  widest_int max;
8264  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8265  /* Check the upper bound of loop niters.  */
8266  if (get_max_loop_iterations (loop, &max))
8267    {
8268      tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8269      signop sgn = TYPE_SIGN (type);
8270      widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8271      if (max < type_max)
8272	return true;
8273    }
8274  return false;
8275}
8276
8277/* Return a mask type with half the number of elements as OLD_TYPE,
8278   given that it should have mode NEW_MODE.  */
8279
8280tree
8281vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8282{
8283  poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8284  return build_truth_vector_type_for_mode (nunits, new_mode);
8285}
8286
8287/* Return a mask type with twice as many elements as OLD_TYPE,
8288   given that it should have mode NEW_MODE.  */
8289
8290tree
8291vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8292{
8293  poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8294  return build_truth_vector_type_for_mode (nunits, new_mode);
8295}
8296
8297/* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8298   contain a sequence of NVECTORS masks that each control a vector of type
8299   VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
8300   these vector masks with the vector version of SCALAR_MASK.  */
8301
8302void
8303vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8304		       unsigned int nvectors, tree vectype, tree scalar_mask)
8305{
8306  gcc_assert (nvectors != 0);
8307  if (masks->length () < nvectors)
8308    masks->safe_grow_cleared (nvectors);
8309  rgroup_masks *rgm = &(*masks)[nvectors - 1];
8310  /* The number of scalars per iteration and the number of vectors are
8311     both compile-time constants.  */
8312  unsigned int nscalars_per_iter
8313    = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8314		 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8315
8316  if (scalar_mask)
8317    {
8318      scalar_cond_masked_key cond (scalar_mask, nvectors);
8319      loop_vinfo->scalar_cond_masked_set.add (cond);
8320    }
8321
8322  if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8323    {
8324      rgm->max_nscalars_per_iter = nscalars_per_iter;
8325      rgm->mask_type = truth_type_for (vectype);
8326    }
8327}
8328
8329/* Given a complete set of masks MASKS, extract mask number INDEX
8330   for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8331   where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8332
8333   See the comment above vec_loop_masks for more details about the mask
8334   arrangement.  */
8335
8336tree
8337vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8338		    unsigned int nvectors, tree vectype, unsigned int index)
8339{
8340  rgroup_masks *rgm = &(*masks)[nvectors - 1];
8341  tree mask_type = rgm->mask_type;
8342
8343  /* Populate the rgroup's mask array, if this is the first time we've
8344     used it.  */
8345  if (rgm->masks.is_empty ())
8346    {
8347      rgm->masks.safe_grow_cleared (nvectors);
8348      for (unsigned int i = 0; i < nvectors; ++i)
8349	{
8350	  tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8351	  /* Provide a dummy definition until the real one is available.  */
8352	  SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8353	  rgm->masks[i] = mask;
8354	}
8355    }
8356
8357  tree mask = rgm->masks[index];
8358  if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8359		TYPE_VECTOR_SUBPARTS (vectype)))
8360    {
8361      /* A loop mask for data type X can be reused for data type Y
8362	 if X has N times more elements than Y and if Y's elements
8363	 are N times bigger than X's.  In this case each sequence
8364	 of N elements in the loop mask will be all-zero or all-one.
8365	 We can then view-convert the mask so that each sequence of
8366	 N elements is replaced by a single element.  */
8367      gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8368			      TYPE_VECTOR_SUBPARTS (vectype)));
8369      gimple_seq seq = NULL;
8370      mask_type = truth_type_for (vectype);
8371      mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8372      if (seq)
8373	gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8374    }
8375  return mask;
8376}
8377
8378/* Scale profiling counters by estimation for LOOP which is vectorized
8379   by factor VF.  */
8380
8381static void
8382scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8383{
8384  edge preheader = loop_preheader_edge (loop);
8385  /* Reduce loop iterations by the vectorization factor.  */
8386  gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8387  profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8388
8389  if (freq_h.nonzero_p ())
8390    {
8391      profile_probability p;
8392
8393      /* Avoid dropping loop body profile counter to 0 because of zero count
8394	 in loop's preheader.  */
8395      if (!(freq_e == profile_count::zero ()))
8396        freq_e = freq_e.force_nonzero ();
8397      p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8398      scale_loop_frequencies (loop, p);
8399    }
8400
8401  edge exit_e = single_exit (loop);
8402  exit_e->probability = profile_probability::always ()
8403				 .apply_scale (1, new_est_niter + 1);
8404
8405  edge exit_l = single_pred_edge (loop->latch);
8406  profile_probability prob = exit_l->probability;
8407  exit_l->probability = exit_e->probability.invert ();
8408  if (prob.initialized_p () && exit_l->probability.initialized_p ())
8409    scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8410}
8411
8412/* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
8413   latch edge values originally defined by it.  */
8414
8415static void
8416maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
8417				     stmt_vec_info def_stmt_info)
8418{
8419  tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
8420  if (!def || TREE_CODE (def) != SSA_NAME)
8421    return;
8422  stmt_vec_info phi_info;
8423  imm_use_iterator iter;
8424  use_operand_p use_p;
8425  FOR_EACH_IMM_USE_FAST (use_p, iter, def)
8426    if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
8427      if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
8428	  && (phi_info = loop_vinfo->lookup_stmt (phi))
8429	  && STMT_VINFO_RELEVANT_P (phi_info)
8430	  && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
8431	  && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
8432	  && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
8433	{
8434	  loop_p loop = gimple_bb (phi)->loop_father;
8435	  edge e = loop_latch_edge (loop);
8436	  if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
8437	    {
8438	      stmt_vec_info phi_vec_info = STMT_VINFO_VEC_STMT (phi_info);
8439	      stmt_vec_info def_vec_info = STMT_VINFO_VEC_STMT (def_stmt_info);
8440	      do
8441		{
8442		  add_phi_arg (as_a <gphi *> (phi_vec_info->stmt),
8443			       gimple_get_lhs (def_vec_info->stmt), e,
8444			       gimple_phi_arg_location (phi, e->dest_idx));
8445		  phi_vec_info = STMT_VINFO_RELATED_STMT (phi_vec_info);
8446		  def_vec_info = STMT_VINFO_RELATED_STMT (def_vec_info);
8447		}
8448	      while (phi_vec_info);
8449	      gcc_assert (!def_vec_info);
8450	    }
8451	}
8452}
8453
8454/* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8455   When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8456   stmt_vec_info.  */
8457
8458static bool
8459vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8460			  gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8461{
8462  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8463  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8464
8465  if (dump_enabled_p ())
8466    dump_printf_loc (MSG_NOTE, vect_location,
8467		     "------>vectorizing statement: %G", stmt_info->stmt);
8468
8469  if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8470    vect_loop_kill_debug_uses (loop, stmt_info);
8471
8472  if (!STMT_VINFO_RELEVANT_P (stmt_info)
8473      && !STMT_VINFO_LIVE_P (stmt_info))
8474    return false;
8475
8476  if (STMT_VINFO_VECTYPE (stmt_info))
8477    {
8478      poly_uint64 nunits
8479	= TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8480      if (!STMT_SLP_TYPE (stmt_info)
8481	  && maybe_ne (nunits, vf)
8482	  && dump_enabled_p ())
8483	/* For SLP VF is set according to unrolling factor, and not
8484	   to vector size, hence for SLP this print is not valid.  */
8485	dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8486    }
8487
8488  /* Pure SLP statements have already been vectorized.  We still need
8489     to apply loop vectorization to hybrid SLP statements.  */
8490  if (PURE_SLP_STMT (stmt_info))
8491    return false;
8492
8493  if (dump_enabled_p ())
8494    dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8495
8496  if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8497    *seen_store = stmt_info;
8498
8499  return true;
8500}
8501
8502/* Helper function to pass to simplify_replace_tree to enable replacing tree's
8503   in the hash_map with its corresponding values.  */
8504
8505static tree
8506find_in_mapping (tree t, void *context)
8507{
8508  hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8509
8510  tree *value = mapping->get (t);
8511  return value ? *value : t;
8512}
8513
8514/* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
8515   original loop that has now been vectorized.
8516
8517   The inits of the data_references need to be advanced with the number of
8518   iterations of the main loop.  This has been computed in vect_do_peeling and
8519   is stored in parameter ADVANCE.  We first restore the data_references
8520   initial offset with the values recored in ORIG_DRS_INIT.
8521
8522   Since the loop_vec_info of this EPILOGUE was constructed for the original
8523   loop, its stmt_vec_infos all point to the original statements.  These need
8524   to be updated to point to their corresponding copies as well as the SSA_NAMES
8525   in their PATTERN_DEF_SEQs and RELATED_STMTs.
8526
8527   The data_reference's connections also need to be updated.  Their
8528   corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8529   stmt_vec_infos, their statements need to point to their corresponding copy,
8530   if they are gather loads or scatter stores then their reference needs to be
8531   updated to point to its corresponding copy and finally we set
8532   'base_misaligned' to false as we have already peeled for alignment in the
8533   prologue of the main loop.  */
8534
8535static void
8536update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8537{
8538  loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8539  auto_vec<gimple *> stmt_worklist;
8540  hash_map<tree,tree> mapping;
8541  gimple *orig_stmt, *new_stmt;
8542  gimple_stmt_iterator epilogue_gsi;
8543  gphi_iterator epilogue_phi_gsi;
8544  stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8545  basic_block *epilogue_bbs = get_loop_body (epilogue);
8546  unsigned i;
8547
8548  LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8549
8550  /* Advance data_reference's with the number of iterations of the previous
8551     loop and its prologue.  */
8552  vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8553
8554
8555  /* The EPILOGUE loop is a copy of the original loop so they share the same
8556     gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
8557     point to the copied statements.  We also create a mapping of all LHS' in
8558     the original loop and all the LHS' in the EPILOGUE and create worklists to
8559     update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
8560  for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8561    {
8562      for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8563	   !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8564	{
8565	  new_stmt = epilogue_phi_gsi.phi ();
8566
8567	  gcc_assert (gimple_uid (new_stmt) > 0);
8568	  stmt_vinfo
8569	    = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8570
8571	  orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8572	  STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8573
8574	  mapping.put (gimple_phi_result (orig_stmt),
8575		       gimple_phi_result (new_stmt));
8576	  /* PHI nodes can not have patterns or related statements.  */
8577	  gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8578		      && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8579	}
8580
8581      for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8582	   !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8583	{
8584	  new_stmt = gsi_stmt (epilogue_gsi);
8585
8586	  gcc_assert (gimple_uid (new_stmt) > 0);
8587	  stmt_vinfo
8588	    = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8589
8590	  orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8591	  STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8592
8593	  if (tree old_lhs = gimple_get_lhs (orig_stmt))
8594	    mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8595
8596	  if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8597	    {
8598	      gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8599	      for (gimple_stmt_iterator gsi = gsi_start (seq);
8600		   !gsi_end_p (gsi); gsi_next (&gsi))
8601		stmt_worklist.safe_push (gsi_stmt (gsi));
8602	    }
8603
8604	  related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8605	  if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8606	    {
8607	      gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8608	      stmt_worklist.safe_push (stmt);
8609	      /* Set BB such that the assert in
8610		'get_initial_def_for_reduction' is able to determine that
8611		the BB of the related stmt is inside this loop.  */
8612	      gimple_set_bb (stmt,
8613			     gimple_bb (new_stmt));
8614	      related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8615	      gcc_assert (related_vinfo == NULL
8616			  || related_vinfo == stmt_vinfo);
8617	    }
8618	}
8619    }
8620
8621  /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8622     using the original main loop and thus need to be updated to refer to the
8623     cloned variables used in the epilogue.  */
8624  for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8625    {
8626      gimple *stmt = stmt_worklist[i];
8627      tree *new_op;
8628
8629      for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8630	{
8631	  tree op = gimple_op (stmt, j);
8632	  if ((new_op = mapping.get(op)))
8633	    gimple_set_op (stmt, j, *new_op);
8634	  else
8635	    {
8636	      /* PR92429: The last argument of simplify_replace_tree disables
8637		 folding when replacing arguments.  This is required as
8638		 otherwise you might end up with different statements than the
8639		 ones analyzed in vect_loop_analyze, leading to different
8640		 vectorization.  */
8641	      op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8642					  &find_in_mapping, &mapping, false);
8643	      gimple_set_op (stmt, j, op);
8644	    }
8645	}
8646    }
8647
8648  struct data_reference *dr;
8649  vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
8650  FOR_EACH_VEC_ELT (datarefs, i, dr)
8651    {
8652      orig_stmt = DR_STMT (dr);
8653      gcc_assert (gimple_uid (orig_stmt) > 0);
8654      stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8655      /* Data references for gather loads and scatter stores do not use the
8656	 updated offset we set using ADVANCE.  Instead we have to make sure the
8657	 reference in the data references point to the corresponding copy of
8658	 the original in the epilogue.  */
8659      if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8660	  == VMAT_GATHER_SCATTER)
8661	{
8662	  DR_REF (dr)
8663	    = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8664				     &find_in_mapping, &mapping);
8665	  DR_BASE_ADDRESS (dr)
8666	    = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8667				     &find_in_mapping, &mapping);
8668	}
8669      DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8670      stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8671      /* The vector size of the epilogue is smaller than that of the main loop
8672	 so the alignment is either the same or lower. This means the dr will
8673	 thus by definition be aligned.  */
8674      STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8675    }
8676
8677  epilogue_vinfo->shared->datarefs_copy.release ();
8678  epilogue_vinfo->shared->save_datarefs ();
8679}
8680
8681/* Function vect_transform_loop.
8682
8683   The analysis phase has determined that the loop is vectorizable.
8684   Vectorize the loop - created vectorized stmts to replace the scalar
8685   stmts in the loop, and update the loop exit condition.
8686   Returns scalar epilogue loop if any.  */
8687
8688class loop *
8689vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
8690{
8691  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8692  class loop *epilogue = NULL;
8693  basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8694  int nbbs = loop->num_nodes;
8695  int i;
8696  tree niters_vector = NULL_TREE;
8697  tree step_vector = NULL_TREE;
8698  tree niters_vector_mult_vf = NULL_TREE;
8699  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8700  unsigned int lowest_vf = constant_lower_bound (vf);
8701  gimple *stmt;
8702  bool check_profitability = false;
8703  unsigned int th;
8704
8705  DUMP_VECT_SCOPE ("vec_transform_loop");
8706
8707  loop_vinfo->shared->check_datarefs ();
8708
8709  /* Use the more conservative vectorization threshold.  If the number
8710     of iterations is constant assume the cost check has been performed
8711     by our caller.  If the threshold makes all loops profitable that
8712     run at least the (estimated) vectorization factor number of times
8713     checking is pointless, too.  */
8714  th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8715  if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8716    {
8717      if (dump_enabled_p ())
8718	dump_printf_loc (MSG_NOTE, vect_location,
8719			 "Profitability threshold is %d loop iterations.\n",
8720			 th);
8721      check_profitability = true;
8722    }
8723
8724  /* Make sure there exists a single-predecessor exit bb.  Do this before
8725     versioning.   */
8726  edge e = single_exit (loop);
8727  if (! single_pred_p (e->dest))
8728    {
8729      split_loop_exit_edge (e, true);
8730      if (dump_enabled_p ())
8731	dump_printf (MSG_NOTE, "split exit edge\n");
8732    }
8733
8734  /* Version the loop first, if required, so the profitability check
8735     comes first.  */
8736
8737  if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8738    {
8739      class loop *sloop
8740	= vect_loop_versioning (loop_vinfo, loop_vectorized_call);
8741      sloop->force_vectorize = false;
8742      check_profitability = false;
8743    }
8744
8745  /* Make sure there exists a single-predecessor exit bb also on the
8746     scalar loop copy.  Do this after versioning but before peeling
8747     so CFG structure is fine for both scalar and if-converted loop
8748     to make slpeel_duplicate_current_defs_from_edges face matched
8749     loop closed PHI nodes on the exit.  */
8750  if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8751    {
8752      e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8753      if (! single_pred_p (e->dest))
8754	{
8755	  split_loop_exit_edge (e, true);
8756	  if (dump_enabled_p ())
8757	    dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8758	}
8759    }
8760
8761  tree niters = vect_build_loop_niters (loop_vinfo);
8762  LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8763  tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8764  bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8765  tree advance;
8766  drs_init_vec orig_drs_init;
8767
8768  epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8769			      &step_vector, &niters_vector_mult_vf, th,
8770			      check_profitability, niters_no_overflow,
8771			      &advance);
8772
8773  if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8774      && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8775    scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8776			    LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8777
8778  if (niters_vector == NULL_TREE)
8779    {
8780      if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8781	  && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8782	  && known_eq (lowest_vf, vf))
8783	{
8784	  niters_vector
8785	    = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8786			     LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8787	  step_vector = build_one_cst (TREE_TYPE (niters));
8788	}
8789      else
8790	vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8791				     &step_vector, niters_no_overflow);
8792    }
8793
8794  /* 1) Make sure the loop header has exactly two entries
8795     2) Make sure we have a preheader basic block.  */
8796
8797  gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8798
8799  split_edge (loop_preheader_edge (loop));
8800
8801  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8802      && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8803    /* This will deal with any possible peeling.  */
8804    vect_prepare_for_masked_peels (loop_vinfo);
8805
8806  /* Schedule the SLP instances first, then handle loop vectorization
8807     below.  */
8808  if (!loop_vinfo->slp_instances.is_empty ())
8809    {
8810      DUMP_VECT_SCOPE ("scheduling SLP instances");
8811      vect_schedule_slp (loop_vinfo);
8812    }
8813
8814  /* FORNOW: the vectorizer supports only loops which body consist
8815     of one basic block (header + empty latch). When the vectorizer will
8816     support more involved loop forms, the order by which the BBs are
8817     traversed need to be reconsidered.  */
8818
8819  for (i = 0; i < nbbs; i++)
8820    {
8821      basic_block bb = bbs[i];
8822      stmt_vec_info stmt_info;
8823
8824      for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8825	   gsi_next (&si))
8826	{
8827	  gphi *phi = si.phi ();
8828	  if (dump_enabled_p ())
8829	    dump_printf_loc (MSG_NOTE, vect_location,
8830			     "------>vectorizing phi: %G", phi);
8831	  stmt_info = loop_vinfo->lookup_stmt (phi);
8832	  if (!stmt_info)
8833	    continue;
8834
8835	  if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8836	    vect_loop_kill_debug_uses (loop, stmt_info);
8837
8838	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
8839	      && !STMT_VINFO_LIVE_P (stmt_info))
8840	    continue;
8841
8842	  if (STMT_VINFO_VECTYPE (stmt_info)
8843	      && (maybe_ne
8844		  (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8845	      && dump_enabled_p ())
8846	    dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8847
8848	  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8849	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8850	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8851	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8852	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8853	      && ! PURE_SLP_STMT (stmt_info))
8854	    {
8855	      if (dump_enabled_p ())
8856		dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8857	      vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8858	    }
8859	}
8860
8861      for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8862	   gsi_next (&si))
8863	{
8864	  gphi *phi = si.phi ();
8865	  stmt_info = loop_vinfo->lookup_stmt (phi);
8866	  if (!stmt_info)
8867	    continue;
8868
8869	  if (!STMT_VINFO_RELEVANT_P (stmt_info)
8870	      && !STMT_VINFO_LIVE_P (stmt_info))
8871	    continue;
8872
8873	  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8874	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8875	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8876	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8877	       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8878	      && ! PURE_SLP_STMT (stmt_info))
8879	    maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
8880	}
8881
8882      for (gimple_stmt_iterator si = gsi_start_bb (bb);
8883	   !gsi_end_p (si);)
8884	{
8885	  stmt = gsi_stmt (si);
8886	  /* During vectorization remove existing clobber stmts.  */
8887	  if (gimple_clobber_p (stmt))
8888	    {
8889	      unlink_stmt_vdef (stmt);
8890	      gsi_remove (&si, true);
8891	      release_defs (stmt);
8892	    }
8893	  else
8894	    {
8895	      stmt_info = loop_vinfo->lookup_stmt (stmt);
8896
8897	      /* vector stmts created in the outer-loop during vectorization of
8898		 stmts in an inner-loop may not have a stmt_info, and do not
8899		 need to be vectorized.  */
8900	      stmt_vec_info seen_store = NULL;
8901	      if (stmt_info)
8902		{
8903		  if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8904		    {
8905		      gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8906		      for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8907			   !gsi_end_p (subsi); gsi_next (&subsi))
8908			{
8909			  stmt_vec_info pat_stmt_info
8910			    = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8911			  vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8912						    &si, &seen_store);
8913			}
8914		      stmt_vec_info pat_stmt_info
8915			= STMT_VINFO_RELATED_STMT (stmt_info);
8916		      if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8917						    &si, &seen_store))
8918			maybe_set_vectorized_backedge_value (loop_vinfo,
8919							     pat_stmt_info);
8920		    }
8921		  else
8922		    {
8923		      if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8924						    &seen_store))
8925			maybe_set_vectorized_backedge_value (loop_vinfo,
8926							     stmt_info);
8927		    }
8928		}
8929	      gsi_next (&si);
8930	      if (seen_store)
8931		{
8932		  if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8933		    /* Interleaving.  If IS_STORE is TRUE, the
8934		       vectorization of the interleaving chain was
8935		       completed - free all the stores in the chain.  */
8936		    vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8937		  else
8938		    /* Free the attached stmt_vec_info and remove the stmt.  */
8939		    loop_vinfo->remove_stmt (stmt_info);
8940		}
8941	    }
8942	}
8943
8944      /* Stub out scalar statements that must not survive vectorization.
8945	 Doing this here helps with grouped statements, or statements that
8946	 are involved in patterns.  */
8947      for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8948	   !gsi_end_p (gsi); gsi_next (&gsi))
8949	{
8950	  gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8951	  if (!call || !gimple_call_internal_p (call))
8952	    continue;
8953	  internal_fn ifn = gimple_call_internal_fn (call);
8954	  if (ifn == IFN_MASK_LOAD)
8955	    {
8956	      tree lhs = gimple_get_lhs (call);
8957	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8958		{
8959		  tree zero = build_zero_cst (TREE_TYPE (lhs));
8960		  gimple *new_stmt = gimple_build_assign (lhs, zero);
8961		  gsi_replace (&gsi, new_stmt, true);
8962		}
8963	    }
8964	  else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
8965	    {
8966	      tree lhs = gimple_get_lhs (call);
8967	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8968		{
8969		  tree else_arg
8970		    = gimple_call_arg (call, gimple_call_num_args (call) - 1);
8971		  gimple *new_stmt = gimple_build_assign (lhs, else_arg);
8972		  gsi_replace (&gsi, new_stmt, true);
8973		}
8974	    }
8975	}
8976    }				/* BBs in loop */
8977
8978  /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8979     a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8980  if (integer_onep (step_vector))
8981    niters_no_overflow = true;
8982  vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8983			   niters_vector_mult_vf, !niters_no_overflow);
8984
8985  unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8986  scale_profile_for_vect_loop (loop, assumed_vf);
8987
8988  /* True if the final iteration might not handle a full vector's
8989     worth of scalar iterations.  */
8990  bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8991  /* The minimum number of iterations performed by the epilogue.  This
8992     is 1 when peeling for gaps because we always need a final scalar
8993     iteration.  */
8994  int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8995  /* +1 to convert latch counts to loop iteration counts,
8996     -min_epilogue_iters to remove iterations that cannot be performed
8997       by the vector code.  */
8998  int bias_for_lowest = 1 - min_epilogue_iters;
8999  int bias_for_assumed = bias_for_lowest;
9000  int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9001  if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
9002    {
9003      /* When the amount of peeling is known at compile time, the first
9004	 iteration will have exactly alignment_npeels active elements.
9005	 In the worst case it will have at least one.  */
9006      int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9007      bias_for_lowest += lowest_vf - min_first_active;
9008      bias_for_assumed += assumed_vf - min_first_active;
9009    }
9010  /* In these calculations the "- 1" converts loop iteration counts
9011     back to latch counts.  */
9012  if (loop->any_upper_bound)
9013    loop->nb_iterations_upper_bound
9014      = (final_iter_may_be_partial
9015	 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9016			  lowest_vf) - 1
9017	 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9018			   lowest_vf) - 1);
9019  if (loop->any_likely_upper_bound)
9020    loop->nb_iterations_likely_upper_bound
9021      = (final_iter_may_be_partial
9022	 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9023			  + bias_for_lowest, lowest_vf) - 1
9024	 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9025			   + bias_for_lowest, lowest_vf) - 1);
9026  if (loop->any_estimate)
9027    loop->nb_iterations_estimate
9028      = (final_iter_may_be_partial
9029	 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9030			  assumed_vf) - 1
9031	 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9032			   assumed_vf) - 1);
9033
9034  if (dump_enabled_p ())
9035    {
9036      if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9037	{
9038	  dump_printf_loc (MSG_NOTE, vect_location,
9039			   "LOOP VECTORIZED\n");
9040	  if (loop->inner)
9041	    dump_printf_loc (MSG_NOTE, vect_location,
9042			     "OUTER LOOP VECTORIZED\n");
9043	  dump_printf (MSG_NOTE, "\n");
9044	}
9045      else
9046	dump_printf_loc (MSG_NOTE, vect_location,
9047			 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9048			 GET_MODE_NAME (loop_vinfo->vector_mode));
9049    }
9050
9051  /* Loops vectorized with a variable factor won't benefit from
9052     unrolling/peeling.  */
9053  if (!vf.is_constant ())
9054    {
9055      loop->unroll = 1;
9056      if (dump_enabled_p ())
9057	dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9058			 " variable-length vectorization factor\n");
9059    }
9060  /* Free SLP instances here because otherwise stmt reference counting
9061     won't work.  */
9062  slp_instance instance;
9063  FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9064    vect_free_slp_instance (instance, true);
9065  LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9066  /* Clear-up safelen field since its value is invalid after vectorization
9067     since vectorized loop can have loop-carried dependencies.  */
9068  loop->safelen = 0;
9069
9070  if (epilogue)
9071    {
9072      update_epilogue_loop_vinfo (epilogue, advance);
9073
9074      epilogue->simduid = loop->simduid;
9075      epilogue->force_vectorize = loop->force_vectorize;
9076      epilogue->dont_vectorize = false;
9077    }
9078
9079  return epilogue;
9080}
9081
9082/* The code below is trying to perform simple optimization - revert
9083   if-conversion for masked stores, i.e. if the mask of a store is zero
9084   do not perform it and all stored value producers also if possible.
9085   For example,
9086     for (i=0; i<n; i++)
9087       if (c[i])
9088	{
9089	  p1[i] += 1;
9090	  p2[i] = p3[i] +2;
9091	}
9092   this transformation will produce the following semi-hammock:
9093
9094   if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9095     {
9096       vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9097       vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9098       MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9099       vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9100       vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9101       MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9102     }
9103*/
9104
9105void
9106optimize_mask_stores (class loop *loop)
9107{
9108  basic_block *bbs = get_loop_body (loop);
9109  unsigned nbbs = loop->num_nodes;
9110  unsigned i;
9111  basic_block bb;
9112  class loop *bb_loop;
9113  gimple_stmt_iterator gsi;
9114  gimple *stmt;
9115  auto_vec<gimple *> worklist;
9116  auto_purge_vect_location sentinel;
9117
9118  vect_location = find_loop_location (loop);
9119  /* Pick up all masked stores in loop if any.  */
9120  for (i = 0; i < nbbs; i++)
9121    {
9122      bb = bbs[i];
9123      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9124	   gsi_next (&gsi))
9125	{
9126	  stmt = gsi_stmt (gsi);
9127	  if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9128	    worklist.safe_push (stmt);
9129	}
9130    }
9131
9132  free (bbs);
9133  if (worklist.is_empty ())
9134    return;
9135
9136  /* Loop has masked stores.  */
9137  while (!worklist.is_empty ())
9138    {
9139      gimple *last, *last_store;
9140      edge e, efalse;
9141      tree mask;
9142      basic_block store_bb, join_bb;
9143      gimple_stmt_iterator gsi_to;
9144      tree vdef, new_vdef;
9145      gphi *phi;
9146      tree vectype;
9147      tree zero;
9148
9149      last = worklist.pop ();
9150      mask = gimple_call_arg (last, 2);
9151      bb = gimple_bb (last);
9152      /* Create then_bb and if-then structure in CFG, then_bb belongs to
9153	 the same loop as if_bb.  It could be different to LOOP when two
9154	 level loop-nest is vectorized and mask_store belongs to the inner
9155	 one.  */
9156      e = split_block (bb, last);
9157      bb_loop = bb->loop_father;
9158      gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9159      join_bb = e->dest;
9160      store_bb = create_empty_bb (bb);
9161      add_bb_to_loop (store_bb, bb_loop);
9162      e->flags = EDGE_TRUE_VALUE;
9163      efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9164      /* Put STORE_BB to likely part.  */
9165      efalse->probability = profile_probability::unlikely ();
9166      store_bb->count = efalse->count ();
9167      make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9168      if (dom_info_available_p (CDI_DOMINATORS))
9169	set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9170      if (dump_enabled_p ())
9171	dump_printf_loc (MSG_NOTE, vect_location,
9172			 "Create new block %d to sink mask stores.",
9173			 store_bb->index);
9174      /* Create vector comparison with boolean result.  */
9175      vectype = TREE_TYPE (mask);
9176      zero = build_zero_cst (vectype);
9177      stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9178      gsi = gsi_last_bb (bb);
9179      gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9180      /* Create new PHI node for vdef of the last masked store:
9181	 .MEM_2 = VDEF <.MEM_1>
9182	 will be converted to
9183	 .MEM.3 = VDEF <.MEM_1>
9184	 and new PHI node will be created in join bb
9185	 .MEM_2 = PHI <.MEM_1, .MEM_3>
9186      */
9187      vdef = gimple_vdef (last);
9188      new_vdef = make_ssa_name (gimple_vop (cfun), last);
9189      gimple_set_vdef (last, new_vdef);
9190      phi = create_phi_node (vdef, join_bb);
9191      add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9192
9193      /* Put all masked stores with the same mask to STORE_BB if possible.  */
9194      while (true)
9195	{
9196	  gimple_stmt_iterator gsi_from;
9197	  gimple *stmt1 = NULL;
9198
9199	  /* Move masked store to STORE_BB.  */
9200	  last_store = last;
9201	  gsi = gsi_for_stmt (last);
9202	  gsi_from = gsi;
9203	  /* Shift GSI to the previous stmt for further traversal.  */
9204	  gsi_prev (&gsi);
9205	  gsi_to = gsi_start_bb (store_bb);
9206	  gsi_move_before (&gsi_from, &gsi_to);
9207	  /* Setup GSI_TO to the non-empty block start.  */
9208	  gsi_to = gsi_start_bb (store_bb);
9209	  if (dump_enabled_p ())
9210	    dump_printf_loc (MSG_NOTE, vect_location,
9211			     "Move stmt to created bb\n%G", last);
9212	  /* Move all stored value producers if possible.  */
9213	  while (!gsi_end_p (gsi))
9214	    {
9215	      tree lhs;
9216	      imm_use_iterator imm_iter;
9217	      use_operand_p use_p;
9218	      bool res;
9219
9220	      /* Skip debug statements.  */
9221	      if (is_gimple_debug (gsi_stmt (gsi)))
9222		{
9223		  gsi_prev (&gsi);
9224		  continue;
9225		}
9226	      stmt1 = gsi_stmt (gsi);
9227	      /* Do not consider statements writing to memory or having
9228		 volatile operand.  */
9229	      if (gimple_vdef (stmt1)
9230		  || gimple_has_volatile_ops (stmt1))
9231		break;
9232	      gsi_from = gsi;
9233	      gsi_prev (&gsi);
9234	      lhs = gimple_get_lhs (stmt1);
9235	      if (!lhs)
9236		break;
9237
9238	      /* LHS of vectorized stmt must be SSA_NAME.  */
9239	      if (TREE_CODE (lhs) != SSA_NAME)
9240		break;
9241
9242	      if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9243		{
9244		  /* Remove dead scalar statement.  */
9245		  if (has_zero_uses (lhs))
9246		    {
9247		      gsi_remove (&gsi_from, true);
9248		      continue;
9249		    }
9250		}
9251
9252	      /* Check that LHS does not have uses outside of STORE_BB.  */
9253	      res = true;
9254	      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9255		{
9256		  gimple *use_stmt;
9257		  use_stmt = USE_STMT (use_p);
9258		  if (is_gimple_debug (use_stmt))
9259		    continue;
9260		  if (gimple_bb (use_stmt) != store_bb)
9261		    {
9262		      res = false;
9263		      break;
9264		    }
9265		}
9266	      if (!res)
9267		break;
9268
9269	      if (gimple_vuse (stmt1)
9270		  && gimple_vuse (stmt1) != gimple_vuse (last_store))
9271		break;
9272
9273	      /* Can move STMT1 to STORE_BB.  */
9274	      if (dump_enabled_p ())
9275		dump_printf_loc (MSG_NOTE, vect_location,
9276				 "Move stmt to created bb\n%G", stmt1);
9277	      gsi_move_before (&gsi_from, &gsi_to);
9278	      /* Shift GSI_TO for further insertion.  */
9279	      gsi_prev (&gsi_to);
9280	    }
9281	  /* Put other masked stores with the same mask to STORE_BB.  */
9282	  if (worklist.is_empty ()
9283	      || gimple_call_arg (worklist.last (), 2) != mask
9284	      || worklist.last () != stmt1)
9285	    break;
9286	  last = worklist.pop ();
9287	}
9288      add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9289    }
9290}
9291
9292/* Decide whether it is possible to use a zero-based induction variable
9293   when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
9294   return the value that the induction variable must be able to hold
9295   in order to ensure that the loop ends with an all-false mask.
9296   Return -1 otherwise.  */
9297widest_int
9298vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9299{
9300  tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9301  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9302  unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9303
9304  /* Calculate the value that the induction variable must be able
9305     to hit in order to ensure that we end the loop with an all-false mask.
9306     This involves adding the maximum number of inactive trailing scalar
9307     iterations.  */
9308  widest_int iv_limit = -1;
9309  if (max_loop_iterations (loop, &iv_limit))
9310    {
9311      if (niters_skip)
9312	{
9313	  /* Add the maximum number of skipped iterations to the
9314	     maximum iteration count.  */
9315	  if (TREE_CODE (niters_skip) == INTEGER_CST)
9316	    iv_limit += wi::to_widest (niters_skip);
9317	  else
9318	    iv_limit += max_vf - 1;
9319	}
9320      else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9321	/* Make a conservatively-correct assumption.  */
9322	iv_limit += max_vf - 1;
9323
9324      /* IV_LIMIT is the maximum number of latch iterations, which is also
9325	 the maximum in-range IV value.  Round this value down to the previous
9326	 vector alignment boundary and then add an extra full iteration.  */
9327      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9328      iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9329    }
9330  return iv_limit;
9331}
9332
9333