1/* Data References Analysis and Manipulation Utilities for Vectorization.
2   Copyright (C) 2003-2020 Free Software Foundation, Inc.
3   Contributed by Dorit Naishlos <dorit@il.ibm.com>
4   and Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3.  If not see
20<http://www.gnu.org/licenses/>.  */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "target.h"
27#include "rtl.h"
28#include "tree.h"
29#include "gimple.h"
30#include "predict.h"
31#include "memmodel.h"
32#include "tm_p.h"
33#include "ssa.h"
34#include "optabs-tree.h"
35#include "cgraph.h"
36#include "dumpfile.h"
37#include "alias.h"
38#include "fold-const.h"
39#include "stor-layout.h"
40#include "tree-eh.h"
41#include "gimplify.h"
42#include "gimple-iterator.h"
43#include "gimplify-me.h"
44#include "tree-ssa-loop-ivopts.h"
45#include "tree-ssa-loop-manip.h"
46#include "tree-ssa-loop.h"
47#include "cfgloop.h"
48#include "tree-scalar-evolution.h"
49#include "tree-vectorizer.h"
50#include "expr.h"
51#include "builtins.h"
52#include "tree-cfg.h"
53#include "tree-hash-traits.h"
54#include "vec-perm-indices.h"
55#include "internal-fn.h"
56
57/* Return true if load- or store-lanes optab OPTAB is implemented for
58   COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
59
60static bool
61vect_lanes_optab_supported_p (const char *name, convert_optab optab,
62			      tree vectype, unsigned HOST_WIDE_INT count)
63{
64  machine_mode mode, array_mode;
65  bool limit_p;
66
67  mode = TYPE_MODE (vectype);
68  if (!targetm.array_mode (mode, count).exists (&array_mode))
69    {
70      poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
71      limit_p = !targetm.array_mode_supported_p (mode, count);
72      if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
73	{
74	  if (dump_enabled_p ())
75	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
76			     "no array mode for %s[%wu]\n",
77			     GET_MODE_NAME (mode), count);
78	  return false;
79	}
80    }
81
82  if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
83    {
84      if (dump_enabled_p ())
85	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
86                         "cannot use %s<%s><%s>\n", name,
87                         GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
88      return false;
89    }
90
91  if (dump_enabled_p ())
92    dump_printf_loc (MSG_NOTE, vect_location,
93                     "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
94                     GET_MODE_NAME (mode));
95
96  return true;
97}
98
99
100/* Return the smallest scalar part of STMT_INFO.
101   This is used to determine the vectype of the stmt.  We generally set the
102   vectype according to the type of the result (lhs).  For stmts whose
103   result-type is different than the type of the arguments (e.g., demotion,
104   promotion), vectype will be reset appropriately (later).  Note that we have
105   to visit the smallest datatype in this function, because that determines the
106   VF.  If the smallest datatype in the loop is present only as the rhs of a
107   promotion operation - we'd miss it.
108   Such a case, where a variable of this datatype does not appear in the lhs
109   anywhere in the loop, can only occur if it's an invariant: e.g.:
110   'int_x = (int) short_inv', which we'd expect to have been optimized away by
111   invariant motion.  However, we cannot rely on invariant motion to always
112   take invariants out of the loop, and so in the case of promotion we also
113   have to check the rhs.
114   LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
115   types.  */
116
117tree
118vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
119			       HOST_WIDE_INT *lhs_size_unit,
120			       HOST_WIDE_INT *rhs_size_unit)
121{
122  tree scalar_type = gimple_expr_type (stmt_info->stmt);
123  HOST_WIDE_INT lhs, rhs;
124
125  /* During the analysis phase, this function is called on arbitrary
126     statements that might not have scalar results.  */
127  if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
128    return scalar_type;
129
130  lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
131
132  gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
133  if (assign
134      && (gimple_assign_cast_p (assign)
135	  || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
136	  || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
137	  || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
138	  || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
139	  || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
140    {
141      tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
142
143      rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
144      if (rhs < lhs)
145        scalar_type = rhs_type;
146    }
147  else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
148    {
149      unsigned int i = 0;
150      if (gimple_call_internal_p (call))
151	{
152	  internal_fn ifn = gimple_call_internal_fn (call);
153	  if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
154	    /* gimple_expr_type already picked the type of the loaded
155	       or stored data.  */
156	    i = ~0U;
157	  else if (internal_fn_mask_index (ifn) == 0)
158	    i = 1;
159	}
160      if (i < gimple_call_num_args (call))
161	{
162	  tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
163	  if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
164	    {
165	      rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
166	      if (rhs < lhs)
167		scalar_type = rhs_type;
168	    }
169	}
170    }
171
172  *lhs_size_unit = lhs;
173  *rhs_size_unit = rhs;
174  return scalar_type;
175}
176
177
178/* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
179   tested at run-time.  Return TRUE if DDR was successfully inserted.
180   Return false if versioning is not supported.  */
181
182static opt_result
183vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
184{
185  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
186
187  if ((unsigned) param_vect_max_version_for_alias_checks == 0)
188    return opt_result::failure_at (vect_location,
189				   "will not create alias checks, as"
190				   " --param vect-max-version-for-alias-checks"
191				   " == 0\n");
192
193  opt_result res
194    = runtime_alias_check_p (ddr, loop,
195			     optimize_loop_nest_for_speed_p (loop));
196  if (!res)
197    return res;
198
199  LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
200  return opt_result::success ();
201}
202
203/* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
204
205static void
206vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
207{
208  vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
209  for (unsigned int i = 0; i < checks.length(); ++i)
210    if (checks[i] == value)
211      return;
212
213  if (dump_enabled_p ())
214    dump_printf_loc (MSG_NOTE, vect_location,
215		     "need run-time check that %T is nonzero\n",
216		     value);
217  LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
218}
219
220/* Return true if we know that the order of vectorized DR_INFO_A and
221   vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
222   DR_INFO_B.  At least one of the accesses is a write.  */
223
224static bool
225vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
226{
227  stmt_vec_info stmtinfo_a = dr_info_a->stmt;
228  stmt_vec_info stmtinfo_b = dr_info_b->stmt;
229
230  /* Single statements are always kept in their original order.  */
231  if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
232      && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
233    return true;
234
235  /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
236     SLP group are emitted at the position of the last scalar load and
237     all loads in an interleaving group are emitted at the position
238     of the first scalar load.
239     Stores in a group are emitted at the position of the last scalar store.
240     Compute that position and check whether the resulting order matches
241     the current one.
242     We have not yet decided between SLP and interleaving so we have
243     to conservatively assume both.  */
244  stmt_vec_info il_a;
245  stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
246  if (last_a)
247    {
248      for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
249	   s = DR_GROUP_NEXT_ELEMENT (s))
250	last_a = get_later_stmt (last_a, s);
251      if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
252	{
253	  for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
254	       s = DR_GROUP_NEXT_ELEMENT (s))
255	    if (get_later_stmt (il_a, s) == il_a)
256	      il_a = s;
257	}
258      else
259	il_a = last_a;
260    }
261  else
262    last_a = il_a = stmtinfo_a;
263  stmt_vec_info il_b;
264  stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
265  if (last_b)
266    {
267      for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
268	   s = DR_GROUP_NEXT_ELEMENT (s))
269	last_b = get_later_stmt (last_b, s);
270      if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
271	{
272	  for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
273	       s = DR_GROUP_NEXT_ELEMENT (s))
274	    if (get_later_stmt (il_b, s) == il_b)
275	      il_b = s;
276	}
277      else
278	il_b = last_b;
279    }
280  else
281    last_b = il_b = stmtinfo_b;
282  bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
283  return (/* SLP */
284	  (get_later_stmt (last_a, last_b) == last_a) == a_after_b
285	  /* Interleaving */
286	  && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
287	  /* Mixed */
288	  && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
289	  && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
290}
291
292/* A subroutine of vect_analyze_data_ref_dependence.  Handle
293   DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
294   distances.  These distances are conservatively correct but they don't
295   reflect a guaranteed dependence.
296
297   Return true if this function does all the work necessary to avoid
298   an alias or false if the caller should use the dependence distances
299   to limit the vectorization factor in the usual way.  LOOP_DEPTH is
300   the depth of the loop described by LOOP_VINFO and the other arguments
301   are as for vect_analyze_data_ref_dependence.  */
302
303static bool
304vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
305				       loop_vec_info loop_vinfo,
306				       int loop_depth, unsigned int *max_vf)
307{
308  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
309  lambda_vector dist_v;
310  unsigned int i;
311  FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
312    {
313      int dist = dist_v[loop_depth];
314      if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
315	{
316	  /* If the user asserted safelen >= DIST consecutive iterations
317	     can be executed concurrently, assume independence.
318
319	     ??? An alternative would be to add the alias check even
320	     in this case, and vectorize the fallback loop with the
321	     maximum VF set to safelen.  However, if the user has
322	     explicitly given a length, it's less likely that that
323	     would be a win.  */
324	  if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
325	    {
326	      if ((unsigned int) loop->safelen < *max_vf)
327		*max_vf = loop->safelen;
328	      LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
329	      continue;
330	    }
331
332	  /* For dependence distances of 2 or more, we have the option
333	     of limiting VF or checking for an alias at runtime.
334	     Prefer to check at runtime if we can, to avoid limiting
335	     the VF unnecessarily when the bases are in fact independent.
336
337	     Note that the alias checks will be removed if the VF ends up
338	     being small enough.  */
339	  dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
340	  dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
341	  return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
342		  && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
343		  && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
344	}
345    }
346  return true;
347}
348
349
350/* Function vect_analyze_data_ref_dependence.
351
352   FIXME: I needed to change the sense of the returned flag.
353
354   Return FALSE if there (might) exist a dependence between a memory-reference
355   DRA and a memory-reference DRB.  When versioning for alias may check a
356   dependence at run-time, return TRUE.  Adjust *MAX_VF according to
357   the data dependence.  */
358
359static opt_result
360vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
361				  loop_vec_info loop_vinfo,
362				  unsigned int *max_vf)
363{
364  unsigned int i;
365  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
366  struct data_reference *dra = DDR_A (ddr);
367  struct data_reference *drb = DDR_B (ddr);
368  dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
369  dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
370  stmt_vec_info stmtinfo_a = dr_info_a->stmt;
371  stmt_vec_info stmtinfo_b = dr_info_b->stmt;
372  lambda_vector dist_v;
373  unsigned int loop_depth;
374
375  /* In loop analysis all data references should be vectorizable.  */
376  if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
377      || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
378    gcc_unreachable ();
379
380  /* Independent data accesses.  */
381  if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
382    return opt_result::success ();
383
384  if (dra == drb
385      || (DR_IS_READ (dra) && DR_IS_READ (drb)))
386    return opt_result::success ();
387
388  /* We do not have to consider dependences between accesses that belong
389     to the same group, unless the stride could be smaller than the
390     group size.  */
391  if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
392      && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
393	  == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
394      && !STMT_VINFO_STRIDED_P (stmtinfo_a))
395    return opt_result::success ();
396
397  /* Even if we have an anti-dependence then, as the vectorized loop covers at
398     least two scalar iterations, there is always also a true dependence.
399     As the vectorizer does not re-order loads and stores we can ignore
400     the anti-dependence if TBAA can disambiguate both DRs similar to the
401     case with known negative distance anti-dependences (positive
402     distance anti-dependences would violate TBAA constraints).  */
403  if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
404       || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
405      && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
406				 get_alias_set (DR_REF (drb))))
407    return opt_result::success ();
408
409  /* Unknown data dependence.  */
410  if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
411    {
412      /* If user asserted safelen consecutive iterations can be
413	 executed concurrently, assume independence.  */
414      if (loop->safelen >= 2)
415	{
416	  if ((unsigned int) loop->safelen < *max_vf)
417	    *max_vf = loop->safelen;
418	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
419	  return opt_result::success ();
420	}
421
422      if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
423	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
424	return opt_result::failure_at
425	  (stmtinfo_a->stmt,
426	   "versioning for alias not supported for: "
427	   "can't determine dependence between %T and %T\n",
428	   DR_REF (dra), DR_REF (drb));
429
430      if (dump_enabled_p ())
431	dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
432			 "versioning for alias required: "
433			 "can't determine dependence between %T and %T\n",
434			 DR_REF (dra), DR_REF (drb));
435
436      /* Add to list of ddrs that need to be tested at run-time.  */
437      return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
438    }
439
440  /* Known data dependence.  */
441  if (DDR_NUM_DIST_VECTS (ddr) == 0)
442    {
443      /* If user asserted safelen consecutive iterations can be
444	 executed concurrently, assume independence.  */
445      if (loop->safelen >= 2)
446	{
447	  if ((unsigned int) loop->safelen < *max_vf)
448	    *max_vf = loop->safelen;
449	  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
450	  return opt_result::success ();
451	}
452
453      if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
454	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
455	return opt_result::failure_at
456	  (stmtinfo_a->stmt,
457	   "versioning for alias not supported for: "
458	   "bad dist vector for %T and %T\n",
459	   DR_REF (dra), DR_REF (drb));
460
461      if (dump_enabled_p ())
462	dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
463			 "versioning for alias required: "
464			 "bad dist vector for %T and %T\n",
465			 DR_REF (dra), DR_REF (drb));
466      /* Add to list of ddrs that need to be tested at run-time.  */
467      return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
468    }
469
470  loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
471
472  if (DDR_COULD_BE_INDEPENDENT_P (ddr)
473      && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
474						loop_depth, max_vf))
475    return opt_result::success ();
476
477  FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
478    {
479      int dist = dist_v[loop_depth];
480
481      if (dump_enabled_p ())
482	dump_printf_loc (MSG_NOTE, vect_location,
483                         "dependence distance  = %d.\n", dist);
484
485      if (dist == 0)
486	{
487	  if (dump_enabled_p ())
488	    dump_printf_loc (MSG_NOTE, vect_location,
489			     "dependence distance == 0 between %T and %T\n",
490			     DR_REF (dra), DR_REF (drb));
491
492	  /* When we perform grouped accesses and perform implicit CSE
493	     by detecting equal accesses and doing disambiguation with
494	     runtime alias tests like for
495	        .. = a[i];
496		.. = a[i+1];
497		a[i] = ..;
498		a[i+1] = ..;
499		*p = ..;
500		.. = a[i];
501		.. = a[i+1];
502	     where we will end up loading { a[i], a[i+1] } once, make
503	     sure that inserting group loads before the first load and
504	     stores after the last store will do the right thing.
505	     Similar for groups like
506	        a[i] = ...;
507		... = a[i];
508		a[i+1] = ...;
509	     where loads from the group interleave with the store.  */
510	  if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
511	    return opt_result::failure_at (stmtinfo_a->stmt,
512					   "READ_WRITE dependence"
513					   " in interleaving.\n");
514
515	  if (loop->safelen < 2)
516	    {
517	      tree indicator = dr_zero_step_indicator (dra);
518	      if (!indicator || integer_zerop (indicator))
519		return opt_result::failure_at (stmtinfo_a->stmt,
520					       "access also has a zero step\n");
521	      else if (TREE_CODE (indicator) != INTEGER_CST)
522		vect_check_nonzero_value (loop_vinfo, indicator);
523	    }
524	  continue;
525	}
526
527      if (dist > 0 && DDR_REVERSED_P (ddr))
528	{
529	  /* If DDR_REVERSED_P the order of the data-refs in DDR was
530	     reversed (to make distance vector positive), and the actual
531	     distance is negative.  */
532	  if (dump_enabled_p ())
533	    dump_printf_loc (MSG_NOTE, vect_location,
534	                     "dependence distance negative.\n");
535	  /* When doing outer loop vectorization, we need to check if there is
536	     a backward dependence at the inner loop level if the dependence
537	     at the outer loop is reversed.  See PR81740.  */
538	  if (nested_in_vect_loop_p (loop, stmtinfo_a)
539	      || nested_in_vect_loop_p (loop, stmtinfo_b))
540	    {
541	      unsigned inner_depth = index_in_loop_nest (loop->inner->num,
542							 DDR_LOOP_NEST (ddr));
543	      if (dist_v[inner_depth] < 0)
544		return opt_result::failure_at (stmtinfo_a->stmt,
545					       "not vectorized, dependence "
546					       "between data-refs %T and %T\n",
547					       DR_REF (dra), DR_REF (drb));
548	    }
549	  /* Record a negative dependence distance to later limit the
550	     amount of stmt copying / unrolling we can perform.
551	     Only need to handle read-after-write dependence.  */
552	  if (DR_IS_READ (drb)
553	      && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
554		  || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
555	    STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
556	  continue;
557	}
558
559      unsigned int abs_dist = abs (dist);
560      if (abs_dist >= 2 && abs_dist < *max_vf)
561	{
562	  /* The dependence distance requires reduction of the maximal
563	     vectorization factor.  */
564	  *max_vf = abs_dist;
565	  if (dump_enabled_p ())
566	    dump_printf_loc (MSG_NOTE, vect_location,
567	                     "adjusting maximal vectorization factor to %i\n",
568	                     *max_vf);
569	}
570
571      if (abs_dist >= *max_vf)
572	{
573	  /* Dependence distance does not create dependence, as far as
574	     vectorization is concerned, in this case.  */
575	  if (dump_enabled_p ())
576	    dump_printf_loc (MSG_NOTE, vect_location,
577	                     "dependence distance >= VF.\n");
578	  continue;
579	}
580
581      return opt_result::failure_at (stmtinfo_a->stmt,
582				     "not vectorized, possible dependence "
583				     "between data-refs %T and %T\n",
584				     DR_REF (dra), DR_REF (drb));
585    }
586
587  return opt_result::success ();
588}
589
590/* Function vect_analyze_data_ref_dependences.
591
592   Examine all the data references in the loop, and make sure there do not
593   exist any data dependences between them.  Set *MAX_VF according to
594   the maximum vectorization factor the data dependences allow.  */
595
596opt_result
597vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
598				   unsigned int *max_vf)
599{
600  unsigned int i;
601  struct data_dependence_relation *ddr;
602
603  DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
604
605  if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
606    {
607      LOOP_VINFO_DDRS (loop_vinfo)
608	.create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
609		 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
610      /* We need read-read dependences to compute
611	 STMT_VINFO_SAME_ALIGN_REFS.  */
612      bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
613					  &LOOP_VINFO_DDRS (loop_vinfo),
614					  LOOP_VINFO_LOOP_NEST (loop_vinfo),
615					  true);
616      gcc_assert (res);
617    }
618
619  LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
620
621  /* For epilogues we either have no aliases or alias versioning
622     was applied to original loop.  Therefore we may just get max_vf
623     using VF of original loop.  */
624  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
625    *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
626  else
627    FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
628      {
629	opt_result res
630	  = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
631	if (!res)
632	  return res;
633      }
634
635  return opt_result::success ();
636}
637
638
639/* Function vect_slp_analyze_data_ref_dependence.
640
641   Return TRUE if there (might) exist a dependence between a memory-reference
642   DRA and a memory-reference DRB for VINFO.  When versioning for alias
643   may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
644   according to the data dependence.  */
645
646static bool
647vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
648				      struct data_dependence_relation *ddr)
649{
650  struct data_reference *dra = DDR_A (ddr);
651  struct data_reference *drb = DDR_B (ddr);
652  dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
653  dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
654
655  /* We need to check dependences of statements marked as unvectorizable
656     as well, they still can prohibit vectorization.  */
657
658  /* Independent data accesses.  */
659  if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
660    return false;
661
662  if (dra == drb)
663    return false;
664
665  /* Read-read is OK.  */
666  if (DR_IS_READ (dra) && DR_IS_READ (drb))
667    return false;
668
669  /* If dra and drb are part of the same interleaving chain consider
670     them independent.  */
671  if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
672      && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
673	  == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
674    return false;
675
676  /* Unknown data dependence.  */
677  if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
678    {
679      if  (dump_enabled_p ())
680	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
681			 "can't determine dependence between %T and %T\n",
682			 DR_REF (dra), DR_REF (drb));
683    }
684  else if (dump_enabled_p ())
685    dump_printf_loc (MSG_NOTE, vect_location,
686		     "determined dependence between %T and %T\n",
687		     DR_REF (dra), DR_REF (drb));
688
689  return true;
690}
691
692
693/* Analyze dependences involved in the transform of SLP NODE.  STORES
694   contain the vector of scalar stores of this instance if we are
695   disambiguating the loads.  */
696
697static bool
698vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
699				   vec<stmt_vec_info> stores,
700				   stmt_vec_info last_store_info)
701{
702  /* This walks over all stmts involved in the SLP load/store done
703     in NODE verifying we can sink them up to the last stmt in the
704     group.  */
705  stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
706  vec_info *vinfo = last_access_info->vinfo;
707  for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
708    {
709      stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
710      if (access_info == last_access_info)
711	continue;
712      data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
713      ao_ref ref;
714      bool ref_initialized_p = false;
715      for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
716	   gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
717	{
718	  gimple *stmt = gsi_stmt (gsi);
719	  if (! gimple_vuse (stmt)
720	      || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
721	    continue;
722
723	  /* If we couldn't record a (single) data reference for this
724	     stmt we have to resort to the alias oracle.  */
725	  stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
726	  data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
727	  if (!dr_b)
728	    {
729	      /* We are moving a store or sinking a load - this means
730	         we cannot use TBAA for disambiguation.  */
731	      if (!ref_initialized_p)
732		ao_ref_init (&ref, DR_REF (dr_a));
733	      if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
734		  || ref_maybe_used_by_stmt_p (stmt, &ref, false))
735		return false;
736	      continue;
737	    }
738
739	  bool dependent = false;
740	  /* If we run into a store of this same instance (we've just
741	     marked those) then delay dependence checking until we run
742	     into the last store because this is where it will have
743	     been sunk to (and we verify if we can do that as well).  */
744	  if (gimple_visited_p (stmt))
745	    {
746	      if (stmt_info != last_store_info)
747		continue;
748	      unsigned i;
749	      stmt_vec_info store_info;
750	      FOR_EACH_VEC_ELT (stores, i, store_info)
751		{
752		  data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
753		  ddr_p ddr = initialize_data_dependence_relation
754				(dr_a, store_dr, vNULL);
755		  dependent
756		    = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
757		  free_dependence_relation (ddr);
758		  if (dependent)
759		    break;
760		}
761	    }
762	  else
763	    {
764	      ddr_p ddr = initialize_data_dependence_relation (dr_a,
765							       dr_b, vNULL);
766	      dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
767	      free_dependence_relation (ddr);
768	    }
769	  if (dependent)
770	    return false;
771	}
772    }
773  return true;
774}
775
776
777/* Function vect_analyze_data_ref_dependences.
778
779   Examine all the data references in the basic-block, and make sure there
780   do not exist any data dependences between them.  Set *MAX_VF according to
781   the maximum vectorization factor the data dependences allow.  */
782
783bool
784vect_slp_analyze_instance_dependence (slp_instance instance)
785{
786  DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
787
788  /* The stores of this instance are at the root of the SLP tree.  */
789  slp_tree store = SLP_INSTANCE_TREE (instance);
790  if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
791    store = NULL;
792
793  /* Verify we can sink stores to the vectorized stmt insert location.  */
794  stmt_vec_info last_store_info = NULL;
795  if (store)
796    {
797      if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
798	return false;
799
800      /* Mark stores in this instance and remember the last one.  */
801      last_store_info = vect_find_last_scalar_stmt_in_slp (store);
802      for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
803	gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
804    }
805
806  bool res = true;
807
808  /* Verify we can sink loads to the vectorized stmt insert location,
809     special-casing stores of this instance.  */
810  slp_tree load;
811  unsigned int i;
812  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
813    if (! vect_slp_analyze_node_dependences (instance, load,
814					     store
815					     ? SLP_TREE_SCALAR_STMTS (store)
816					     : vNULL, last_store_info))
817      {
818	res = false;
819	break;
820      }
821
822  /* Unset the visited flag.  */
823  if (store)
824    for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
825      gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
826
827  return res;
828}
829
830/* Record the base alignment guarantee given by DRB, which occurs
831   in STMT_INFO.  */
832
833static void
834vect_record_base_alignment (stmt_vec_info stmt_info,
835			    innermost_loop_behavior *drb)
836{
837  vec_info *vinfo = stmt_info->vinfo;
838  bool existed;
839  innermost_loop_behavior *&entry
840    = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
841  if (!existed || entry->base_alignment < drb->base_alignment)
842    {
843      entry = drb;
844      if (dump_enabled_p ())
845	dump_printf_loc (MSG_NOTE, vect_location,
846			 "recording new base alignment for %T\n"
847			 "  alignment:    %d\n"
848			 "  misalignment: %d\n"
849			 "  based on:     %G",
850			 drb->base_address,
851			 drb->base_alignment,
852			 drb->base_misalignment,
853			 stmt_info->stmt);
854    }
855}
856
857/* If the region we're going to vectorize is reached, all unconditional
858   data references occur at least once.  We can therefore pool the base
859   alignment guarantees from each unconditional reference.  Do this by
860   going through all the data references in VINFO and checking whether
861   the containing statement makes the reference unconditionally.  If so,
862   record the alignment of the base address in VINFO so that it can be
863   used for all other references with the same base.  */
864
865void
866vect_record_base_alignments (vec_info *vinfo)
867{
868  loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
869  class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
870  data_reference *dr;
871  unsigned int i;
872  FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
873    {
874      dr_vec_info *dr_info = vinfo->lookup_dr (dr);
875      stmt_vec_info stmt_info = dr_info->stmt;
876      if (!DR_IS_CONDITIONAL_IN_STMT (dr)
877	  && STMT_VINFO_VECTORIZABLE (stmt_info)
878	  && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
879	{
880	  vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
881
882	  /* If DR is nested in the loop that is being vectorized, we can also
883	     record the alignment of the base wrt the outer loop.  */
884	  if (loop && nested_in_vect_loop_p (loop, stmt_info))
885	    vect_record_base_alignment
886	      (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
887	}
888    }
889}
890
891/* Return the target alignment for the vectorized form of DR_INFO.  */
892
893static poly_uint64
894vect_calculate_target_alignment (dr_vec_info *dr_info)
895{
896  tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
897  return targetm.vectorize.preferred_vector_alignment (vectype);
898}
899
900/* Function vect_compute_data_ref_alignment
901
902   Compute the misalignment of the data reference DR_INFO.
903
904   Output:
905   1. DR_MISALIGNMENT (DR_INFO) is defined.
906
907   FOR NOW: No analysis is actually performed. Misalignment is calculated
908   only for trivial cases. TODO.  */
909
910static void
911vect_compute_data_ref_alignment (dr_vec_info *dr_info)
912{
913  stmt_vec_info stmt_info = dr_info->stmt;
914  vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
915  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
916  class loop *loop = NULL;
917  tree ref = DR_REF (dr_info->dr);
918  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
919
920  if (dump_enabled_p ())
921    dump_printf_loc (MSG_NOTE, vect_location,
922                     "vect_compute_data_ref_alignment:\n");
923
924  if (loop_vinfo)
925    loop = LOOP_VINFO_LOOP (loop_vinfo);
926
927  /* Initialize misalignment to unknown.  */
928  SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
929
930  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
931    return;
932
933  innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
934  bool step_preserves_misalignment_p;
935
936  poly_uint64 vector_alignment
937    = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
938  DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
939
940  /* If the main loop has peeled for alignment we have no way of knowing
941     whether the data accesses in the epilogues are aligned.  We can't at
942     compile time answer the question whether we have entered the main loop or
943     not.  Fixes PR 92351.  */
944  if (loop_vinfo)
945    {
946      loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
947      if (orig_loop_vinfo
948	  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0)
949	return;
950    }
951
952  unsigned HOST_WIDE_INT vect_align_c;
953  if (!vector_alignment.is_constant (&vect_align_c))
954    return;
955
956  /* No step for BB vectorization.  */
957  if (!loop)
958    {
959      gcc_assert (integer_zerop (drb->step));
960      step_preserves_misalignment_p = true;
961    }
962
963  /* In case the dataref is in an inner-loop of the loop that is being
964     vectorized (LOOP), we use the base and misalignment information
965     relative to the outer-loop (LOOP).  This is ok only if the misalignment
966     stays the same throughout the execution of the inner-loop, which is why
967     we have to check that the stride of the dataref in the inner-loop evenly
968     divides by the vector alignment.  */
969  else if (nested_in_vect_loop_p (loop, stmt_info))
970    {
971      step_preserves_misalignment_p
972	= (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
973
974      if (dump_enabled_p ())
975	{
976	  if (step_preserves_misalignment_p)
977	    dump_printf_loc (MSG_NOTE, vect_location,
978			     "inner step divides the vector alignment.\n");
979	  else
980	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
981			     "inner step doesn't divide the vector"
982			     " alignment.\n");
983	}
984    }
985
986  /* Similarly we can only use base and misalignment information relative to
987     an innermost loop if the misalignment stays the same throughout the
988     execution of the loop.  As above, this is the case if the stride of
989     the dataref evenly divides by the alignment.  */
990  else
991    {
992      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
993      step_preserves_misalignment_p
994	= multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
995
996      if (!step_preserves_misalignment_p && dump_enabled_p ())
997	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
998			 "step doesn't divide the vector alignment.\n");
999    }
1000
1001  unsigned int base_alignment = drb->base_alignment;
1002  unsigned int base_misalignment = drb->base_misalignment;
1003
1004  /* Calculate the maximum of the pooled base address alignment and the
1005     alignment that we can compute for DR itself.  */
1006  innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
1007  if (entry && base_alignment < (*entry)->base_alignment)
1008    {
1009      base_alignment = (*entry)->base_alignment;
1010      base_misalignment = (*entry)->base_misalignment;
1011    }
1012
1013  if (drb->offset_alignment < vect_align_c
1014      || !step_preserves_misalignment_p
1015      /* We need to know whether the step wrt the vectorized loop is
1016	 negative when computing the starting misalignment below.  */
1017      || TREE_CODE (drb->step) != INTEGER_CST)
1018    {
1019      if (dump_enabled_p ())
1020	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1021			 "Unknown alignment for access: %T\n", ref);
1022      return;
1023    }
1024
1025  if (base_alignment < vect_align_c)
1026    {
1027      unsigned int max_alignment;
1028      tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1029      if (max_alignment < vect_align_c
1030	  || !vect_can_force_dr_alignment_p (base,
1031					     vect_align_c * BITS_PER_UNIT))
1032	{
1033	  if (dump_enabled_p ())
1034	    dump_printf_loc (MSG_NOTE, vect_location,
1035			     "can't force alignment of ref: %T\n", ref);
1036	  return;
1037	}
1038
1039      /* Force the alignment of the decl.
1040	 NOTE: This is the only change to the code we make during
1041	 the analysis phase, before deciding to vectorize the loop.  */
1042      if (dump_enabled_p ())
1043	dump_printf_loc (MSG_NOTE, vect_location,
1044			 "force alignment of %T\n", ref);
1045
1046      dr_info->base_decl = base;
1047      dr_info->base_misaligned = true;
1048      base_misalignment = 0;
1049    }
1050  poly_int64 misalignment
1051    = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1052
1053  /* If this is a backward running DR then first access in the larger
1054     vectype actually is N-1 elements before the address in the DR.
1055     Adjust misalign accordingly.  */
1056  if (tree_int_cst_sgn (drb->step) < 0)
1057    /* PLUS because STEP is negative.  */
1058    misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1059		     * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))));
1060
1061  unsigned int const_misalignment;
1062  if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1063    {
1064      if (dump_enabled_p ())
1065	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1066			 "Non-constant misalignment for access: %T\n", ref);
1067      return;
1068    }
1069
1070  SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1071
1072  if (dump_enabled_p ())
1073    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1074		     "misalign = %d bytes of ref %T\n",
1075		     DR_MISALIGNMENT (dr_info), ref);
1076
1077  return;
1078}
1079
1080/* Function vect_update_misalignment_for_peel.
1081   Sets DR_INFO's misalignment
1082   - to 0 if it has the same alignment as DR_PEEL_INFO,
1083   - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1084   - to -1 (unknown) otherwise.
1085
1086   DR_INFO - the data reference whose misalignment is to be adjusted.
1087   DR_PEEL_INFO - the data reference whose misalignment is being made
1088		  zero in the vector loop by the peel.
1089   NPEEL - the number of iterations in the peel loop if the misalignment
1090           of DR_PEEL_INFO is known at compile time.  */
1091
1092static void
1093vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1094				   dr_vec_info *dr_peel_info, int npeel)
1095{
1096  unsigned int i;
1097  vec<dr_p> same_aligned_drs;
1098  struct data_reference *current_dr;
1099  stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1100
1101  /* It can be assumed that if dr_info has the same alignment as dr_peel,
1102     it is aligned in the vector loop.  */
1103  same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1104  FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1105    {
1106      if (current_dr != dr_info->dr)
1107        continue;
1108      gcc_assert (!known_alignment_for_access_p (dr_info)
1109		  || !known_alignment_for_access_p (dr_peel_info)
1110		  || (DR_MISALIGNMENT (dr_info)
1111		      == DR_MISALIGNMENT (dr_peel_info)));
1112      SET_DR_MISALIGNMENT (dr_info, 0);
1113      return;
1114    }
1115
1116  unsigned HOST_WIDE_INT alignment;
1117  if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1118      && known_alignment_for_access_p (dr_info)
1119      && known_alignment_for_access_p (dr_peel_info))
1120    {
1121      int misal = DR_MISALIGNMENT (dr_info);
1122      misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1123      misal &= alignment - 1;
1124      SET_DR_MISALIGNMENT (dr_info, misal);
1125      return;
1126    }
1127
1128  if (dump_enabled_p ())
1129    dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1130		     "to unknown (-1).\n");
1131  SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1132}
1133
1134
1135/* Function verify_data_ref_alignment
1136
1137   Return TRUE if DR_INFO can be handled with respect to alignment.  */
1138
1139static opt_result
1140verify_data_ref_alignment (dr_vec_info *dr_info)
1141{
1142  enum dr_alignment_support supportable_dr_alignment
1143    = vect_supportable_dr_alignment (dr_info, false);
1144  if (!supportable_dr_alignment)
1145    return opt_result::failure_at
1146      (dr_info->stmt->stmt,
1147       DR_IS_READ (dr_info->dr)
1148	? "not vectorized: unsupported unaligned load: %T\n"
1149	: "not vectorized: unsupported unaligned store: %T\n",
1150       DR_REF (dr_info->dr));
1151
1152  if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1153    dump_printf_loc (MSG_NOTE, vect_location,
1154		     "Vectorizing an unaligned access.\n");
1155
1156  return opt_result::success ();
1157}
1158
1159/* Function vect_verify_datarefs_alignment
1160
1161   Return TRUE if all data references in the loop can be
1162   handled with respect to alignment.  */
1163
1164opt_result
1165vect_verify_datarefs_alignment (loop_vec_info vinfo)
1166{
1167  vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1168  struct data_reference *dr;
1169  unsigned int i;
1170
1171  FOR_EACH_VEC_ELT (datarefs, i, dr)
1172    {
1173      dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1174      stmt_vec_info stmt_info = dr_info->stmt;
1175
1176      if (!STMT_VINFO_RELEVANT_P (stmt_info))
1177	continue;
1178
1179      /* For interleaving, only the alignment of the first access matters.   */
1180      if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1181	  && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1182	continue;
1183
1184      /* Strided accesses perform only component accesses, alignment is
1185	 irrelevant for them.  */
1186      if (STMT_VINFO_STRIDED_P (stmt_info)
1187	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1188	continue;
1189
1190      opt_result res = verify_data_ref_alignment (dr_info);
1191      if (!res)
1192	return res;
1193    }
1194
1195  return opt_result::success ();
1196}
1197
1198/* Given an memory reference EXP return whether its alignment is less
1199   than its size.  */
1200
1201static bool
1202not_size_aligned (tree exp)
1203{
1204  if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1205    return true;
1206
1207  return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1208	  > get_object_alignment (exp));
1209}
1210
1211/* Function vector_alignment_reachable_p
1212
1213   Return true if vector alignment for DR_INFO is reachable by peeling
1214   a few loop iterations.  Return false otherwise.  */
1215
1216static bool
1217vector_alignment_reachable_p (dr_vec_info *dr_info)
1218{
1219  stmt_vec_info stmt_info = dr_info->stmt;
1220  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1221
1222  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1223    {
1224      /* For interleaved access we peel only if number of iterations in
1225	 the prolog loop ({VF - misalignment}), is a multiple of the
1226	 number of the interleaved accesses.  */
1227      int elem_size, mis_in_elements;
1228
1229      /* FORNOW: handle only known alignment.  */
1230      if (!known_alignment_for_access_p (dr_info))
1231	return false;
1232
1233      poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1234      poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1235      elem_size = vector_element_size (vector_size, nelements);
1236      mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1237
1238      if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1239	return false;
1240    }
1241
1242  /* If misalignment is known at the compile time then allow peeling
1243     only if natural alignment is reachable through peeling.  */
1244  if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1245    {
1246      HOST_WIDE_INT elmsize =
1247		int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1248      if (dump_enabled_p ())
1249	{
1250	  dump_printf_loc (MSG_NOTE, vect_location,
1251	                   "data size = %wd. misalignment = %d.\n", elmsize,
1252			   DR_MISALIGNMENT (dr_info));
1253	}
1254      if (DR_MISALIGNMENT (dr_info) % elmsize)
1255	{
1256	  if (dump_enabled_p ())
1257	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258	                     "data size does not divide the misalignment.\n");
1259	  return false;
1260	}
1261    }
1262
1263  if (!known_alignment_for_access_p (dr_info))
1264    {
1265      tree type = TREE_TYPE (DR_REF (dr_info->dr));
1266      bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1267      if (dump_enabled_p ())
1268	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1269	                 "Unknown misalignment, %snaturally aligned\n",
1270			 is_packed ? "not " : "");
1271      return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1272    }
1273
1274  return true;
1275}
1276
1277
1278/* Calculate the cost of the memory access represented by DR_INFO.  */
1279
1280static void
1281vect_get_data_access_cost (dr_vec_info *dr_info,
1282                           unsigned int *inside_cost,
1283                           unsigned int *outside_cost,
1284			   stmt_vector_for_cost *body_cost_vec,
1285			   stmt_vector_for_cost *prologue_cost_vec)
1286{
1287  stmt_vec_info stmt_info = dr_info->stmt;
1288  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1289  int ncopies;
1290
1291  if (PURE_SLP_STMT (stmt_info))
1292    ncopies = 1;
1293  else
1294    ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1295
1296  if (DR_IS_READ (dr_info->dr))
1297    vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1298			prologue_cost_vec, body_cost_vec, false);
1299  else
1300    vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1301
1302  if (dump_enabled_p ())
1303    dump_printf_loc (MSG_NOTE, vect_location,
1304                     "vect_get_data_access_cost: inside_cost = %d, "
1305                     "outside_cost = %d.\n", *inside_cost, *outside_cost);
1306}
1307
1308
1309typedef struct _vect_peel_info
1310{
1311  dr_vec_info *dr_info;
1312  int npeel;
1313  unsigned int count;
1314} *vect_peel_info;
1315
1316typedef struct _vect_peel_extended_info
1317{
1318  struct _vect_peel_info peel_info;
1319  unsigned int inside_cost;
1320  unsigned int outside_cost;
1321} *vect_peel_extended_info;
1322
1323
1324/* Peeling hashtable helpers.  */
1325
1326struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1327{
1328  static inline hashval_t hash (const _vect_peel_info *);
1329  static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1330};
1331
1332inline hashval_t
1333peel_info_hasher::hash (const _vect_peel_info *peel_info)
1334{
1335  return (hashval_t) peel_info->npeel;
1336}
1337
1338inline bool
1339peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1340{
1341  return (a->npeel == b->npeel);
1342}
1343
1344
1345/* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1346
1347static void
1348vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1349			  loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1350                          int npeel)
1351{
1352  struct _vect_peel_info elem, *slot;
1353  _vect_peel_info **new_slot;
1354  bool supportable_dr_alignment
1355    = vect_supportable_dr_alignment (dr_info, true);
1356
1357  elem.npeel = npeel;
1358  slot = peeling_htab->find (&elem);
1359  if (slot)
1360    slot->count++;
1361  else
1362    {
1363      slot = XNEW (struct _vect_peel_info);
1364      slot->npeel = npeel;
1365      slot->dr_info = dr_info;
1366      slot->count = 1;
1367      new_slot = peeling_htab->find_slot (slot, INSERT);
1368      *new_slot = slot;
1369    }
1370
1371  if (!supportable_dr_alignment
1372      && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1373    slot->count += VECT_MAX_COST;
1374}
1375
1376
1377/* Traverse peeling hash table to find peeling option that aligns maximum
1378   number of data accesses.  */
1379
1380int
1381vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1382				     _vect_peel_extended_info *max)
1383{
1384  vect_peel_info elem = *slot;
1385
1386  if (elem->count > max->peel_info.count
1387      || (elem->count == max->peel_info.count
1388          && max->peel_info.npeel > elem->npeel))
1389    {
1390      max->peel_info.npeel = elem->npeel;
1391      max->peel_info.count = elem->count;
1392      max->peel_info.dr_info = elem->dr_info;
1393    }
1394
1395  return 1;
1396}
1397
1398/* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1399   data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1400   we assume DR0_INFO's misalignment will be zero after peeling.  */
1401
1402static void
1403vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1404				dr_vec_info *dr0_info,
1405				unsigned int *inside_cost,
1406				unsigned int *outside_cost,
1407				stmt_vector_for_cost *body_cost_vec,
1408				stmt_vector_for_cost *prologue_cost_vec,
1409				unsigned int npeel,
1410				bool unknown_misalignment)
1411{
1412  vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1413  unsigned i;
1414  data_reference *dr;
1415
1416  FOR_EACH_VEC_ELT (datarefs, i, dr)
1417    {
1418      dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1419      stmt_vec_info stmt_info = dr_info->stmt;
1420      if (!STMT_VINFO_RELEVANT_P (stmt_info))
1421	continue;
1422
1423      /* For interleaving, only the alignment of the first access
1424         matters.  */
1425      if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1426	  && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1427	continue;
1428
1429      /* Strided accesses perform only component accesses, alignment is
1430         irrelevant for them.  */
1431      if (STMT_VINFO_STRIDED_P (stmt_info)
1432	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1433	continue;
1434
1435      int save_misalignment;
1436      save_misalignment = DR_MISALIGNMENT (dr_info);
1437      if (npeel == 0)
1438	;
1439      else if (unknown_misalignment && dr_info == dr0_info)
1440	SET_DR_MISALIGNMENT (dr_info, 0);
1441      else
1442	vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1443      vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1444				 body_cost_vec, prologue_cost_vec);
1445      SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1446    }
1447}
1448
1449/* Traverse peeling hash table and calculate cost for each peeling option.
1450   Find the one with the lowest cost.  */
1451
1452int
1453vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1454				   _vect_peel_extended_info *min)
1455{
1456  vect_peel_info elem = *slot;
1457  int dummy;
1458  unsigned int inside_cost = 0, outside_cost = 0;
1459  stmt_vec_info stmt_info = elem->dr_info->stmt;
1460  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1461  stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1462		       epilogue_cost_vec;
1463
1464  prologue_cost_vec.create (2);
1465  body_cost_vec.create (2);
1466  epilogue_cost_vec.create (2);
1467
1468  vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1469				  &outside_cost, &body_cost_vec,
1470				  &prologue_cost_vec, elem->npeel, false);
1471
1472  body_cost_vec.release ();
1473
1474  outside_cost += vect_get_known_peeling_cost
1475    (loop_vinfo, elem->npeel, &dummy,
1476     &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1477     &prologue_cost_vec, &epilogue_cost_vec);
1478
1479  /* Prologue and epilogue costs are added to the target model later.
1480     These costs depend only on the scalar iteration cost, the
1481     number of peeling iterations finally chosen, and the number of
1482     misaligned statements.  So discard the information found here.  */
1483  prologue_cost_vec.release ();
1484  epilogue_cost_vec.release ();
1485
1486  if (inside_cost < min->inside_cost
1487      || (inside_cost == min->inside_cost
1488	  && outside_cost < min->outside_cost))
1489    {
1490      min->inside_cost = inside_cost;
1491      min->outside_cost = outside_cost;
1492      min->peel_info.dr_info = elem->dr_info;
1493      min->peel_info.npeel = elem->npeel;
1494      min->peel_info.count = elem->count;
1495    }
1496
1497  return 1;
1498}
1499
1500
1501/* Choose best peeling option by traversing peeling hash table and either
1502   choosing an option with the lowest cost (if cost model is enabled) or the
1503   option that aligns as many accesses as possible.  */
1504
1505static struct _vect_peel_extended_info
1506vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1507				       loop_vec_info loop_vinfo)
1508{
1509   struct _vect_peel_extended_info res;
1510
1511   res.peel_info.dr_info = NULL;
1512
1513   if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1514     {
1515       res.inside_cost = INT_MAX;
1516       res.outside_cost = INT_MAX;
1517       peeling_htab->traverse <_vect_peel_extended_info *,
1518	   		       vect_peeling_hash_get_lowest_cost> (&res);
1519     }
1520   else
1521     {
1522       res.peel_info.count = 0;
1523       peeling_htab->traverse <_vect_peel_extended_info *,
1524	   		       vect_peeling_hash_get_most_frequent> (&res);
1525       res.inside_cost = 0;
1526       res.outside_cost = 0;
1527     }
1528
1529   return res;
1530}
1531
1532/* Return true if the new peeling NPEEL is supported.  */
1533
1534static bool
1535vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1536			  unsigned npeel)
1537{
1538  unsigned i;
1539  struct data_reference *dr = NULL;
1540  vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1541  enum dr_alignment_support supportable_dr_alignment;
1542
1543  /* Ensure that all data refs can be vectorized after the peel.  */
1544  FOR_EACH_VEC_ELT (datarefs, i, dr)
1545    {
1546      int save_misalignment;
1547
1548      if (dr == dr0_info->dr)
1549	continue;
1550
1551      dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1552      stmt_vec_info stmt_info = dr_info->stmt;
1553      /* For interleaving, only the alignment of the first access
1554	 matters.  */
1555      if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1556	  && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1557	continue;
1558
1559      /* Strided accesses perform only component accesses, alignment is
1560	 irrelevant for them.  */
1561      if (STMT_VINFO_STRIDED_P (stmt_info)
1562	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1563	continue;
1564
1565      save_misalignment = DR_MISALIGNMENT (dr_info);
1566      vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1567      supportable_dr_alignment
1568	= vect_supportable_dr_alignment (dr_info, false);
1569      SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1570
1571      if (!supportable_dr_alignment)
1572	return false;
1573    }
1574
1575  return true;
1576}
1577
1578/* Function vect_enhance_data_refs_alignment
1579
1580   This pass will use loop versioning and loop peeling in order to enhance
1581   the alignment of data references in the loop.
1582
1583   FOR NOW: we assume that whatever versioning/peeling takes place, only the
1584   original loop is to be vectorized.  Any other loops that are created by
1585   the transformations performed in this pass - are not supposed to be
1586   vectorized.  This restriction will be relaxed.
1587
1588   This pass will require a cost model to guide it whether to apply peeling
1589   or versioning or a combination of the two.  For example, the scheme that
1590   intel uses when given a loop with several memory accesses, is as follows:
1591   choose one memory access ('p') which alignment you want to force by doing
1592   peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1593   other accesses are not necessarily aligned, or (2) use loop versioning to
1594   generate one loop in which all accesses are aligned, and another loop in
1595   which only 'p' is necessarily aligned.
1596
1597   ("Automatic Intra-Register Vectorization for the Intel Architecture",
1598   Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1599   Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1600
1601   Devising a cost model is the most critical aspect of this work.  It will
1602   guide us on which access to peel for, whether to use loop versioning, how
1603   many versions to create, etc.  The cost model will probably consist of
1604   generic considerations as well as target specific considerations (on
1605   powerpc for example, misaligned stores are more painful than misaligned
1606   loads).
1607
1608   Here are the general steps involved in alignment enhancements:
1609
1610     -- original loop, before alignment analysis:
1611	for (i=0; i<N; i++){
1612	  x = q[i];			# DR_MISALIGNMENT(q) = unknown
1613	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1614	}
1615
1616     -- After vect_compute_data_refs_alignment:
1617	for (i=0; i<N; i++){
1618	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1619	  p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1620	}
1621
1622     -- Possibility 1: we do loop versioning:
1623     if (p is aligned) {
1624	for (i=0; i<N; i++){	# loop 1A
1625	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1626	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1627	}
1628     }
1629     else {
1630	for (i=0; i<N; i++){	# loop 1B
1631	  x = q[i];			# DR_MISALIGNMENT(q) = 3
1632	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1633	}
1634     }
1635
1636     -- Possibility 2: we do loop peeling:
1637     for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1638	x = q[i];
1639	p[i] = y;
1640     }
1641     for (i = 3; i < N; i++){	# loop 2A
1642	x = q[i];			# DR_MISALIGNMENT(q) = 0
1643	p[i] = y;			# DR_MISALIGNMENT(p) = unknown
1644     }
1645
1646     -- Possibility 3: combination of loop peeling and versioning:
1647     for (i = 0; i < 3; i++){	# (scalar loop, not to be vectorized).
1648	x = q[i];
1649	p[i] = y;
1650     }
1651     if (p is aligned) {
1652	for (i = 3; i<N; i++){	# loop 3A
1653	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1654	  p[i] = y;			# DR_MISALIGNMENT(p) = 0
1655	}
1656     }
1657     else {
1658	for (i = 3; i<N; i++){	# loop 3B
1659	  x = q[i];			# DR_MISALIGNMENT(q) = 0
1660	  p[i] = y;			# DR_MISALIGNMENT(p) = unaligned
1661	}
1662     }
1663
1664     These loops are later passed to loop_transform to be vectorized.  The
1665     vectorizer will use the alignment information to guide the transformation
1666     (whether to generate regular loads/stores, or with special handling for
1667     misalignment).  */
1668
1669opt_result
1670vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1671{
1672  vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1673  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1674  enum dr_alignment_support supportable_dr_alignment;
1675  dr_vec_info *first_store = NULL;
1676  dr_vec_info *dr0_info = NULL;
1677  struct data_reference *dr;
1678  unsigned int i, j;
1679  bool do_peeling = false;
1680  bool do_versioning = false;
1681  unsigned int npeel = 0;
1682  bool one_misalignment_known = false;
1683  bool one_misalignment_unknown = false;
1684  bool one_dr_unsupportable = false;
1685  dr_vec_info *unsupportable_dr_info = NULL;
1686  poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1687  unsigned possible_npeel_number = 1;
1688  tree vectype;
1689  unsigned int mis, same_align_drs_max = 0;
1690  hash_table<peel_info_hasher> peeling_htab (1);
1691
1692  DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1693
1694  /* Reset data so we can safely be called multiple times.  */
1695  LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1696  LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1697
1698  /* While cost model enhancements are expected in the future, the high level
1699     view of the code at this time is as follows:
1700
1701     A) If there is a misaligned access then see if peeling to align
1702        this access can make all data references satisfy
1703        vect_supportable_dr_alignment.  If so, update data structures
1704        as needed and return true.
1705
1706     B) If peeling wasn't possible and there is a data reference with an
1707        unknown misalignment that does not satisfy vect_supportable_dr_alignment
1708        then see if loop versioning checks can be used to make all data
1709        references satisfy vect_supportable_dr_alignment.  If so, update
1710        data structures as needed and return true.
1711
1712     C) If neither peeling nor versioning were successful then return false if
1713        any data reference does not satisfy vect_supportable_dr_alignment.
1714
1715     D) Return true (all data references satisfy vect_supportable_dr_alignment).
1716
1717     Note, Possibility 3 above (which is peeling and versioning together) is not
1718     being done at this time.  */
1719
1720  /* (1) Peeling to force alignment.  */
1721
1722  /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1723     Considerations:
1724     + How many accesses will become aligned due to the peeling
1725     - How many accesses will become unaligned due to the peeling,
1726       and the cost of misaligned accesses.
1727     - The cost of peeling (the extra runtime checks, the increase
1728       in code size).  */
1729
1730  FOR_EACH_VEC_ELT (datarefs, i, dr)
1731    {
1732      dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1733      stmt_vec_info stmt_info = dr_info->stmt;
1734
1735      if (!STMT_VINFO_RELEVANT_P (stmt_info))
1736	continue;
1737
1738      /* For interleaving, only the alignment of the first access
1739         matters.  */
1740      if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1741	  && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1742	continue;
1743
1744      /* For scatter-gather or invariant accesses there is nothing
1745	 to enhance.  */
1746      if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1747	  || integer_zerop (DR_STEP (dr)))
1748	continue;
1749
1750      /* Strided accesses perform only component accesses, alignment is
1751	 irrelevant for them.  */
1752      if (STMT_VINFO_STRIDED_P (stmt_info)
1753	  && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1754	continue;
1755
1756      supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1757      do_peeling = vector_alignment_reachable_p (dr_info);
1758      if (do_peeling)
1759        {
1760          if (known_alignment_for_access_p (dr_info))
1761            {
1762	      unsigned int npeel_tmp = 0;
1763	      bool negative = tree_int_cst_compare (DR_STEP (dr),
1764						    size_zero_node) < 0;
1765
1766	      vectype = STMT_VINFO_VECTYPE (stmt_info);
1767	      /* If known_alignment_for_access_p then we have set
1768	         DR_MISALIGNMENT which is only done if we know it at compiler
1769	         time, so it is safe to assume target alignment is constant.
1770	       */
1771	      unsigned int target_align =
1772		DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1773	      unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1774	      mis = (negative
1775		     ? DR_MISALIGNMENT (dr_info)
1776		     : -DR_MISALIGNMENT (dr_info));
1777	      if (DR_MISALIGNMENT (dr_info) != 0)
1778		npeel_tmp = (mis & (target_align - 1)) / dr_size;
1779
1780              /* For multiple types, it is possible that the bigger type access
1781                 will have more than one peeling option.  E.g., a loop with two
1782                 types: one of size (vector size / 4), and the other one of
1783                 size (vector size / 8).  Vectorization factor will 8.  If both
1784                 accesses are misaligned by 3, the first one needs one scalar
1785                 iteration to be aligned, and the second one needs 5.  But the
1786		 first one will be aligned also by peeling 5 scalar
1787                 iterations, and in that case both accesses will be aligned.
1788                 Hence, except for the immediate peeling amount, we also want
1789                 to try to add full vector size, while we don't exceed
1790                 vectorization factor.
1791                 We do this automatically for cost model, since we calculate
1792		 cost for every peeling option.  */
1793              if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1794		{
1795		  poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1796					  ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1797		  possible_npeel_number
1798		    = vect_get_num_vectors (nscalars, vectype);
1799
1800		  /* NPEEL_TMP is 0 when there is no misalignment, but also
1801		     allow peeling NELEMENTS.  */
1802		  if (DR_MISALIGNMENT (dr_info) == 0)
1803		    possible_npeel_number++;
1804		}
1805
1806	      /* Save info about DR in the hash table.  Also include peeling
1807	         amounts according to the explanation above.  */
1808              for (j = 0; j < possible_npeel_number; j++)
1809                {
1810                  vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1811					    dr_info, npeel_tmp);
1812		  npeel_tmp += target_align / dr_size;
1813                }
1814
1815	      one_misalignment_known = true;
1816            }
1817          else
1818            {
1819              /* If we don't know any misalignment values, we prefer
1820                 peeling for data-ref that has the maximum number of data-refs
1821                 with the same alignment, unless the target prefers to align
1822                 stores over load.  */
1823	      unsigned same_align_drs
1824		= STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1825	      if (!dr0_info
1826		  || same_align_drs_max < same_align_drs)
1827		{
1828		  same_align_drs_max = same_align_drs;
1829		  dr0_info = dr_info;
1830		}
1831	      /* For data-refs with the same number of related
1832		 accesses prefer the one where the misalign
1833		 computation will be invariant in the outermost loop.  */
1834	      else if (same_align_drs_max == same_align_drs)
1835		{
1836		  class loop *ivloop0, *ivloop;
1837		  ivloop0 = outermost_invariant_loop_for_expr
1838		    (loop, DR_BASE_ADDRESS (dr0_info->dr));
1839		  ivloop = outermost_invariant_loop_for_expr
1840		    (loop, DR_BASE_ADDRESS (dr));
1841		  if ((ivloop && !ivloop0)
1842		      || (ivloop && ivloop0
1843			  && flow_loop_nested_p (ivloop, ivloop0)))
1844		    dr0_info = dr_info;
1845		}
1846
1847	      one_misalignment_unknown = true;
1848
1849	      /* Check for data refs with unsupportable alignment that
1850	         can be peeled.  */
1851	      if (!supportable_dr_alignment)
1852	      {
1853		one_dr_unsupportable = true;
1854		unsupportable_dr_info = dr_info;
1855	      }
1856
1857	      if (!first_store && DR_IS_WRITE (dr))
1858		first_store = dr_info;
1859            }
1860        }
1861      else
1862        {
1863          if (!aligned_access_p (dr_info))
1864            {
1865              if (dump_enabled_p ())
1866                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1867                                 "vector alignment may not be reachable\n");
1868              break;
1869            }
1870        }
1871    }
1872
1873  /* Check if we can possibly peel the loop.  */
1874  if (!vect_can_advance_ivs_p (loop_vinfo)
1875      || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1876      || loop->inner)
1877    do_peeling = false;
1878
1879  struct _vect_peel_extended_info peel_for_known_alignment;
1880  struct _vect_peel_extended_info peel_for_unknown_alignment;
1881  struct _vect_peel_extended_info best_peel;
1882
1883  peel_for_unknown_alignment.inside_cost = INT_MAX;
1884  peel_for_unknown_alignment.outside_cost = INT_MAX;
1885  peel_for_unknown_alignment.peel_info.count = 0;
1886
1887  if (do_peeling
1888      && one_misalignment_unknown)
1889    {
1890      /* Check if the target requires to prefer stores over loads, i.e., if
1891         misaligned stores are more expensive than misaligned loads (taking
1892         drs with same alignment into account).  */
1893      unsigned int load_inside_cost = 0;
1894      unsigned int load_outside_cost = 0;
1895      unsigned int store_inside_cost = 0;
1896      unsigned int store_outside_cost = 0;
1897      unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1898
1899      stmt_vector_for_cost dummy;
1900      dummy.create (2);
1901      vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1902				      &load_inside_cost,
1903				      &load_outside_cost,
1904				      &dummy, &dummy, estimated_npeels, true);
1905      dummy.release ();
1906
1907      if (first_store)
1908	{
1909	  dummy.create (2);
1910	  vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1911					  &store_inside_cost,
1912					  &store_outside_cost,
1913					  &dummy, &dummy,
1914					  estimated_npeels, true);
1915	  dummy.release ();
1916	}
1917      else
1918	{
1919	  store_inside_cost = INT_MAX;
1920	  store_outside_cost = INT_MAX;
1921	}
1922
1923      if (load_inside_cost > store_inside_cost
1924	  || (load_inside_cost == store_inside_cost
1925	      && load_outside_cost > store_outside_cost))
1926	{
1927	  dr0_info = first_store;
1928	  peel_for_unknown_alignment.inside_cost = store_inside_cost;
1929	  peel_for_unknown_alignment.outside_cost = store_outside_cost;
1930	}
1931      else
1932	{
1933	  peel_for_unknown_alignment.inside_cost = load_inside_cost;
1934	  peel_for_unknown_alignment.outside_cost = load_outside_cost;
1935	}
1936
1937      stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1938      prologue_cost_vec.create (2);
1939      epilogue_cost_vec.create (2);
1940
1941      int dummy2;
1942      peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1943	(loop_vinfo, estimated_npeels, &dummy2,
1944	 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1945	 &prologue_cost_vec, &epilogue_cost_vec);
1946
1947      prologue_cost_vec.release ();
1948      epilogue_cost_vec.release ();
1949
1950      peel_for_unknown_alignment.peel_info.count = 1
1951	+ STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1952    }
1953
1954  peel_for_unknown_alignment.peel_info.npeel = 0;
1955  peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1956
1957  best_peel = peel_for_unknown_alignment;
1958
1959  peel_for_known_alignment.inside_cost = INT_MAX;
1960  peel_for_known_alignment.outside_cost = INT_MAX;
1961  peel_for_known_alignment.peel_info.count = 0;
1962  peel_for_known_alignment.peel_info.dr_info = NULL;
1963
1964  if (do_peeling && one_misalignment_known)
1965    {
1966      /* Peeling is possible, but there is no data access that is not supported
1967         unless aligned.  So we try to choose the best possible peeling from
1968	 the hash table.  */
1969      peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1970	(&peeling_htab, loop_vinfo);
1971    }
1972
1973  /* Compare costs of peeling for known and unknown alignment. */
1974  if (peel_for_known_alignment.peel_info.dr_info != NULL
1975      && peel_for_unknown_alignment.inside_cost
1976      >= peel_for_known_alignment.inside_cost)
1977    {
1978      best_peel = peel_for_known_alignment;
1979
1980      /* If the best peeling for known alignment has NPEEL == 0, perform no
1981         peeling at all except if there is an unsupportable dr that we can
1982         align.  */
1983      if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1984	do_peeling = false;
1985    }
1986
1987  /* If there is an unsupportable data ref, prefer this over all choices so far
1988     since we'd have to discard a chosen peeling except when it accidentally
1989     aligned the unsupportable data ref.  */
1990  if (one_dr_unsupportable)
1991    dr0_info = unsupportable_dr_info;
1992  else if (do_peeling)
1993    {
1994      /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1995	 TODO: Use nopeel_outside_cost or get rid of it?  */
1996      unsigned nopeel_inside_cost = 0;
1997      unsigned nopeel_outside_cost = 0;
1998
1999      stmt_vector_for_cost dummy;
2000      dummy.create (2);
2001      vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
2002				      &nopeel_outside_cost, &dummy, &dummy,
2003				      0, false);
2004      dummy.release ();
2005
2006      /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2007	 costs will be recorded.  */
2008      stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2009      prologue_cost_vec.create (2);
2010      epilogue_cost_vec.create (2);
2011
2012      int dummy2;
2013      nopeel_outside_cost += vect_get_known_peeling_cost
2014	(loop_vinfo, 0, &dummy2,
2015	 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2016	 &prologue_cost_vec, &epilogue_cost_vec);
2017
2018      prologue_cost_vec.release ();
2019      epilogue_cost_vec.release ();
2020
2021      npeel = best_peel.peel_info.npeel;
2022      dr0_info = best_peel.peel_info.dr_info;
2023
2024      /* If no peeling is not more expensive than the best peeling we
2025	 have so far, don't perform any peeling.  */
2026      if (nopeel_inside_cost <= best_peel.inside_cost)
2027	do_peeling = false;
2028    }
2029
2030  if (do_peeling)
2031    {
2032      stmt_vec_info stmt_info = dr0_info->stmt;
2033      vectype = STMT_VINFO_VECTYPE (stmt_info);
2034
2035      if (known_alignment_for_access_p (dr0_info))
2036        {
2037	  bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2038						size_zero_node) < 0;
2039          if (!npeel)
2040            {
2041              /* Since it's known at compile time, compute the number of
2042                 iterations in the peeled loop (the peeling factor) for use in
2043                 updating DR_MISALIGNMENT values.  The peeling factor is the
2044                 vectorization factor minus the misalignment as an element
2045                 count.  */
2046	      mis = (negative
2047		     ? DR_MISALIGNMENT (dr0_info)
2048		     : -DR_MISALIGNMENT (dr0_info));
2049	      /* If known_alignment_for_access_p then we have set
2050	         DR_MISALIGNMENT which is only done if we know it at compiler
2051	         time, so it is safe to assume target alignment is constant.
2052	       */
2053	      unsigned int target_align =
2054		DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2055	      npeel = ((mis & (target_align - 1))
2056		       / vect_get_scalar_dr_size (dr0_info));
2057            }
2058
2059	  /* For interleaved data access every iteration accesses all the
2060	     members of the group, therefore we divide the number of iterations
2061	     by the group size.  */
2062	  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2063	    npeel /= DR_GROUP_SIZE (stmt_info);
2064
2065          if (dump_enabled_p ())
2066            dump_printf_loc (MSG_NOTE, vect_location,
2067                             "Try peeling by %d\n", npeel);
2068        }
2069
2070      /* Ensure that all datarefs can be vectorized after the peel.  */
2071      if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2072	do_peeling = false;
2073
2074      /* Check if all datarefs are supportable and log.  */
2075      if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2076        {
2077          opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2078          if (!stat)
2079            do_peeling = false;
2080          else
2081	    return stat;
2082        }
2083
2084      /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2085      if (do_peeling)
2086        {
2087          unsigned max_allowed_peel
2088	    = param_vect_max_peeling_for_alignment;
2089	  if (flag_vect_cost_model == VECT_COST_MODEL_CHEAP)
2090	    max_allowed_peel = 0;
2091          if (max_allowed_peel != (unsigned)-1)
2092            {
2093              unsigned max_peel = npeel;
2094              if (max_peel == 0)
2095                {
2096		  poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2097		  unsigned HOST_WIDE_INT target_align_c;
2098		  if (target_align.is_constant (&target_align_c))
2099		    max_peel =
2100		      target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2101		  else
2102		    {
2103		      do_peeling = false;
2104		      if (dump_enabled_p ())
2105			dump_printf_loc (MSG_NOTE, vect_location,
2106			  "Disable peeling, max peels set and vector"
2107			  " alignment unknown\n");
2108		    }
2109                }
2110              if (max_peel > max_allowed_peel)
2111                {
2112                  do_peeling = false;
2113                  if (dump_enabled_p ())
2114                    dump_printf_loc (MSG_NOTE, vect_location,
2115                        "Disable peeling, max peels reached: %d\n", max_peel);
2116                }
2117            }
2118        }
2119
2120      /* Cost model #2 - if peeling may result in a remaining loop not
2121	 iterating enough to be vectorized then do not peel.  Since this
2122	 is a cost heuristic rather than a correctness decision, use the
2123	 most likely runtime value for variable vectorization factors.  */
2124      if (do_peeling
2125	  && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2126	{
2127	  unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2128	  unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2129	  if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2130	      < assumed_vf + max_peel)
2131	    do_peeling = false;
2132	}
2133
2134      if (do_peeling)
2135        {
2136          /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2137             If the misalignment of DR_i is identical to that of dr0 then set
2138             DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2139             dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2140             by the peeling factor times the element size of DR_i (MOD the
2141             vectorization factor times the size).  Otherwise, the
2142             misalignment of DR_i must be set to unknown.  */
2143	  FOR_EACH_VEC_ELT (datarefs, i, dr)
2144	    if (dr != dr0_info->dr)
2145	      {
2146		/* Strided accesses perform only component accesses, alignment
2147		   is irrelevant for them.  */
2148		dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2149		stmt_info = dr_info->stmt;
2150		if (STMT_VINFO_STRIDED_P (stmt_info)
2151		    && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2152		  continue;
2153
2154		vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2155	      }
2156
2157          LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2158          if (npeel)
2159            LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2160          else
2161            LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2162	      = DR_MISALIGNMENT (dr0_info);
2163	  SET_DR_MISALIGNMENT (dr0_info, 0);
2164	  if (dump_enabled_p ())
2165            {
2166              dump_printf_loc (MSG_NOTE, vect_location,
2167                               "Alignment of access forced using peeling.\n");
2168              dump_printf_loc (MSG_NOTE, vect_location,
2169                               "Peeling for alignment will be applied.\n");
2170            }
2171
2172	  /* The inside-loop cost will be accounted for in vectorizable_load
2173	     and vectorizable_store correctly with adjusted alignments.
2174	     Drop the body_cst_vec on the floor here.  */
2175	  opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2176	  gcc_assert (stat);
2177          return stat;
2178        }
2179    }
2180
2181  /* (2) Versioning to force alignment.  */
2182
2183  /* Try versioning if:
2184     1) optimize loop for speed and the cost-model is not cheap
2185     2) there is at least one unsupported misaligned data ref with an unknown
2186        misalignment, and
2187     3) all misaligned data refs with a known misalignment are supported, and
2188     4) the number of runtime alignment checks is within reason.  */
2189
2190  do_versioning
2191    = (optimize_loop_nest_for_speed_p (loop)
2192       && !loop->inner /* FORNOW */
2193       && flag_vect_cost_model != VECT_COST_MODEL_CHEAP);
2194
2195  if (do_versioning)
2196    {
2197      FOR_EACH_VEC_ELT (datarefs, i, dr)
2198        {
2199	  dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2200	  stmt_vec_info stmt_info = dr_info->stmt;
2201
2202	  /* For interleaving, only the alignment of the first access
2203	     matters.  */
2204	  if (aligned_access_p (dr_info)
2205	      || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2206		  && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2207	    continue;
2208
2209	  if (STMT_VINFO_STRIDED_P (stmt_info))
2210	    {
2211	      /* Strided loads perform only component accesses, alignment is
2212		 irrelevant for them.  */
2213	      if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2214		continue;
2215	      do_versioning = false;
2216	      break;
2217	    }
2218
2219	  supportable_dr_alignment
2220	    = vect_supportable_dr_alignment (dr_info, false);
2221
2222          if (!supportable_dr_alignment)
2223            {
2224              int mask;
2225              tree vectype;
2226
2227              if (known_alignment_for_access_p (dr_info)
2228                  || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2229		  >= (unsigned) param_vect_max_version_for_alignment_checks)
2230                {
2231                  do_versioning = false;
2232                  break;
2233                }
2234
2235	      vectype = STMT_VINFO_VECTYPE (stmt_info);
2236	      gcc_assert (vectype);
2237
2238	      /* At present we don't support versioning for alignment
2239		 with variable VF, since there's no guarantee that the
2240		 VF is a power of two.  We could relax this if we added
2241		 a way of enforcing a power-of-two size.  */
2242	      unsigned HOST_WIDE_INT size;
2243	      if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2244		{
2245		  do_versioning = false;
2246		  break;
2247		}
2248
2249	      /* Forcing alignment in the first iteration is no good if
2250		 we don't keep it across iterations.  For now, just disable
2251		 versioning in this case.
2252		 ?? We could actually unroll the loop to achieve the required
2253		 overall step alignment, and forcing the alignment could be
2254		 done by doing some iterations of the non-vectorized loop.  */
2255	      if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2256			       * DR_STEP_ALIGNMENT (dr),
2257			       DR_TARGET_ALIGNMENT (dr_info)))
2258		{
2259		  do_versioning = false;
2260		  break;
2261		}
2262
2263              /* The rightmost bits of an aligned address must be zeros.
2264                 Construct the mask needed for this test.  For example,
2265                 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2266                 mask must be 15 = 0xf. */
2267	      mask = size - 1;
2268
2269	      /* FORNOW: use the same mask to test all potentially unaligned
2270		 references in the loop.  */
2271	      if (LOOP_VINFO_PTR_MASK (loop_vinfo)
2272		  && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask)
2273		{
2274		  do_versioning = false;
2275		  break;
2276		}
2277
2278              LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2279	      LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2280            }
2281        }
2282
2283      /* Versioning requires at least one misaligned data reference.  */
2284      if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2285        do_versioning = false;
2286      else if (!do_versioning)
2287        LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2288    }
2289
2290  if (do_versioning)
2291    {
2292      vec<stmt_vec_info> may_misalign_stmts
2293        = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2294      stmt_vec_info stmt_info;
2295
2296      /* It can now be assumed that the data references in the statements
2297         in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2298         of the loop being vectorized.  */
2299      FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2300        {
2301	  dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2302	  SET_DR_MISALIGNMENT (dr_info, 0);
2303	  if (dump_enabled_p ())
2304            dump_printf_loc (MSG_NOTE, vect_location,
2305                             "Alignment of access forced using versioning.\n");
2306        }
2307
2308      if (dump_enabled_p ())
2309        dump_printf_loc (MSG_NOTE, vect_location,
2310                         "Versioning for alignment will be applied.\n");
2311
2312      /* Peeling and versioning can't be done together at this time.  */
2313      gcc_assert (! (do_peeling && do_versioning));
2314
2315      opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2316      gcc_assert (stat);
2317      return stat;
2318    }
2319
2320  /* This point is reached if neither peeling nor versioning is being done.  */
2321  gcc_assert (! (do_peeling || do_versioning));
2322
2323  opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2324  return stat;
2325}
2326
2327
2328/* Function vect_find_same_alignment_drs.
2329
2330   Update group and alignment relations in VINFO according to the chosen
2331   vectorization factor.  */
2332
2333static void
2334vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2335{
2336  struct data_reference *dra = DDR_A (ddr);
2337  struct data_reference *drb = DDR_B (ddr);
2338  dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2339  dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2340  stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2341  stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2342
2343  if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2344    return;
2345
2346  if (dra == drb)
2347    return;
2348
2349  if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2350      || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2351    return;
2352
2353  if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2354      || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2355      || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2356    return;
2357
2358  /* Two references with distance zero have the same alignment.  */
2359  poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2360			  - wi::to_poly_offset (DR_INIT (drb)));
2361  if (maybe_ne (diff, 0))
2362    {
2363      /* Get the wider of the two alignments.  */
2364      poly_uint64 align_a =
2365	exact_div (vect_calculate_target_alignment (dr_info_a),
2366		   BITS_PER_UNIT);
2367      poly_uint64 align_b =
2368	exact_div (vect_calculate_target_alignment (dr_info_b),
2369		   BITS_PER_UNIT);
2370      unsigned HOST_WIDE_INT align_a_c, align_b_c;
2371      if (!align_a.is_constant (&align_a_c)
2372	  || !align_b.is_constant (&align_b_c))
2373	return;
2374
2375      unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2376
2377      /* Require the gap to be a multiple of the larger vector alignment.  */
2378      if (!multiple_p (diff, max_align))
2379	return;
2380    }
2381
2382  STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2383  STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2384  if (dump_enabled_p ())
2385    dump_printf_loc (MSG_NOTE, vect_location,
2386		     "accesses have the same alignment: %T and %T\n",
2387		     DR_REF (dra), DR_REF (drb));
2388}
2389
2390
2391/* Function vect_analyze_data_refs_alignment
2392
2393   Analyze the alignment of the data-references in the loop.
2394   Return FALSE if a data reference is found that cannot be vectorized.  */
2395
2396opt_result
2397vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2398{
2399  DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2400
2401  /* Mark groups of data references with same alignment using
2402     data dependence information.  */
2403  vec<ddr_p> ddrs = vinfo->shared->ddrs;
2404  struct data_dependence_relation *ddr;
2405  unsigned int i;
2406
2407  FOR_EACH_VEC_ELT (ddrs, i, ddr)
2408    vect_find_same_alignment_drs (vinfo, ddr);
2409
2410  vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2411  struct data_reference *dr;
2412
2413  vect_record_base_alignments (vinfo);
2414  FOR_EACH_VEC_ELT (datarefs, i, dr)
2415    {
2416      dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2417      if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2418	vect_compute_data_ref_alignment (dr_info);
2419    }
2420
2421  return opt_result::success ();
2422}
2423
2424
2425/* Analyze alignment of DRs of stmts in NODE.  */
2426
2427static bool
2428vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2429{
2430  /* We vectorize from the first scalar stmt in the node unless
2431     the node is permuted in which case we start from the first
2432     element in the group.  */
2433  stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2434  dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2435  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2436    first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2437
2438  dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2439  vect_compute_data_ref_alignment (dr_info);
2440  /* For creating the data-ref pointer we need alignment of the
2441     first element anyway.  */
2442  if (dr_info != first_dr_info)
2443    vect_compute_data_ref_alignment (first_dr_info);
2444  if (! verify_data_ref_alignment (dr_info))
2445    {
2446      if (dump_enabled_p ())
2447	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2448			 "not vectorized: bad data alignment in basic "
2449			 "block.\n");
2450      return false;
2451    }
2452
2453  return true;
2454}
2455
2456/* Function vect_slp_analyze_instance_alignment
2457
2458   Analyze the alignment of the data-references in the SLP instance.
2459   Return FALSE if a data reference is found that cannot be vectorized.  */
2460
2461bool
2462vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2463{
2464  DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2465
2466  slp_tree node;
2467  unsigned i;
2468  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2469    if (! vect_slp_analyze_and_verify_node_alignment (node))
2470      return false;
2471
2472  node = SLP_INSTANCE_TREE (instance);
2473  if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2474      && ! vect_slp_analyze_and_verify_node_alignment
2475	     (SLP_INSTANCE_TREE (instance)))
2476    return false;
2477
2478  return true;
2479}
2480
2481
2482/* Analyze groups of accesses: check that DR_INFO belongs to a group of
2483   accesses of legal size, step, etc.  Detect gaps, single element
2484   interleaving, and other special cases. Set grouped access info.
2485   Collect groups of strided stores for further use in SLP analysis.
2486   Worker for vect_analyze_group_access.  */
2487
2488static bool
2489vect_analyze_group_access_1 (dr_vec_info *dr_info)
2490{
2491  data_reference *dr = dr_info->dr;
2492  tree step = DR_STEP (dr);
2493  tree scalar_type = TREE_TYPE (DR_REF (dr));
2494  HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2495  stmt_vec_info stmt_info = dr_info->stmt;
2496  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2497  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2498  HOST_WIDE_INT dr_step = -1;
2499  HOST_WIDE_INT groupsize, last_accessed_element = 1;
2500  bool slp_impossible = false;
2501
2502  /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2503     size of the interleaving group (including gaps).  */
2504  if (tree_fits_shwi_p (step))
2505    {
2506      dr_step = tree_to_shwi (step);
2507      /* Check that STEP is a multiple of type size.  Otherwise there is
2508         a non-element-sized gap at the end of the group which we
2509	 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2510	 ???  As we can handle non-constant step fine here we should
2511	 simply remove uses of DR_GROUP_GAP between the last and first
2512	 element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2513	 simply not include that gap.  */
2514      if ((dr_step % type_size) != 0)
2515	{
2516	  if (dump_enabled_p ())
2517	    dump_printf_loc (MSG_NOTE, vect_location,
2518			     "Step %T is not a multiple of the element size"
2519			     " for %T\n",
2520			     step, DR_REF (dr));
2521	  return false;
2522	}
2523      groupsize = absu_hwi (dr_step) / type_size;
2524    }
2525  else
2526    groupsize = 0;
2527
2528  /* Not consecutive access is possible only if it is a part of interleaving.  */
2529  if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2530    {
2531      /* Check if it this DR is a part of interleaving, and is a single
2532	 element of the group that is accessed in the loop.  */
2533
2534      /* Gaps are supported only for loads. STEP must be a multiple of the type
2535	 size.  */
2536      if (DR_IS_READ (dr)
2537	  && (dr_step % type_size) == 0
2538	  && groupsize > 0)
2539	{
2540	  DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2541	  DR_GROUP_SIZE (stmt_info) = groupsize;
2542	  DR_GROUP_GAP (stmt_info) = groupsize - 1;
2543	  if (dump_enabled_p ())
2544	    dump_printf_loc (MSG_NOTE, vect_location,
2545			     "Detected single element interleaving %T"
2546			     " step %T\n",
2547			     DR_REF (dr), step);
2548
2549	  return true;
2550	}
2551
2552      if (dump_enabled_p ())
2553	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2554			 "not consecutive access %G", stmt_info->stmt);
2555
2556      if (bb_vinfo)
2557	{
2558	  /* Mark the statement as unvectorizable.  */
2559	  STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2560	  return true;
2561	}
2562
2563      if (dump_enabled_p ())
2564	dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2565      STMT_VINFO_STRIDED_P (stmt_info) = true;
2566      return true;
2567    }
2568
2569  if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2570    {
2571      /* First stmt in the interleaving chain. Check the chain.  */
2572      stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2573      struct data_reference *data_ref = dr;
2574      unsigned int count = 1;
2575      tree prev_init = DR_INIT (data_ref);
2576      HOST_WIDE_INT diff, gaps = 0;
2577
2578      /* By construction, all group members have INTEGER_CST DR_INITs.  */
2579      while (next)
2580        {
2581          /* We never have the same DR multiple times.  */
2582          gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2583				DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2584
2585	  data_ref = STMT_VINFO_DATA_REF (next);
2586
2587	  /* All group members have the same STEP by construction.  */
2588	  gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2589
2590          /* Check that the distance between two accesses is equal to the type
2591             size. Otherwise, we have gaps.  */
2592          diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2593                  - TREE_INT_CST_LOW (prev_init)) / type_size;
2594	  if (diff != 1)
2595	    {
2596	      /* FORNOW: SLP of accesses with gaps is not supported.  */
2597	      slp_impossible = true;
2598	      if (DR_IS_WRITE (data_ref))
2599		{
2600                  if (dump_enabled_p ())
2601                    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2602                                     "interleaved store with gaps\n");
2603		  return false;
2604		}
2605
2606              gaps += diff - 1;
2607	    }
2608
2609	  last_accessed_element += diff;
2610
2611          /* Store the gap from the previous member of the group. If there is no
2612             gap in the access, DR_GROUP_GAP is always 1.  */
2613	  DR_GROUP_GAP (next) = diff;
2614
2615	  prev_init = DR_INIT (data_ref);
2616	  next = DR_GROUP_NEXT_ELEMENT (next);
2617	  /* Count the number of data-refs in the chain.  */
2618	  count++;
2619        }
2620
2621      if (groupsize == 0)
2622        groupsize = count + gaps;
2623
2624      /* This could be UINT_MAX but as we are generating code in a very
2625         inefficient way we have to cap earlier.  See PR78699 for example.  */
2626      if (groupsize > 4096)
2627	{
2628	  if (dump_enabled_p ())
2629	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2630			     "group is too large\n");
2631	  return false;
2632	}
2633
2634      /* Check that the size of the interleaving is equal to count for stores,
2635         i.e., that there are no gaps.  */
2636      if (groupsize != count
2637	  && !DR_IS_READ (dr))
2638        {
2639	  groupsize = count;
2640	  STMT_VINFO_STRIDED_P (stmt_info) = true;
2641	}
2642
2643      /* If there is a gap after the last load in the group it is the
2644	 difference between the groupsize and the last accessed
2645	 element.
2646	 When there is no gap, this difference should be 0.  */
2647      DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2648
2649      DR_GROUP_SIZE (stmt_info) = groupsize;
2650      if (dump_enabled_p ())
2651	{
2652	  dump_printf_loc (MSG_NOTE, vect_location,
2653			   "Detected interleaving ");
2654	  if (DR_IS_READ (dr))
2655	    dump_printf (MSG_NOTE, "load ");
2656	  else if (STMT_VINFO_STRIDED_P (stmt_info))
2657	    dump_printf (MSG_NOTE, "strided store ");
2658	  else
2659	    dump_printf (MSG_NOTE, "store ");
2660	  dump_printf (MSG_NOTE, "of size %u\n",
2661		       (unsigned)groupsize);
2662	  dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2663	  next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2664	  while (next)
2665	    {
2666	      if (DR_GROUP_GAP (next) != 1)
2667		dump_printf_loc (MSG_NOTE, vect_location,
2668				 "\t<gap of %d elements>\n",
2669				 DR_GROUP_GAP (next) - 1);
2670	      dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2671	      next = DR_GROUP_NEXT_ELEMENT (next);
2672	    }
2673	  if (DR_GROUP_GAP (stmt_info) != 0)
2674	    dump_printf_loc (MSG_NOTE, vect_location,
2675			     "\t<gap of %d elements>\n",
2676			     DR_GROUP_GAP (stmt_info));
2677	}
2678
2679      /* SLP: create an SLP data structure for every interleaving group of
2680	 stores for further analysis in vect_analyse_slp.  */
2681      if (DR_IS_WRITE (dr) && !slp_impossible)
2682	{
2683	  if (loop_vinfo)
2684	    LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2685	  if (bb_vinfo)
2686	    BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2687	}
2688    }
2689
2690  return true;
2691}
2692
2693/* Analyze groups of accesses: check that DR_INFO belongs to a group of
2694   accesses of legal size, step, etc.  Detect gaps, single element
2695   interleaving, and other special cases. Set grouped access info.
2696   Collect groups of strided stores for further use in SLP analysis.  */
2697
2698static bool
2699vect_analyze_group_access (dr_vec_info *dr_info)
2700{
2701  if (!vect_analyze_group_access_1 (dr_info))
2702    {
2703      /* Dissolve the group if present.  */
2704      stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2705      while (stmt_info)
2706	{
2707	  stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2708	  DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2709	  DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2710	  stmt_info = next;
2711	}
2712      return false;
2713    }
2714  return true;
2715}
2716
2717/* Analyze the access pattern of the data-reference DR_INFO.
2718   In case of non-consecutive accesses call vect_analyze_group_access() to
2719   analyze groups of accesses.  */
2720
2721static bool
2722vect_analyze_data_ref_access (dr_vec_info *dr_info)
2723{
2724  data_reference *dr = dr_info->dr;
2725  tree step = DR_STEP (dr);
2726  tree scalar_type = TREE_TYPE (DR_REF (dr));
2727  stmt_vec_info stmt_info = dr_info->stmt;
2728  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2729  class loop *loop = NULL;
2730
2731  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2732    return true;
2733
2734  if (loop_vinfo)
2735    loop = LOOP_VINFO_LOOP (loop_vinfo);
2736
2737  if (loop_vinfo && !step)
2738    {
2739      if (dump_enabled_p ())
2740	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2741	                 "bad data-ref access in loop\n");
2742      return false;
2743    }
2744
2745  /* Allow loads with zero step in inner-loop vectorization.  */
2746  if (loop_vinfo && integer_zerop (step))
2747    {
2748      DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2749      if (!nested_in_vect_loop_p (loop, stmt_info))
2750	return DR_IS_READ (dr);
2751      /* Allow references with zero step for outer loops marked
2752	 with pragma omp simd only - it guarantees absence of
2753	 loop-carried dependencies between inner loop iterations.  */
2754      if (loop->safelen < 2)
2755	{
2756	  if (dump_enabled_p ())
2757	    dump_printf_loc (MSG_NOTE, vect_location,
2758			     "zero step in inner loop of nest\n");
2759	  return false;
2760	}
2761    }
2762
2763  if (loop && nested_in_vect_loop_p (loop, stmt_info))
2764    {
2765      /* Interleaved accesses are not yet supported within outer-loop
2766        vectorization for references in the inner-loop.  */
2767      DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2768
2769      /* For the rest of the analysis we use the outer-loop step.  */
2770      step = STMT_VINFO_DR_STEP (stmt_info);
2771      if (integer_zerop (step))
2772	{
2773	  if (dump_enabled_p ())
2774	    dump_printf_loc (MSG_NOTE, vect_location,
2775	                     "zero step in outer loop.\n");
2776	  return DR_IS_READ (dr);
2777	}
2778    }
2779
2780  /* Consecutive?  */
2781  if (TREE_CODE (step) == INTEGER_CST)
2782    {
2783      HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2784      if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2785	  || (dr_step < 0
2786	      && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2787	{
2788	  /* Mark that it is not interleaving.  */
2789	  DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2790	  return true;
2791	}
2792    }
2793
2794  if (loop && nested_in_vect_loop_p (loop, stmt_info))
2795    {
2796      if (dump_enabled_p ())
2797	dump_printf_loc (MSG_NOTE, vect_location,
2798	                 "grouped access in outer loop.\n");
2799      return false;
2800    }
2801
2802
2803  /* Assume this is a DR handled by non-constant strided load case.  */
2804  if (TREE_CODE (step) != INTEGER_CST)
2805    return (STMT_VINFO_STRIDED_P (stmt_info)
2806	    && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2807		|| vect_analyze_group_access (dr_info)));
2808
2809  /* Not consecutive access - check if it's a part of interleaving group.  */
2810  return vect_analyze_group_access (dr_info);
2811}
2812
2813/* Compare two data-references DRA and DRB to group them into chunks
2814   suitable for grouping.  */
2815
2816static int
2817dr_group_sort_cmp (const void *dra_, const void *drb_)
2818{
2819  data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2820  data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2821  int cmp;
2822
2823  /* Stabilize sort.  */
2824  if (dra == drb)
2825    return 0;
2826
2827  /* DRs in different loops never belong to the same group.  */
2828  loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2829  loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2830  if (loopa != loopb)
2831    return loopa->num < loopb->num ? -1 : 1;
2832
2833  /* Ordering of DRs according to base.  */
2834  cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2835			       DR_BASE_ADDRESS (drb));
2836  if (cmp != 0)
2837    return cmp;
2838
2839  /* And according to DR_OFFSET.  */
2840  cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2841  if (cmp != 0)
2842    return cmp;
2843
2844  /* Put reads before writes.  */
2845  if (DR_IS_READ (dra) != DR_IS_READ (drb))
2846    return DR_IS_READ (dra) ? -1 : 1;
2847
2848  /* Then sort after access size.  */
2849  cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2850			       TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2851  if (cmp != 0)
2852    return cmp;
2853
2854  /* And after step.  */
2855  cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2856  if (cmp != 0)
2857    return cmp;
2858
2859  /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2860  cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2861  if (cmp == 0)
2862    return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2863  return cmp;
2864}
2865
2866/* If OP is the result of a conversion, return the unconverted value,
2867   otherwise return null.  */
2868
2869static tree
2870strip_conversion (tree op)
2871{
2872  if (TREE_CODE (op) != SSA_NAME)
2873    return NULL_TREE;
2874  gimple *stmt = SSA_NAME_DEF_STMT (op);
2875  if (!is_gimple_assign (stmt)
2876      || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2877    return NULL_TREE;
2878  return gimple_assign_rhs1 (stmt);
2879}
2880
2881/* Return true if vectorizable_* routines can handle statements STMT1_INFO
2882   and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
2883   be grouped in SLP mode.  */
2884
2885static bool
2886can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2887		   bool allow_slp_p)
2888{
2889  if (gimple_assign_single_p (stmt1_info->stmt))
2890    return gimple_assign_single_p (stmt2_info->stmt);
2891
2892  gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2893  if (call1 && gimple_call_internal_p (call1))
2894    {
2895      /* Check for two masked loads or two masked stores.  */
2896      gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2897      if (!call2 || !gimple_call_internal_p (call2))
2898	return false;
2899      internal_fn ifn = gimple_call_internal_fn (call1);
2900      if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2901	return false;
2902      if (ifn != gimple_call_internal_fn (call2))
2903	return false;
2904
2905      /* Check that the masks are the same.  Cope with casts of masks,
2906	 like those created by build_mask_conversion.  */
2907      tree mask1 = gimple_call_arg (call1, 2);
2908      tree mask2 = gimple_call_arg (call2, 2);
2909      if (!operand_equal_p (mask1, mask2, 0)
2910          && (ifn == IFN_MASK_STORE || !allow_slp_p))
2911	{
2912	  mask1 = strip_conversion (mask1);
2913	  if (!mask1)
2914	    return false;
2915	  mask2 = strip_conversion (mask2);
2916	  if (!mask2)
2917	    return false;
2918	  if (!operand_equal_p (mask1, mask2, 0))
2919	    return false;
2920	}
2921      return true;
2922    }
2923
2924  return false;
2925}
2926
2927/* Function vect_analyze_data_ref_accesses.
2928
2929   Analyze the access pattern of all the data references in the loop.
2930
2931   FORNOW: the only access pattern that is considered vectorizable is a
2932	   simple step 1 (consecutive) access.
2933
2934   FORNOW: handle only arrays and pointer accesses.  */
2935
2936opt_result
2937vect_analyze_data_ref_accesses (vec_info *vinfo)
2938{
2939  unsigned int i;
2940  vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2941  struct data_reference *dr;
2942
2943  DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2944
2945  if (datarefs.is_empty ())
2946    return opt_result::success ();
2947
2948  /* Sort the array of datarefs to make building the interleaving chains
2949     linear.  Don't modify the original vector's order, it is needed for
2950     determining what dependencies are reversed.  */
2951  vec<data_reference_p> datarefs_copy = datarefs.copy ();
2952  datarefs_copy.qsort (dr_group_sort_cmp);
2953  hash_set<stmt_vec_info> to_fixup;
2954
2955  /* Build the interleaving chains.  */
2956  for (i = 0; i < datarefs_copy.length () - 1;)
2957    {
2958      data_reference_p dra = datarefs_copy[i];
2959      dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2960      stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2961      stmt_vec_info lastinfo = NULL;
2962      if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2963	  || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2964	{
2965	  ++i;
2966	  continue;
2967	}
2968      for (i = i + 1; i < datarefs_copy.length (); ++i)
2969	{
2970	  data_reference_p drb = datarefs_copy[i];
2971	  dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2972	  stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2973	  if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2974	      || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2975	    break;
2976
2977	  /* ???  Imperfect sorting (non-compatible types, non-modulo
2978	     accesses, same accesses) can lead to a group to be artificially
2979	     split here as we don't just skip over those.  If it really
2980	     matters we can push those to a worklist and re-iterate
2981	     over them.  The we can just skip ahead to the next DR here.  */
2982
2983	  /* DRs in a different loop should not be put into the same
2984	     interleaving group.  */
2985	  if (gimple_bb (DR_STMT (dra))->loop_father
2986	      != gimple_bb (DR_STMT (drb))->loop_father)
2987	    break;
2988
2989	  /* Check that the data-refs have same first location (except init)
2990	     and they are both either store or load (not load and store,
2991	     not masked loads or stores).  */
2992	  if (DR_IS_READ (dra) != DR_IS_READ (drb)
2993	      || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2994					DR_BASE_ADDRESS (drb)) != 0
2995	      || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2996	      || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
2997	    break;
2998
2999	  /* Check that the data-refs have the same constant size.  */
3000	  tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
3001	  tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
3002	  if (!tree_fits_uhwi_p (sza)
3003	      || !tree_fits_uhwi_p (szb)
3004	      || !tree_int_cst_equal (sza, szb))
3005	    break;
3006
3007	  /* Check that the data-refs have the same step.  */
3008	  if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
3009	    break;
3010
3011	  /* Check the types are compatible.
3012	     ???  We don't distinguish this during sorting.  */
3013	  if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
3014				   TREE_TYPE (DR_REF (drb))))
3015	    break;
3016
3017	  /* Check that the DR_INITs are compile-time constants.  */
3018	  if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3019	      || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3020	    break;
3021
3022	  /* Different .GOMP_SIMD_LANE calls still give the same lane,
3023	     just hold extra information.  */
3024	  if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a)
3025	      && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b)
3026	      && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0)
3027	    break;
3028
3029	  /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3030	  HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3031	  HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3032	  HOST_WIDE_INT init_prev
3033	    = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3034	  gcc_assert (init_a <= init_b
3035		      && init_a <= init_prev
3036		      && init_prev <= init_b);
3037
3038	  /* Do not place the same access in the interleaving chain twice.  */
3039	  if (init_b == init_prev)
3040	    {
3041	      gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3042			  < gimple_uid (DR_STMT (drb)));
3043	      /* Simply link in duplicates and fix up the chain below.  */
3044	    }
3045	  else
3046	    {
3047	      /* If init_b == init_a + the size of the type * k, we have an
3048		 interleaving, and DRA is accessed before DRB.  */
3049	      HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3050	      if (type_size_a == 0
3051		  || (init_b - init_a) % type_size_a != 0)
3052		break;
3053
3054	      /* If we have a store, the accesses are adjacent.  This splits
3055		 groups into chunks we support (we don't support vectorization
3056		 of stores with gaps).  */
3057	      if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3058		break;
3059
3060	      /* If the step (if not zero or non-constant) is greater than the
3061		 difference between data-refs' inits this splits groups into
3062		 suitable sizes.  */
3063	      if (tree_fits_shwi_p (DR_STEP (dra)))
3064		{
3065		  HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3066		  if (step != 0 && step <= (init_b - init_a))
3067		    break;
3068		}
3069	    }
3070
3071	  if (dump_enabled_p ())
3072	    dump_printf_loc (MSG_NOTE, vect_location,
3073			     DR_IS_READ (dra)
3074			     ? "Detected interleaving load %T and %T\n"
3075			     : "Detected interleaving store %T and %T\n",
3076			     DR_REF (dra), DR_REF (drb));
3077
3078	  /* Link the found element into the group list.  */
3079	  if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3080	    {
3081	      DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3082	      lastinfo = stmtinfo_a;
3083	    }
3084	  DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3085	  DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3086	  lastinfo = stmtinfo_b;
3087
3088	  STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3089	    = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3090
3091	  if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3092	    dump_printf_loc (MSG_NOTE, vect_location,
3093			     "Load suitable for SLP vectorization only.\n");
3094
3095	  if (init_b == init_prev
3096	      && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3097	      && dump_enabled_p ())
3098	    dump_printf_loc (MSG_NOTE, vect_location,
3099			     "Queuing group with duplicate access for fixup\n");
3100	}
3101    }
3102
3103  /* Fixup groups with duplicate entries by splitting it.  */
3104  while (1)
3105    {
3106      hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3107      if (!(it != to_fixup.end ()))
3108	break;
3109      stmt_vec_info grp = *it;
3110      to_fixup.remove (grp);
3111
3112      /* Find the earliest duplicate group member.  */
3113      unsigned first_duplicate = -1u;
3114      stmt_vec_info next, g = grp;
3115      while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3116	{
3117	  if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3118				  DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3119	      && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3120	    first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3121	  g = next;
3122	}
3123      if (first_duplicate == -1U)
3124	continue;
3125
3126      /* Then move all stmts after the first duplicate to a new group.
3127         Note this is a heuristic but one with the property that *it
3128	 is fixed up completely.  */
3129      g = grp;
3130      stmt_vec_info newgroup = NULL, ng = grp;
3131      while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3132	{
3133	  if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3134	    {
3135	      DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3136	      if (!newgroup)
3137		newgroup = next;
3138	      else
3139		DR_GROUP_NEXT_ELEMENT (ng) = next;
3140	      ng = next;
3141	      DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3142	    }
3143	  else
3144	    g = DR_GROUP_NEXT_ELEMENT (g);
3145	}
3146      DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3147
3148      /* Fixup the new group which still may contain duplicates.  */
3149      to_fixup.add (newgroup);
3150    }
3151
3152  FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3153    {
3154      dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3155      if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3156	  && !vect_analyze_data_ref_access (dr_info))
3157	{
3158	  if (dump_enabled_p ())
3159	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3160			     "not vectorized: complicated access pattern.\n");
3161
3162	  if (is_a <bb_vec_info> (vinfo))
3163	    {
3164	      /* Mark the statement as not vectorizable.  */
3165	      STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3166	      continue;
3167	    }
3168	  else
3169	    {
3170	      datarefs_copy.release ();
3171	      return opt_result::failure_at (dr_info->stmt->stmt,
3172					     "not vectorized:"
3173					     " complicated access pattern.\n");
3174	    }
3175	}
3176    }
3177
3178  datarefs_copy.release ();
3179  return opt_result::success ();
3180}
3181
3182/* Function vect_vfa_segment_size.
3183
3184   Input:
3185     DR_INFO: The data reference.
3186     LENGTH_FACTOR: segment length to consider.
3187
3188   Return a value suitable for the dr_with_seg_len::seg_len field.
3189   This is the "distance travelled" by the pointer from the first
3190   iteration in the segment to the last.  Note that it does not include
3191   the size of the access; in effect it only describes the first byte.  */
3192
3193static tree
3194vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3195{
3196  length_factor = size_binop (MINUS_EXPR,
3197			      fold_convert (sizetype, length_factor),
3198			      size_one_node);
3199  return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3200		     length_factor);
3201}
3202
3203/* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3204   gives the worst-case number of bytes covered by the segment.  */
3205
3206static unsigned HOST_WIDE_INT
3207vect_vfa_access_size (dr_vec_info *dr_info)
3208{
3209  stmt_vec_info stmt_vinfo = dr_info->stmt;
3210  tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3211  unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3212  unsigned HOST_WIDE_INT access_size = ref_size;
3213  if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3214    {
3215      gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3216      access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3217    }
3218  if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3219      && (vect_supportable_dr_alignment (dr_info, false)
3220	  == dr_explicit_realign_optimized))
3221    {
3222      /* We might access a full vector's worth.  */
3223      tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3224      access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3225    }
3226  return access_size;
3227}
3228
3229/* Get the minimum alignment for all the scalar accesses that DR_INFO
3230   describes.  */
3231
3232static unsigned int
3233vect_vfa_align (dr_vec_info *dr_info)
3234{
3235  return dr_alignment (dr_info->dr);
3236}
3237
3238/* Function vect_no_alias_p.
3239
3240   Given data references A and B with equal base and offset, see whether
3241   the alias relation can be decided at compilation time.  Return 1 if
3242   it can and the references alias, 0 if it can and the references do
3243   not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3244   SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3245   of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3246
3247static int
3248vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3249			 tree segment_length_a, tree segment_length_b,
3250			 unsigned HOST_WIDE_INT access_size_a,
3251			 unsigned HOST_WIDE_INT access_size_b)
3252{
3253  poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3254  poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3255  poly_uint64 const_length_a;
3256  poly_uint64 const_length_b;
3257
3258  /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3259     bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3260     [a, a+12) */
3261  if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3262    {
3263      const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3264      offset_a -= const_length_a;
3265    }
3266  else
3267    const_length_a = tree_to_poly_uint64 (segment_length_a);
3268  if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3269    {
3270      const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3271      offset_b -= const_length_b;
3272    }
3273  else
3274    const_length_b = tree_to_poly_uint64 (segment_length_b);
3275
3276  const_length_a += access_size_a;
3277  const_length_b += access_size_b;
3278
3279  if (ranges_known_overlap_p (offset_a, const_length_a,
3280			      offset_b, const_length_b))
3281    return 1;
3282
3283  if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3284			       offset_b, const_length_b))
3285    return 0;
3286
3287  return -1;
3288}
3289
3290/* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3291   in DDR is >= VF.  */
3292
3293static bool
3294dependence_distance_ge_vf (data_dependence_relation *ddr,
3295			   unsigned int loop_depth, poly_uint64 vf)
3296{
3297  if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3298      || DDR_NUM_DIST_VECTS (ddr) == 0)
3299    return false;
3300
3301  /* If the dependence is exact, we should have limited the VF instead.  */
3302  gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3303
3304  unsigned int i;
3305  lambda_vector dist_v;
3306  FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3307    {
3308      HOST_WIDE_INT dist = dist_v[loop_depth];
3309      if (dist != 0
3310	  && !(dist > 0 && DDR_REVERSED_P (ddr))
3311	  && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3312	return false;
3313    }
3314
3315  if (dump_enabled_p ())
3316    dump_printf_loc (MSG_NOTE, vect_location,
3317		     "dependence distance between %T and %T is >= VF\n",
3318		     DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3319
3320  return true;
3321}
3322
3323/* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3324
3325static void
3326dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3327{
3328  dump_printf (dump_kind, "%s (%T) >= ",
3329	       lower_bound.unsigned_p ? "unsigned" : "abs",
3330	       lower_bound.expr);
3331  dump_dec (dump_kind, lower_bound.min_value);
3332}
3333
3334/* Record that the vectorized loop requires the vec_lower_bound described
3335   by EXPR, UNSIGNED_P and MIN_VALUE.  */
3336
3337static void
3338vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3339			poly_uint64 min_value)
3340{
3341  vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3342  for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3343    if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3344      {
3345	unsigned_p &= lower_bounds[i].unsigned_p;
3346	min_value = upper_bound (lower_bounds[i].min_value, min_value);
3347	if (lower_bounds[i].unsigned_p != unsigned_p
3348	    || maybe_lt (lower_bounds[i].min_value, min_value))
3349	  {
3350	    lower_bounds[i].unsigned_p = unsigned_p;
3351	    lower_bounds[i].min_value = min_value;
3352	    if (dump_enabled_p ())
3353	      {
3354		dump_printf_loc (MSG_NOTE, vect_location,
3355				 "updating run-time check to ");
3356		dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3357		dump_printf (MSG_NOTE, "\n");
3358	      }
3359	  }
3360	return;
3361      }
3362
3363  vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3364  if (dump_enabled_p ())
3365    {
3366      dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3367      dump_lower_bound (MSG_NOTE, lower_bound);
3368      dump_printf (MSG_NOTE, "\n");
3369    }
3370  LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3371}
3372
3373/* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3374   will span fewer than GAP bytes.  */
3375
3376static bool
3377vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3378		  poly_int64 gap)
3379{
3380  stmt_vec_info stmt_info = dr_info->stmt;
3381  HOST_WIDE_INT count
3382    = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3383  if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3384    count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3385  return (estimated_poly_value (gap)
3386	  <= count * vect_get_scalar_dr_size (dr_info));
3387}
3388
3389/* Return true if we know that there is no alias between DR_INFO_A and
3390   DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3391   When returning true, set *LOWER_BOUND_OUT to this N.  */
3392
3393static bool
3394vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3395				poly_uint64 *lower_bound_out)
3396{
3397  /* Check that there is a constant gap of known sign between DR_A
3398     and DR_B.  */
3399  data_reference *dr_a = dr_info_a->dr;
3400  data_reference *dr_b = dr_info_b->dr;
3401  poly_int64 init_a, init_b;
3402  if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3403      || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3404      || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3405      || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3406      || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3407      || !ordered_p (init_a, init_b))
3408    return false;
3409
3410  /* Sort DR_A and DR_B by the address they access.  */
3411  if (maybe_lt (init_b, init_a))
3412    {
3413      std::swap (init_a, init_b);
3414      std::swap (dr_info_a, dr_info_b);
3415      std::swap (dr_a, dr_b);
3416    }
3417
3418  /* If the two accesses could be dependent within a scalar iteration,
3419     make sure that we'd retain their order.  */
3420  if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3421      && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3422    return false;
3423
3424  /* There is no alias if abs (DR_STEP) is greater than or equal to
3425     the bytes spanned by the combination of the two accesses.  */
3426  *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3427  return true;
3428}
3429
3430/* Function vect_prune_runtime_alias_test_list.
3431
3432   Prune a list of ddrs to be tested at run-time by versioning for alias.
3433   Merge several alias checks into one if possible.
3434   Return FALSE if resulting list of ddrs is longer then allowed by
3435   PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3436
3437opt_result
3438vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3439{
3440  typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3441  hash_set <tree_pair_hash> compared_objects;
3442
3443  vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3444  vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3445    = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3446  vec<vec_object_pair> &check_unequal_addrs
3447    = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3448  poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3449  tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3450
3451  ddr_p ddr;
3452  unsigned int i;
3453  tree length_factor;
3454
3455  DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3456
3457  /* Step values are irrelevant for aliasing if the number of vector
3458     iterations is equal to the number of scalar iterations (which can
3459     happen for fully-SLP loops).  */
3460  bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3461
3462  if (!vf_one_p)
3463    {
3464      /* Convert the checks for nonzero steps into bound tests.  */
3465      tree value;
3466      FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3467	vect_check_lower_bound (loop_vinfo, value, true, 1);
3468    }
3469
3470  if (may_alias_ddrs.is_empty ())
3471    return opt_result::success ();
3472
3473  comp_alias_ddrs.create (may_alias_ddrs.length ());
3474
3475  unsigned int loop_depth
3476    = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3477			  LOOP_VINFO_LOOP_NEST (loop_vinfo));
3478
3479  /* First, we collect all data ref pairs for aliasing checks.  */
3480  FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3481    {
3482      poly_uint64 lower_bound;
3483      tree segment_length_a, segment_length_b;
3484      unsigned HOST_WIDE_INT access_size_a, access_size_b;
3485      unsigned int align_a, align_b;
3486
3487      /* Ignore the alias if the VF we chose ended up being no greater
3488	 than the dependence distance.  */
3489      if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3490	continue;
3491
3492      if (DDR_OBJECT_A (ddr))
3493	{
3494	  vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3495	  if (!compared_objects.add (new_pair))
3496	    {
3497	      if (dump_enabled_p ())
3498		dump_printf_loc (MSG_NOTE, vect_location,
3499				 "checking that %T and %T"
3500				 " have different addresses\n",
3501				 new_pair.first, new_pair.second);
3502	      LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3503	    }
3504	  continue;
3505	}
3506
3507      dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3508      stmt_vec_info stmt_info_a = dr_info_a->stmt;
3509
3510      dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3511      stmt_vec_info stmt_info_b = dr_info_b->stmt;
3512
3513      bool preserves_scalar_order_p
3514	= vect_preserves_scalar_order_p (dr_info_a, dr_info_b);
3515      bool ignore_step_p
3516	  = (vf_one_p
3517	     && (preserves_scalar_order_p
3518		 || operand_equal_p (DR_STEP (dr_info_a->dr),
3519				     DR_STEP (dr_info_b->dr))));
3520
3521      /* Skip the pair if inter-iteration dependencies are irrelevant
3522	 and intra-iteration dependencies are guaranteed to be honored.  */
3523      if (ignore_step_p
3524	  && (preserves_scalar_order_p
3525	      || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3526						 &lower_bound)))
3527	{
3528	  if (dump_enabled_p ())
3529	    dump_printf_loc (MSG_NOTE, vect_location,
3530			     "no need for alias check between "
3531			     "%T and %T when VF is 1\n",
3532			     DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3533	  continue;
3534	}
3535
3536      /* See whether we can handle the alias using a bounds check on
3537	 the step, and whether that's likely to be the best approach.
3538	 (It might not be, for example, if the minimum step is much larger
3539	 than the number of bytes handled by one vector iteration.)  */
3540      if (!ignore_step_p
3541	  && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3542	  && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3543					     &lower_bound)
3544	  && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3545	      || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3546	{
3547	  bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3548	  if (dump_enabled_p ())
3549	    {
3550	      dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3551			       "%T and %T when the step %T is outside ",
3552			       DR_REF (dr_info_a->dr),
3553			       DR_REF (dr_info_b->dr),
3554			       DR_STEP (dr_info_a->dr));
3555	      if (unsigned_p)
3556		dump_printf (MSG_NOTE, "[0");
3557	      else
3558		{
3559		  dump_printf (MSG_NOTE, "(");
3560		  dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3561		}
3562	      dump_printf (MSG_NOTE, ", ");
3563	      dump_dec (MSG_NOTE, lower_bound);
3564	      dump_printf (MSG_NOTE, ")\n");
3565	    }
3566	  vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3567				  unsigned_p, lower_bound);
3568	  continue;
3569	}
3570
3571      stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3572      if (dr_group_first_a)
3573	{
3574	  stmt_info_a = dr_group_first_a;
3575	  dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3576	}
3577
3578      stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3579      if (dr_group_first_b)
3580	{
3581	  stmt_info_b = dr_group_first_b;
3582	  dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3583	}
3584
3585      if (ignore_step_p)
3586	{
3587	  segment_length_a = size_zero_node;
3588	  segment_length_b = size_zero_node;
3589	}
3590      else
3591	{
3592	  if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3593				DR_STEP (dr_info_b->dr), 0))
3594	    length_factor = scalar_loop_iters;
3595	  else
3596	    length_factor = size_int (vect_factor);
3597	  segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3598	  segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3599	}
3600      access_size_a = vect_vfa_access_size (dr_info_a);
3601      access_size_b = vect_vfa_access_size (dr_info_b);
3602      align_a = vect_vfa_align (dr_info_a);
3603      align_b = vect_vfa_align (dr_info_b);
3604
3605      /* See whether the alias is known at compilation time.  */
3606      if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr),
3607			   DR_BASE_ADDRESS (dr_info_b->dr), 0)
3608	  && operand_equal_p (DR_OFFSET (dr_info_a->dr),
3609			      DR_OFFSET (dr_info_b->dr), 0)
3610	  && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3611	  && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3612	  && poly_int_tree_p (segment_length_a)
3613	  && poly_int_tree_p (segment_length_b))
3614	{
3615	  int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3616					     segment_length_a,
3617					     segment_length_b,
3618					     access_size_a,
3619					     access_size_b);
3620	  if (res >= 0 && dump_enabled_p ())
3621	    {
3622	      dump_printf_loc (MSG_NOTE, vect_location,
3623			       "can tell at compile time that %T and %T",
3624			       DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3625	      if (res == 0)
3626		dump_printf (MSG_NOTE, " do not alias\n");
3627	      else
3628		dump_printf (MSG_NOTE, " alias\n");
3629	    }
3630
3631	  if (res == 0)
3632	    continue;
3633
3634	  if (res == 1)
3635	    return opt_result::failure_at (stmt_info_b->stmt,
3636					   "not vectorized:"
3637					   " compilation time alias: %G%G",
3638					   stmt_info_a->stmt,
3639					   stmt_info_b->stmt);
3640	}
3641
3642      dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a,
3643			    access_size_a, align_a);
3644      dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b,
3645			    access_size_b, align_b);
3646      /* Canonicalize the order to be the one that's needed for accurate
3647	 RAW, WAR and WAW flags, in cases where the data references are
3648	 well-ordered.  The order doesn't really matter otherwise,
3649	 but we might as well be consistent.  */
3650      if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a)
3651	std::swap (dr_a, dr_b);
3652
3653      dr_with_seg_len_pair_t dr_with_seg_len_pair
3654	(dr_a, dr_b, (preserves_scalar_order_p
3655		      ? dr_with_seg_len_pair_t::WELL_ORDERED
3656		      : dr_with_seg_len_pair_t::REORDERED));
3657
3658      comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3659    }
3660
3661  prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3662
3663  unsigned int count = (comp_alias_ddrs.length ()
3664			+ check_unequal_addrs.length ());
3665
3666  if (dump_enabled_p ())
3667    dump_printf_loc (MSG_NOTE, vect_location,
3668		     "improved number of alias checks from %d to %d\n",
3669		     may_alias_ddrs.length (), count);
3670  unsigned limit = param_vect_max_version_for_alias_checks;
3671  if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP)
3672    limit = param_vect_max_version_for_alias_checks * 6 / 10;
3673  if (count > limit)
3674    return opt_result::failure_at
3675      (vect_location,
3676       "number of versioning for alias run-time tests exceeds %d "
3677       "(--param vect-max-version-for-alias-checks)\n", limit);
3678
3679  return opt_result::success ();
3680}
3681
3682/* Check whether we can use an internal function for a gather load
3683   or scatter store.  READ_P is true for loads and false for stores.
3684   MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3685   the type of the memory elements being loaded or stored.  OFFSET_TYPE
3686   is the type of the offset that is being applied to the invariant
3687   base address.  SCALE is the amount by which the offset should
3688   be multiplied *after* it has been converted to address width.
3689
3690   Return true if the function is supported, storing the function id in
3691   *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
3692
3693bool
3694vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
3695			  tree vectype, tree memory_type, tree offset_type,
3696			  int scale, internal_fn *ifn_out,
3697			  tree *offset_vectype_out)
3698{
3699  unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3700  unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3701  if (element_bits != memory_bits)
3702    /* For now the vector elements must be the same width as the
3703       memory elements.  */
3704    return false;
3705
3706  /* Work out which function we need.  */
3707  internal_fn ifn;
3708  if (read_p)
3709    ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3710  else
3711    ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3712
3713  for (;;)
3714    {
3715      tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type);
3716      if (!offset_vectype)
3717	return false;
3718
3719      /* Test whether the target supports this combination.  */
3720      if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3721						  offset_vectype, scale))
3722	{
3723	  *ifn_out = ifn;
3724	  *offset_vectype_out = offset_vectype;
3725	  return true;
3726	}
3727
3728      if (TYPE_PRECISION (offset_type) >= POINTER_SIZE
3729	  && TYPE_PRECISION (offset_type) >= element_bits)
3730	return false;
3731
3732      offset_type = build_nonstandard_integer_type
3733	(TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type));
3734    }
3735}
3736
3737/* STMT_INFO is a call to an internal gather load or scatter store function.
3738   Describe the operation in INFO.  */
3739
3740static void
3741vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3742				   gather_scatter_info *info)
3743{
3744  gcall *call = as_a <gcall *> (stmt_info->stmt);
3745  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3746  data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3747
3748  info->ifn = gimple_call_internal_fn (call);
3749  info->decl = NULL_TREE;
3750  info->base = gimple_call_arg (call, 0);
3751  info->offset = gimple_call_arg (call, 1);
3752  info->offset_dt = vect_unknown_def_type;
3753  info->offset_vectype = NULL_TREE;
3754  info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3755  info->element_type = TREE_TYPE (vectype);
3756  info->memory_type = TREE_TYPE (DR_REF (dr));
3757}
3758
3759/* Return true if a non-affine read or write in STMT_INFO is suitable for a
3760   gather load or scatter store.  Describe the operation in *INFO if so.  */
3761
3762bool
3763vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3764			   gather_scatter_info *info)
3765{
3766  HOST_WIDE_INT scale = 1;
3767  poly_int64 pbitpos, pbitsize;
3768  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3769  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3770  tree offtype = NULL_TREE;
3771  tree decl = NULL_TREE, base, off;
3772  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3773  tree memory_type = TREE_TYPE (DR_REF (dr));
3774  machine_mode pmode;
3775  int punsignedp, reversep, pvolatilep = 0;
3776  internal_fn ifn;
3777  tree offset_vectype;
3778  bool masked_p = false;
3779
3780  /* See whether this is already a call to a gather/scatter internal function.
3781     If not, see whether it's a masked load or store.  */
3782  gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3783  if (call && gimple_call_internal_p (call))
3784    {
3785      ifn = gimple_call_internal_fn (call);
3786      if (internal_gather_scatter_fn_p (ifn))
3787	{
3788	  vect_describe_gather_scatter_call (stmt_info, info);
3789	  return true;
3790	}
3791      masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3792    }
3793
3794  /* True if we should aim to use internal functions rather than
3795     built-in functions.  */
3796  bool use_ifn_p = (DR_IS_READ (dr)
3797		    ? supports_vec_gather_load_p ()
3798		    : supports_vec_scatter_store_p ());
3799
3800  base = DR_REF (dr);
3801  /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3802     see if we can use the def stmt of the address.  */
3803  if (masked_p
3804      && TREE_CODE (base) == MEM_REF
3805      && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3806      && integer_zerop (TREE_OPERAND (base, 1))
3807      && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3808    {
3809      gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3810      if (is_gimple_assign (def_stmt)
3811	  && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3812	base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3813    }
3814
3815  /* The gather and scatter builtins need address of the form
3816     loop_invariant + vector * {1, 2, 4, 8}
3817     or
3818     loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3819     Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3820     of loop invariants/SSA_NAMEs defined in the loop, with casts,
3821     multiplications and additions in it.  To get a vector, we need
3822     a single SSA_NAME that will be defined in the loop and will
3823     contain everything that is not loop invariant and that can be
3824     vectorized.  The following code attempts to find such a preexistng
3825     SSA_NAME OFF and put the loop invariants into a tree BASE
3826     that can be gimplified before the loop.  */
3827  base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3828			      &punsignedp, &reversep, &pvolatilep);
3829  if (reversep)
3830    return false;
3831
3832  poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3833
3834  if (TREE_CODE (base) == MEM_REF)
3835    {
3836      if (!integer_zerop (TREE_OPERAND (base, 1)))
3837	{
3838	  if (off == NULL_TREE)
3839	    off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3840	  else
3841	    off = size_binop (PLUS_EXPR, off,
3842			      fold_convert (sizetype, TREE_OPERAND (base, 1)));
3843	}
3844      base = TREE_OPERAND (base, 0);
3845    }
3846  else
3847    base = build_fold_addr_expr (base);
3848
3849  if (off == NULL_TREE)
3850    off = size_zero_node;
3851
3852  /* If base is not loop invariant, either off is 0, then we start with just
3853     the constant offset in the loop invariant BASE and continue with base
3854     as OFF, otherwise give up.
3855     We could handle that case by gimplifying the addition of base + off
3856     into some SSA_NAME and use that as off, but for now punt.  */
3857  if (!expr_invariant_in_loop_p (loop, base))
3858    {
3859      if (!integer_zerop (off))
3860	return false;
3861      off = base;
3862      base = size_int (pbytepos);
3863    }
3864  /* Otherwise put base + constant offset into the loop invariant BASE
3865     and continue with OFF.  */
3866  else
3867    {
3868      base = fold_convert (sizetype, base);
3869      base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3870    }
3871
3872  /* OFF at this point may be either a SSA_NAME or some tree expression
3873     from get_inner_reference.  Try to peel off loop invariants from it
3874     into BASE as long as possible.  */
3875  STRIP_NOPS (off);
3876  while (offtype == NULL_TREE)
3877    {
3878      enum tree_code code;
3879      tree op0, op1, add = NULL_TREE;
3880
3881      if (TREE_CODE (off) == SSA_NAME)
3882	{
3883	  gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3884
3885	  if (expr_invariant_in_loop_p (loop, off))
3886	    return false;
3887
3888	  if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3889	    break;
3890
3891	  op0 = gimple_assign_rhs1 (def_stmt);
3892	  code = gimple_assign_rhs_code (def_stmt);
3893	  op1 = gimple_assign_rhs2 (def_stmt);
3894	}
3895      else
3896	{
3897	  if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3898	    return false;
3899	  code = TREE_CODE (off);
3900	  extract_ops_from_tree (off, &code, &op0, &op1);
3901	}
3902      switch (code)
3903	{
3904	case POINTER_PLUS_EXPR:
3905	case PLUS_EXPR:
3906	  if (expr_invariant_in_loop_p (loop, op0))
3907	    {
3908	      add = op0;
3909	      off = op1;
3910	    do_add:
3911	      add = fold_convert (sizetype, add);
3912	      if (scale != 1)
3913		add = size_binop (MULT_EXPR, add, size_int (scale));
3914	      base = size_binop (PLUS_EXPR, base, add);
3915	      continue;
3916	    }
3917	  if (expr_invariant_in_loop_p (loop, op1))
3918	    {
3919	      add = op1;
3920	      off = op0;
3921	      goto do_add;
3922	    }
3923	  break;
3924	case MINUS_EXPR:
3925	  if (expr_invariant_in_loop_p (loop, op1))
3926	    {
3927	      add = fold_convert (sizetype, op1);
3928	      add = size_binop (MINUS_EXPR, size_zero_node, add);
3929	      off = op0;
3930	      goto do_add;
3931	    }
3932	  break;
3933	case MULT_EXPR:
3934	  if (scale == 1 && tree_fits_shwi_p (op1))
3935	    {
3936	      int new_scale = tree_to_shwi (op1);
3937	      /* Only treat this as a scaling operation if the target
3938		 supports it for at least some offset type.  */
3939	      if (use_ifn_p
3940		  && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3941						masked_p, vectype, memory_type,
3942						signed_char_type_node,
3943						new_scale, &ifn,
3944						&offset_vectype)
3945		  && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3946						masked_p, vectype, memory_type,
3947						unsigned_char_type_node,
3948						new_scale, &ifn,
3949						&offset_vectype))
3950		break;
3951	      scale = new_scale;
3952	      off = op0;
3953	      continue;
3954	    }
3955	  break;
3956	case SSA_NAME:
3957	  off = op0;
3958	  continue;
3959	CASE_CONVERT:
3960	  if (!POINTER_TYPE_P (TREE_TYPE (op0))
3961	      && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3962	    break;
3963
3964	  /* Don't include the conversion if the target is happy with
3965	     the current offset type.  */
3966	  if (use_ifn_p
3967	      && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
3968					   masked_p, vectype, memory_type,
3969					   TREE_TYPE (off), scale, &ifn,
3970					   &offset_vectype))
3971	    break;
3972
3973	  if (TYPE_PRECISION (TREE_TYPE (op0))
3974	      == TYPE_PRECISION (TREE_TYPE (off)))
3975	    {
3976	      off = op0;
3977	      continue;
3978	    }
3979
3980	  if (TYPE_PRECISION (TREE_TYPE (op0))
3981	      < TYPE_PRECISION (TREE_TYPE (off)))
3982	    {
3983	      off = op0;
3984	      offtype = TREE_TYPE (off);
3985	      STRIP_NOPS (off);
3986	      continue;
3987	    }
3988	  break;
3989	default:
3990	  break;
3991	}
3992      break;
3993    }
3994
3995  /* If at the end OFF still isn't a SSA_NAME or isn't
3996     defined in the loop, punt.  */
3997  if (TREE_CODE (off) != SSA_NAME
3998      || expr_invariant_in_loop_p (loop, off))
3999    return false;
4000
4001  if (offtype == NULL_TREE)
4002    offtype = TREE_TYPE (off);
4003
4004  if (use_ifn_p)
4005    {
4006      if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
4007				     vectype, memory_type, offtype, scale,
4008				     &ifn, &offset_vectype))
4009	return false;
4010    }
4011  else
4012    {
4013      if (DR_IS_READ (dr))
4014	{
4015	  if (targetm.vectorize.builtin_gather)
4016	    decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
4017	}
4018      else
4019	{
4020	  if (targetm.vectorize.builtin_scatter)
4021	    decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
4022	}
4023
4024      if (!decl)
4025	return false;
4026
4027      ifn = IFN_LAST;
4028      /* The offset vector type will be read from DECL when needed.  */
4029      offset_vectype = NULL_TREE;
4030    }
4031
4032  info->ifn = ifn;
4033  info->decl = decl;
4034  info->base = base;
4035  info->offset = off;
4036  info->offset_dt = vect_unknown_def_type;
4037  info->offset_vectype = offset_vectype;
4038  info->scale = scale;
4039  info->element_type = TREE_TYPE (vectype);
4040  info->memory_type = memory_type;
4041  return true;
4042}
4043
4044/* Find the data references in STMT, analyze them with respect to LOOP and
4045   append them to DATAREFS.  Return false if datarefs in this stmt cannot
4046   be handled.  */
4047
4048opt_result
4049vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4050			       vec<data_reference_p> *datarefs)
4051{
4052  /* We can ignore clobbers for dataref analysis - they are removed during
4053     loop vectorization and BB vectorization checks dependences with a
4054     stmt walk.  */
4055  if (gimple_clobber_p (stmt))
4056    return opt_result::success ();
4057
4058  if (gimple_has_volatile_ops (stmt))
4059    return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4060				   stmt);
4061
4062  if (stmt_can_throw_internal (cfun, stmt))
4063    return opt_result::failure_at (stmt,
4064				   "not vectorized:"
4065				   " statement can throw an exception: %G",
4066				   stmt);
4067
4068  auto_vec<data_reference_p, 2> refs;
4069  opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4070  if (!res)
4071    return res;
4072
4073  if (refs.is_empty ())
4074    return opt_result::success ();
4075
4076  if (refs.length () > 1)
4077    return opt_result::failure_at (stmt,
4078				   "not vectorized:"
4079				   " more than one data ref in stmt: %G", stmt);
4080
4081  if (gcall *call = dyn_cast <gcall *> (stmt))
4082    if (!gimple_call_internal_p (call)
4083	|| (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4084	    && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4085      return opt_result::failure_at (stmt,
4086				     "not vectorized: dr in a call %G", stmt);
4087
4088  data_reference_p dr = refs.pop ();
4089  if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4090      && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4091    return opt_result::failure_at (stmt,
4092				   "not vectorized:"
4093				   " statement is bitfield access %G", stmt);
4094
4095  if (DR_BASE_ADDRESS (dr)
4096      && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4097    return opt_result::failure_at (stmt,
4098				   "not vectorized:"
4099				   " base addr of dr is a constant\n");
4100
4101  /* Check whether this may be a SIMD lane access and adjust the
4102     DR to make it easier for us to handle it.  */
4103  if (loop
4104      && loop->simduid
4105      && (!DR_BASE_ADDRESS (dr)
4106	  || !DR_OFFSET (dr)
4107	  || !DR_INIT (dr)
4108	  || !DR_STEP (dr)))
4109    {
4110      struct data_reference *newdr
4111	= create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4112			   DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4113      if (DR_BASE_ADDRESS (newdr)
4114	  && DR_OFFSET (newdr)
4115	  && DR_INIT (newdr)
4116	  && DR_STEP (newdr)
4117	  && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4118	  && integer_zerop (DR_STEP (newdr)))
4119	{
4120	  tree base_address = DR_BASE_ADDRESS (newdr);
4121	  tree off = DR_OFFSET (newdr);
4122	  tree step = ssize_int (1);
4123	  if (integer_zerop (off)
4124	      && TREE_CODE (base_address) == POINTER_PLUS_EXPR)
4125	    {
4126	      off = TREE_OPERAND (base_address, 1);
4127	      base_address = TREE_OPERAND (base_address, 0);
4128	    }
4129	  STRIP_NOPS (off);
4130	  if (TREE_CODE (off) == MULT_EXPR
4131	      && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4132	    {
4133	      step = TREE_OPERAND (off, 1);
4134	      off = TREE_OPERAND (off, 0);
4135	      STRIP_NOPS (off);
4136	    }
4137	  if (CONVERT_EXPR_P (off)
4138	      && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4139		  < TYPE_PRECISION (TREE_TYPE (off))))
4140	    off = TREE_OPERAND (off, 0);
4141	  if (TREE_CODE (off) == SSA_NAME)
4142	    {
4143	      gimple *def = SSA_NAME_DEF_STMT (off);
4144	      /* Look through widening conversion.  */
4145	      if (is_gimple_assign (def)
4146		  && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def)))
4147		{
4148		  tree rhs1 = gimple_assign_rhs1 (def);
4149		  if (TREE_CODE (rhs1) == SSA_NAME
4150		      && INTEGRAL_TYPE_P (TREE_TYPE (rhs1))
4151		      && (TYPE_PRECISION (TREE_TYPE (off))
4152			  > TYPE_PRECISION (TREE_TYPE (rhs1))))
4153		    def = SSA_NAME_DEF_STMT (rhs1);
4154		}
4155	      if (is_gimple_call (def)
4156		  && gimple_call_internal_p (def)
4157		  && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4158		{
4159		  tree arg = gimple_call_arg (def, 0);
4160		  tree reft = TREE_TYPE (DR_REF (newdr));
4161		  gcc_assert (TREE_CODE (arg) == SSA_NAME);
4162		  arg = SSA_NAME_VAR (arg);
4163		  if (arg == loop->simduid
4164		      /* For now.  */
4165		      && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4166		    {
4167		      DR_BASE_ADDRESS (newdr) = base_address;
4168		      DR_OFFSET (newdr) = ssize_int (0);
4169		      DR_STEP (newdr) = step;
4170		      DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4171		      DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step);
4172		      /* Mark as simd-lane access.  */
4173		      tree arg2 = gimple_call_arg (def, 1);
4174		      newdr->aux = (void *) (-1 - tree_to_uhwi (arg2));
4175		      free_data_ref (dr);
4176		      datarefs->safe_push (newdr);
4177		      return opt_result::success ();
4178		    }
4179		}
4180	    }
4181	}
4182      free_data_ref (newdr);
4183    }
4184
4185  datarefs->safe_push (dr);
4186  return opt_result::success ();
4187}
4188
4189/* Function vect_analyze_data_refs.
4190
4191  Find all the data references in the loop or basic block.
4192
4193   The general structure of the analysis of data refs in the vectorizer is as
4194   follows:
4195   1- vect_analyze_data_refs(loop/bb): call
4196      compute_data_dependences_for_loop/bb to find and analyze all data-refs
4197      in the loop/bb and their dependences.
4198   2- vect_analyze_dependences(): apply dependence testing using ddrs.
4199   3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4200   4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4201
4202*/
4203
4204opt_result
4205vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal)
4206{
4207  class loop *loop = NULL;
4208  unsigned int i;
4209  struct data_reference *dr;
4210  tree scalar_type;
4211
4212  DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4213
4214  if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4215    loop = LOOP_VINFO_LOOP (loop_vinfo);
4216
4217  /* Go through the data-refs, check that the analysis succeeded.  Update
4218     pointer from stmt_vec_info struct to DR and vectype.  */
4219
4220  vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4221  FOR_EACH_VEC_ELT (datarefs, i, dr)
4222    {
4223      enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4224      poly_uint64 vf;
4225
4226      gcc_assert (DR_REF (dr));
4227      stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4228      gcc_assert (!stmt_info->dr_aux.dr);
4229      stmt_info->dr_aux.dr = dr;
4230      stmt_info->dr_aux.stmt = stmt_info;
4231
4232      /* Check that analysis of the data-ref succeeded.  */
4233      if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4234	  || !DR_STEP (dr))
4235        {
4236	  bool maybe_gather
4237	    = DR_IS_READ (dr)
4238	      && !TREE_THIS_VOLATILE (DR_REF (dr))
4239	      && (targetm.vectorize.builtin_gather != NULL
4240		  || supports_vec_gather_load_p ());
4241	  bool maybe_scatter
4242	    = DR_IS_WRITE (dr)
4243	      && !TREE_THIS_VOLATILE (DR_REF (dr))
4244	      && (targetm.vectorize.builtin_scatter != NULL
4245		  || supports_vec_scatter_store_p ());
4246
4247	  /* If target supports vector gather loads or scatter stores,
4248	     see if they can't be used.  */
4249	  if (is_a <loop_vec_info> (vinfo)
4250	      && !nested_in_vect_loop_p (loop, stmt_info))
4251	    {
4252	      if (maybe_gather || maybe_scatter)
4253		{
4254		  if (maybe_gather)
4255		    gatherscatter = GATHER;
4256		  else
4257		    gatherscatter = SCATTER;
4258		}
4259	    }
4260
4261	  if (gatherscatter == SG_NONE)
4262	    {
4263	      if (dump_enabled_p ())
4264		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4265				 "not vectorized: data ref analysis "
4266				 "failed %G", stmt_info->stmt);
4267	      if (is_a <bb_vec_info> (vinfo))
4268		{
4269		  /* In BB vectorization the ref can still participate
4270		     in dependence analysis, we just can't vectorize it.  */
4271		  STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4272		  continue;
4273		}
4274	      return opt_result::failure_at (stmt_info->stmt,
4275					     "not vectorized:"
4276					     " data ref analysis failed: %G",
4277					     stmt_info->stmt);
4278	    }
4279        }
4280
4281      /* See if this was detected as SIMD lane access.  */
4282      if (dr->aux == (void *)-1
4283	  || dr->aux == (void *)-2
4284	  || dr->aux == (void *)-3
4285	  || dr->aux == (void *)-4)
4286	{
4287	  if (nested_in_vect_loop_p (loop, stmt_info))
4288	    return opt_result::failure_at (stmt_info->stmt,
4289					   "not vectorized:"
4290					   " data ref analysis failed: %G",
4291					   stmt_info->stmt);
4292	  STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info)
4293	    = -(uintptr_t) dr->aux;
4294	}
4295
4296      tree base = get_base_address (DR_REF (dr));
4297      if (base && VAR_P (base) && DECL_NONALIASED (base))
4298	{
4299          if (dump_enabled_p ())
4300	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4301			     "not vectorized: base object not addressable "
4302			     "for stmt: %G", stmt_info->stmt);
4303          if (is_a <bb_vec_info> (vinfo))
4304	    {
4305	      /* In BB vectorization the ref can still participate
4306	         in dependence analysis, we just can't vectorize it.  */
4307	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4308	      continue;
4309	    }
4310	  return opt_result::failure_at (stmt_info->stmt,
4311					 "not vectorized: base object not"
4312					 " addressable for stmt: %G",
4313					 stmt_info->stmt);
4314	}
4315
4316      if (is_a <loop_vec_info> (vinfo)
4317	  && DR_STEP (dr)
4318	  && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4319	{
4320	  if (nested_in_vect_loop_p (loop, stmt_info))
4321	    return opt_result::failure_at (stmt_info->stmt,
4322					   "not vectorized: "
4323					   "not suitable for strided load %G",
4324					   stmt_info->stmt);
4325	  STMT_VINFO_STRIDED_P (stmt_info) = true;
4326	}
4327
4328      /* Update DR field in stmt_vec_info struct.  */
4329
4330      /* If the dataref is in an inner-loop of the loop that is considered for
4331	 for vectorization, we also want to analyze the access relative to
4332	 the outer-loop (DR contains information only relative to the
4333	 inner-most enclosing loop).  We do that by building a reference to the
4334	 first location accessed by the inner-loop, and analyze it relative to
4335	 the outer-loop.  */
4336      if (loop && nested_in_vect_loop_p (loop, stmt_info))
4337	{
4338	  /* Build a reference to the first location accessed by the
4339	     inner loop: *(BASE + INIT + OFFSET).  By construction,
4340	     this address must be invariant in the inner loop, so we
4341	     can consider it as being used in the outer loop.  */
4342	  tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4343	  tree offset = unshare_expr (DR_OFFSET (dr));
4344	  tree init = unshare_expr (DR_INIT (dr));
4345	  tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4346					  init, offset);
4347	  tree init_addr = fold_build_pointer_plus (base, init_offset);
4348	  tree init_ref = build_fold_indirect_ref (init_addr);
4349
4350	  if (dump_enabled_p ())
4351	    dump_printf_loc (MSG_NOTE, vect_location,
4352			     "analyze in outer loop: %T\n", init_ref);
4353
4354	  opt_result res
4355	    = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4356				    init_ref, loop, stmt_info->stmt);
4357	  if (!res)
4358	    /* dr_analyze_innermost already explained the failure.  */
4359	    return res;
4360
4361          if (dump_enabled_p ())
4362	    dump_printf_loc (MSG_NOTE, vect_location,
4363			     "\touter base_address: %T\n"
4364			     "\touter offset from base address: %T\n"
4365			     "\touter constant offset from base address: %T\n"
4366			     "\touter step: %T\n"
4367			     "\touter base alignment: %d\n\n"
4368			     "\touter base misalignment: %d\n"
4369			     "\touter offset alignment: %d\n"
4370			     "\touter step alignment: %d\n",
4371			     STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4372			     STMT_VINFO_DR_OFFSET (stmt_info),
4373			     STMT_VINFO_DR_INIT (stmt_info),
4374			     STMT_VINFO_DR_STEP (stmt_info),
4375			     STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4376			     STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4377			     STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4378			     STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4379	}
4380
4381      /* Set vectype for STMT.  */
4382      scalar_type = TREE_TYPE (DR_REF (dr));
4383      tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
4384      if (!vectype)
4385        {
4386          if (dump_enabled_p ())
4387            {
4388              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4389                               "not vectorized: no vectype for stmt: %G",
4390			       stmt_info->stmt);
4391              dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4392              dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4393                                 scalar_type);
4394              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4395            }
4396
4397          if (is_a <bb_vec_info> (vinfo))
4398	    {
4399	      /* No vector type is fine, the ref can still participate
4400	         in dependence analysis, we just can't vectorize it.  */
4401	      STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4402	      continue;
4403	    }
4404	  if (fatal)
4405	    *fatal = false;
4406	  return opt_result::failure_at (stmt_info->stmt,
4407					 "not vectorized:"
4408					 " no vectype for stmt: %G"
4409					 " scalar_type: %T\n",
4410					 stmt_info->stmt, scalar_type);
4411        }
4412      else
4413	{
4414	  if (dump_enabled_p ())
4415	    dump_printf_loc (MSG_NOTE, vect_location,
4416			     "got vectype for stmt: %G%T\n",
4417			     stmt_info->stmt, vectype);
4418	}
4419
4420      /* Adjust the minimal vectorization factor according to the
4421	 vector type.  */
4422      vf = TYPE_VECTOR_SUBPARTS (vectype);
4423      *min_vf = upper_bound (*min_vf, vf);
4424
4425      /* Leave the BB vectorizer to pick the vector type later, based on
4426	 the final dataref group size and SLP node size.  */
4427      if (is_a <loop_vec_info> (vinfo))
4428	STMT_VINFO_VECTYPE (stmt_info) = vectype;
4429
4430      if (gatherscatter != SG_NONE)
4431	{
4432	  gather_scatter_info gs_info;
4433	  if (!vect_check_gather_scatter (stmt_info,
4434					  as_a <loop_vec_info> (vinfo),
4435					  &gs_info)
4436	      || !get_vectype_for_scalar_type (vinfo,
4437					       TREE_TYPE (gs_info.offset)))
4438	    {
4439	      if (fatal)
4440		*fatal = false;
4441	      return opt_result::failure_at
4442			(stmt_info->stmt,
4443			 (gatherscatter == GATHER)
4444			 ? "not vectorized: not suitable for gather load %G"
4445			 : "not vectorized: not suitable for scatter store %G",
4446			 stmt_info->stmt);
4447	    }
4448	  STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4449	}
4450    }
4451
4452  /* We used to stop processing and prune the list here.  Verify we no
4453     longer need to.  */
4454  gcc_assert (i == datarefs.length ());
4455
4456  return opt_result::success ();
4457}
4458
4459
4460/* Function vect_get_new_vect_var.
4461
4462   Returns a name for a new variable.  The current naming scheme appends the
4463   prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4464   the name of vectorizer generated variables, and appends that to NAME if
4465   provided.  */
4466
4467tree
4468vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4469{
4470  const char *prefix;
4471  tree new_vect_var;
4472
4473  switch (var_kind)
4474  {
4475  case vect_simple_var:
4476    prefix = "vect";
4477    break;
4478  case vect_scalar_var:
4479    prefix = "stmp";
4480    break;
4481  case vect_mask_var:
4482    prefix = "mask";
4483    break;
4484  case vect_pointer_var:
4485    prefix = "vectp";
4486    break;
4487  default:
4488    gcc_unreachable ();
4489  }
4490
4491  if (name)
4492    {
4493      char* tmp = concat (prefix, "_", name, NULL);
4494      new_vect_var = create_tmp_reg (type, tmp);
4495      free (tmp);
4496    }
4497  else
4498    new_vect_var = create_tmp_reg (type, prefix);
4499
4500  return new_vect_var;
4501}
4502
4503/* Like vect_get_new_vect_var but return an SSA name.  */
4504
4505tree
4506vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4507{
4508  const char *prefix;
4509  tree new_vect_var;
4510
4511  switch (var_kind)
4512  {
4513  case vect_simple_var:
4514    prefix = "vect";
4515    break;
4516  case vect_scalar_var:
4517    prefix = "stmp";
4518    break;
4519  case vect_pointer_var:
4520    prefix = "vectp";
4521    break;
4522  default:
4523    gcc_unreachable ();
4524  }
4525
4526  if (name)
4527    {
4528      char* tmp = concat (prefix, "_", name, NULL);
4529      new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4530      free (tmp);
4531    }
4532  else
4533    new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4534
4535  return new_vect_var;
4536}
4537
4538/* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4539
4540static void
4541vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4542{
4543  duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4544  int misalign = DR_MISALIGNMENT (dr_info);
4545  if (misalign == DR_MISALIGNMENT_UNKNOWN)
4546    mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4547  else
4548    set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4549			    known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4550			    misalign);
4551}
4552
4553/* Function vect_create_addr_base_for_vector_ref.
4554
4555   Create an expression that computes the address of the first memory location
4556   that will be accessed for a data reference.
4557
4558   Input:
4559   STMT_INFO: The statement containing the data reference.
4560   NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4561   OFFSET: Optional. If supplied, it is be added to the initial address.
4562   LOOP:    Specify relative to which loop-nest should the address be computed.
4563            For example, when the dataref is in an inner-loop nested in an
4564	    outer-loop that is now being vectorized, LOOP can be either the
4565	    outer-loop, or the inner-loop.  The first memory location accessed
4566	    by the following dataref ('in' points to short):
4567
4568		for (i=0; i<N; i++)
4569		   for (j=0; j<M; j++)
4570		     s += in[i+j]
4571
4572	    is as follows:
4573	    if LOOP=i_loop:	&in		(relative to i_loop)
4574	    if LOOP=j_loop: 	&in+i*2B	(relative to j_loop)
4575   BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4576	    initial address.  Unlike OFFSET, which is number of elements to
4577	    be added, BYTE_OFFSET is measured in bytes.
4578
4579   Output:
4580   1. Return an SSA_NAME whose value is the address of the memory location of
4581      the first vector of the data reference.
4582   2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4583      these statement(s) which define the returned SSA_NAME.
4584
4585   FORNOW: We are only handling array accesses with step 1.  */
4586
4587tree
4588vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4589				      gimple_seq *new_stmt_list,
4590				      tree offset,
4591				      tree byte_offset)
4592{
4593  dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4594  struct data_reference *dr = dr_info->dr;
4595  const char *base_name;
4596  tree addr_base;
4597  tree dest;
4598  gimple_seq seq = NULL;
4599  tree vect_ptr_type;
4600  tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4601  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4602  innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4603
4604  tree data_ref_base = unshare_expr (drb->base_address);
4605  tree base_offset = unshare_expr (get_dr_vinfo_offset (dr_info, true));
4606  tree init = unshare_expr (drb->init);
4607
4608  if (loop_vinfo)
4609    base_name = get_name (data_ref_base);
4610  else
4611    {
4612      base_offset = ssize_int (0);
4613      init = ssize_int (0);
4614      base_name = get_name (DR_REF (dr));
4615    }
4616
4617  /* Create base_offset */
4618  base_offset = size_binop (PLUS_EXPR,
4619			    fold_convert (sizetype, base_offset),
4620			    fold_convert (sizetype, init));
4621
4622  if (offset)
4623    {
4624      offset = fold_build2 (MULT_EXPR, sizetype,
4625			    fold_convert (sizetype, offset), step);
4626      base_offset = fold_build2 (PLUS_EXPR, sizetype,
4627				 base_offset, offset);
4628    }
4629  if (byte_offset)
4630    {
4631      byte_offset = fold_convert (sizetype, byte_offset);
4632      base_offset = fold_build2 (PLUS_EXPR, sizetype,
4633				 base_offset, byte_offset);
4634    }
4635
4636  /* base + base_offset */
4637  if (loop_vinfo)
4638    addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4639  else
4640    {
4641      addr_base = build1 (ADDR_EXPR,
4642			  build_pointer_type (TREE_TYPE (DR_REF (dr))),
4643			  unshare_expr (DR_REF (dr)));
4644    }
4645
4646  vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4647  dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4648  addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4649  gimple_seq_add_seq (new_stmt_list, seq);
4650
4651  if (DR_PTR_INFO (dr)
4652      && TREE_CODE (addr_base) == SSA_NAME
4653      /* We should only duplicate pointer info to newly created SSA names.  */
4654      && SSA_NAME_VAR (addr_base) == dest)
4655    {
4656      vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4657      if (offset || byte_offset)
4658	mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4659    }
4660
4661  if (dump_enabled_p ())
4662    dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4663
4664  return addr_base;
4665}
4666
4667
4668/* Function vect_create_data_ref_ptr.
4669
4670   Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4671   location accessed in the loop by STMT_INFO, along with the def-use update
4672   chain to appropriately advance the pointer through the loop iterations.
4673   Also set aliasing information for the pointer.  This pointer is used by
4674   the callers to this function to create a memory reference expression for
4675   vector load/store access.
4676
4677   Input:
4678   1. STMT_INFO: a stmt that references memory. Expected to be of the form
4679         GIMPLE_ASSIGN <name, data-ref> or
4680	 GIMPLE_ASSIGN <data-ref, name>.
4681   2. AGGR_TYPE: the type of the reference, which should be either a vector
4682        or an array.
4683   3. AT_LOOP: the loop where the vector memref is to be created.
4684   4. OFFSET (optional): an offset to be added to the initial address accessed
4685	by the data-ref in STMT_INFO.
4686   5. BSI: location where the new stmts are to be placed if there is no loop
4687   6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4688        pointing to the initial address.
4689   7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4690	to the initial address accessed by the data-ref in STMT_INFO.  This is
4691	similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4692	in bytes.
4693   8. IV_STEP (optional, defaults to NULL): the amount that should be added
4694	to the IV during each iteration of the loop.  NULL says to move
4695	by one copy of AGGR_TYPE up or down, depending on the step of the
4696	data reference.
4697
4698   Output:
4699   1. Declare a new ptr to vector_type, and have it point to the base of the
4700      data reference (initial addressed accessed by the data reference).
4701      For example, for vector of type V8HI, the following code is generated:
4702
4703      v8hi *ap;
4704      ap = (v8hi *)initial_address;
4705
4706      if OFFSET is not supplied:
4707         initial_address = &a[init];
4708      if OFFSET is supplied:
4709         initial_address = &a[init + OFFSET];
4710      if BYTE_OFFSET is supplied:
4711	 initial_address = &a[init] + BYTE_OFFSET;
4712
4713      Return the initial_address in INITIAL_ADDRESS.
4714
4715   2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4716      update the pointer in each iteration of the loop.
4717
4718      Return the increment stmt that updates the pointer in PTR_INCR.
4719
4720   3. Return the pointer.  */
4721
4722tree
4723vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4724			  class loop *at_loop, tree offset,
4725			  tree *initial_address, gimple_stmt_iterator *gsi,
4726			  gimple **ptr_incr, bool only_init,
4727			  tree byte_offset, tree iv_step)
4728{
4729  const char *base_name;
4730  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4731  class loop *loop = NULL;
4732  bool nested_in_vect_loop = false;
4733  class loop *containing_loop = NULL;
4734  tree aggr_ptr_type;
4735  tree aggr_ptr;
4736  tree new_temp;
4737  gimple_seq new_stmt_list = NULL;
4738  edge pe = NULL;
4739  basic_block new_bb;
4740  tree aggr_ptr_init;
4741  dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4742  struct data_reference *dr = dr_info->dr;
4743  tree aptr;
4744  gimple_stmt_iterator incr_gsi;
4745  bool insert_after;
4746  tree indx_before_incr, indx_after_incr;
4747  gimple *incr;
4748  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4749
4750  gcc_assert (iv_step != NULL_TREE
4751	      || TREE_CODE (aggr_type) == ARRAY_TYPE
4752	      || TREE_CODE (aggr_type) == VECTOR_TYPE);
4753
4754  if (loop_vinfo)
4755    {
4756      loop = LOOP_VINFO_LOOP (loop_vinfo);
4757      nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4758      containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4759      pe = loop_preheader_edge (loop);
4760    }
4761  else
4762    {
4763      gcc_assert (bb_vinfo);
4764      only_init = true;
4765      *ptr_incr = NULL;
4766    }
4767
4768  /* Create an expression for the first address accessed by this load
4769     in LOOP.  */
4770  base_name = get_name (DR_BASE_ADDRESS (dr));
4771
4772  if (dump_enabled_p ())
4773    {
4774      tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4775      dump_printf_loc (MSG_NOTE, vect_location,
4776                       "create %s-pointer variable to type: %T",
4777		       get_tree_code_name (TREE_CODE (aggr_type)),
4778		       aggr_type);
4779      if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4780        dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4781      else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4782        dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4783      else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4784        dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4785      else
4786        dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4787      dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4788    }
4789
4790  /* (1) Create the new aggregate-pointer variable.
4791     Vector and array types inherit the alias set of their component
4792     type by default so we need to use a ref-all pointer if the data
4793     reference does not conflict with the created aggregated data
4794     reference because it is not addressable.  */
4795  bool need_ref_all = false;
4796  if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4797			      get_alias_set (DR_REF (dr))))
4798    need_ref_all = true;
4799  /* Likewise for any of the data references in the stmt group.  */
4800  else if (DR_GROUP_SIZE (stmt_info) > 1)
4801    {
4802      stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4803      do
4804	{
4805	  struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4806	  if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4807				      get_alias_set (DR_REF (sdr))))
4808	    {
4809	      need_ref_all = true;
4810	      break;
4811	    }
4812	  sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4813	}
4814      while (sinfo);
4815    }
4816  aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4817					       need_ref_all);
4818  aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4819
4820
4821  /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4822     vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4823     def-use update cycles for the pointer: one relative to the outer-loop
4824     (LOOP), which is what steps (3) and (4) below do.  The other is relative
4825     to the inner-loop (which is the inner-most loop containing the dataref),
4826     and this is done be step (5) below.
4827
4828     When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4829     inner-most loop, and so steps (3),(4) work the same, and step (5) is
4830     redundant.  Steps (3),(4) create the following:
4831
4832	vp0 = &base_addr;
4833	LOOP:	vp1 = phi(vp0,vp2)
4834		...
4835		...
4836		vp2 = vp1 + step
4837		goto LOOP
4838
4839     If there is an inner-loop nested in loop, then step (5) will also be
4840     applied, and an additional update in the inner-loop will be created:
4841
4842	vp0 = &base_addr;
4843	LOOP:   vp1 = phi(vp0,vp2)
4844		...
4845        inner:     vp3 = phi(vp1,vp4)
4846	           vp4 = vp3 + inner_step
4847	           if () goto inner
4848		...
4849		vp2 = vp1 + step
4850		if () goto LOOP   */
4851
4852  /* (2) Calculate the initial address of the aggregate-pointer, and set
4853     the aggregate-pointer to point to it before the loop.  */
4854
4855  /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4856
4857  new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4858						   offset, byte_offset);
4859  if (new_stmt_list)
4860    {
4861      if (pe)
4862        {
4863          new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4864          gcc_assert (!new_bb);
4865        }
4866      else
4867        gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4868    }
4869
4870  *initial_address = new_temp;
4871  aggr_ptr_init = new_temp;
4872
4873  /* (3) Handle the updating of the aggregate-pointer inside the loop.
4874     This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4875     inner-loop nested in LOOP (during outer-loop vectorization).  */
4876
4877  /* No update in loop is required.  */
4878  if (only_init && (!loop_vinfo || at_loop == loop))
4879    aptr = aggr_ptr_init;
4880  else
4881    {
4882      /* Accesses to invariant addresses should be handled specially
4883	 by the caller.  */
4884      tree step = vect_dr_behavior (dr_info)->step;
4885      gcc_assert (!integer_zerop (step));
4886
4887      if (iv_step == NULL_TREE)
4888	{
4889	  /* The step of the aggregate pointer is the type size,
4890	     negated for downward accesses.  */
4891	  iv_step = TYPE_SIZE_UNIT (aggr_type);
4892	  if (tree_int_cst_sgn (step) == -1)
4893	    iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4894	}
4895
4896      standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4897
4898      create_iv (aggr_ptr_init,
4899		 fold_convert (aggr_ptr_type, iv_step),
4900		 aggr_ptr, loop, &incr_gsi, insert_after,
4901		 &indx_before_incr, &indx_after_incr);
4902      incr = gsi_stmt (incr_gsi);
4903      loop_vinfo->add_stmt (incr);
4904
4905      /* Copy the points-to information if it exists. */
4906      if (DR_PTR_INFO (dr))
4907	{
4908	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4909	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4910	}
4911      if (ptr_incr)
4912	*ptr_incr = incr;
4913
4914      aptr = indx_before_incr;
4915    }
4916
4917  if (!nested_in_vect_loop || only_init)
4918    return aptr;
4919
4920
4921  /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4922     nested in LOOP, if exists.  */
4923
4924  gcc_assert (nested_in_vect_loop);
4925  if (!only_init)
4926    {
4927      standard_iv_increment_position (containing_loop, &incr_gsi,
4928				      &insert_after);
4929      create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4930		 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4931		 &indx_after_incr);
4932      incr = gsi_stmt (incr_gsi);
4933      loop_vinfo->add_stmt (incr);
4934
4935      /* Copy the points-to information if it exists. */
4936      if (DR_PTR_INFO (dr))
4937	{
4938	  vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4939	  vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4940	}
4941      if (ptr_incr)
4942	*ptr_incr = incr;
4943
4944      return indx_before_incr;
4945    }
4946  else
4947    gcc_unreachable ();
4948}
4949
4950
4951/* Function bump_vector_ptr
4952
4953   Increment a pointer (to a vector type) by vector-size. If requested,
4954   i.e. if PTR-INCR is given, then also connect the new increment stmt
4955   to the existing def-use update-chain of the pointer, by modifying
4956   the PTR_INCR as illustrated below:
4957
4958   The pointer def-use update-chain before this function:
4959                        DATAREF_PTR = phi (p_0, p_2)
4960                        ....
4961        PTR_INCR:       p_2 = DATAREF_PTR + step
4962
4963   The pointer def-use update-chain after this function:
4964                        DATAREF_PTR = phi (p_0, p_2)
4965                        ....
4966                        NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4967                        ....
4968        PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4969
4970   Input:
4971   DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4972                 in the loop.
4973   PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4974	      the loop.  The increment amount across iterations is expected
4975	      to be vector_size.
4976   BSI - location where the new update stmt is to be placed.
4977   STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4978   BUMP - optional. The offset by which to bump the pointer. If not given,
4979	  the offset is assumed to be vector_size.
4980
4981   Output: Return NEW_DATAREF_PTR as illustrated above.
4982
4983*/
4984
4985tree
4986bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4987		 stmt_vec_info stmt_info, tree bump)
4988{
4989  struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4990  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4991  tree update = TYPE_SIZE_UNIT (vectype);
4992  gassign *incr_stmt;
4993  ssa_op_iter iter;
4994  use_operand_p use_p;
4995  tree new_dataref_ptr;
4996
4997  if (bump)
4998    update = bump;
4999
5000  if (TREE_CODE (dataref_ptr) == SSA_NAME)
5001    new_dataref_ptr = copy_ssa_name (dataref_ptr);
5002  else
5003    new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
5004  incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
5005				   dataref_ptr, update);
5006  vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
5007
5008  /* Copy the points-to information if it exists. */
5009  if (DR_PTR_INFO (dr))
5010    {
5011      duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
5012      mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
5013    }
5014
5015  if (!ptr_incr)
5016    return new_dataref_ptr;
5017
5018  /* Update the vector-pointer's cross-iteration increment.  */
5019  FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
5020    {
5021      tree use = USE_FROM_PTR (use_p);
5022
5023      if (use == dataref_ptr)
5024        SET_USE (use_p, new_dataref_ptr);
5025      else
5026        gcc_assert (operand_equal_p (use, update, 0));
5027    }
5028
5029  return new_dataref_ptr;
5030}
5031
5032
5033/* Copy memory reference info such as base/clique from the SRC reference
5034   to the DEST MEM_REF.  */
5035
5036void
5037vect_copy_ref_info (tree dest, tree src)
5038{
5039  if (TREE_CODE (dest) != MEM_REF)
5040    return;
5041
5042  tree src_base = src;
5043  while (handled_component_p (src_base))
5044    src_base = TREE_OPERAND (src_base, 0);
5045  if (TREE_CODE (src_base) != MEM_REF
5046      && TREE_CODE (src_base) != TARGET_MEM_REF)
5047    return;
5048
5049  MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5050  MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5051}
5052
5053
5054/* Function vect_create_destination_var.
5055
5056   Create a new temporary of type VECTYPE.  */
5057
5058tree
5059vect_create_destination_var (tree scalar_dest, tree vectype)
5060{
5061  tree vec_dest;
5062  const char *name;
5063  char *new_name;
5064  tree type;
5065  enum vect_var_kind kind;
5066
5067  kind = vectype
5068    ? VECTOR_BOOLEAN_TYPE_P (vectype)
5069    ? vect_mask_var
5070    : vect_simple_var
5071    : vect_scalar_var;
5072  type = vectype ? vectype : TREE_TYPE (scalar_dest);
5073
5074  gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5075
5076  name = get_name (scalar_dest);
5077  if (name)
5078    new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5079  else
5080    new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5081  vec_dest = vect_get_new_vect_var (type, kind, new_name);
5082  free (new_name);
5083
5084  return vec_dest;
5085}
5086
5087/* Function vect_grouped_store_supported.
5088
5089   Returns TRUE if interleave high and interleave low permutations
5090   are supported, and FALSE otherwise.  */
5091
5092bool
5093vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5094{
5095  machine_mode mode = TYPE_MODE (vectype);
5096
5097  /* vect_permute_store_chain requires the group size to be equal to 3 or
5098     be a power of two.  */
5099  if (count != 3 && exact_log2 (count) == -1)
5100    {
5101      if (dump_enabled_p ())
5102	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5103			 "the size of the group of accesses"
5104			 " is not a power of 2 or not eqaul to 3\n");
5105      return false;
5106    }
5107
5108  /* Check that the permutation is supported.  */
5109  if (VECTOR_MODE_P (mode))
5110    {
5111      unsigned int i;
5112      if (count == 3)
5113	{
5114	  unsigned int j0 = 0, j1 = 0, j2 = 0;
5115	  unsigned int i, j;
5116
5117	  unsigned int nelt;
5118	  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5119	    {
5120	      if (dump_enabled_p ())
5121		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5122				 "cannot handle groups of 3 stores for"
5123				 " variable-length vectors\n");
5124	      return false;
5125	    }
5126
5127	  vec_perm_builder sel (nelt, nelt, 1);
5128	  sel.quick_grow (nelt);
5129	  vec_perm_indices indices;
5130	  for (j = 0; j < 3; j++)
5131	    {
5132	      int nelt0 = ((3 - j) * nelt) % 3;
5133	      int nelt1 = ((3 - j) * nelt + 1) % 3;
5134	      int nelt2 = ((3 - j) * nelt + 2) % 3;
5135	      for (i = 0; i < nelt; i++)
5136		{
5137		  if (3 * i + nelt0 < nelt)
5138		    sel[3 * i + nelt0] = j0++;
5139		  if (3 * i + nelt1 < nelt)
5140		    sel[3 * i + nelt1] = nelt + j1++;
5141		  if (3 * i + nelt2 < nelt)
5142		    sel[3 * i + nelt2] = 0;
5143		}
5144	      indices.new_vector (sel, 2, nelt);
5145	      if (!can_vec_perm_const_p (mode, indices))
5146		{
5147		  if (dump_enabled_p ())
5148		    dump_printf (MSG_MISSED_OPTIMIZATION,
5149				 "permutation op not supported by target.\n");
5150		  return false;
5151		}
5152
5153	      for (i = 0; i < nelt; i++)
5154		{
5155		  if (3 * i + nelt0 < nelt)
5156		    sel[3 * i + nelt0] = 3 * i + nelt0;
5157		  if (3 * i + nelt1 < nelt)
5158		    sel[3 * i + nelt1] = 3 * i + nelt1;
5159		  if (3 * i + nelt2 < nelt)
5160		    sel[3 * i + nelt2] = nelt + j2++;
5161		}
5162	      indices.new_vector (sel, 2, nelt);
5163	      if (!can_vec_perm_const_p (mode, indices))
5164		{
5165		  if (dump_enabled_p ())
5166		    dump_printf (MSG_MISSED_OPTIMIZATION,
5167				 "permutation op not supported by target.\n");
5168		  return false;
5169		}
5170	    }
5171	  return true;
5172	}
5173      else
5174	{
5175	  /* If length is not equal to 3 then only power of 2 is supported.  */
5176	  gcc_assert (pow2p_hwi (count));
5177	  poly_uint64 nelt = GET_MODE_NUNITS (mode);
5178
5179	  /* The encoding has 2 interleaved stepped patterns.  */
5180	  vec_perm_builder sel (nelt, 2, 3);
5181	  sel.quick_grow (6);
5182	  for (i = 0; i < 3; i++)
5183	    {
5184	      sel[i * 2] = i;
5185	      sel[i * 2 + 1] = i + nelt;
5186	    }
5187	  vec_perm_indices indices (sel, 2, nelt);
5188	  if (can_vec_perm_const_p (mode, indices))
5189	    {
5190	      for (i = 0; i < 6; i++)
5191		sel[i] += exact_div (nelt, 2);
5192	      indices.new_vector (sel, 2, nelt);
5193	      if (can_vec_perm_const_p (mode, indices))
5194		return true;
5195	    }
5196	}
5197    }
5198
5199  if (dump_enabled_p ())
5200    dump_printf (MSG_MISSED_OPTIMIZATION,
5201		 "permutation op not supported by target.\n");
5202  return false;
5203}
5204
5205
5206/* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5207   type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5208
5209bool
5210vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5211			    bool masked_p)
5212{
5213  if (masked_p)
5214    return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5215					 vec_mask_store_lanes_optab,
5216					 vectype, count);
5217  else
5218    return vect_lanes_optab_supported_p ("vec_store_lanes",
5219					 vec_store_lanes_optab,
5220					 vectype, count);
5221}
5222
5223
5224/* Function vect_permute_store_chain.
5225
5226   Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5227   a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5228   the data correctly for the stores.  Return the final references for stores
5229   in RESULT_CHAIN.
5230
5231   E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5232   The input is 4 vectors each containing 8 elements.  We assign a number to
5233   each element, the input sequence is:
5234
5235   1st vec:   0  1  2  3  4  5  6  7
5236   2nd vec:   8  9 10 11 12 13 14 15
5237   3rd vec:  16 17 18 19 20 21 22 23
5238   4th vec:  24 25 26 27 28 29 30 31
5239
5240   The output sequence should be:
5241
5242   1st vec:  0  8 16 24  1  9 17 25
5243   2nd vec:  2 10 18 26  3 11 19 27
5244   3rd vec:  4 12 20 28  5 13 21 30
5245   4th vec:  6 14 22 30  7 15 23 31
5246
5247   i.e., we interleave the contents of the four vectors in their order.
5248
5249   We use interleave_high/low instructions to create such output.  The input of
5250   each interleave_high/low operation is two vectors:
5251   1st vec    2nd vec
5252   0 1 2 3    4 5 6 7
5253   the even elements of the result vector are obtained left-to-right from the
5254   high/low elements of the first vector.  The odd elements of the result are
5255   obtained left-to-right from the high/low elements of the second vector.
5256   The output of interleave_high will be:   0 4 1 5
5257   and of interleave_low:                   2 6 3 7
5258
5259
5260   The permutation is done in log LENGTH stages.  In each stage interleave_high
5261   and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5262   where the first argument is taken from the first half of DR_CHAIN and the
5263   second argument from it's second half.
5264   In our example,
5265
5266   I1: interleave_high (1st vec, 3rd vec)
5267   I2: interleave_low (1st vec, 3rd vec)
5268   I3: interleave_high (2nd vec, 4th vec)
5269   I4: interleave_low (2nd vec, 4th vec)
5270
5271   The output for the first stage is:
5272
5273   I1:  0 16  1 17  2 18  3 19
5274   I2:  4 20  5 21  6 22  7 23
5275   I3:  8 24  9 25 10 26 11 27
5276   I4: 12 28 13 29 14 30 15 31
5277
5278   The output of the second stage, i.e. the final result is:
5279
5280   I1:  0  8 16 24  1  9 17 25
5281   I2:  2 10 18 26  3 11 19 27
5282   I3:  4 12 20 28  5 13 21 30
5283   I4:  6 14 22 30  7 15 23 31.  */
5284
5285void
5286vect_permute_store_chain (vec<tree> dr_chain,
5287			  unsigned int length,
5288			  stmt_vec_info stmt_info,
5289			  gimple_stmt_iterator *gsi,
5290			  vec<tree> *result_chain)
5291{
5292  tree vect1, vect2, high, low;
5293  gimple *perm_stmt;
5294  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5295  tree perm_mask_low, perm_mask_high;
5296  tree data_ref;
5297  tree perm3_mask_low, perm3_mask_high;
5298  unsigned int i, j, n, log_length = exact_log2 (length);
5299
5300  result_chain->quick_grow (length);
5301  memcpy (result_chain->address (), dr_chain.address (),
5302	  length * sizeof (tree));
5303
5304  if (length == 3)
5305    {
5306      /* vect_grouped_store_supported ensures that this is constant.  */
5307      unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5308      unsigned int j0 = 0, j1 = 0, j2 = 0;
5309
5310      vec_perm_builder sel (nelt, nelt, 1);
5311      sel.quick_grow (nelt);
5312      vec_perm_indices indices;
5313      for (j = 0; j < 3; j++)
5314        {
5315	  int nelt0 = ((3 - j) * nelt) % 3;
5316	  int nelt1 = ((3 - j) * nelt + 1) % 3;
5317	  int nelt2 = ((3 - j) * nelt + 2) % 3;
5318
5319	  for (i = 0; i < nelt; i++)
5320	    {
5321	      if (3 * i + nelt0 < nelt)
5322		sel[3 * i + nelt0] = j0++;
5323	      if (3 * i + nelt1 < nelt)
5324		sel[3 * i + nelt1] = nelt + j1++;
5325	      if (3 * i + nelt2 < nelt)
5326		sel[3 * i + nelt2] = 0;
5327	    }
5328	  indices.new_vector (sel, 2, nelt);
5329	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5330
5331	  for (i = 0; i < nelt; i++)
5332	    {
5333	      if (3 * i + nelt0 < nelt)
5334		sel[3 * i + nelt0] = 3 * i + nelt0;
5335	      if (3 * i + nelt1 < nelt)
5336		sel[3 * i + nelt1] = 3 * i + nelt1;
5337	      if (3 * i + nelt2 < nelt)
5338		sel[3 * i + nelt2] = nelt + j2++;
5339	    }
5340	  indices.new_vector (sel, 2, nelt);
5341	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5342
5343	  vect1 = dr_chain[0];
5344	  vect2 = dr_chain[1];
5345
5346	  /* Create interleaving stmt:
5347	     low = VEC_PERM_EXPR <vect1, vect2,
5348				  {j, nelt, *, j + 1, nelt + j + 1, *,
5349				   j + 2, nelt + j + 2, *, ...}>  */
5350	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5351	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5352					   vect2, perm3_mask_low);
5353	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5354
5355	  vect1 = data_ref;
5356	  vect2 = dr_chain[2];
5357	  /* Create interleaving stmt:
5358	     low = VEC_PERM_EXPR <vect1, vect2,
5359				  {0, 1, nelt + j, 3, 4, nelt + j + 1,
5360				   6, 7, nelt + j + 2, ...}>  */
5361	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5362	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5363					   vect2, perm3_mask_high);
5364	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5365	  (*result_chain)[j] = data_ref;
5366	}
5367    }
5368  else
5369    {
5370      /* If length is not equal to 3 then only power of 2 is supported.  */
5371      gcc_assert (pow2p_hwi (length));
5372
5373      /* The encoding has 2 interleaved stepped patterns.  */
5374      poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5375      vec_perm_builder sel (nelt, 2, 3);
5376      sel.quick_grow (6);
5377      for (i = 0; i < 3; i++)
5378	{
5379	  sel[i * 2] = i;
5380	  sel[i * 2 + 1] = i + nelt;
5381	}
5382	vec_perm_indices indices (sel, 2, nelt);
5383	perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5384
5385	for (i = 0; i < 6; i++)
5386	  sel[i] += exact_div (nelt, 2);
5387	indices.new_vector (sel, 2, nelt);
5388	perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5389
5390	for (i = 0, n = log_length; i < n; i++)
5391	  {
5392	    for (j = 0; j < length/2; j++)
5393	      {
5394		vect1 = dr_chain[j];
5395		vect2 = dr_chain[j+length/2];
5396
5397		/* Create interleaving stmt:
5398		   high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5399							...}>  */
5400		high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5401		perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5402						 vect2, perm_mask_high);
5403		vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5404		(*result_chain)[2*j] = high;
5405
5406		/* Create interleaving stmt:
5407		   low = VEC_PERM_EXPR <vect1, vect2,
5408					{nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5409					 ...}>  */
5410		low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5411		perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5412						 vect2, perm_mask_low);
5413		vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5414		(*result_chain)[2*j+1] = low;
5415	      }
5416	    memcpy (dr_chain.address (), result_chain->address (),
5417		    length * sizeof (tree));
5418	  }
5419    }
5420}
5421
5422/* Function vect_setup_realignment
5423
5424   This function is called when vectorizing an unaligned load using
5425   the dr_explicit_realign[_optimized] scheme.
5426   This function generates the following code at the loop prolog:
5427
5428      p = initial_addr;
5429   x  msq_init = *(floor(p));   # prolog load
5430      realignment_token = call target_builtin;
5431    loop:
5432   x  msq = phi (msq_init, ---)
5433
5434   The stmts marked with x are generated only for the case of
5435   dr_explicit_realign_optimized.
5436
5437   The code above sets up a new (vector) pointer, pointing to the first
5438   location accessed by STMT_INFO, and a "floor-aligned" load using that
5439   pointer.  It also generates code to compute the "realignment-token"
5440   (if the relevant target hook was defined), and creates a phi-node at the
5441   loop-header bb whose arguments are the result of the prolog-load (created
5442   by this function) and the result of a load that takes place in the loop
5443   (to be created by the caller to this function).
5444
5445   For the case of dr_explicit_realign_optimized:
5446   The caller to this function uses the phi-result (msq) to create the
5447   realignment code inside the loop, and sets up the missing phi argument,
5448   as follows:
5449    loop:
5450      msq = phi (msq_init, lsq)
5451      lsq = *(floor(p'));        # load in loop
5452      result = realign_load (msq, lsq, realignment_token);
5453
5454   For the case of dr_explicit_realign:
5455    loop:
5456      msq = *(floor(p)); 	# load in loop
5457      p' = p + (VS-1);
5458      lsq = *(floor(p'));	# load in loop
5459      result = realign_load (msq, lsq, realignment_token);
5460
5461   Input:
5462   STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5463	       a memory location that may be unaligned.
5464   BSI - place where new code is to be inserted.
5465   ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5466			      is used.
5467
5468   Output:
5469   REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5470                       target hook, if defined.
5471   Return value - the result of the loop-header phi node.  */
5472
5473tree
5474vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5475                        tree *realignment_token,
5476			enum dr_alignment_support alignment_support_scheme,
5477			tree init_addr,
5478			class loop **at_loop)
5479{
5480  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5481  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5482  dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5483  struct data_reference *dr = dr_info->dr;
5484  class loop *loop = NULL;
5485  edge pe = NULL;
5486  tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5487  tree vec_dest;
5488  gimple *inc;
5489  tree ptr;
5490  tree data_ref;
5491  basic_block new_bb;
5492  tree msq_init = NULL_TREE;
5493  tree new_temp;
5494  gphi *phi_stmt;
5495  tree msq = NULL_TREE;
5496  gimple_seq stmts = NULL;
5497  bool compute_in_loop = false;
5498  bool nested_in_vect_loop = false;
5499  class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5500  class loop *loop_for_initial_load = NULL;
5501
5502  if (loop_vinfo)
5503    {
5504      loop = LOOP_VINFO_LOOP (loop_vinfo);
5505      nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5506    }
5507
5508  gcc_assert (alignment_support_scheme == dr_explicit_realign
5509	      || alignment_support_scheme == dr_explicit_realign_optimized);
5510
5511  /* We need to generate three things:
5512     1. the misalignment computation
5513     2. the extra vector load (for the optimized realignment scheme).
5514     3. the phi node for the two vectors from which the realignment is
5515      done (for the optimized realignment scheme).  */
5516
5517  /* 1. Determine where to generate the misalignment computation.
5518
5519     If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5520     calculation will be generated by this function, outside the loop (in the
5521     preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5522     caller, inside the loop.
5523
5524     Background: If the misalignment remains fixed throughout the iterations of
5525     the loop, then both realignment schemes are applicable, and also the
5526     misalignment computation can be done outside LOOP.  This is because we are
5527     vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5528     are a multiple of VS (the Vector Size), and therefore the misalignment in
5529     different vectorized LOOP iterations is always the same.
5530     The problem arises only if the memory access is in an inner-loop nested
5531     inside LOOP, which is now being vectorized using outer-loop vectorization.
5532     This is the only case when the misalignment of the memory access may not
5533     remain fixed throughout the iterations of the inner-loop (as explained in
5534     detail in vect_supportable_dr_alignment).  In this case, not only is the
5535     optimized realignment scheme not applicable, but also the misalignment
5536     computation (and generation of the realignment token that is passed to
5537     REALIGN_LOAD) have to be done inside the loop.
5538
5539     In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5540     or not, which in turn determines if the misalignment is computed inside
5541     the inner-loop, or outside LOOP.  */
5542
5543  if (init_addr != NULL_TREE || !loop_vinfo)
5544    {
5545      compute_in_loop = true;
5546      gcc_assert (alignment_support_scheme == dr_explicit_realign);
5547    }
5548
5549
5550  /* 2. Determine where to generate the extra vector load.
5551
5552     For the optimized realignment scheme, instead of generating two vector
5553     loads in each iteration, we generate a single extra vector load in the
5554     preheader of the loop, and in each iteration reuse the result of the
5555     vector load from the previous iteration.  In case the memory access is in
5556     an inner-loop nested inside LOOP, which is now being vectorized using
5557     outer-loop vectorization, we need to determine whether this initial vector
5558     load should be generated at the preheader of the inner-loop, or can be
5559     generated at the preheader of LOOP.  If the memory access has no evolution
5560     in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5561     to be generated inside LOOP (in the preheader of the inner-loop).  */
5562
5563  if (nested_in_vect_loop)
5564    {
5565      tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5566      bool invariant_in_outerloop =
5567            (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5568      loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5569    }
5570  else
5571    loop_for_initial_load = loop;
5572  if (at_loop)
5573    *at_loop = loop_for_initial_load;
5574
5575  if (loop_for_initial_load)
5576    pe = loop_preheader_edge (loop_for_initial_load);
5577
5578  /* 3. For the case of the optimized realignment, create the first vector
5579      load at the loop preheader.  */
5580
5581  if (alignment_support_scheme == dr_explicit_realign_optimized)
5582    {
5583      /* Create msq_init = *(floor(p1)) in the loop preheader  */
5584      gassign *new_stmt;
5585
5586      gcc_assert (!compute_in_loop);
5587      vec_dest = vect_create_destination_var (scalar_dest, vectype);
5588      ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5589				      loop_for_initial_load, NULL_TREE,
5590				      &init_addr, NULL, &inc, true);
5591      if (TREE_CODE (ptr) == SSA_NAME)
5592	new_temp = copy_ssa_name (ptr);
5593      else
5594	new_temp = make_ssa_name (TREE_TYPE (ptr));
5595      poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5596      tree type = TREE_TYPE (ptr);
5597      new_stmt = gimple_build_assign
5598		   (new_temp, BIT_AND_EXPR, ptr,
5599		    fold_build2 (MINUS_EXPR, type,
5600				 build_int_cst (type, 0),
5601				 build_int_cst (type, align)));
5602      new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5603      gcc_assert (!new_bb);
5604      data_ref
5605	= build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5606		  build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5607      vect_copy_ref_info (data_ref, DR_REF (dr));
5608      new_stmt = gimple_build_assign (vec_dest, data_ref);
5609      new_temp = make_ssa_name (vec_dest, new_stmt);
5610      gimple_assign_set_lhs (new_stmt, new_temp);
5611      if (pe)
5612        {
5613          new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5614          gcc_assert (!new_bb);
5615        }
5616      else
5617         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5618
5619      msq_init = gimple_assign_lhs (new_stmt);
5620    }
5621
5622  /* 4. Create realignment token using a target builtin, if available.
5623      It is done either inside the containing loop, or before LOOP (as
5624      determined above).  */
5625
5626  if (targetm.vectorize.builtin_mask_for_load)
5627    {
5628      gcall *new_stmt;
5629      tree builtin_decl;
5630
5631      /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5632      if (!init_addr)
5633	{
5634	  /* Generate the INIT_ADDR computation outside LOOP.  */
5635	  init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5636							    NULL_TREE);
5637          if (loop)
5638            {
5639   	      pe = loop_preheader_edge (loop);
5640	      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5641	      gcc_assert (!new_bb);
5642            }
5643          else
5644             gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5645	}
5646
5647      builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5648      new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5649      vec_dest =
5650	vect_create_destination_var (scalar_dest,
5651				     gimple_call_return_type (new_stmt));
5652      new_temp = make_ssa_name (vec_dest, new_stmt);
5653      gimple_call_set_lhs (new_stmt, new_temp);
5654
5655      if (compute_in_loop)
5656	gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5657      else
5658	{
5659	  /* Generate the misalignment computation outside LOOP.  */
5660	  pe = loop_preheader_edge (loop);
5661	  new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5662	  gcc_assert (!new_bb);
5663	}
5664
5665      *realignment_token = gimple_call_lhs (new_stmt);
5666
5667      /* The result of the CALL_EXPR to this builtin is determined from
5668         the value of the parameter and no global variables are touched
5669         which makes the builtin a "const" function.  Requiring the
5670         builtin to have the "const" attribute makes it unnecessary
5671         to call mark_call_clobbered.  */
5672      gcc_assert (TREE_READONLY (builtin_decl));
5673    }
5674
5675  if (alignment_support_scheme == dr_explicit_realign)
5676    return msq;
5677
5678  gcc_assert (!compute_in_loop);
5679  gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5680
5681
5682  /* 5. Create msq = phi <msq_init, lsq> in loop  */
5683
5684  pe = loop_preheader_edge (containing_loop);
5685  vec_dest = vect_create_destination_var (scalar_dest, vectype);
5686  msq = make_ssa_name (vec_dest);
5687  phi_stmt = create_phi_node (msq, containing_loop->header);
5688  add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5689
5690  return msq;
5691}
5692
5693
5694/* Function vect_grouped_load_supported.
5695
5696   COUNT is the size of the load group (the number of statements plus the
5697   number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5698   only one statement, with a gap of COUNT - 1.
5699
5700   Returns true if a suitable permute exists.  */
5701
5702bool
5703vect_grouped_load_supported (tree vectype, bool single_element_p,
5704			     unsigned HOST_WIDE_INT count)
5705{
5706  machine_mode mode = TYPE_MODE (vectype);
5707
5708  /* If this is single-element interleaving with an element distance
5709     that leaves unused vector loads around punt - we at least create
5710     very sub-optimal code in that case (and blow up memory,
5711     see PR65518).  */
5712  if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5713    {
5714      if (dump_enabled_p ())
5715	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5716			 "single-element interleaving not supported "
5717			 "for not adjacent vector loads\n");
5718      return false;
5719    }
5720
5721  /* vect_permute_load_chain requires the group size to be equal to 3 or
5722     be a power of two.  */
5723  if (count != 3 && exact_log2 (count) == -1)
5724    {
5725      if (dump_enabled_p ())
5726	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5727			 "the size of the group of accesses"
5728			 " is not a power of 2 or not equal to 3\n");
5729      return false;
5730    }
5731
5732  /* Check that the permutation is supported.  */
5733  if (VECTOR_MODE_P (mode))
5734    {
5735      unsigned int i, j;
5736      if (count == 3)
5737	{
5738	  unsigned int nelt;
5739	  if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5740	    {
5741	      if (dump_enabled_p ())
5742		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5743				 "cannot handle groups of 3 loads for"
5744				 " variable-length vectors\n");
5745	      return false;
5746	    }
5747
5748	  vec_perm_builder sel (nelt, nelt, 1);
5749	  sel.quick_grow (nelt);
5750	  vec_perm_indices indices;
5751	  unsigned int k;
5752	  for (k = 0; k < 3; k++)
5753	    {
5754	      for (i = 0; i < nelt; i++)
5755		if (3 * i + k < 2 * nelt)
5756		  sel[i] = 3 * i + k;
5757		else
5758		  sel[i] = 0;
5759	      indices.new_vector (sel, 2, nelt);
5760	      if (!can_vec_perm_const_p (mode, indices))
5761		{
5762		  if (dump_enabled_p ())
5763		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5764				     "shuffle of 3 loads is not supported by"
5765				     " target\n");
5766		  return false;
5767		}
5768	      for (i = 0, j = 0; i < nelt; i++)
5769		if (3 * i + k < 2 * nelt)
5770		  sel[i] = i;
5771		else
5772		  sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5773	      indices.new_vector (sel, 2, nelt);
5774	      if (!can_vec_perm_const_p (mode, indices))
5775		{
5776		  if (dump_enabled_p ())
5777		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5778				     "shuffle of 3 loads is not supported by"
5779				     " target\n");
5780		  return false;
5781		}
5782	    }
5783	  return true;
5784	}
5785      else
5786	{
5787	  /* If length is not equal to 3 then only power of 2 is supported.  */
5788	  gcc_assert (pow2p_hwi (count));
5789	  poly_uint64 nelt = GET_MODE_NUNITS (mode);
5790
5791	  /* The encoding has a single stepped pattern.  */
5792	  vec_perm_builder sel (nelt, 1, 3);
5793	  sel.quick_grow (3);
5794	  for (i = 0; i < 3; i++)
5795	    sel[i] = i * 2;
5796	  vec_perm_indices indices (sel, 2, nelt);
5797	  if (can_vec_perm_const_p (mode, indices))
5798	    {
5799	      for (i = 0; i < 3; i++)
5800		sel[i] = i * 2 + 1;
5801	      indices.new_vector (sel, 2, nelt);
5802	      if (can_vec_perm_const_p (mode, indices))
5803		return true;
5804	    }
5805        }
5806    }
5807
5808  if (dump_enabled_p ())
5809    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5810		     "extract even/odd not supported by target\n");
5811  return false;
5812}
5813
5814/* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5815   type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5816
5817bool
5818vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5819			   bool masked_p)
5820{
5821  if (masked_p)
5822    return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5823					 vec_mask_load_lanes_optab,
5824					 vectype, count);
5825  else
5826    return vect_lanes_optab_supported_p ("vec_load_lanes",
5827					 vec_load_lanes_optab,
5828					 vectype, count);
5829}
5830
5831/* Function vect_permute_load_chain.
5832
5833   Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5834   a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5835   the input data correctly.  Return the final references for loads in
5836   RESULT_CHAIN.
5837
5838   E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5839   The input is 4 vectors each containing 8 elements. We assign a number to each
5840   element, the input sequence is:
5841
5842   1st vec:   0  1  2  3  4  5  6  7
5843   2nd vec:   8  9 10 11 12 13 14 15
5844   3rd vec:  16 17 18 19 20 21 22 23
5845   4th vec:  24 25 26 27 28 29 30 31
5846
5847   The output sequence should be:
5848
5849   1st vec:  0 4  8 12 16 20 24 28
5850   2nd vec:  1 5  9 13 17 21 25 29
5851   3rd vec:  2 6 10 14 18 22 26 30
5852   4th vec:  3 7 11 15 19 23 27 31
5853
5854   i.e., the first output vector should contain the first elements of each
5855   interleaving group, etc.
5856
5857   We use extract_even/odd instructions to create such output.  The input of
5858   each extract_even/odd operation is two vectors
5859   1st vec    2nd vec
5860   0 1 2 3    4 5 6 7
5861
5862   and the output is the vector of extracted even/odd elements.  The output of
5863   extract_even will be:   0 2 4 6
5864   and of extract_odd:     1 3 5 7
5865
5866
5867   The permutation is done in log LENGTH stages.  In each stage extract_even
5868   and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5869   their order.  In our example,
5870
5871   E1: extract_even (1st vec, 2nd vec)
5872   E2: extract_odd (1st vec, 2nd vec)
5873   E3: extract_even (3rd vec, 4th vec)
5874   E4: extract_odd (3rd vec, 4th vec)
5875
5876   The output for the first stage will be:
5877
5878   E1:  0  2  4  6  8 10 12 14
5879   E2:  1  3  5  7  9 11 13 15
5880   E3: 16 18 20 22 24 26 28 30
5881   E4: 17 19 21 23 25 27 29 31
5882
5883   In order to proceed and create the correct sequence for the next stage (or
5884   for the correct output, if the second stage is the last one, as in our
5885   example), we first put the output of extract_even operation and then the
5886   output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5887   The input for the second stage is:
5888
5889   1st vec (E1):  0  2  4  6  8 10 12 14
5890   2nd vec (E3): 16 18 20 22 24 26 28 30
5891   3rd vec (E2):  1  3  5  7  9 11 13 15
5892   4th vec (E4): 17 19 21 23 25 27 29 31
5893
5894   The output of the second stage:
5895
5896   E1: 0 4  8 12 16 20 24 28
5897   E2: 2 6 10 14 18 22 26 30
5898   E3: 1 5  9 13 17 21 25 29
5899   E4: 3 7 11 15 19 23 27 31
5900
5901   And RESULT_CHAIN after reordering:
5902
5903   1st vec (E1):  0 4  8 12 16 20 24 28
5904   2nd vec (E3):  1 5  9 13 17 21 25 29
5905   3rd vec (E2):  2 6 10 14 18 22 26 30
5906   4th vec (E4):  3 7 11 15 19 23 27 31.  */
5907
5908static void
5909vect_permute_load_chain (vec<tree> dr_chain,
5910			 unsigned int length,
5911			 stmt_vec_info stmt_info,
5912			 gimple_stmt_iterator *gsi,
5913			 vec<tree> *result_chain)
5914{
5915  tree data_ref, first_vect, second_vect;
5916  tree perm_mask_even, perm_mask_odd;
5917  tree perm3_mask_low, perm3_mask_high;
5918  gimple *perm_stmt;
5919  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5920  unsigned int i, j, log_length = exact_log2 (length);
5921
5922  result_chain->quick_grow (length);
5923  memcpy (result_chain->address (), dr_chain.address (),
5924	  length * sizeof (tree));
5925
5926  if (length == 3)
5927    {
5928      /* vect_grouped_load_supported ensures that this is constant.  */
5929      unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5930      unsigned int k;
5931
5932      vec_perm_builder sel (nelt, nelt, 1);
5933      sel.quick_grow (nelt);
5934      vec_perm_indices indices;
5935      for (k = 0; k < 3; k++)
5936	{
5937	  for (i = 0; i < nelt; i++)
5938	    if (3 * i + k < 2 * nelt)
5939	      sel[i] = 3 * i + k;
5940	    else
5941	      sel[i] = 0;
5942	  indices.new_vector (sel, 2, nelt);
5943	  perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5944
5945	  for (i = 0, j = 0; i < nelt; i++)
5946	    if (3 * i + k < 2 * nelt)
5947	      sel[i] = i;
5948	    else
5949	      sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5950	  indices.new_vector (sel, 2, nelt);
5951	  perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5952
5953	  first_vect = dr_chain[0];
5954	  second_vect = dr_chain[1];
5955
5956	  /* Create interleaving stmt (low part of):
5957	     low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5958							     ...}>  */
5959	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5960	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5961					   second_vect, perm3_mask_low);
5962	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5963
5964	  /* Create interleaving stmt (high part of):
5965	     high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5966							      ...}>  */
5967	  first_vect = data_ref;
5968	  second_vect = dr_chain[2];
5969	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5970	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5971					   second_vect, perm3_mask_high);
5972	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5973	  (*result_chain)[k] = data_ref;
5974	}
5975    }
5976  else
5977    {
5978      /* If length is not equal to 3 then only power of 2 is supported.  */
5979      gcc_assert (pow2p_hwi (length));
5980
5981      /* The encoding has a single stepped pattern.  */
5982      poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5983      vec_perm_builder sel (nelt, 1, 3);
5984      sel.quick_grow (3);
5985      for (i = 0; i < 3; ++i)
5986	sel[i] = i * 2;
5987      vec_perm_indices indices (sel, 2, nelt);
5988      perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5989
5990      for (i = 0; i < 3; ++i)
5991	sel[i] = i * 2 + 1;
5992      indices.new_vector (sel, 2, nelt);
5993      perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5994
5995      for (i = 0; i < log_length; i++)
5996	{
5997	  for (j = 0; j < length; j += 2)
5998	    {
5999	      first_vect = dr_chain[j];
6000	      second_vect = dr_chain[j+1];
6001
6002	      /* data_ref = permute_even (first_data_ref, second_data_ref);  */
6003	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
6004	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6005					       first_vect, second_vect,
6006					       perm_mask_even);
6007	      vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6008	      (*result_chain)[j/2] = data_ref;
6009
6010	      /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
6011	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
6012	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6013					       first_vect, second_vect,
6014					       perm_mask_odd);
6015	      vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6016	      (*result_chain)[j/2+length/2] = data_ref;
6017	    }
6018	  memcpy (dr_chain.address (), result_chain->address (),
6019		  length * sizeof (tree));
6020	}
6021    }
6022}
6023
6024/* Function vect_shift_permute_load_chain.
6025
6026   Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
6027   sequence of stmts to reorder the input data accordingly.
6028   Return the final references for loads in RESULT_CHAIN.
6029   Return true if successed, false otherwise.
6030
6031   E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
6032   The input is 3 vectors each containing 8 elements.  We assign a
6033   number to each element, the input sequence is:
6034
6035   1st vec:   0  1  2  3  4  5  6  7
6036   2nd vec:   8  9 10 11 12 13 14 15
6037   3rd vec:  16 17 18 19 20 21 22 23
6038
6039   The output sequence should be:
6040
6041   1st vec:  0 3 6  9 12 15 18 21
6042   2nd vec:  1 4 7 10 13 16 19 22
6043   3rd vec:  2 5 8 11 14 17 20 23
6044
6045   We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
6046
6047   First we shuffle all 3 vectors to get correct elements order:
6048
6049   1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6050   2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6051   3rd vec:  (16 19 22) (17 20 23) (18 21)
6052
6053   Next we unite and shift vector 3 times:
6054
6055   1st step:
6056     shift right by 6 the concatenation of:
6057     "1st vec" and  "2nd vec"
6058       ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6059     "2nd vec" and  "3rd vec"
6060       ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6061     "3rd vec" and  "1st vec"
6062       (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6063			     | New vectors                   |
6064
6065     So that now new vectors are:
6066
6067     1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6068     2nd vec:  (10 13) (16 19 22) (17 20 23)
6069     3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6070
6071   2nd step:
6072     shift right by 5 the concatenation of:
6073     "1st vec" and  "3rd vec"
6074       ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6075     "2nd vec" and  "1st vec"
6076       (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6077     "3rd vec" and  "2nd vec"
6078       (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6079			  | New vectors                   |
6080
6081     So that now new vectors are:
6082
6083     1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6084     2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6085     3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6086
6087   3rd step:
6088     shift right by 5 the concatenation of:
6089     "1st vec" and  "1st vec"
6090       ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6091     shift right by 3 the concatenation of:
6092     "2nd vec" and  "2nd vec"
6093               (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6094			  | New vectors                   |
6095
6096     So that now all vectors are READY:
6097     1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6098     2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6099     3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6100
6101   This algorithm is faster than one in vect_permute_load_chain if:
6102     1.  "shift of a concatination" is faster than general permutation.
6103	 This is usually so.
6104     2.  The TARGET machine can't execute vector instructions in parallel.
6105	 This is because each step of the algorithm depends on previous.
6106	 The algorithm in vect_permute_load_chain is much more parallel.
6107
6108   The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6109*/
6110
6111static bool
6112vect_shift_permute_load_chain (vec<tree> dr_chain,
6113			       unsigned int length,
6114			       stmt_vec_info stmt_info,
6115			       gimple_stmt_iterator *gsi,
6116			       vec<tree> *result_chain)
6117{
6118  tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6119  tree perm2_mask1, perm2_mask2, perm3_mask;
6120  tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6121  gimple *perm_stmt;
6122
6123  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6124  unsigned int i;
6125  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6126
6127  unsigned HOST_WIDE_INT nelt, vf;
6128  if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6129      || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6130    /* Not supported for variable-length vectors.  */
6131    return false;
6132
6133  vec_perm_builder sel (nelt, nelt, 1);
6134  sel.quick_grow (nelt);
6135
6136  result_chain->quick_grow (length);
6137  memcpy (result_chain->address (), dr_chain.address (),
6138	  length * sizeof (tree));
6139
6140  if (pow2p_hwi (length) && vf > 4)
6141    {
6142      unsigned int j, log_length = exact_log2 (length);
6143      for (i = 0; i < nelt / 2; ++i)
6144	sel[i] = i * 2;
6145      for (i = 0; i < nelt / 2; ++i)
6146	sel[nelt / 2 + i] = i * 2 + 1;
6147      vec_perm_indices indices (sel, 2, nelt);
6148      if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6149	{
6150	  if (dump_enabled_p ())
6151	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6152			     "shuffle of 2 fields structure is not \
6153			      supported by target\n");
6154	  return false;
6155	}
6156      perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6157
6158      for (i = 0; i < nelt / 2; ++i)
6159	sel[i] = i * 2 + 1;
6160      for (i = 0; i < nelt / 2; ++i)
6161	sel[nelt / 2 + i] = i * 2;
6162      indices.new_vector (sel, 2, nelt);
6163      if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6164	{
6165	  if (dump_enabled_p ())
6166	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6167			     "shuffle of 2 fields structure is not \
6168			      supported by target\n");
6169	  return false;
6170	}
6171      perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6172
6173      /* Generating permutation constant to shift all elements.
6174	 For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6175      for (i = 0; i < nelt; i++)
6176	sel[i] = nelt / 2 + i;
6177      indices.new_vector (sel, 2, nelt);
6178      if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6179	{
6180	  if (dump_enabled_p ())
6181	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6182			     "shift permutation is not supported by target\n");
6183	  return false;
6184	}
6185      shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6186
6187      /* Generating permutation constant to select vector from 2.
6188	 For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6189      for (i = 0; i < nelt / 2; i++)
6190	sel[i] = i;
6191      for (i = nelt / 2; i < nelt; i++)
6192	sel[i] = nelt + i;
6193      indices.new_vector (sel, 2, nelt);
6194      if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6195	{
6196	  if (dump_enabled_p ())
6197	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6198			     "select is not supported by target\n");
6199	  return false;
6200	}
6201      select_mask = vect_gen_perm_mask_checked (vectype, indices);
6202
6203      for (i = 0; i < log_length; i++)
6204	{
6205	  for (j = 0; j < length; j += 2)
6206	    {
6207	      first_vect = dr_chain[j];
6208	      second_vect = dr_chain[j + 1];
6209
6210	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6211	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6212					       first_vect, first_vect,
6213					       perm2_mask1);
6214	      vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6215	      vect[0] = data_ref;
6216
6217	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6218	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6219					       second_vect, second_vect,
6220					       perm2_mask2);
6221	      vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6222	      vect[1] = data_ref;
6223
6224	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6225	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6226					       vect[0], vect[1], shift1_mask);
6227	      vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6228	      (*result_chain)[j/2 + length/2] = data_ref;
6229
6230	      data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6231	      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6232					       vect[0], vect[1], select_mask);
6233	      vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6234	      (*result_chain)[j/2] = data_ref;
6235	    }
6236	  memcpy (dr_chain.address (), result_chain->address (),
6237		  length * sizeof (tree));
6238	}
6239      return true;
6240    }
6241  if (length == 3 && vf > 2)
6242    {
6243      unsigned int k = 0, l = 0;
6244
6245      /* Generating permutation constant to get all elements in rigth order.
6246	 For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6247      for (i = 0; i < nelt; i++)
6248	{
6249	  if (3 * k + (l % 3) >= nelt)
6250	    {
6251	      k = 0;
6252	      l += (3 - (nelt % 3));
6253	    }
6254	  sel[i] = 3 * k + (l % 3);
6255	  k++;
6256	}
6257      vec_perm_indices indices (sel, 2, nelt);
6258      if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6259	{
6260	  if (dump_enabled_p ())
6261	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6262			     "shuffle of 3 fields structure is not \
6263			      supported by target\n");
6264	  return false;
6265	}
6266      perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6267
6268      /* Generating permutation constant to shift all elements.
6269	 For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6270      for (i = 0; i < nelt; i++)
6271	sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6272      indices.new_vector (sel, 2, nelt);
6273      if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6274	{
6275	  if (dump_enabled_p ())
6276	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6277			     "shift permutation is not supported by target\n");
6278	  return false;
6279	}
6280      shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6281
6282      /* Generating permutation constant to shift all elements.
6283	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6284      for (i = 0; i < nelt; i++)
6285	sel[i] = 2 * (nelt / 3) + 1 + i;
6286      indices.new_vector (sel, 2, nelt);
6287      if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6288	{
6289	  if (dump_enabled_p ())
6290	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6291			     "shift permutation is not supported by target\n");
6292	  return false;
6293	}
6294      shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6295
6296      /* Generating permutation constant to shift all elements.
6297	 For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6298      for (i = 0; i < nelt; i++)
6299	sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6300      indices.new_vector (sel, 2, nelt);
6301      if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6302	{
6303	  if (dump_enabled_p ())
6304	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6305			     "shift permutation is not supported by target\n");
6306	  return false;
6307	}
6308      shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6309
6310      /* Generating permutation constant to shift all elements.
6311	 For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6312      for (i = 0; i < nelt; i++)
6313	sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6314      indices.new_vector (sel, 2, nelt);
6315      if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6316	{
6317	  if (dump_enabled_p ())
6318	    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6319			     "shift permutation is not supported by target\n");
6320	  return false;
6321	}
6322      shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6323
6324      for (k = 0; k < 3; k++)
6325	{
6326	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6327	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6328					   dr_chain[k], dr_chain[k],
6329					   perm3_mask);
6330	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6331	  vect[k] = data_ref;
6332	}
6333
6334      for (k = 0; k < 3; k++)
6335	{
6336	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6337	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6338					   vect[k % 3], vect[(k + 1) % 3],
6339					   shift1_mask);
6340	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6341	  vect_shift[k] = data_ref;
6342	}
6343
6344      for (k = 0; k < 3; k++)
6345	{
6346	  data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6347	  perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6348					   vect_shift[(4 - k) % 3],
6349					   vect_shift[(3 - k) % 3],
6350					   shift2_mask);
6351	  vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6352	  vect[k] = data_ref;
6353	}
6354
6355      (*result_chain)[3 - (nelt % 3)] = vect[2];
6356
6357      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6358      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6359				       vect[0], shift3_mask);
6360      vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6361      (*result_chain)[nelt % 3] = data_ref;
6362
6363      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6364      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6365				       vect[1], shift4_mask);
6366      vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6367      (*result_chain)[0] = data_ref;
6368      return true;
6369    }
6370  return false;
6371}
6372
6373/* Function vect_transform_grouped_load.
6374
6375   Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6376   to perform their permutation and ascribe the result vectorized statements to
6377   the scalar statements.
6378*/
6379
6380void
6381vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6382			     int size, gimple_stmt_iterator *gsi)
6383{
6384  machine_mode mode;
6385  vec<tree> result_chain = vNULL;
6386
6387  /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6388     RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6389     vectors, that are ready for vector computation.  */
6390  result_chain.create (size);
6391
6392  /* If reassociation width for vector type is 2 or greater target machine can
6393     execute 2 or more vector instructions in parallel.  Otherwise try to
6394     get chain for loads group using vect_shift_permute_load_chain.  */
6395  mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6396  if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6397      || pow2p_hwi (size)
6398      || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6399					 gsi, &result_chain))
6400    vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6401  vect_record_grouped_load_vectors (stmt_info, result_chain);
6402  result_chain.release ();
6403}
6404
6405/* RESULT_CHAIN contains the output of a group of grouped loads that were
6406   generated as part of the vectorization of STMT_INFO.  Assign the statement
6407   for each vector to the associated scalar statement.  */
6408
6409void
6410vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6411				  vec<tree> result_chain)
6412{
6413  vec_info *vinfo = stmt_info->vinfo;
6414  stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6415  unsigned int i, gap_count;
6416  tree tmp_data_ref;
6417
6418  /* Put a permuted data-ref in the VECTORIZED_STMT field.
6419     Since we scan the chain starting from it's first node, their order
6420     corresponds the order of data-refs in RESULT_CHAIN.  */
6421  stmt_vec_info next_stmt_info = first_stmt_info;
6422  gap_count = 1;
6423  FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6424    {
6425      if (!next_stmt_info)
6426	break;
6427
6428      /* Skip the gaps.  Loads created for the gaps will be removed by dead
6429       code elimination pass later.  No need to check for the first stmt in
6430       the group, since it always exists.
6431       DR_GROUP_GAP is the number of steps in elements from the previous
6432       access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6433       correspond to the gaps.  */
6434      if (next_stmt_info != first_stmt_info
6435	  && gap_count < DR_GROUP_GAP (next_stmt_info))
6436	{
6437	  gap_count++;
6438	  continue;
6439	}
6440
6441      /* ???  The following needs cleanup after the removal of
6442         DR_GROUP_SAME_DR_STMT.  */
6443      if (next_stmt_info)
6444        {
6445	  stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6446	  /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6447	     copies, and we put the new vector statement in the first available
6448	     RELATED_STMT.  */
6449	  if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6450	    STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6451	  else
6452            {
6453	      stmt_vec_info prev_stmt_info
6454		= STMT_VINFO_VEC_STMT (next_stmt_info);
6455	      stmt_vec_info rel_stmt_info
6456		= STMT_VINFO_RELATED_STMT (prev_stmt_info);
6457	      while (rel_stmt_info)
6458		{
6459		  prev_stmt_info = rel_stmt_info;
6460		  rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6461		}
6462
6463	      STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6464            }
6465
6466	  next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6467	  gap_count = 1;
6468        }
6469    }
6470}
6471
6472/* Function vect_force_dr_alignment_p.
6473
6474   Returns whether the alignment of a DECL can be forced to be aligned
6475   on ALIGNMENT bit boundary.  */
6476
6477bool
6478vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6479{
6480  if (!VAR_P (decl))
6481    return false;
6482
6483  if (decl_in_symtab_p (decl)
6484      && !symtab_node::get (decl)->can_increase_alignment_p ())
6485    return false;
6486
6487  if (TREE_STATIC (decl))
6488    return (known_le (alignment,
6489		      (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6490  else
6491    return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6492}
6493
6494
6495/* Return whether the data reference DR_INFO is supported with respect to its
6496   alignment.
6497   If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6498   it is aligned, i.e., check if it is possible to vectorize it with different
6499   alignment.  */
6500
6501enum dr_alignment_support
6502vect_supportable_dr_alignment (dr_vec_info *dr_info,
6503                               bool check_aligned_accesses)
6504{
6505  data_reference *dr = dr_info->dr;
6506  stmt_vec_info stmt_info = dr_info->stmt;
6507  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6508  machine_mode mode = TYPE_MODE (vectype);
6509  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6510  class loop *vect_loop = NULL;
6511  bool nested_in_vect_loop = false;
6512
6513  if (aligned_access_p (dr_info) && !check_aligned_accesses)
6514    return dr_aligned;
6515
6516  /* For now assume all conditional loads/stores support unaligned
6517     access without any special code.  */
6518  if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6519    if (gimple_call_internal_p (stmt)
6520	&& (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6521	    || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6522      return dr_unaligned_supported;
6523
6524  if (loop_vinfo)
6525    {
6526      vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6527      nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6528    }
6529
6530  /* Possibly unaligned access.  */
6531
6532  /* We can choose between using the implicit realignment scheme (generating
6533     a misaligned_move stmt) and the explicit realignment scheme (generating
6534     aligned loads with a REALIGN_LOAD).  There are two variants to the
6535     explicit realignment scheme: optimized, and unoptimized.
6536     We can optimize the realignment only if the step between consecutive
6537     vector loads is equal to the vector size.  Since the vector memory
6538     accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6539     is guaranteed that the misalignment amount remains the same throughout the
6540     execution of the vectorized loop.  Therefore, we can create the
6541     "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6542     at the loop preheader.
6543
6544     However, in the case of outer-loop vectorization, when vectorizing a
6545     memory access in the inner-loop nested within the LOOP that is now being
6546     vectorized, while it is guaranteed that the misalignment of the
6547     vectorized memory access will remain the same in different outer-loop
6548     iterations, it is *not* guaranteed that is will remain the same throughout
6549     the execution of the inner-loop.  This is because the inner-loop advances
6550     with the original scalar step (and not in steps of VS).  If the inner-loop
6551     step happens to be a multiple of VS, then the misalignment remains fixed
6552     and we can use the optimized realignment scheme.  For example:
6553
6554      for (i=0; i<N; i++)
6555        for (j=0; j<M; j++)
6556          s += a[i+j];
6557
6558     When vectorizing the i-loop in the above example, the step between
6559     consecutive vector loads is 1, and so the misalignment does not remain
6560     fixed across the execution of the inner-loop, and the realignment cannot
6561     be optimized (as illustrated in the following pseudo vectorized loop):
6562
6563      for (i=0; i<N; i+=4)
6564        for (j=0; j<M; j++){
6565          vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6566                         // when j is {0,1,2,3,4,5,6,7,...} respectively.
6567                         // (assuming that we start from an aligned address).
6568          }
6569
6570     We therefore have to use the unoptimized realignment scheme:
6571
6572      for (i=0; i<N; i+=4)
6573          for (j=k; j<M; j+=4)
6574          vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6575                           // that the misalignment of the initial address is
6576                           // 0).
6577
6578     The loop can then be vectorized as follows:
6579
6580      for (k=0; k<4; k++){
6581        rt = get_realignment_token (&vp[k]);
6582        for (i=0; i<N; i+=4){
6583          v1 = vp[i+k];
6584          for (j=k; j<M; j+=4){
6585            v2 = vp[i+j+VS-1];
6586            va = REALIGN_LOAD <v1,v2,rt>;
6587            vs += va;
6588            v1 = v2;
6589          }
6590        }
6591    } */
6592
6593  if (DR_IS_READ (dr))
6594    {
6595      bool is_packed = false;
6596      tree type = (TREE_TYPE (DR_REF (dr)));
6597
6598      if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6599	  && (!targetm.vectorize.builtin_mask_for_load
6600	      || targetm.vectorize.builtin_mask_for_load ()))
6601	{
6602	  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6603
6604	  /* If we are doing SLP then the accesses need not have the
6605	     same alignment, instead it depends on the SLP group size.  */
6606	  if (loop_vinfo
6607	      && STMT_SLP_TYPE (stmt_info)
6608	      && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6609			      * (DR_GROUP_SIZE
6610				 (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6611			      TYPE_VECTOR_SUBPARTS (vectype)))
6612	    ;
6613	  else if (!loop_vinfo
6614		   || (nested_in_vect_loop
6615		       && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6616				    GET_MODE_SIZE (TYPE_MODE (vectype)))))
6617	    return dr_explicit_realign;
6618	  else
6619	    return dr_explicit_realign_optimized;
6620	}
6621      if (!known_alignment_for_access_p (dr_info))
6622	is_packed = not_size_aligned (DR_REF (dr));
6623
6624      if (targetm.vectorize.support_vector_misalignment
6625	    (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6626	/* Can't software pipeline the loads, but can at least do them.  */
6627	return dr_unaligned_supported;
6628    }
6629  else
6630    {
6631      bool is_packed = false;
6632      tree type = (TREE_TYPE (DR_REF (dr)));
6633
6634      if (!known_alignment_for_access_p (dr_info))
6635	is_packed = not_size_aligned (DR_REF (dr));
6636
6637     if (targetm.vectorize.support_vector_misalignment
6638	   (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6639       return dr_unaligned_supported;
6640    }
6641
6642  /* Unsupported.  */
6643  return dr_unaligned_unsupported;
6644}
6645