1/* Loop Vectorization 2 Copyright (C) 2003-2020 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and 4 Ira Rosen <irar@il.ibm.com> 5 6This file is part of GCC. 7 8GCC is free software; you can redistribute it and/or modify it under 9the terms of the GNU General Public License as published by the Free 10Software Foundation; either version 3, or (at your option) any later 11version. 12 13GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14WARRANTY; without even the implied warranty of MERCHANTABILITY or 15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16for more details. 17 18You should have received a copy of the GNU General Public License 19along with GCC; see the file COPYING3. If not see 20<http://www.gnu.org/licenses/>. */ 21 22#include "config.h" 23#include "system.h" 24#include "coretypes.h" 25#include "backend.h" 26#include "target.h" 27#include "rtl.h" 28#include "tree.h" 29#include "gimple.h" 30#include "cfghooks.h" 31#include "tree-pass.h" 32#include "ssa.h" 33#include "optabs-tree.h" 34#include "diagnostic-core.h" 35#include "fold-const.h" 36#include "stor-layout.h" 37#include "cfganal.h" 38#include "gimplify.h" 39#include "gimple-iterator.h" 40#include "gimplify-me.h" 41#include "tree-ssa-loop-ivopts.h" 42#include "tree-ssa-loop-manip.h" 43#include "tree-ssa-loop-niter.h" 44#include "tree-ssa-loop.h" 45#include "cfgloop.h" 46#include "tree-scalar-evolution.h" 47#include "tree-vectorizer.h" 48#include "gimple-fold.h" 49#include "cgraph.h" 50#include "tree-cfg.h" 51#include "tree-if-conv.h" 52#include "internal-fn.h" 53#include "tree-vector-builder.h" 54#include "vec-perm-indices.h" 55#include "tree-eh.h" 56 57/* Loop Vectorization Pass. 58 59 This pass tries to vectorize loops. 60 61 For example, the vectorizer transforms the following simple loop: 62 63 short a[N]; short b[N]; short c[N]; int i; 64 65 for (i=0; i<N; i++){ 66 a[i] = b[i] + c[i]; 67 } 68 69 as if it was manually vectorized by rewriting the source code into: 70 71 typedef int __attribute__((mode(V8HI))) v8hi; 72 short a[N]; short b[N]; short c[N]; int i; 73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c; 74 v8hi va, vb, vc; 75 76 for (i=0; i<N/8; i++){ 77 vb = pb[i]; 78 vc = pc[i]; 79 va = vb + vc; 80 pa[i] = va; 81 } 82 83 The main entry to this pass is vectorize_loops(), in which 84 the vectorizer applies a set of analyses on a given set of loops, 85 followed by the actual vectorization transformation for the loops that 86 had successfully passed the analysis phase. 87 Throughout this pass we make a distinction between two types of 88 data: scalars (which are represented by SSA_NAMES), and memory references 89 ("data-refs"). These two types of data require different handling both 90 during analysis and transformation. The types of data-refs that the 91 vectorizer currently supports are ARRAY_REFS which base is an array DECL 92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer 93 accesses are required to have a simple (consecutive) access pattern. 94 95 Analysis phase: 96 =============== 97 The driver for the analysis phase is vect_analyze_loop(). 98 It applies a set of analyses, some of which rely on the scalar evolution 99 analyzer (scev) developed by Sebastian Pop. 100 101 During the analysis phase the vectorizer records some information 102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the 103 loop, as well as general information about the loop as a whole, which is 104 recorded in a "loop_vec_info" struct attached to each loop. 105 106 Transformation phase: 107 ===================== 108 The loop transformation phase scans all the stmts in the loop, and 109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in 110 the loop that needs to be vectorized. It inserts the vector code sequence 111 just before the scalar stmt S, and records a pointer to the vector code 112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct 113 attached to S). This pointer will be used for the vectorization of following 114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory; 115 otherwise, we rely on dead code elimination for removing it. 116 117 For example, say stmt S1 was vectorized into stmt VS1: 118 119 VS1: vb = px[i]; 120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 121 S2: a = b; 122 123 To vectorize stmt S2, the vectorizer first finds the stmt that defines 124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the 125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The 126 resulting sequence would be: 127 128 VS1: vb = px[i]; 129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 130 VS2: va = vb; 131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2 132 133 Operands that are not SSA_NAMEs, are data-refs that appear in 134 load/store operations (like 'x[i]' in S1), and are handled differently. 135 136 Target modeling: 137 ================= 138 Currently the only target specific information that is used is the 139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". 140 Targets that can support different sizes of vectors, for now will need 141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More 142 flexibility will be added in the future. 143 144 Since we only vectorize operations which vector form can be 145 expressed using existing tree codes, to verify that an operation is 146 supported, the vectorizer checks the relevant optab at the relevant 147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If 148 the value found is CODE_FOR_nothing, then there's no target support, and 149 we can't vectorize the stmt. 150 151 For additional information on this project see: 152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html 153*/ 154 155static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *); 156static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info, 157 bool *, bool *); 158 159/* Subroutine of vect_determine_vf_for_stmt that handles only one 160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE 161 may already be set for general statements (not just data refs). */ 162 163static opt_result 164vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info, 165 bool vectype_maybe_set_p, 166 poly_uint64 *vf) 167{ 168 gimple *stmt = stmt_info->stmt; 169 170 if ((!STMT_VINFO_RELEVANT_P (stmt_info) 171 && !STMT_VINFO_LIVE_P (stmt_info)) 172 || gimple_clobber_p (stmt)) 173 { 174 if (dump_enabled_p ()) 175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n"); 176 return opt_result::success (); 177 } 178 179 tree stmt_vectype, nunits_vectype; 180 opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype, 181 &nunits_vectype); 182 if (!res) 183 return res; 184 185 if (stmt_vectype) 186 { 187 if (STMT_VINFO_VECTYPE (stmt_info)) 188 /* The only case when a vectype had been already set is for stmts 189 that contain a data ref, or for "pattern-stmts" (stmts generated 190 by the vectorizer to represent/replace a certain idiom). */ 191 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info) 192 || vectype_maybe_set_p) 193 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype); 194 else 195 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype; 196 } 197 198 if (nunits_vectype) 199 vect_update_max_nunits (vf, nunits_vectype); 200 201 return opt_result::success (); 202} 203 204/* Subroutine of vect_determine_vectorization_factor. Set the vector 205 types of STMT_INFO and all attached pattern statements and update 206 the vectorization factor VF accordingly. Return true on success 207 or false if something prevented vectorization. */ 208 209static opt_result 210vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf) 211{ 212 vec_info *vinfo = stmt_info->vinfo; 213 if (dump_enabled_p ()) 214 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G", 215 stmt_info->stmt); 216 opt_result res = vect_determine_vf_for_stmt_1 (stmt_info, false, vf); 217 if (!res) 218 return res; 219 220 if (STMT_VINFO_IN_PATTERN_P (stmt_info) 221 && STMT_VINFO_RELATED_STMT (stmt_info)) 222 { 223 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); 224 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info); 225 226 /* If a pattern statement has def stmts, analyze them too. */ 227 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq); 228 !gsi_end_p (si); gsi_next (&si)) 229 { 230 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si)); 231 if (dump_enabled_p ()) 232 dump_printf_loc (MSG_NOTE, vect_location, 233 "==> examining pattern def stmt: %G", 234 def_stmt_info->stmt); 235 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true, vf); 236 if (!res) 237 return res; 238 } 239 240 if (dump_enabled_p ()) 241 dump_printf_loc (MSG_NOTE, vect_location, 242 "==> examining pattern statement: %G", 243 stmt_info->stmt); 244 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf); 245 if (!res) 246 return res; 247 } 248 249 return opt_result::success (); 250} 251 252/* Function vect_determine_vectorization_factor 253 254 Determine the vectorization factor (VF). VF is the number of data elements 255 that are operated upon in parallel in a single iteration of the vectorized 256 loop. For example, when vectorizing a loop that operates on 4byte elements, 257 on a target with vector size (VS) 16byte, the VF is set to 4, since 4 258 elements can fit in a single vector register. 259 260 We currently support vectorization of loops in which all types operated upon 261 are of the same size. Therefore this function currently sets VF according to 262 the size of the types operated upon, and fails if there are multiple sizes 263 in the loop. 264 265 VF is also the factor by which the loop iterations are strip-mined, e.g.: 266 original loop: 267 for (i=0; i<N; i++){ 268 a[i] = b[i] + c[i]; 269 } 270 271 vectorized loop: 272 for (i=0; i<N; i+=VF){ 273 a[i:VF] = b[i:VF] + c[i:VF]; 274 } 275*/ 276 277static opt_result 278vect_determine_vectorization_factor (loop_vec_info loop_vinfo) 279{ 280 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 281 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 282 unsigned nbbs = loop->num_nodes; 283 poly_uint64 vectorization_factor = 1; 284 tree scalar_type = NULL_TREE; 285 gphi *phi; 286 tree vectype; 287 stmt_vec_info stmt_info; 288 unsigned i; 289 290 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor"); 291 292 for (i = 0; i < nbbs; i++) 293 { 294 basic_block bb = bbs[i]; 295 296 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 297 gsi_next (&si)) 298 { 299 phi = si.phi (); 300 stmt_info = loop_vinfo->lookup_stmt (phi); 301 if (dump_enabled_p ()) 302 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G", 303 phi); 304 305 gcc_assert (stmt_info); 306 307 if (STMT_VINFO_RELEVANT_P (stmt_info) 308 || STMT_VINFO_LIVE_P (stmt_info)) 309 { 310 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info)); 311 scalar_type = TREE_TYPE (PHI_RESULT (phi)); 312 313 if (dump_enabled_p ()) 314 dump_printf_loc (MSG_NOTE, vect_location, 315 "get vectype for scalar type: %T\n", 316 scalar_type); 317 318 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type); 319 if (!vectype) 320 return opt_result::failure_at (phi, 321 "not vectorized: unsupported " 322 "data-type %T\n", 323 scalar_type); 324 STMT_VINFO_VECTYPE (stmt_info) = vectype; 325 326 if (dump_enabled_p ()) 327 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n", 328 vectype); 329 330 if (dump_enabled_p ()) 331 { 332 dump_printf_loc (MSG_NOTE, vect_location, "nunits = "); 333 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype)); 334 dump_printf (MSG_NOTE, "\n"); 335 } 336 337 vect_update_max_nunits (&vectorization_factor, vectype); 338 } 339 } 340 341 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); 342 gsi_next (&si)) 343 { 344 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); 345 opt_result res 346 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor); 347 if (!res) 348 return res; 349 } 350 } 351 352 /* TODO: Analyze cost. Decide if worth while to vectorize. */ 353 if (dump_enabled_p ()) 354 { 355 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = "); 356 dump_dec (MSG_NOTE, vectorization_factor); 357 dump_printf (MSG_NOTE, "\n"); 358 } 359 360 if (known_le (vectorization_factor, 1U)) 361 return opt_result::failure_at (vect_location, 362 "not vectorized: unsupported data-type\n"); 363 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; 364 return opt_result::success (); 365} 366 367 368/* Function vect_is_simple_iv_evolution. 369 370 FORNOW: A simple evolution of an induction variables in the loop is 371 considered a polynomial evolution. */ 372 373static bool 374vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init, 375 tree * step) 376{ 377 tree init_expr; 378 tree step_expr; 379 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb); 380 basic_block bb; 381 382 /* When there is no evolution in this loop, the evolution function 383 is not "simple". */ 384 if (evolution_part == NULL_TREE) 385 return false; 386 387 /* When the evolution is a polynomial of degree >= 2 388 the evolution function is not "simple". */ 389 if (tree_is_chrec (evolution_part)) 390 return false; 391 392 step_expr = evolution_part; 393 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb)); 394 395 if (dump_enabled_p ()) 396 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n", 397 step_expr, init_expr); 398 399 *init = init_expr; 400 *step = step_expr; 401 402 if (TREE_CODE (step_expr) != INTEGER_CST 403 && (TREE_CODE (step_expr) != SSA_NAME 404 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr))) 405 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb)) 406 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr)) 407 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)) 408 || !flag_associative_math))) 409 && (TREE_CODE (step_expr) != REAL_CST 410 || !flag_associative_math)) 411 { 412 if (dump_enabled_p ()) 413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 414 "step unknown.\n"); 415 return false; 416 } 417 418 return true; 419} 420 421/* Return true if PHI, described by STMT_INFO, is the inner PHI in 422 what we are assuming is a double reduction. For example, given 423 a structure like this: 424 425 outer1: 426 x_1 = PHI <x_4(outer2), ...>; 427 ... 428 429 inner: 430 x_2 = PHI <x_1(outer1), ...>; 431 ... 432 x_3 = ...; 433 ... 434 435 outer2: 436 x_4 = PHI <x_3(inner)>; 437 ... 438 439 outer loop analysis would treat x_1 as a double reduction phi and 440 this function would then return true for x_2. */ 441 442static bool 443vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi) 444{ 445 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 446 use_operand_p use_p; 447 ssa_op_iter op_iter; 448 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE) 449 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p))) 450 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def) 451 return true; 452 return false; 453} 454 455/* Function vect_analyze_scalar_cycles_1. 456 457 Examine the cross iteration def-use cycles of scalar variables 458 in LOOP. LOOP_VINFO represents the loop that is now being 459 considered for vectorization (can be LOOP, or an outer-loop 460 enclosing LOOP). */ 461 462static void 463vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop) 464{ 465 basic_block bb = loop->header; 466 tree init, step; 467 auto_vec<stmt_vec_info, 64> worklist; 468 gphi_iterator gsi; 469 bool double_reduc, reduc_chain; 470 471 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles"); 472 473 /* First - identify all inductions. Reduction detection assumes that all the 474 inductions have been identified, therefore, this order must not be 475 changed. */ 476 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 477 { 478 gphi *phi = gsi.phi (); 479 tree access_fn = NULL; 480 tree def = PHI_RESULT (phi); 481 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi); 482 483 if (dump_enabled_p ()) 484 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi); 485 486 /* Skip virtual phi's. The data dependences that are associated with 487 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */ 488 if (virtual_operand_p (def)) 489 continue; 490 491 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type; 492 493 /* Analyze the evolution function. */ 494 access_fn = analyze_scalar_evolution (loop, def); 495 if (access_fn) 496 { 497 STRIP_NOPS (access_fn); 498 if (dump_enabled_p ()) 499 dump_printf_loc (MSG_NOTE, vect_location, 500 "Access function of PHI: %T\n", access_fn); 501 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo) 502 = initial_condition_in_loop_num (access_fn, loop->num); 503 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) 504 = evolution_part_in_loop_num (access_fn, loop->num); 505 } 506 507 if (!access_fn 508 || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi) 509 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step) 510 || (LOOP_VINFO_LOOP (loop_vinfo) != loop 511 && TREE_CODE (step) != INTEGER_CST)) 512 { 513 worklist.safe_push (stmt_vinfo); 514 continue; 515 } 516 517 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo) 518 != NULL_TREE); 519 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE); 520 521 if (dump_enabled_p ()) 522 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n"); 523 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def; 524 } 525 526 527 /* Second - identify all reductions and nested cycles. */ 528 while (worklist.length () > 0) 529 { 530 stmt_vec_info stmt_vinfo = worklist.pop (); 531 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt); 532 tree def = PHI_RESULT (phi); 533 534 if (dump_enabled_p ()) 535 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi); 536 537 gcc_assert (!virtual_operand_p (def) 538 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type); 539 540 stmt_vec_info reduc_stmt_info 541 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc, 542 &reduc_chain); 543 if (reduc_stmt_info) 544 { 545 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info; 546 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo; 547 if (double_reduc) 548 { 549 if (dump_enabled_p ()) 550 dump_printf_loc (MSG_NOTE, vect_location, 551 "Detected double reduction.\n"); 552 553 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def; 554 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def; 555 } 556 else 557 { 558 if (loop != LOOP_VINFO_LOOP (loop_vinfo)) 559 { 560 if (dump_enabled_p ()) 561 dump_printf_loc (MSG_NOTE, vect_location, 562 "Detected vectorizable nested cycle.\n"); 563 564 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle; 565 } 566 else 567 { 568 if (dump_enabled_p ()) 569 dump_printf_loc (MSG_NOTE, vect_location, 570 "Detected reduction.\n"); 571 572 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def; 573 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def; 574 /* Store the reduction cycles for possible vectorization in 575 loop-aware SLP if it was not detected as reduction 576 chain. */ 577 if (! reduc_chain) 578 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push 579 (reduc_stmt_info); 580 } 581 } 582 } 583 else 584 if (dump_enabled_p ()) 585 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 586 "Unknown def-use cycle pattern.\n"); 587 } 588} 589 590 591/* Function vect_analyze_scalar_cycles. 592 593 Examine the cross iteration def-use cycles of scalar variables, by 594 analyzing the loop-header PHIs of scalar variables. Classify each 595 cycle as one of the following: invariant, induction, reduction, unknown. 596 We do that for the loop represented by LOOP_VINFO, and also to its 597 inner-loop, if exists. 598 Examples for scalar cycles: 599 600 Example1: reduction: 601 602 loop1: 603 for (i=0; i<N; i++) 604 sum += a[i]; 605 606 Example2: induction: 607 608 loop2: 609 for (i=0; i<N; i++) 610 a[i] = i; */ 611 612static void 613vect_analyze_scalar_cycles (loop_vec_info loop_vinfo) 614{ 615 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 616 617 vect_analyze_scalar_cycles_1 (loop_vinfo, loop); 618 619 /* When vectorizing an outer-loop, the inner-loop is executed sequentially. 620 Reductions in such inner-loop therefore have different properties than 621 the reductions in the nest that gets vectorized: 622 1. When vectorized, they are executed in the same order as in the original 623 scalar loop, so we can't change the order of computation when 624 vectorizing them. 625 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the 626 current checks are too strict. */ 627 628 if (loop->inner) 629 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner); 630} 631 632/* Transfer group and reduction information from STMT_INFO to its 633 pattern stmt. */ 634 635static void 636vect_fixup_reduc_chain (stmt_vec_info stmt_info) 637{ 638 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info); 639 stmt_vec_info stmtp; 640 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp) 641 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)); 642 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info); 643 do 644 { 645 stmtp = STMT_VINFO_RELATED_STMT (stmt_info); 646 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp) 647 == STMT_VINFO_DEF_TYPE (stmt_info)); 648 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp; 649 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info); 650 if (stmt_info) 651 REDUC_GROUP_NEXT_ELEMENT (stmtp) 652 = STMT_VINFO_RELATED_STMT (stmt_info); 653 } 654 while (stmt_info); 655} 656 657/* Fixup scalar cycles that now have their stmts detected as patterns. */ 658 659static void 660vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo) 661{ 662 stmt_vec_info first; 663 unsigned i; 664 665 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first) 666 if (STMT_VINFO_IN_PATTERN_P (first)) 667 { 668 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first); 669 while (next) 670 { 671 if (! STMT_VINFO_IN_PATTERN_P (next) 672 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1) 673 break; 674 next = REDUC_GROUP_NEXT_ELEMENT (next); 675 } 676 /* If not all stmt in the chain are patterns or if we failed 677 to update STMT_VINFO_REDUC_IDX try to handle the chain 678 without patterns. */ 679 if (! next 680 && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1) 681 { 682 vect_fixup_reduc_chain (first); 683 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i] 684 = STMT_VINFO_RELATED_STMT (first); 685 } 686 } 687} 688 689/* Function vect_get_loop_niters. 690 691 Determine how many iterations the loop is executed and place it 692 in NUMBER_OF_ITERATIONS. Place the number of latch iterations 693 in NUMBER_OF_ITERATIONSM1. Place the condition under which the 694 niter information holds in ASSUMPTIONS. 695 696 Return the loop exit condition. */ 697 698 699static gcond * 700vect_get_loop_niters (class loop *loop, tree *assumptions, 701 tree *number_of_iterations, tree *number_of_iterationsm1) 702{ 703 edge exit = single_exit (loop); 704 class tree_niter_desc niter_desc; 705 tree niter_assumptions, niter, may_be_zero; 706 gcond *cond = get_loop_exit_condition (loop); 707 708 *assumptions = boolean_true_node; 709 *number_of_iterationsm1 = chrec_dont_know; 710 *number_of_iterations = chrec_dont_know; 711 DUMP_VECT_SCOPE ("get_loop_niters"); 712 713 if (!exit) 714 return cond; 715 716 may_be_zero = NULL_TREE; 717 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL) 718 || chrec_contains_undetermined (niter_desc.niter)) 719 return cond; 720 721 niter_assumptions = niter_desc.assumptions; 722 may_be_zero = niter_desc.may_be_zero; 723 niter = niter_desc.niter; 724 725 if (may_be_zero && integer_zerop (may_be_zero)) 726 may_be_zero = NULL_TREE; 727 728 if (may_be_zero) 729 { 730 if (COMPARISON_CLASS_P (may_be_zero)) 731 { 732 /* Try to combine may_be_zero with assumptions, this can simplify 733 computation of niter expression. */ 734 if (niter_assumptions && !integer_nonzerop (niter_assumptions)) 735 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, 736 niter_assumptions, 737 fold_build1 (TRUTH_NOT_EXPR, 738 boolean_type_node, 739 may_be_zero)); 740 else 741 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero, 742 build_int_cst (TREE_TYPE (niter), 0), 743 rewrite_to_non_trapping_overflow (niter)); 744 745 may_be_zero = NULL_TREE; 746 } 747 else if (integer_nonzerop (may_be_zero)) 748 { 749 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0); 750 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1); 751 return cond; 752 } 753 else 754 return cond; 755 } 756 757 *assumptions = niter_assumptions; 758 *number_of_iterationsm1 = niter; 759 760 /* We want the number of loop header executions which is the number 761 of latch executions plus one. 762 ??? For UINT_MAX latch executions this number overflows to zero 763 for loops like do { n++; } while (n != 0); */ 764 if (niter && !chrec_contains_undetermined (niter)) 765 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter), 766 build_int_cst (TREE_TYPE (niter), 1)); 767 *number_of_iterations = niter; 768 769 return cond; 770} 771 772/* Function bb_in_loop_p 773 774 Used as predicate for dfs order traversal of the loop bbs. */ 775 776static bool 777bb_in_loop_p (const_basic_block bb, const void *data) 778{ 779 const class loop *const loop = (const class loop *)data; 780 if (flow_bb_inside_loop_p (loop, bb)) 781 return true; 782 return false; 783} 784 785 786/* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as 787 stmt_vec_info structs for all the stmts in LOOP_IN. */ 788 789_loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) 790 : vec_info (vec_info::loop, init_cost (loop_in), shared), 791 loop (loop_in), 792 bbs (XCNEWVEC (basic_block, loop->num_nodes)), 793 num_itersm1 (NULL_TREE), 794 num_iters (NULL_TREE), 795 num_iters_unchanged (NULL_TREE), 796 num_iters_assumptions (NULL_TREE), 797 th (0), 798 versioning_threshold (0), 799 vectorization_factor (0), 800 max_vectorization_factor (0), 801 mask_skip_niters (NULL_TREE), 802 mask_compare_type (NULL_TREE), 803 simd_if_cond (NULL_TREE), 804 unaligned_dr (NULL), 805 peeling_for_alignment (0), 806 ptr_mask (0), 807 ivexpr_map (NULL), 808 scan_map (NULL), 809 slp_unrolling_factor (1), 810 single_scalar_iteration_cost (0), 811 vec_outside_cost (0), 812 vec_inside_cost (0), 813 vectorizable (false), 814 can_fully_mask_p (true), 815 fully_masked_p (false), 816 peeling_for_gaps (false), 817 peeling_for_niter (false), 818 no_data_dependencies (false), 819 has_mask_store (false), 820 scalar_loop_scaling (profile_probability::uninitialized ()), 821 scalar_loop (NULL), 822 orig_loop_info (NULL) 823{ 824 /* CHECKME: We want to visit all BBs before their successors (except for 825 latch blocks, for which this assertion wouldn't hold). In the simple 826 case of the loop forms we allow, a dfs order of the BBs would the same 827 as reversed postorder traversal, so we are safe. */ 828 829 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, 830 bbs, loop->num_nodes, loop); 831 gcc_assert (nbbs == loop->num_nodes); 832 833 for (unsigned int i = 0; i < nbbs; i++) 834 { 835 basic_block bb = bbs[i]; 836 gimple_stmt_iterator si; 837 838 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) 839 { 840 gimple *phi = gsi_stmt (si); 841 gimple_set_uid (phi, 0); 842 add_stmt (phi); 843 } 844 845 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 846 { 847 gimple *stmt = gsi_stmt (si); 848 gimple_set_uid (stmt, 0); 849 add_stmt (stmt); 850 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the 851 third argument is the #pragma omp simd if (x) condition, when 0, 852 loop shouldn't be vectorized, when non-zero constant, it should 853 be vectorized normally, otherwise versioned with vectorized loop 854 done if the condition is non-zero at runtime. */ 855 if (loop_in->simduid 856 && is_gimple_call (stmt) 857 && gimple_call_internal_p (stmt) 858 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE 859 && gimple_call_num_args (stmt) >= 3 860 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME 861 && (loop_in->simduid 862 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))) 863 { 864 tree arg = gimple_call_arg (stmt, 2); 865 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME) 866 simd_if_cond = arg; 867 else 868 gcc_assert (integer_nonzerop (arg)); 869 } 870 } 871 } 872 873 epilogue_vinfos.create (6); 874} 875 876/* Free all levels of MASKS. */ 877 878void 879release_vec_loop_masks (vec_loop_masks *masks) 880{ 881 rgroup_masks *rgm; 882 unsigned int i; 883 FOR_EACH_VEC_ELT (*masks, i, rgm) 884 rgm->masks.release (); 885 masks->release (); 886} 887 888/* Free all memory used by the _loop_vec_info, as well as all the 889 stmt_vec_info structs of all the stmts in the loop. */ 890 891_loop_vec_info::~_loop_vec_info () 892{ 893 free (bbs); 894 895 release_vec_loop_masks (&masks); 896 delete ivexpr_map; 897 delete scan_map; 898 epilogue_vinfos.release (); 899 900 loop->aux = NULL; 901} 902 903/* Return an invariant or register for EXPR and emit necessary 904 computations in the LOOP_VINFO loop preheader. */ 905 906tree 907cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr) 908{ 909 if (is_gimple_reg (expr) 910 || is_gimple_min_invariant (expr)) 911 return expr; 912 913 if (! loop_vinfo->ivexpr_map) 914 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>; 915 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr); 916 if (! cached) 917 { 918 gimple_seq stmts = NULL; 919 cached = force_gimple_operand (unshare_expr (expr), 920 &stmts, true, NULL_TREE); 921 if (stmts) 922 { 923 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); 924 gsi_insert_seq_on_edge_immediate (e, stmts); 925 } 926 } 927 return cached; 928} 929 930/* Return true if we can use CMP_TYPE as the comparison type to produce 931 all masks required to mask LOOP_VINFO. */ 932 933static bool 934can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type) 935{ 936 rgroup_masks *rgm; 937 unsigned int i; 938 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm) 939 if (rgm->mask_type != NULL_TREE 940 && !direct_internal_fn_supported_p (IFN_WHILE_ULT, 941 cmp_type, rgm->mask_type, 942 OPTIMIZE_FOR_SPEED)) 943 return false; 944 return true; 945} 946 947/* Calculate the maximum number of scalars per iteration for every 948 rgroup in LOOP_VINFO. */ 949 950static unsigned int 951vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo) 952{ 953 unsigned int res = 1; 954 unsigned int i; 955 rgroup_masks *rgm; 956 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm) 957 res = MAX (res, rgm->max_nscalars_per_iter); 958 return res; 959} 960 961/* Each statement in LOOP_VINFO can be masked where necessary. Check 962 whether we can actually generate the masks required. Return true if so, 963 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */ 964 965static bool 966vect_verify_full_masking (loop_vec_info loop_vinfo) 967{ 968 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 969 unsigned int min_ni_width; 970 unsigned int max_nscalars_per_iter 971 = vect_get_max_nscalars_per_iter (loop_vinfo); 972 973 /* Use a normal loop if there are no statements that need masking. 974 This only happens in rare degenerate cases: it means that the loop 975 has no loads, no stores, and no live-out values. */ 976 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()) 977 return false; 978 979 /* Get the maximum number of iterations that is representable 980 in the counter type. */ 981 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo)); 982 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1; 983 984 /* Get a more refined estimate for the number of iterations. */ 985 widest_int max_back_edges; 986 if (max_loop_iterations (loop, &max_back_edges)) 987 max_ni = wi::smin (max_ni, max_back_edges + 1); 988 989 /* Account for rgroup masks, in which each bit is replicated N times. */ 990 max_ni *= max_nscalars_per_iter; 991 992 /* Work out how many bits we need to represent the limit. */ 993 min_ni_width = wi::min_precision (max_ni, UNSIGNED); 994 995 /* Find a scalar mode for which WHILE_ULT is supported. */ 996 opt_scalar_int_mode cmp_mode_iter; 997 tree cmp_type = NULL_TREE; 998 tree iv_type = NULL_TREE; 999 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo); 1000 unsigned int iv_precision = UINT_MAX; 1001 1002 if (iv_limit != -1) 1003 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter, 1004 UNSIGNED); 1005 1006 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT) 1007 { 1008 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ()); 1009 if (cmp_bits >= min_ni_width 1010 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ())) 1011 { 1012 tree this_type = build_nonstandard_integer_type (cmp_bits, true); 1013 if (this_type 1014 && can_produce_all_loop_masks_p (loop_vinfo, this_type)) 1015 { 1016 /* Although we could stop as soon as we find a valid mode, 1017 there are at least two reasons why that's not always the 1018 best choice: 1019 1020 - An IV that's Pmode or wider is more likely to be reusable 1021 in address calculations than an IV that's narrower than 1022 Pmode. 1023 1024 - Doing the comparison in IV_PRECISION or wider allows 1025 a natural 0-based IV, whereas using a narrower comparison 1026 type requires mitigations against wrap-around. 1027 1028 Conversely, if the IV limit is variable, doing the comparison 1029 in a wider type than the original type can introduce 1030 unnecessary extensions, so picking the widest valid mode 1031 is not always a good choice either. 1032 1033 Here we prefer the first IV type that's Pmode or wider, 1034 and the first comparison type that's IV_PRECISION or wider. 1035 (The comparison type must be no wider than the IV type, 1036 to avoid extensions in the vector loop.) 1037 1038 ??? We might want to try continuing beyond Pmode for ILP32 1039 targets if CMP_BITS < IV_PRECISION. */ 1040 iv_type = this_type; 1041 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type)) 1042 cmp_type = this_type; 1043 if (cmp_bits >= GET_MODE_BITSIZE (Pmode)) 1044 break; 1045 } 1046 } 1047 } 1048 1049 if (!cmp_type) 1050 return false; 1051 1052 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type; 1053 LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type; 1054 return true; 1055} 1056 1057/* Calculate the cost of one scalar iteration of the loop. */ 1058static void 1059vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo) 1060{ 1061 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1062 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1063 int nbbs = loop->num_nodes, factor; 1064 int innerloop_iters, i; 1065 1066 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost"); 1067 1068 /* Gather costs for statements in the scalar loop. */ 1069 1070 /* FORNOW. */ 1071 innerloop_iters = 1; 1072 if (loop->inner) 1073 innerloop_iters = 50; /* FIXME */ 1074 1075 for (i = 0; i < nbbs; i++) 1076 { 1077 gimple_stmt_iterator si; 1078 basic_block bb = bbs[i]; 1079 1080 if (bb->loop_father == loop->inner) 1081 factor = innerloop_iters; 1082 else 1083 factor = 1; 1084 1085 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 1086 { 1087 gimple *stmt = gsi_stmt (si); 1088 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt); 1089 1090 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt)) 1091 continue; 1092 1093 /* Skip stmts that are not vectorized inside the loop. */ 1094 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info); 1095 if (!STMT_VINFO_RELEVANT_P (vstmt_info) 1096 && (!STMT_VINFO_LIVE_P (vstmt_info) 1097 || !VECTORIZABLE_CYCLE_DEF 1098 (STMT_VINFO_DEF_TYPE (vstmt_info)))) 1099 continue; 1100 1101 vect_cost_for_stmt kind; 1102 if (STMT_VINFO_DATA_REF (stmt_info)) 1103 { 1104 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) 1105 kind = scalar_load; 1106 else 1107 kind = scalar_store; 1108 } 1109 else if (vect_nop_conversion_p (stmt_info)) 1110 continue; 1111 else 1112 kind = scalar_stmt; 1113 1114 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1115 factor, kind, stmt_info, 0, vect_prologue); 1116 } 1117 } 1118 1119 /* Now accumulate cost. */ 1120 void *target_cost_data = init_cost (loop); 1121 stmt_info_for_cost *si; 1122 int j; 1123 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1124 j, si) 1125 (void) add_stmt_cost (target_cost_data, si->count, 1126 si->kind, si->stmt_info, si->misalign, 1127 vect_body); 1128 unsigned dummy, body_cost = 0; 1129 finish_cost (target_cost_data, &dummy, &body_cost, &dummy); 1130 destroy_cost_data (target_cost_data); 1131 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost; 1132} 1133 1134 1135/* Function vect_analyze_loop_form_1. 1136 1137 Verify that certain CFG restrictions hold, including: 1138 - the loop has a pre-header 1139 - the loop has a single entry and exit 1140 - the loop exit condition is simple enough 1141 - the number of iterations can be analyzed, i.e, a countable loop. The 1142 niter could be analyzed under some assumptions. */ 1143 1144opt_result 1145vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond, 1146 tree *assumptions, tree *number_of_iterationsm1, 1147 tree *number_of_iterations, gcond **inner_loop_cond) 1148{ 1149 DUMP_VECT_SCOPE ("vect_analyze_loop_form"); 1150 1151 /* Different restrictions apply when we are considering an inner-most loop, 1152 vs. an outer (nested) loop. 1153 (FORNOW. May want to relax some of these restrictions in the future). */ 1154 1155 if (!loop->inner) 1156 { 1157 /* Inner-most loop. We currently require that the number of BBs is 1158 exactly 2 (the header and latch). Vectorizable inner-most loops 1159 look like this: 1160 1161 (pre-header) 1162 | 1163 header <--------+ 1164 | | | 1165 | +--> latch --+ 1166 | 1167 (exit-bb) */ 1168 1169 if (loop->num_nodes != 2) 1170 return opt_result::failure_at (vect_location, 1171 "not vectorized:" 1172 " control flow in loop.\n"); 1173 1174 if (empty_block_p (loop->header)) 1175 return opt_result::failure_at (vect_location, 1176 "not vectorized: empty loop.\n"); 1177 } 1178 else 1179 { 1180 class loop *innerloop = loop->inner; 1181 edge entryedge; 1182 1183 /* Nested loop. We currently require that the loop is doubly-nested, 1184 contains a single inner loop, and the number of BBs is exactly 5. 1185 Vectorizable outer-loops look like this: 1186 1187 (pre-header) 1188 | 1189 header <---+ 1190 | | 1191 inner-loop | 1192 | | 1193 tail ------+ 1194 | 1195 (exit-bb) 1196 1197 The inner-loop has the properties expected of inner-most loops 1198 as described above. */ 1199 1200 if ((loop->inner)->inner || (loop->inner)->next) 1201 return opt_result::failure_at (vect_location, 1202 "not vectorized:" 1203 " multiple nested loops.\n"); 1204 1205 if (loop->num_nodes != 5) 1206 return opt_result::failure_at (vect_location, 1207 "not vectorized:" 1208 " control flow in loop.\n"); 1209 1210 entryedge = loop_preheader_edge (innerloop); 1211 if (entryedge->src != loop->header 1212 || !single_exit (innerloop) 1213 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src) 1214 return opt_result::failure_at (vect_location, 1215 "not vectorized:" 1216 " unsupported outerloop form.\n"); 1217 1218 /* Analyze the inner-loop. */ 1219 tree inner_niterm1, inner_niter, inner_assumptions; 1220 opt_result res 1221 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond, 1222 &inner_assumptions, &inner_niterm1, 1223 &inner_niter, NULL); 1224 if (!res) 1225 { 1226 if (dump_enabled_p ()) 1227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1228 "not vectorized: Bad inner loop.\n"); 1229 return res; 1230 } 1231 1232 /* Don't support analyzing niter under assumptions for inner 1233 loop. */ 1234 if (!integer_onep (inner_assumptions)) 1235 return opt_result::failure_at (vect_location, 1236 "not vectorized: Bad inner loop.\n"); 1237 1238 if (!expr_invariant_in_loop_p (loop, inner_niter)) 1239 return opt_result::failure_at (vect_location, 1240 "not vectorized: inner-loop count not" 1241 " invariant.\n"); 1242 1243 if (dump_enabled_p ()) 1244 dump_printf_loc (MSG_NOTE, vect_location, 1245 "Considering outer-loop vectorization.\n"); 1246 } 1247 1248 if (!single_exit (loop)) 1249 return opt_result::failure_at (vect_location, 1250 "not vectorized: multiple exits.\n"); 1251 if (EDGE_COUNT (loop->header->preds) != 2) 1252 return opt_result::failure_at (vect_location, 1253 "not vectorized:" 1254 " too many incoming edges.\n"); 1255 1256 /* We assume that the loop exit condition is at the end of the loop. i.e, 1257 that the loop is represented as a do-while (with a proper if-guard 1258 before the loop if needed), where the loop header contains all the 1259 executable statements, and the latch is empty. */ 1260 if (!empty_block_p (loop->latch) 1261 || !gimple_seq_empty_p (phi_nodes (loop->latch))) 1262 return opt_result::failure_at (vect_location, 1263 "not vectorized: latch block not empty.\n"); 1264 1265 /* Make sure the exit is not abnormal. */ 1266 edge e = single_exit (loop); 1267 if (e->flags & EDGE_ABNORMAL) 1268 return opt_result::failure_at (vect_location, 1269 "not vectorized:" 1270 " abnormal loop exit edge.\n"); 1271 1272 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations, 1273 number_of_iterationsm1); 1274 if (!*loop_cond) 1275 return opt_result::failure_at 1276 (vect_location, 1277 "not vectorized: complicated exit condition.\n"); 1278 1279 if (integer_zerop (*assumptions) 1280 || !*number_of_iterations 1281 || chrec_contains_undetermined (*number_of_iterations)) 1282 return opt_result::failure_at 1283 (*loop_cond, 1284 "not vectorized: number of iterations cannot be computed.\n"); 1285 1286 if (integer_zerop (*number_of_iterations)) 1287 return opt_result::failure_at 1288 (*loop_cond, 1289 "not vectorized: number of iterations = 0.\n"); 1290 1291 return opt_result::success (); 1292} 1293 1294/* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */ 1295 1296opt_loop_vec_info 1297vect_analyze_loop_form (class loop *loop, vec_info_shared *shared) 1298{ 1299 tree assumptions, number_of_iterations, number_of_iterationsm1; 1300 gcond *loop_cond, *inner_loop_cond = NULL; 1301 1302 opt_result res 1303 = vect_analyze_loop_form_1 (loop, &loop_cond, 1304 &assumptions, &number_of_iterationsm1, 1305 &number_of_iterations, &inner_loop_cond); 1306 if (!res) 1307 return opt_loop_vec_info::propagate_failure (res); 1308 1309 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared); 1310 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1; 1311 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations; 1312 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations; 1313 if (!integer_onep (assumptions)) 1314 { 1315 /* We consider to vectorize this loop by versioning it under 1316 some assumptions. In order to do this, we need to clear 1317 existing information computed by scev and niter analyzer. */ 1318 scev_reset_htab (); 1319 free_numbers_of_iterations_estimates (loop); 1320 /* Also set flag for this loop so that following scev and niter 1321 analysis are done under the assumptions. */ 1322 loop_constraint_set (loop, LOOP_C_FINITE); 1323 /* Also record the assumptions for versioning. */ 1324 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions; 1325 } 1326 1327 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 1328 { 1329 if (dump_enabled_p ()) 1330 { 1331 dump_printf_loc (MSG_NOTE, vect_location, 1332 "Symbolic number of iterations is "); 1333 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations); 1334 dump_printf (MSG_NOTE, "\n"); 1335 } 1336 } 1337 1338 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond); 1339 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type; 1340 if (inner_loop_cond) 1341 { 1342 stmt_vec_info inner_loop_cond_info 1343 = loop_vinfo->lookup_stmt (inner_loop_cond); 1344 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type; 1345 } 1346 1347 gcc_assert (!loop->aux); 1348 loop->aux = loop_vinfo; 1349 return opt_loop_vec_info::success (loop_vinfo); 1350} 1351 1352 1353 1354/* Scan the loop stmts and dependent on whether there are any (non-)SLP 1355 statements update the vectorization factor. */ 1356 1357static void 1358vect_update_vf_for_slp (loop_vec_info loop_vinfo) 1359{ 1360 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1361 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1362 int nbbs = loop->num_nodes; 1363 poly_uint64 vectorization_factor; 1364 int i; 1365 1366 DUMP_VECT_SCOPE ("vect_update_vf_for_slp"); 1367 1368 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1369 gcc_assert (known_ne (vectorization_factor, 0U)); 1370 1371 /* If all the stmts in the loop can be SLPed, we perform only SLP, and 1372 vectorization factor of the loop is the unrolling factor required by 1373 the SLP instances. If that unrolling factor is 1, we say, that we 1374 perform pure SLP on loop - cross iteration parallelism is not 1375 exploited. */ 1376 bool only_slp_in_loop = true; 1377 for (i = 0; i < nbbs; i++) 1378 { 1379 basic_block bb = bbs[i]; 1380 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 1381 gsi_next (&si)) 1382 { 1383 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ()); 1384 if (!stmt_info) 1385 continue; 1386 if ((STMT_VINFO_RELEVANT_P (stmt_info) 1387 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) 1388 && !PURE_SLP_STMT (stmt_info)) 1389 /* STMT needs both SLP and loop-based vectorization. */ 1390 only_slp_in_loop = false; 1391 } 1392 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); 1393 gsi_next (&si)) 1394 { 1395 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); 1396 stmt_info = vect_stmt_to_vectorize (stmt_info); 1397 if ((STMT_VINFO_RELEVANT_P (stmt_info) 1398 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) 1399 && !PURE_SLP_STMT (stmt_info)) 1400 /* STMT needs both SLP and loop-based vectorization. */ 1401 only_slp_in_loop = false; 1402 } 1403 } 1404 1405 if (only_slp_in_loop) 1406 { 1407 if (dump_enabled_p ()) 1408 dump_printf_loc (MSG_NOTE, vect_location, 1409 "Loop contains only SLP stmts\n"); 1410 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo); 1411 } 1412 else 1413 { 1414 if (dump_enabled_p ()) 1415 dump_printf_loc (MSG_NOTE, vect_location, 1416 "Loop contains SLP and non-SLP stmts\n"); 1417 /* Both the vectorization factor and unroll factor have the form 1418 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X, 1419 so they must have a common multiple. */ 1420 vectorization_factor 1421 = force_common_multiple (vectorization_factor, 1422 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo)); 1423 } 1424 1425 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; 1426 if (dump_enabled_p ()) 1427 { 1428 dump_printf_loc (MSG_NOTE, vect_location, 1429 "Updating vectorization factor to "); 1430 dump_dec (MSG_NOTE, vectorization_factor); 1431 dump_printf (MSG_NOTE, ".\n"); 1432 } 1433} 1434 1435/* Return true if STMT_INFO describes a double reduction phi and if 1436 the other phi in the reduction is also relevant for vectorization. 1437 This rejects cases such as: 1438 1439 outer1: 1440 x_1 = PHI <x_3(outer2), ...>; 1441 ... 1442 1443 inner: 1444 x_2 = ...; 1445 ... 1446 1447 outer2: 1448 x_3 = PHI <x_2(inner)>; 1449 1450 if nothing in x_2 or elsewhere makes x_1 relevant. */ 1451 1452static bool 1453vect_active_double_reduction_p (stmt_vec_info stmt_info) 1454{ 1455 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def) 1456 return false; 1457 1458 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info)); 1459} 1460 1461/* Function vect_analyze_loop_operations. 1462 1463 Scan the loop stmts and make sure they are all vectorizable. */ 1464 1465static opt_result 1466vect_analyze_loop_operations (loop_vec_info loop_vinfo) 1467{ 1468 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1469 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 1470 int nbbs = loop->num_nodes; 1471 int i; 1472 stmt_vec_info stmt_info; 1473 bool need_to_vectorize = false; 1474 bool ok; 1475 1476 DUMP_VECT_SCOPE ("vect_analyze_loop_operations"); 1477 1478 auto_vec<stmt_info_for_cost> cost_vec; 1479 1480 for (i = 0; i < nbbs; i++) 1481 { 1482 basic_block bb = bbs[i]; 1483 1484 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 1485 gsi_next (&si)) 1486 { 1487 gphi *phi = si.phi (); 1488 ok = true; 1489 1490 stmt_info = loop_vinfo->lookup_stmt (phi); 1491 if (dump_enabled_p ()) 1492 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi); 1493 if (virtual_operand_p (gimple_phi_result (phi))) 1494 continue; 1495 1496 /* Inner-loop loop-closed exit phi in outer-loop vectorization 1497 (i.e., a phi in the tail of the outer-loop). */ 1498 if (! is_loop_header_bb_p (bb)) 1499 { 1500 /* FORNOW: we currently don't support the case that these phis 1501 are not used in the outerloop (unless it is double reduction, 1502 i.e., this phi is vect_reduction_def), cause this case 1503 requires to actually do something here. */ 1504 if (STMT_VINFO_LIVE_P (stmt_info) 1505 && !vect_active_double_reduction_p (stmt_info)) 1506 return opt_result::failure_at (phi, 1507 "Unsupported loop-closed phi" 1508 " in outer-loop.\n"); 1509 1510 /* If PHI is used in the outer loop, we check that its operand 1511 is defined in the inner loop. */ 1512 if (STMT_VINFO_RELEVANT_P (stmt_info)) 1513 { 1514 tree phi_op; 1515 1516 if (gimple_phi_num_args (phi) != 1) 1517 return opt_result::failure_at (phi, "unsupported phi"); 1518 1519 phi_op = PHI_ARG_DEF (phi, 0); 1520 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op); 1521 if (!op_def_info) 1522 return opt_result::failure_at (phi, "unsupported phi\n"); 1523 1524 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer 1525 && (STMT_VINFO_RELEVANT (op_def_info) 1526 != vect_used_in_outer_by_reduction)) 1527 return opt_result::failure_at (phi, "unsupported phi\n"); 1528 1529 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def 1530 || (STMT_VINFO_DEF_TYPE (stmt_info) 1531 == vect_double_reduction_def)) 1532 && !vectorizable_lc_phi (stmt_info, NULL, NULL)) 1533 return opt_result::failure_at (phi, "unsupported phi\n"); 1534 } 1535 1536 continue; 1537 } 1538 1539 gcc_assert (stmt_info); 1540 1541 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope 1542 || STMT_VINFO_LIVE_P (stmt_info)) 1543 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def) 1544 /* A scalar-dependence cycle that we don't support. */ 1545 return opt_result::failure_at (phi, 1546 "not vectorized:" 1547 " scalar dependence cycle.\n"); 1548 1549 if (STMT_VINFO_RELEVANT_P (stmt_info)) 1550 { 1551 need_to_vectorize = true; 1552 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def 1553 && ! PURE_SLP_STMT (stmt_info)) 1554 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL, 1555 &cost_vec); 1556 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def 1557 || (STMT_VINFO_DEF_TYPE (stmt_info) 1558 == vect_double_reduction_def) 1559 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) 1560 && ! PURE_SLP_STMT (stmt_info)) 1561 ok = vectorizable_reduction (stmt_info, NULL, NULL, &cost_vec); 1562 } 1563 1564 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */ 1565 if (ok 1566 && STMT_VINFO_LIVE_P (stmt_info) 1567 && !PURE_SLP_STMT (stmt_info)) 1568 ok = vectorizable_live_operation (stmt_info, NULL, NULL, NULL, 1569 -1, false, &cost_vec); 1570 1571 if (!ok) 1572 return opt_result::failure_at (phi, 1573 "not vectorized: relevant phi not " 1574 "supported: %G", 1575 static_cast <gimple *> (phi)); 1576 } 1577 1578 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si); 1579 gsi_next (&si)) 1580 { 1581 gimple *stmt = gsi_stmt (si); 1582 if (!gimple_clobber_p (stmt)) 1583 { 1584 opt_result res 1585 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt), 1586 &need_to_vectorize, 1587 NULL, NULL, &cost_vec); 1588 if (!res) 1589 return res; 1590 } 1591 } 1592 } /* bbs */ 1593 1594 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec); 1595 1596 /* All operations in the loop are either irrelevant (deal with loop 1597 control, or dead), or only used outside the loop and can be moved 1598 out of the loop (e.g. invariants, inductions). The loop can be 1599 optimized away by scalar optimizations. We're better off not 1600 touching this loop. */ 1601 if (!need_to_vectorize) 1602 { 1603 if (dump_enabled_p ()) 1604 dump_printf_loc (MSG_NOTE, vect_location, 1605 "All the computation can be taken out of the loop.\n"); 1606 return opt_result::failure_at 1607 (vect_location, 1608 "not vectorized: redundant loop. no profit to vectorize.\n"); 1609 } 1610 1611 return opt_result::success (); 1612} 1613 1614/* Analyze the cost of the loop described by LOOP_VINFO. Decide if it 1615 is worthwhile to vectorize. Return 1 if definitely yes, 0 if 1616 definitely no, or -1 if it's worth retrying. */ 1617 1618static int 1619vect_analyze_loop_costing (loop_vec_info loop_vinfo) 1620{ 1621 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1622 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); 1623 1624 /* Only fully-masked loops can have iteration counts less than the 1625 vectorization factor. */ 1626 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 1627 { 1628 HOST_WIDE_INT max_niter; 1629 1630 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 1631 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo); 1632 else 1633 max_niter = max_stmt_executions_int (loop); 1634 1635 if (max_niter != -1 1636 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf) 1637 { 1638 if (dump_enabled_p ()) 1639 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1640 "not vectorized: iteration count smaller than " 1641 "vectorization factor.\n"); 1642 return 0; 1643 } 1644 } 1645 1646 int min_profitable_iters, min_profitable_estimate; 1647 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters, 1648 &min_profitable_estimate); 1649 1650 if (min_profitable_iters < 0) 1651 { 1652 if (dump_enabled_p ()) 1653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1654 "not vectorized: vectorization not profitable.\n"); 1655 if (dump_enabled_p ()) 1656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1657 "not vectorized: vector version will never be " 1658 "profitable.\n"); 1659 return -1; 1660 } 1661 1662 int min_scalar_loop_bound = (param_min_vect_loop_bound 1663 * assumed_vf); 1664 1665 /* Use the cost model only if it is more conservative than user specified 1666 threshold. */ 1667 unsigned int th = (unsigned) MAX (min_scalar_loop_bound, 1668 min_profitable_iters); 1669 1670 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th; 1671 1672 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 1673 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th) 1674 { 1675 if (dump_enabled_p ()) 1676 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1677 "not vectorized: vectorization not profitable.\n"); 1678 if (dump_enabled_p ()) 1679 dump_printf_loc (MSG_NOTE, vect_location, 1680 "not vectorized: iteration count smaller than user " 1681 "specified loop bound parameter or minimum profitable " 1682 "iterations (whichever is more conservative).\n"); 1683 return 0; 1684 } 1685 1686 /* The static profitablity threshold min_profitable_estimate includes 1687 the cost of having to check at runtime whether the scalar loop 1688 should be used instead. If it turns out that we don't need or want 1689 such a check, the threshold we should use for the static estimate 1690 is simply the point at which the vector loop becomes more profitable 1691 than the scalar loop. */ 1692 if (min_profitable_estimate > min_profitable_iters 1693 && !LOOP_REQUIRES_VERSIONING (loop_vinfo) 1694 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) 1695 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) 1696 && !vect_apply_runtime_profitability_check_p (loop_vinfo)) 1697 { 1698 if (dump_enabled_p ()) 1699 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime" 1700 " choice between the scalar and vector loops\n"); 1701 min_profitable_estimate = min_profitable_iters; 1702 } 1703 1704 HOST_WIDE_INT estimated_niter; 1705 1706 /* If we are vectorizing an epilogue then we know the maximum number of 1707 scalar iterations it will cover is at least one lower than the 1708 vectorization factor of the main loop. */ 1709 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 1710 estimated_niter 1711 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1; 1712 else 1713 { 1714 estimated_niter = estimated_stmt_executions_int (loop); 1715 if (estimated_niter == -1) 1716 estimated_niter = likely_max_stmt_executions_int (loop); 1717 } 1718 if (estimated_niter != -1 1719 && ((unsigned HOST_WIDE_INT) estimated_niter 1720 < MAX (th, (unsigned) min_profitable_estimate))) 1721 { 1722 if (dump_enabled_p ()) 1723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1724 "not vectorized: estimated iteration count too " 1725 "small.\n"); 1726 if (dump_enabled_p ()) 1727 dump_printf_loc (MSG_NOTE, vect_location, 1728 "not vectorized: estimated iteration count smaller " 1729 "than specified loop bound parameter or minimum " 1730 "profitable iterations (whichever is more " 1731 "conservative).\n"); 1732 return -1; 1733 } 1734 1735 return 1; 1736} 1737 1738static opt_result 1739vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs, 1740 vec<data_reference_p> *datarefs, 1741 unsigned int *n_stmts) 1742{ 1743 *n_stmts = 0; 1744 for (unsigned i = 0; i < loop->num_nodes; i++) 1745 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); 1746 !gsi_end_p (gsi); gsi_next (&gsi)) 1747 { 1748 gimple *stmt = gsi_stmt (gsi); 1749 if (is_gimple_debug (stmt)) 1750 continue; 1751 ++(*n_stmts); 1752 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs); 1753 if (!res) 1754 { 1755 if (is_gimple_call (stmt) && loop->safelen) 1756 { 1757 tree fndecl = gimple_call_fndecl (stmt), op; 1758 if (fndecl != NULL_TREE) 1759 { 1760 cgraph_node *node = cgraph_node::get (fndecl); 1761 if (node != NULL && node->simd_clones != NULL) 1762 { 1763 unsigned int j, n = gimple_call_num_args (stmt); 1764 for (j = 0; j < n; j++) 1765 { 1766 op = gimple_call_arg (stmt, j); 1767 if (DECL_P (op) 1768 || (REFERENCE_CLASS_P (op) 1769 && get_base_address (op))) 1770 break; 1771 } 1772 op = gimple_call_lhs (stmt); 1773 /* Ignore #pragma omp declare simd functions 1774 if they don't have data references in the 1775 call stmt itself. */ 1776 if (j == n 1777 && !(op 1778 && (DECL_P (op) 1779 || (REFERENCE_CLASS_P (op) 1780 && get_base_address (op))))) 1781 continue; 1782 } 1783 } 1784 } 1785 return res; 1786 } 1787 /* If dependence analysis will give up due to the limit on the 1788 number of datarefs stop here and fail fatally. */ 1789 if (datarefs->length () 1790 > (unsigned)param_loop_max_datarefs_for_datadeps) 1791 return opt_result::failure_at (stmt, "exceeded param " 1792 "loop-max-datarefs-for-datadeps\n"); 1793 } 1794 return opt_result::success (); 1795} 1796 1797/* Look for SLP-only access groups and turn each individual access into its own 1798 group. */ 1799static void 1800vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo) 1801{ 1802 unsigned int i; 1803 struct data_reference *dr; 1804 1805 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups"); 1806 1807 vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs; 1808 FOR_EACH_VEC_ELT (datarefs, i, dr) 1809 { 1810 gcc_assert (DR_REF (dr)); 1811 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr)); 1812 1813 /* Check if the load is a part of an interleaving chain. */ 1814 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1815 { 1816 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info); 1817 unsigned int group_size = DR_GROUP_SIZE (first_element); 1818 1819 /* Check if SLP-only groups. */ 1820 if (!STMT_SLP_TYPE (stmt_info) 1821 && STMT_VINFO_SLP_VECT_ONLY (first_element)) 1822 { 1823 /* Dissolve the group. */ 1824 STMT_VINFO_SLP_VECT_ONLY (first_element) = false; 1825 1826 stmt_vec_info vinfo = first_element; 1827 while (vinfo) 1828 { 1829 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo); 1830 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo; 1831 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL; 1832 DR_GROUP_SIZE (vinfo) = 1; 1833 if (STMT_VINFO_STRIDED_P (first_element)) 1834 DR_GROUP_GAP (vinfo) = 0; 1835 else 1836 DR_GROUP_GAP (vinfo) = group_size - 1; 1837 vinfo = next; 1838 } 1839 } 1840 } 1841 } 1842} 1843 1844 1845/* Decides whether we need to create an epilogue loop to handle 1846 remaining scalar iterations and sets PEELING_FOR_NITERS accordingly. */ 1847 1848void 1849determine_peel_for_niter (loop_vec_info loop_vinfo) 1850{ 1851 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; 1852 1853 unsigned HOST_WIDE_INT const_vf; 1854 HOST_WIDE_INT max_niter 1855 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo)); 1856 1857 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); 1858 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) 1859 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO 1860 (loop_vinfo)); 1861 1862 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 1863 /* The main loop handles all iterations. */ 1864 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; 1865 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 1866 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0) 1867 { 1868 /* Work out the (constant) number of iterations that need to be 1869 peeled for reasons other than niters. */ 1870 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 1871 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) 1872 peel_niter += 1; 1873 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter, 1874 LOOP_VINFO_VECT_FACTOR (loop_vinfo))) 1875 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; 1876 } 1877 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) 1878 /* ??? When peeling for gaps but not alignment, we could 1879 try to check whether the (variable) niters is known to be 1880 VF * N + 1. That's something of a niche case though. */ 1881 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) 1882 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf) 1883 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo)) 1884 < (unsigned) exact_log2 (const_vf)) 1885 /* In case of versioning, check if the maximum number of 1886 iterations is greater than th. If they are identical, 1887 the epilogue is unnecessary. */ 1888 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo) 1889 || ((unsigned HOST_WIDE_INT) max_niter 1890 > (th / const_vf) * const_vf)))) 1891 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true; 1892} 1893 1894 1895/* Function vect_analyze_loop_2. 1896 1897 Apply a set of analyses on LOOP, and create a loop_vec_info struct 1898 for it. The different analyses will record information in the 1899 loop_vec_info struct. */ 1900static opt_result 1901vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts) 1902{ 1903 opt_result ok = opt_result::success (); 1904 int res; 1905 unsigned int max_vf = MAX_VECTORIZATION_FACTOR; 1906 poly_uint64 min_vf = 2; 1907 loop_vec_info orig_loop_vinfo = NULL; 1908 1909 /* If we are dealing with an epilogue then orig_loop_vinfo points to the 1910 loop_vec_info of the first vectorized loop. */ 1911 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 1912 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); 1913 else 1914 orig_loop_vinfo = loop_vinfo; 1915 gcc_assert (orig_loop_vinfo); 1916 1917 /* The first group of checks is independent of the vector size. */ 1918 fatal = true; 1919 1920 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo) 1921 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo))) 1922 return opt_result::failure_at (vect_location, 1923 "not vectorized: simd if(0)\n"); 1924 1925 /* Find all data references in the loop (which correspond to vdefs/vuses) 1926 and analyze their evolution in the loop. */ 1927 1928 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo); 1929 1930 /* Gather the data references and count stmts in the loop. */ 1931 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ()) 1932 { 1933 opt_result res 1934 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo), 1935 &LOOP_VINFO_DATAREFS (loop_vinfo), 1936 n_stmts); 1937 if (!res) 1938 { 1939 if (dump_enabled_p ()) 1940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1941 "not vectorized: loop contains function " 1942 "calls or data references that cannot " 1943 "be analyzed\n"); 1944 return res; 1945 } 1946 loop_vinfo->shared->save_datarefs (); 1947 } 1948 else 1949 loop_vinfo->shared->check_datarefs (); 1950 1951 /* Analyze the data references and also adjust the minimal 1952 vectorization factor according to the loads and stores. */ 1953 1954 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal); 1955 if (!ok) 1956 { 1957 if (dump_enabled_p ()) 1958 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1959 "bad data references.\n"); 1960 return ok; 1961 } 1962 1963 /* Classify all cross-iteration scalar data-flow cycles. 1964 Cross-iteration cycles caused by virtual phis are analyzed separately. */ 1965 vect_analyze_scalar_cycles (loop_vinfo); 1966 1967 vect_pattern_recog (loop_vinfo); 1968 1969 vect_fixup_scalar_cycles_with_patterns (loop_vinfo); 1970 1971 /* Analyze the access patterns of the data-refs in the loop (consecutive, 1972 complex, etc.). FORNOW: Only handle consecutive access pattern. */ 1973 1974 ok = vect_analyze_data_ref_accesses (loop_vinfo); 1975 if (!ok) 1976 { 1977 if (dump_enabled_p ()) 1978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1979 "bad data access.\n"); 1980 return ok; 1981 } 1982 1983 /* Data-flow analysis to detect stmts that do not need to be vectorized. */ 1984 1985 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal); 1986 if (!ok) 1987 { 1988 if (dump_enabled_p ()) 1989 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1990 "unexpected pattern.\n"); 1991 return ok; 1992 } 1993 1994 /* While the rest of the analysis below depends on it in some way. */ 1995 fatal = false; 1996 1997 /* Analyze data dependences between the data-refs in the loop 1998 and adjust the maximum vectorization factor according to 1999 the dependences. 2000 FORNOW: fail at the first data dependence that we encounter. */ 2001 2002 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf); 2003 if (!ok) 2004 { 2005 if (dump_enabled_p ()) 2006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2007 "bad data dependence.\n"); 2008 return ok; 2009 } 2010 if (max_vf != MAX_VECTORIZATION_FACTOR 2011 && maybe_lt (max_vf, min_vf)) 2012 return opt_result::failure_at (vect_location, "bad data dependence.\n"); 2013 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf; 2014 2015 ok = vect_determine_vectorization_factor (loop_vinfo); 2016 if (!ok) 2017 { 2018 if (dump_enabled_p ()) 2019 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2020 "can't determine vectorization factor.\n"); 2021 return ok; 2022 } 2023 if (max_vf != MAX_VECTORIZATION_FACTOR 2024 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo))) 2025 return opt_result::failure_at (vect_location, "bad data dependence.\n"); 2026 2027 /* Compute the scalar iteration cost. */ 2028 vect_compute_single_scalar_iteration_cost (loop_vinfo); 2029 2030 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2031 2032 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */ 2033 ok = vect_analyze_slp (loop_vinfo, *n_stmts); 2034 if (!ok) 2035 return ok; 2036 2037 /* If there are any SLP instances mark them as pure_slp. */ 2038 bool slp = vect_make_slp_decision (loop_vinfo); 2039 if (slp) 2040 { 2041 /* Find stmts that need to be both vectorized and SLPed. */ 2042 vect_detect_hybrid_slp (loop_vinfo); 2043 2044 /* Update the vectorization factor based on the SLP decision. */ 2045 vect_update_vf_for_slp (loop_vinfo); 2046 } 2047 2048 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo); 2049 2050 /* We don't expect to have to roll back to anything other than an empty 2051 set of rgroups. */ 2052 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ()); 2053 2054 /* This is the point where we can re-start analysis with SLP forced off. */ 2055start_over: 2056 2057 /* Now the vectorization factor is final. */ 2058 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2059 gcc_assert (known_ne (vectorization_factor, 0U)); 2060 2061 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ()) 2062 { 2063 dump_printf_loc (MSG_NOTE, vect_location, 2064 "vectorization_factor = "); 2065 dump_dec (MSG_NOTE, vectorization_factor); 2066 dump_printf (MSG_NOTE, ", niters = %wd\n", 2067 LOOP_VINFO_INT_NITERS (loop_vinfo)); 2068 } 2069 2070 /* Analyze the alignment of the data-refs in the loop. 2071 Fail if a data reference is found that cannot be vectorized. */ 2072 2073 ok = vect_analyze_data_refs_alignment (loop_vinfo); 2074 if (!ok) 2075 { 2076 if (dump_enabled_p ()) 2077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2078 "bad data alignment.\n"); 2079 return ok; 2080 } 2081 2082 /* Prune the list of ddrs to be tested at run-time by versioning for alias. 2083 It is important to call pruning after vect_analyze_data_ref_accesses, 2084 since we use grouping information gathered by interleaving analysis. */ 2085 ok = vect_prune_runtime_alias_test_list (loop_vinfo); 2086 if (!ok) 2087 return ok; 2088 2089 /* Do not invoke vect_enhance_data_refs_alignment for epilogue 2090 vectorization, since we do not want to add extra peeling or 2091 add versioning for alignment. */ 2092 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 2093 /* This pass will decide on using loop versioning and/or loop peeling in 2094 order to enhance the alignment of data references in the loop. */ 2095 ok = vect_enhance_data_refs_alignment (loop_vinfo); 2096 else 2097 ok = vect_verify_datarefs_alignment (loop_vinfo); 2098 if (!ok) 2099 return ok; 2100 2101 if (slp) 2102 { 2103 /* Analyze operations in the SLP instances. Note this may 2104 remove unsupported SLP instances which makes the above 2105 SLP kind detection invalid. */ 2106 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length (); 2107 vect_slp_analyze_operations (loop_vinfo); 2108 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size) 2109 { 2110 ok = opt_result::failure_at (vect_location, 2111 "unsupported SLP instances\n"); 2112 goto again; 2113 } 2114 } 2115 2116 /* Dissolve SLP-only groups. */ 2117 vect_dissolve_slp_only_groups (loop_vinfo); 2118 2119 /* Scan all the remaining operations in the loop that are not subject 2120 to SLP and make sure they are vectorizable. */ 2121 ok = vect_analyze_loop_operations (loop_vinfo); 2122 if (!ok) 2123 { 2124 if (dump_enabled_p ()) 2125 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2126 "bad operation or unsupported loop bound.\n"); 2127 return ok; 2128 } 2129 2130 /* Decide whether to use a fully-masked loop for this vectorization 2131 factor. */ 2132 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 2133 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) 2134 && vect_verify_full_masking (loop_vinfo)); 2135 if (dump_enabled_p ()) 2136 { 2137 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 2138 dump_printf_loc (MSG_NOTE, vect_location, 2139 "using a fully-masked loop.\n"); 2140 else 2141 dump_printf_loc (MSG_NOTE, vect_location, 2142 "not using a fully-masked loop.\n"); 2143 } 2144 2145 /* If epilog loop is required because of data accesses with gaps, 2146 one additional iteration needs to be peeled. Check if there is 2147 enough iterations for vectorization. */ 2148 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) 2149 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 2150 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 2151 { 2152 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2153 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo); 2154 2155 if (known_lt (wi::to_widest (scalar_niters), vf)) 2156 return opt_result::failure_at (vect_location, 2157 "loop has no enough iterations to" 2158 " support peeling for gaps.\n"); 2159 } 2160 2161 /* If we're vectorizing an epilogue loop, we either need a fully-masked 2162 loop or a loop that has a lower VF than the main loop. */ 2163 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo) 2164 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 2165 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 2166 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo))) 2167 return opt_result::failure_at (vect_location, 2168 "Vectorization factor too high for" 2169 " epilogue loop.\n"); 2170 2171 /* Check the costings of the loop make vectorizing worthwhile. */ 2172 res = vect_analyze_loop_costing (loop_vinfo); 2173 if (res < 0) 2174 { 2175 ok = opt_result::failure_at (vect_location, 2176 "Loop costings may not be worthwhile.\n"); 2177 goto again; 2178 } 2179 if (!res) 2180 return opt_result::failure_at (vect_location, 2181 "Loop costings not worthwhile.\n"); 2182 2183 determine_peel_for_niter (loop_vinfo); 2184 /* If an epilogue loop is required make sure we can create one. */ 2185 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) 2186 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)) 2187 { 2188 if (dump_enabled_p ()) 2189 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n"); 2190 if (!vect_can_advance_ivs_p (loop_vinfo) 2191 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo), 2192 single_exit (LOOP_VINFO_LOOP 2193 (loop_vinfo)))) 2194 { 2195 ok = opt_result::failure_at (vect_location, 2196 "not vectorized: can't create required " 2197 "epilog loop\n"); 2198 goto again; 2199 } 2200 } 2201 2202 /* During peeling, we need to check if number of loop iterations is 2203 enough for both peeled prolog loop and vector loop. This check 2204 can be merged along with threshold check of loop versioning, so 2205 increase threshold for this case if necessary. 2206 2207 If we are analyzing an epilogue we still want to check what its 2208 versioning threshold would be. If we decide to vectorize the epilogues we 2209 will want to use the lowest versioning threshold of all epilogues and main 2210 loop. This will enable us to enter a vectorized epilogue even when 2211 versioning the loop. We can't simply check whether the epilogue requires 2212 versioning though since we may have skipped some versioning checks when 2213 analyzing the epilogue. For instance, checks for alias versioning will be 2214 skipped when dealing with epilogues as we assume we already checked them 2215 for the main loop. So instead we always check the 'orig_loop_vinfo'. */ 2216 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo)) 2217 { 2218 poly_uint64 niters_th = 0; 2219 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); 2220 2221 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) 2222 { 2223 /* Niters for peeled prolog loop. */ 2224 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) 2225 { 2226 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); 2227 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt); 2228 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1; 2229 } 2230 else 2231 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 2232 } 2233 2234 /* Niters for at least one iteration of vectorized loop. */ 2235 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 2236 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2237 /* One additional iteration because of peeling for gap. */ 2238 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) 2239 niters_th += 1; 2240 2241 /* Use the same condition as vect_transform_loop to decide when to use 2242 the cost to determine a versioning threshold. */ 2243 if (vect_apply_runtime_profitability_check_p (loop_vinfo) 2244 && ordered_p (th, niters_th)) 2245 niters_th = ordered_max (poly_uint64 (th), niters_th); 2246 2247 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th; 2248 } 2249 2250 gcc_assert (known_eq (vectorization_factor, 2251 LOOP_VINFO_VECT_FACTOR (loop_vinfo))); 2252 2253 /* Ok to vectorize! */ 2254 return opt_result::success (); 2255 2256again: 2257 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */ 2258 gcc_assert (!ok); 2259 2260 /* Try again with SLP forced off but if we didn't do any SLP there is 2261 no point in re-trying. */ 2262 if (!slp) 2263 return ok; 2264 2265 /* If there are reduction chains re-trying will fail anyway. */ 2266 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ()) 2267 return ok; 2268 2269 /* Likewise if the grouped loads or stores in the SLP cannot be handled 2270 via interleaving or lane instructions. */ 2271 slp_instance instance; 2272 slp_tree node; 2273 unsigned i, j; 2274 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) 2275 { 2276 stmt_vec_info vinfo; 2277 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]; 2278 if (! STMT_VINFO_GROUPED_ACCESS (vinfo)) 2279 continue; 2280 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo); 2281 unsigned int size = DR_GROUP_SIZE (vinfo); 2282 tree vectype = STMT_VINFO_VECTYPE (vinfo); 2283 if (! vect_store_lanes_supported (vectype, size, false) 2284 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U) 2285 && ! vect_grouped_store_supported (vectype, size)) 2286 return opt_result::failure_at (vinfo->stmt, 2287 "unsupported grouped store\n"); 2288 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node) 2289 { 2290 vinfo = SLP_TREE_SCALAR_STMTS (node)[0]; 2291 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo); 2292 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo); 2293 size = DR_GROUP_SIZE (vinfo); 2294 vectype = STMT_VINFO_VECTYPE (vinfo); 2295 if (! vect_load_lanes_supported (vectype, size, false) 2296 && ! vect_grouped_load_supported (vectype, single_element_p, 2297 size)) 2298 return opt_result::failure_at (vinfo->stmt, 2299 "unsupported grouped load\n"); 2300 } 2301 } 2302 2303 if (dump_enabled_p ()) 2304 dump_printf_loc (MSG_NOTE, vect_location, 2305 "re-trying with SLP disabled\n"); 2306 2307 /* Roll back state appropriately. No SLP this time. */ 2308 slp = false; 2309 /* Restore vectorization factor as it were without SLP. */ 2310 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor; 2311 /* Free the SLP instances. */ 2312 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance) 2313 vect_free_slp_instance (instance, false); 2314 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); 2315 /* Reset SLP type to loop_vect on all stmts. */ 2316 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i) 2317 { 2318 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i]; 2319 for (gimple_stmt_iterator si = gsi_start_phis (bb); 2320 !gsi_end_p (si); gsi_next (&si)) 2321 { 2322 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); 2323 STMT_SLP_TYPE (stmt_info) = loop_vect; 2324 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def 2325 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) 2326 { 2327 /* vectorizable_reduction adjusts reduction stmt def-types, 2328 restore them to that of the PHI. */ 2329 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info)) 2330 = STMT_VINFO_DEF_TYPE (stmt_info); 2331 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize 2332 (STMT_VINFO_REDUC_DEF (stmt_info))) 2333 = STMT_VINFO_DEF_TYPE (stmt_info); 2334 } 2335 } 2336 for (gimple_stmt_iterator si = gsi_start_bb (bb); 2337 !gsi_end_p (si); gsi_next (&si)) 2338 { 2339 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si)); 2340 STMT_SLP_TYPE (stmt_info) = loop_vect; 2341 if (STMT_VINFO_IN_PATTERN_P (stmt_info)) 2342 { 2343 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); 2344 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info); 2345 STMT_SLP_TYPE (stmt_info) = loop_vect; 2346 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq); 2347 !gsi_end_p (pi); gsi_next (&pi)) 2348 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi))) 2349 = loop_vect; 2350 } 2351 } 2352 } 2353 /* Free optimized alias test DDRS. */ 2354 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0); 2355 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release (); 2356 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release (); 2357 /* Reset target cost data. */ 2358 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)); 2359 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo) 2360 = init_cost (LOOP_VINFO_LOOP (loop_vinfo)); 2361 /* Reset accumulated rgroup information. */ 2362 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo)); 2363 /* Reset assorted flags. */ 2364 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; 2365 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false; 2366 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0; 2367 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0; 2368 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p; 2369 2370 goto start_over; 2371} 2372 2373/* Return true if vectorizing a loop using NEW_LOOP_VINFO appears 2374 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that 2375 OLD_LOOP_VINFO is better unless something specifically indicates 2376 otherwise. 2377 2378 Note that this deliberately isn't a partial order. */ 2379 2380static bool 2381vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo, 2382 loop_vec_info old_loop_vinfo) 2383{ 2384 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo); 2385 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop); 2386 2387 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo); 2388 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo); 2389 2390 /* Always prefer a VF of loop->simdlen over any other VF. */ 2391 if (loop->simdlen) 2392 { 2393 bool new_simdlen_p = known_eq (new_vf, loop->simdlen); 2394 bool old_simdlen_p = known_eq (old_vf, loop->simdlen); 2395 if (new_simdlen_p != old_simdlen_p) 2396 return new_simdlen_p; 2397 } 2398 2399 /* Limit the VFs to what is likely to be the maximum number of iterations, 2400 to handle cases in which at least one loop_vinfo is fully-masked. */ 2401 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop); 2402 if (estimated_max_niter != -1) 2403 { 2404 if (known_le (estimated_max_niter, new_vf)) 2405 new_vf = estimated_max_niter; 2406 if (known_le (estimated_max_niter, old_vf)) 2407 old_vf = estimated_max_niter; 2408 } 2409 2410 /* Check whether the (fractional) cost per scalar iteration is lower 2411 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */ 2412 poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost 2413 * poly_widest_int (old_vf)); 2414 poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost 2415 * poly_widest_int (new_vf)); 2416 if (maybe_lt (rel_old, rel_new)) 2417 { 2418 /* When old_loop_vinfo uses a variable vectorization factor, 2419 we know that it has a lower cost for at least one runtime VF. 2420 However, we don't know how likely that VF is. 2421 2422 One option would be to compare the costs for the estimated VFs. 2423 The problem is that that can put too much pressure on the cost 2424 model. E.g. if the estimated VF is also the lowest possible VF, 2425 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo 2426 for the estimated VF, we'd then choose new_loop_vinfo even 2427 though (a) new_loop_vinfo might not actually be better than 2428 old_loop_vinfo for that VF and (b) it would be significantly 2429 worse at larger VFs. 2430 2431 Here we go for a hacky compromise: pick new_loop_vinfo if it is 2432 no more expensive than old_loop_vinfo even after doubling the 2433 estimated old_loop_vinfo VF. For all but trivial loops, this 2434 ensures that we only pick new_loop_vinfo if it is significantly 2435 better than old_loop_vinfo at the estimated VF. */ 2436 if (rel_new.is_constant ()) 2437 return false; 2438 2439 HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf); 2440 HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf); 2441 widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost 2442 * widest_int (old_estimated_vf)); 2443 widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost 2444 * widest_int (new_estimated_vf)); 2445 return estimated_rel_new * 2 <= estimated_rel_old; 2446 } 2447 if (known_lt (rel_new, rel_old)) 2448 return true; 2449 2450 /* If there's nothing to choose between the loop bodies, see whether 2451 there's a difference in the prologue and epilogue costs. */ 2452 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost) 2453 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost; 2454 2455 return false; 2456} 2457 2458/* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return 2459 true if we should. */ 2460 2461static bool 2462vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo, 2463 loop_vec_info old_loop_vinfo) 2464{ 2465 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo)) 2466 return false; 2467 2468 if (dump_enabled_p ()) 2469 dump_printf_loc (MSG_NOTE, vect_location, 2470 "***** Preferring vector mode %s to vector mode %s\n", 2471 GET_MODE_NAME (new_loop_vinfo->vector_mode), 2472 GET_MODE_NAME (old_loop_vinfo->vector_mode)); 2473 return true; 2474} 2475 2476/* If LOOP_VINFO is already a main loop, return it unmodified. Otherwise 2477 try to reanalyze it as a main loop. Return the loop_vinfo on success 2478 and null on failure. */ 2479 2480static loop_vec_info 2481vect_reanalyze_as_main_loop (loop_vec_info loop_vinfo, unsigned int *n_stmts) 2482{ 2483 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 2484 return loop_vinfo; 2485 2486 if (dump_enabled_p ()) 2487 dump_printf_loc (MSG_NOTE, vect_location, 2488 "***** Reanalyzing as a main loop with vector mode %s\n", 2489 GET_MODE_NAME (loop_vinfo->vector_mode)); 2490 2491 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 2492 vec_info_shared *shared = loop_vinfo->shared; 2493 opt_loop_vec_info main_loop_vinfo = vect_analyze_loop_form (loop, shared); 2494 gcc_assert (main_loop_vinfo); 2495 2496 main_loop_vinfo->vector_mode = loop_vinfo->vector_mode; 2497 2498 bool fatal = false; 2499 bool res = vect_analyze_loop_2 (main_loop_vinfo, fatal, n_stmts); 2500 loop->aux = NULL; 2501 if (!res) 2502 { 2503 if (dump_enabled_p ()) 2504 dump_printf_loc (MSG_NOTE, vect_location, 2505 "***** Failed to analyze main loop with vector" 2506 " mode %s\n", 2507 GET_MODE_NAME (loop_vinfo->vector_mode)); 2508 delete main_loop_vinfo; 2509 return NULL; 2510 } 2511 LOOP_VINFO_VECTORIZABLE_P (main_loop_vinfo) = 1; 2512 return main_loop_vinfo; 2513} 2514 2515/* Function vect_analyze_loop. 2516 2517 Apply a set of analyses on LOOP, and create a loop_vec_info struct 2518 for it. The different analyses will record information in the 2519 loop_vec_info struct. */ 2520opt_loop_vec_info 2521vect_analyze_loop (class loop *loop, vec_info_shared *shared) 2522{ 2523 auto_vector_modes vector_modes; 2524 2525 /* Autodetect first vector size we try. */ 2526 unsigned int autovec_flags 2527 = targetm.vectorize.autovectorize_vector_modes (&vector_modes, 2528 loop->simdlen != 0); 2529 unsigned int mode_i = 0; 2530 2531 DUMP_VECT_SCOPE ("analyze_loop_nest"); 2532 2533 if (loop_outer (loop) 2534 && loop_vec_info_for_loop (loop_outer (loop)) 2535 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop)))) 2536 return opt_loop_vec_info::failure_at (vect_location, 2537 "outer-loop already vectorized.\n"); 2538 2539 if (!find_loop_nest (loop, &shared->loop_nest)) 2540 return opt_loop_vec_info::failure_at 2541 (vect_location, 2542 "not vectorized: loop nest containing two or more consecutive inner" 2543 " loops cannot be vectorized\n"); 2544 2545 unsigned n_stmts = 0; 2546 machine_mode autodetected_vector_mode = VOIDmode; 2547 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL); 2548 machine_mode next_vector_mode = VOIDmode; 2549 poly_uint64 lowest_th = 0; 2550 unsigned vectorized_loops = 0; 2551 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS) 2552 && !unlimited_cost_model (loop)); 2553 2554 bool vect_epilogues = false; 2555 opt_result res = opt_result::success (); 2556 unsigned HOST_WIDE_INT simdlen = loop->simdlen; 2557 while (1) 2558 { 2559 /* Check the CFG characteristics of the loop (nesting, entry/exit). */ 2560 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared); 2561 if (!loop_vinfo) 2562 { 2563 if (dump_enabled_p ()) 2564 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2565 "bad loop form.\n"); 2566 gcc_checking_assert (first_loop_vinfo == NULL); 2567 return loop_vinfo; 2568 } 2569 loop_vinfo->vector_mode = next_vector_mode; 2570 2571 bool fatal = false; 2572 2573 /* When pick_lowest_cost_p is true, we should in principle iterate 2574 over all the loop_vec_infos that LOOP_VINFO could replace and 2575 try to vectorize LOOP_VINFO under the same conditions. 2576 E.g. when trying to replace an epilogue loop, we should vectorize 2577 LOOP_VINFO as an epilogue loop with the same VF limit. When trying 2578 to replace the main loop, we should vectorize LOOP_VINFO as a main 2579 loop too. 2580 2581 However, autovectorize_vector_modes is usually sorted as follows: 2582 2583 - Modes that naturally produce lower VFs usually follow modes that 2584 naturally produce higher VFs. 2585 2586 - When modes naturally produce the same VF, maskable modes 2587 usually follow unmaskable ones, so that the maskable mode 2588 can be used to vectorize the epilogue of the unmaskable mode. 2589 2590 This order is preferred because it leads to the maximum 2591 epilogue vectorization opportunities. Targets should only use 2592 a different order if they want to make wide modes available while 2593 disparaging them relative to earlier, smaller modes. The assumption 2594 in that case is that the wider modes are more expensive in some 2595 way that isn't reflected directly in the costs. 2596 2597 There should therefore be few interesting cases in which 2598 LOOP_VINFO fails when treated as an epilogue loop, succeeds when 2599 treated as a standalone loop, and ends up being genuinely cheaper 2600 than FIRST_LOOP_VINFO. */ 2601 if (vect_epilogues) 2602 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo; 2603 2604 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts); 2605 if (mode_i == 0) 2606 autodetected_vector_mode = loop_vinfo->vector_mode; 2607 if (dump_enabled_p ()) 2608 { 2609 if (res) 2610 dump_printf_loc (MSG_NOTE, vect_location, 2611 "***** Analysis succeeded with vector mode %s\n", 2612 GET_MODE_NAME (loop_vinfo->vector_mode)); 2613 else 2614 dump_printf_loc (MSG_NOTE, vect_location, 2615 "***** Analysis failed with vector mode %s\n", 2616 GET_MODE_NAME (loop_vinfo->vector_mode)); 2617 } 2618 2619 loop->aux = NULL; 2620 2621 if (!fatal) 2622 while (mode_i < vector_modes.length () 2623 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i])) 2624 { 2625 if (dump_enabled_p ()) 2626 dump_printf_loc (MSG_NOTE, vect_location, 2627 "***** The result for vector mode %s would" 2628 " be the same\n", 2629 GET_MODE_NAME (vector_modes[mode_i])); 2630 mode_i += 1; 2631 } 2632 2633 if (res) 2634 { 2635 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; 2636 vectorized_loops++; 2637 2638 /* Once we hit the desired simdlen for the first time, 2639 discard any previous attempts. */ 2640 if (simdlen 2641 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen)) 2642 { 2643 delete first_loop_vinfo; 2644 first_loop_vinfo = opt_loop_vec_info::success (NULL); 2645 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL; 2646 simdlen = 0; 2647 } 2648 else if (pick_lowest_cost_p && first_loop_vinfo) 2649 { 2650 /* Keep trying to roll back vectorization attempts while the 2651 loop_vec_infos they produced were worse than this one. */ 2652 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos; 2653 while (!vinfos.is_empty () 2654 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ())) 2655 { 2656 gcc_assert (vect_epilogues); 2657 delete vinfos.pop (); 2658 } 2659 if (vinfos.is_empty () 2660 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo)) 2661 { 2662 loop_vec_info main_loop_vinfo 2663 = vect_reanalyze_as_main_loop (loop_vinfo, &n_stmts); 2664 if (main_loop_vinfo == loop_vinfo) 2665 { 2666 delete first_loop_vinfo; 2667 first_loop_vinfo = opt_loop_vec_info::success (NULL); 2668 } 2669 else if (main_loop_vinfo 2670 && vect_joust_loop_vinfos (main_loop_vinfo, 2671 first_loop_vinfo)) 2672 { 2673 delete first_loop_vinfo; 2674 first_loop_vinfo = opt_loop_vec_info::success (NULL); 2675 delete loop_vinfo; 2676 loop_vinfo 2677 = opt_loop_vec_info::success (main_loop_vinfo); 2678 } 2679 else 2680 delete main_loop_vinfo; 2681 } 2682 } 2683 2684 if (first_loop_vinfo == NULL) 2685 { 2686 first_loop_vinfo = loop_vinfo; 2687 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo); 2688 } 2689 else if (vect_epilogues 2690 /* For now only allow one epilogue loop. */ 2691 && first_loop_vinfo->epilogue_vinfos.is_empty ()) 2692 { 2693 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo); 2694 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); 2695 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo) 2696 || maybe_ne (lowest_th, 0U)); 2697 /* Keep track of the known smallest versioning 2698 threshold. */ 2699 if (ordered_p (lowest_th, th)) 2700 lowest_th = ordered_min (lowest_th, th); 2701 } 2702 else 2703 delete loop_vinfo; 2704 2705 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is 2706 enabled, SIMDUID is not set, it is the innermost loop and we have 2707 either already found the loop's SIMDLEN or there was no SIMDLEN to 2708 begin with. 2709 TODO: Enable epilogue vectorization for loops with SIMDUID set. */ 2710 vect_epilogues = (!simdlen 2711 && loop->inner == NULL 2712 && param_vect_epilogues_nomask 2713 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo) 2714 && !loop->simduid 2715 /* For now only allow one epilogue loop, but allow 2716 pick_lowest_cost_p to replace it. */ 2717 && (first_loop_vinfo->epilogue_vinfos.is_empty () 2718 || pick_lowest_cost_p)); 2719 2720 /* Commit to first_loop_vinfo if we have no reason to try 2721 alternatives. */ 2722 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p) 2723 break; 2724 } 2725 else 2726 { 2727 delete loop_vinfo; 2728 if (fatal) 2729 { 2730 gcc_checking_assert (first_loop_vinfo == NULL); 2731 break; 2732 } 2733 } 2734 2735 if (mode_i < vector_modes.length () 2736 && VECTOR_MODE_P (autodetected_vector_mode) 2737 && (related_vector_mode (vector_modes[mode_i], 2738 GET_MODE_INNER (autodetected_vector_mode)) 2739 == autodetected_vector_mode) 2740 && (related_vector_mode (autodetected_vector_mode, 2741 GET_MODE_INNER (vector_modes[mode_i])) 2742 == vector_modes[mode_i])) 2743 { 2744 if (dump_enabled_p ()) 2745 dump_printf_loc (MSG_NOTE, vect_location, 2746 "***** Skipping vector mode %s, which would" 2747 " repeat the analysis for %s\n", 2748 GET_MODE_NAME (vector_modes[mode_i]), 2749 GET_MODE_NAME (autodetected_vector_mode)); 2750 mode_i += 1; 2751 } 2752 2753 if (mode_i == vector_modes.length () 2754 || autodetected_vector_mode == VOIDmode) 2755 break; 2756 2757 /* Try the next biggest vector size. */ 2758 next_vector_mode = vector_modes[mode_i++]; 2759 if (dump_enabled_p ()) 2760 dump_printf_loc (MSG_NOTE, vect_location, 2761 "***** Re-trying analysis with vector mode %s\n", 2762 GET_MODE_NAME (next_vector_mode)); 2763 } 2764 2765 if (first_loop_vinfo) 2766 { 2767 loop->aux = (loop_vec_info) first_loop_vinfo; 2768 if (dump_enabled_p ()) 2769 dump_printf_loc (MSG_NOTE, vect_location, 2770 "***** Choosing vector mode %s\n", 2771 GET_MODE_NAME (first_loop_vinfo->vector_mode)); 2772 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th; 2773 return first_loop_vinfo; 2774 } 2775 2776 return opt_loop_vec_info::propagate_failure (res); 2777} 2778 2779/* Return true if there is an in-order reduction function for CODE, storing 2780 it in *REDUC_FN if so. */ 2781 2782static bool 2783fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn) 2784{ 2785 switch (code) 2786 { 2787 case PLUS_EXPR: 2788 *reduc_fn = IFN_FOLD_LEFT_PLUS; 2789 return true; 2790 2791 default: 2792 return false; 2793 } 2794} 2795 2796/* Function reduction_fn_for_scalar_code 2797 2798 Input: 2799 CODE - tree_code of a reduction operations. 2800 2801 Output: 2802 REDUC_FN - the corresponding internal function to be used to reduce the 2803 vector of partial results into a single scalar result, or IFN_LAST 2804 if the operation is a supported reduction operation, but does not have 2805 such an internal function. 2806 2807 Return FALSE if CODE currently cannot be vectorized as reduction. */ 2808 2809static bool 2810reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn) 2811{ 2812 switch (code) 2813 { 2814 case MAX_EXPR: 2815 *reduc_fn = IFN_REDUC_MAX; 2816 return true; 2817 2818 case MIN_EXPR: 2819 *reduc_fn = IFN_REDUC_MIN; 2820 return true; 2821 2822 case PLUS_EXPR: 2823 *reduc_fn = IFN_REDUC_PLUS; 2824 return true; 2825 2826 case BIT_AND_EXPR: 2827 *reduc_fn = IFN_REDUC_AND; 2828 return true; 2829 2830 case BIT_IOR_EXPR: 2831 *reduc_fn = IFN_REDUC_IOR; 2832 return true; 2833 2834 case BIT_XOR_EXPR: 2835 *reduc_fn = IFN_REDUC_XOR; 2836 return true; 2837 2838 case MULT_EXPR: 2839 case MINUS_EXPR: 2840 *reduc_fn = IFN_LAST; 2841 return true; 2842 2843 default: 2844 return false; 2845 } 2846} 2847 2848/* If there is a neutral value X such that SLP reduction NODE would not 2849 be affected by the introduction of additional X elements, return that X, 2850 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE 2851 is the vector type that would hold element X. REDUC_CHAIN is true if 2852 the SLP statements perform a single reduction, false if each statement 2853 performs an independent reduction. */ 2854 2855static tree 2856neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type, 2857 tree_code code, bool reduc_chain) 2858{ 2859 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node); 2860 stmt_vec_info stmt_vinfo = stmts[0]; 2861 tree scalar_type = TREE_TYPE (vector_type); 2862 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father; 2863 gcc_assert (loop); 2864 2865 switch (code) 2866 { 2867 case WIDEN_SUM_EXPR: 2868 case DOT_PROD_EXPR: 2869 case SAD_EXPR: 2870 case PLUS_EXPR: 2871 case MINUS_EXPR: 2872 case BIT_IOR_EXPR: 2873 case BIT_XOR_EXPR: 2874 return build_zero_cst (scalar_type); 2875 2876 case MULT_EXPR: 2877 return build_one_cst (scalar_type); 2878 2879 case BIT_AND_EXPR: 2880 return build_all_ones_cst (scalar_type); 2881 2882 case MAX_EXPR: 2883 case MIN_EXPR: 2884 /* For MIN/MAX the initial values are neutral. A reduction chain 2885 has only a single initial value, so that value is neutral for 2886 all statements. */ 2887 if (reduc_chain) 2888 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, 2889 loop_preheader_edge (loop)); 2890 return NULL_TREE; 2891 2892 default: 2893 return NULL_TREE; 2894 } 2895} 2896 2897/* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement 2898 STMT is printed with a message MSG. */ 2899 2900static void 2901report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg) 2902{ 2903 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt); 2904} 2905 2906/* Return true if we need an in-order reduction for operation CODE 2907 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer 2908 overflow must wrap. */ 2909 2910bool 2911needs_fold_left_reduction_p (tree type, tree_code code) 2912{ 2913 /* CHECKME: check for !flag_finite_math_only too? */ 2914 if (SCALAR_FLOAT_TYPE_P (type)) 2915 switch (code) 2916 { 2917 case MIN_EXPR: 2918 case MAX_EXPR: 2919 return false; 2920 2921 default: 2922 return !flag_associative_math; 2923 } 2924 2925 if (INTEGRAL_TYPE_P (type)) 2926 { 2927 if (!operation_no_trapping_overflow (type, code)) 2928 return true; 2929 return false; 2930 } 2931 2932 if (SAT_FIXED_POINT_TYPE_P (type)) 2933 return true; 2934 2935 return false; 2936} 2937 2938/* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and 2939 has a handled computation expression. Store the main reduction 2940 operation in *CODE. */ 2941 2942static bool 2943check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi, 2944 tree loop_arg, enum tree_code *code, 2945 vec<std::pair<ssa_op_iter, use_operand_p> > &path) 2946{ 2947 auto_bitmap visited; 2948 tree lookfor = PHI_RESULT (phi); 2949 ssa_op_iter curri; 2950 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE); 2951 while (USE_FROM_PTR (curr) != loop_arg) 2952 curr = op_iter_next_use (&curri); 2953 curri.i = curri.numops; 2954 do 2955 { 2956 path.safe_push (std::make_pair (curri, curr)); 2957 tree use = USE_FROM_PTR (curr); 2958 if (use == lookfor) 2959 break; 2960 gimple *def = SSA_NAME_DEF_STMT (use); 2961 if (gimple_nop_p (def) 2962 || ! flow_bb_inside_loop_p (loop, gimple_bb (def))) 2963 { 2964pop: 2965 do 2966 { 2967 std::pair<ssa_op_iter, use_operand_p> x = path.pop (); 2968 curri = x.first; 2969 curr = x.second; 2970 do 2971 curr = op_iter_next_use (&curri); 2972 /* Skip already visited or non-SSA operands (from iterating 2973 over PHI args). */ 2974 while (curr != NULL_USE_OPERAND_P 2975 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME 2976 || ! bitmap_set_bit (visited, 2977 SSA_NAME_VERSION 2978 (USE_FROM_PTR (curr))))); 2979 } 2980 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ()); 2981 if (curr == NULL_USE_OPERAND_P) 2982 break; 2983 } 2984 else 2985 { 2986 if (gimple_code (def) == GIMPLE_PHI) 2987 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE); 2988 else 2989 curr = op_iter_init_use (&curri, def, SSA_OP_USE); 2990 while (curr != NULL_USE_OPERAND_P 2991 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME 2992 || ! bitmap_set_bit (visited, 2993 SSA_NAME_VERSION 2994 (USE_FROM_PTR (curr))))) 2995 curr = op_iter_next_use (&curri); 2996 if (curr == NULL_USE_OPERAND_P) 2997 goto pop; 2998 } 2999 } 3000 while (1); 3001 if (dump_file && (dump_flags & TDF_DETAILS)) 3002 { 3003 dump_printf_loc (MSG_NOTE, loc, "reduction path: "); 3004 unsigned i; 3005 std::pair<ssa_op_iter, use_operand_p> *x; 3006 FOR_EACH_VEC_ELT (path, i, x) 3007 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second)); 3008 dump_printf (MSG_NOTE, "\n"); 3009 } 3010 3011 /* Check whether the reduction path detected is valid. */ 3012 bool fail = path.length () == 0; 3013 bool neg = false; 3014 int sign = -1; 3015 *code = ERROR_MARK; 3016 for (unsigned i = 1; i < path.length (); ++i) 3017 { 3018 gimple *use_stmt = USE_STMT (path[i].second); 3019 tree op = USE_FROM_PTR (path[i].second); 3020 if (! is_gimple_assign (use_stmt) 3021 /* The following make sure we can compute the operand index 3022 easily plus it mostly disallows chaining via COND_EXPR condition 3023 operands. */ 3024 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use 3025 && (gimple_num_ops (use_stmt) <= 2 3026 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use) 3027 && (gimple_num_ops (use_stmt) <= 3 3028 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use))) 3029 { 3030 fail = true; 3031 break; 3032 } 3033 tree_code use_code = gimple_assign_rhs_code (use_stmt); 3034 if (use_code == MINUS_EXPR) 3035 { 3036 use_code = PLUS_EXPR; 3037 /* Track whether we negate the reduction value each iteration. */ 3038 if (gimple_assign_rhs2 (use_stmt) == op) 3039 neg = ! neg; 3040 } 3041 if (CONVERT_EXPR_CODE_P (use_code) 3042 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)), 3043 TREE_TYPE (gimple_assign_rhs1 (use_stmt)))) 3044 ; 3045 else if (*code == ERROR_MARK) 3046 { 3047 *code = use_code; 3048 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))); 3049 } 3050 else if (use_code != *code) 3051 { 3052 fail = true; 3053 break; 3054 } 3055 else if ((use_code == MIN_EXPR 3056 || use_code == MAX_EXPR) 3057 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)))) 3058 { 3059 fail = true; 3060 break; 3061 } 3062 /* Check there's only a single stmt the op is used on. For the 3063 not value-changing tail and the last stmt allow out-of-loop uses. 3064 ??? We could relax this and handle arbitrary live stmts by 3065 forcing a scalar epilogue for example. */ 3066 imm_use_iterator imm_iter; 3067 gimple *op_use_stmt; 3068 unsigned cnt = 0; 3069 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op) 3070 if (!is_gimple_debug (op_use_stmt) 3071 && (*code != ERROR_MARK 3072 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))) 3073 { 3074 /* We want to allow x + x but not x < 1 ? x : 2. */ 3075 if (is_gimple_assign (op_use_stmt) 3076 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR) 3077 { 3078 use_operand_p use_p; 3079 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 3080 cnt++; 3081 } 3082 else 3083 cnt++; 3084 } 3085 if (cnt != 1) 3086 { 3087 fail = true; 3088 break; 3089 } 3090 } 3091 return ! fail && ! neg && *code != ERROR_MARK; 3092} 3093 3094bool 3095check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi, 3096 tree loop_arg, enum tree_code code) 3097{ 3098 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path; 3099 enum tree_code code_; 3100 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path) 3101 && code_ == code); 3102} 3103 3104 3105 3106/* Function vect_is_simple_reduction 3107 3108 (1) Detect a cross-iteration def-use cycle that represents a simple 3109 reduction computation. We look for the following pattern: 3110 3111 loop_header: 3112 a1 = phi < a0, a2 > 3113 a3 = ... 3114 a2 = operation (a3, a1) 3115 3116 or 3117 3118 a3 = ... 3119 loop_header: 3120 a1 = phi < a0, a2 > 3121 a2 = operation (a3, a1) 3122 3123 such that: 3124 1. operation is commutative and associative and it is safe to 3125 change the order of the computation 3126 2. no uses for a2 in the loop (a2 is used out of the loop) 3127 3. no uses of a1 in the loop besides the reduction operation 3128 4. no uses of a1 outside the loop. 3129 3130 Conditions 1,4 are tested here. 3131 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. 3132 3133 (2) Detect a cross-iteration def-use cycle in nested loops, i.e., 3134 nested cycles. 3135 3136 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double 3137 reductions: 3138 3139 a1 = phi < a0, a2 > 3140 inner loop (def of a3) 3141 a2 = phi < a3 > 3142 3143 (4) Detect condition expressions, ie: 3144 for (int i = 0; i < N; i++) 3145 if (a[i] < val) 3146 ret_val = a[i]; 3147 3148*/ 3149 3150static stmt_vec_info 3151vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, 3152 bool *double_reduc, bool *reduc_chain_p) 3153{ 3154 gphi *phi = as_a <gphi *> (phi_info->stmt); 3155 gimple *phi_use_stmt = NULL; 3156 imm_use_iterator imm_iter; 3157 use_operand_p use_p; 3158 3159 *double_reduc = false; 3160 *reduc_chain_p = false; 3161 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION; 3162 3163 tree phi_name = PHI_RESULT (phi); 3164 /* ??? If there are no uses of the PHI result the inner loop reduction 3165 won't be detected as possibly double-reduction by vectorizable_reduction 3166 because that tries to walk the PHI arg from the preheader edge which 3167 can be constant. See PR60382. */ 3168 if (has_zero_uses (phi_name)) 3169 return NULL; 3170 class loop *loop = (gimple_bb (phi))->loop_father; 3171 unsigned nphi_def_loop_uses = 0; 3172 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name) 3173 { 3174 gimple *use_stmt = USE_STMT (use_p); 3175 if (is_gimple_debug (use_stmt)) 3176 continue; 3177 3178 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) 3179 { 3180 if (dump_enabled_p ()) 3181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3182 "intermediate value used outside loop.\n"); 3183 3184 return NULL; 3185 } 3186 3187 nphi_def_loop_uses++; 3188 phi_use_stmt = use_stmt; 3189 } 3190 3191 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop)); 3192 if (TREE_CODE (latch_def) != SSA_NAME) 3193 { 3194 if (dump_enabled_p ()) 3195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3196 "reduction: not ssa_name: %T\n", latch_def); 3197 return NULL; 3198 } 3199 3200 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def); 3201 if (!def_stmt_info 3202 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))) 3203 return NULL; 3204 3205 bool nested_in_vect_loop 3206 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop); 3207 unsigned nlatch_def_loop_uses = 0; 3208 auto_vec<gphi *, 3> lcphis; 3209 bool inner_loop_of_double_reduc = false; 3210 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def) 3211 { 3212 gimple *use_stmt = USE_STMT (use_p); 3213 if (is_gimple_debug (use_stmt)) 3214 continue; 3215 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))) 3216 nlatch_def_loop_uses++; 3217 else 3218 { 3219 /* We can have more than one loop-closed PHI. */ 3220 lcphis.safe_push (as_a <gphi *> (use_stmt)); 3221 if (nested_in_vect_loop 3222 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt)) 3223 == vect_double_reduction_def)) 3224 inner_loop_of_double_reduc = true; 3225 } 3226 } 3227 3228 /* If we are vectorizing an inner reduction we are executing that 3229 in the original order only in case we are not dealing with a 3230 double reduction. */ 3231 if (nested_in_vect_loop && !inner_loop_of_double_reduc) 3232 { 3233 if (dump_enabled_p ()) 3234 report_vect_op (MSG_NOTE, def_stmt_info->stmt, 3235 "detected nested cycle: "); 3236 return def_stmt_info; 3237 } 3238 3239 /* When the inner loop of a double reduction ends up with more than 3240 one loop-closed PHI we have failed to classify alternate such 3241 PHIs as double reduction, leading to wrong code. See PR103237. */ 3242 if (inner_loop_of_double_reduc && lcphis.length () != 1) 3243 { 3244 if (dump_enabled_p ()) 3245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3246 "unhandle double reduction\n"); 3247 return NULL; 3248 } 3249 3250 /* If this isn't a nested cycle or if the nested cycle reduction value 3251 is used ouside of the inner loop we cannot handle uses of the reduction 3252 value. */ 3253 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1) 3254 { 3255 if (dump_enabled_p ()) 3256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3257 "reduction used in loop.\n"); 3258 return NULL; 3259 } 3260 3261 /* If DEF_STMT is a phi node itself, we expect it to have a single argument 3262 defined in the inner loop. */ 3263 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt)) 3264 { 3265 tree op1 = PHI_ARG_DEF (def_stmt, 0); 3266 if (gimple_phi_num_args (def_stmt) != 1 3267 || TREE_CODE (op1) != SSA_NAME) 3268 { 3269 if (dump_enabled_p ()) 3270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3271 "unsupported phi node definition.\n"); 3272 3273 return NULL; 3274 } 3275 3276 gimple *def1 = SSA_NAME_DEF_STMT (op1); 3277 if (gimple_bb (def1) 3278 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) 3279 && loop->inner 3280 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1)) 3281 && is_gimple_assign (def1) 3282 && is_a <gphi *> (phi_use_stmt) 3283 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))) 3284 { 3285 if (dump_enabled_p ()) 3286 report_vect_op (MSG_NOTE, def_stmt, 3287 "detected double reduction: "); 3288 3289 *double_reduc = true; 3290 return def_stmt_info; 3291 } 3292 3293 return NULL; 3294 } 3295 3296 /* Look for the expression computing latch_def from then loop PHI result. */ 3297 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path; 3298 enum tree_code code; 3299 if (check_reduction_path (vect_location, loop, phi, latch_def, &code, 3300 path)) 3301 { 3302 STMT_VINFO_REDUC_CODE (phi_info) = code; 3303 if (code == COND_EXPR && !nested_in_vect_loop) 3304 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION; 3305 3306 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP 3307 reduction chain for which the additional restriction is that 3308 all operations in the chain are the same. */ 3309 auto_vec<stmt_vec_info, 8> reduc_chain; 3310 unsigned i; 3311 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR; 3312 for (i = path.length () - 1; i >= 1; --i) 3313 { 3314 gimple *stmt = USE_STMT (path[i].second); 3315 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt); 3316 STMT_VINFO_REDUC_IDX (stmt_info) 3317 = path[i].second->use - gimple_assign_rhs1_ptr (stmt); 3318 enum tree_code stmt_code = gimple_assign_rhs_code (stmt); 3319 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code) 3320 && (i == 1 || i == path.length () - 1)); 3321 if ((stmt_code != code && !leading_conversion) 3322 /* We can only handle the final value in epilogue 3323 generation for reduction chains. */ 3324 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt)))) 3325 is_slp_reduc = false; 3326 /* For reduction chains we support a trailing/leading 3327 conversions. We do not store those in the actual chain. */ 3328 if (leading_conversion) 3329 continue; 3330 reduc_chain.safe_push (stmt_info); 3331 } 3332 if (is_slp_reduc && reduc_chain.length () > 1) 3333 { 3334 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i) 3335 { 3336 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0]; 3337 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1]; 3338 } 3339 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0]; 3340 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL; 3341 3342 /* Save the chain for further analysis in SLP detection. */ 3343 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]); 3344 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length (); 3345 3346 *reduc_chain_p = true; 3347 if (dump_enabled_p ()) 3348 dump_printf_loc (MSG_NOTE, vect_location, 3349 "reduction: detected reduction chain\n"); 3350 } 3351 else if (dump_enabled_p ()) 3352 dump_printf_loc (MSG_NOTE, vect_location, 3353 "reduction: detected reduction\n"); 3354 3355 return def_stmt_info; 3356 } 3357 3358 if (dump_enabled_p ()) 3359 dump_printf_loc (MSG_NOTE, vect_location, 3360 "reduction: unknown pattern\n"); 3361 3362 return NULL; 3363} 3364 3365/* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */ 3366int 3367vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue, 3368 int *peel_iters_epilogue, 3369 stmt_vector_for_cost *scalar_cost_vec, 3370 stmt_vector_for_cost *prologue_cost_vec, 3371 stmt_vector_for_cost *epilogue_cost_vec) 3372{ 3373 int retval = 0; 3374 int assumed_vf = vect_vf_for_cost (loop_vinfo); 3375 3376 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 3377 { 3378 *peel_iters_epilogue = assumed_vf / 2; 3379 if (dump_enabled_p ()) 3380 dump_printf_loc (MSG_NOTE, vect_location, 3381 "cost model: epilogue peel iters set to vf/2 " 3382 "because loop iterations are unknown .\n"); 3383 3384 /* If peeled iterations are known but number of scalar loop 3385 iterations are unknown, count a taken branch per peeled loop. */ 3386 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken, 3387 NULL, 0, vect_prologue); 3388 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken, 3389 NULL, 0, vect_epilogue); 3390 } 3391 else 3392 { 3393 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo); 3394 peel_iters_prologue = niters < peel_iters_prologue ? 3395 niters : peel_iters_prologue; 3396 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf; 3397 /* If we need to peel for gaps, but no peeling is required, we have to 3398 peel VF iterations. */ 3399 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue) 3400 *peel_iters_epilogue = assumed_vf; 3401 } 3402 3403 stmt_info_for_cost *si; 3404 int j; 3405 if (peel_iters_prologue) 3406 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si) 3407 retval += record_stmt_cost (prologue_cost_vec, 3408 si->count * peel_iters_prologue, 3409 si->kind, si->stmt_info, si->misalign, 3410 vect_prologue); 3411 if (*peel_iters_epilogue) 3412 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si) 3413 retval += record_stmt_cost (epilogue_cost_vec, 3414 si->count * *peel_iters_epilogue, 3415 si->kind, si->stmt_info, si->misalign, 3416 vect_epilogue); 3417 3418 return retval; 3419} 3420 3421/* Function vect_estimate_min_profitable_iters 3422 3423 Return the number of iterations required for the vector version of the 3424 loop to be profitable relative to the cost of the scalar version of the 3425 loop. 3426 3427 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold 3428 of iterations for vectorization. -1 value means loop vectorization 3429 is not profitable. This returned value may be used for dynamic 3430 profitability check. 3431 3432 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used 3433 for static check against estimated number of iterations. */ 3434 3435static void 3436vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, 3437 int *ret_min_profitable_niters, 3438 int *ret_min_profitable_estimate) 3439{ 3440 int min_profitable_iters; 3441 int min_profitable_estimate; 3442 int peel_iters_prologue; 3443 int peel_iters_epilogue; 3444 unsigned vec_inside_cost = 0; 3445 int vec_outside_cost = 0; 3446 unsigned vec_prologue_cost = 0; 3447 unsigned vec_epilogue_cost = 0; 3448 int scalar_single_iter_cost = 0; 3449 int scalar_outside_cost = 0; 3450 int assumed_vf = vect_vf_for_cost (loop_vinfo); 3451 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 3452 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); 3453 3454 /* Cost model disabled. */ 3455 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 3456 { 3457 if (dump_enabled_p ()) 3458 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n"); 3459 *ret_min_profitable_niters = 0; 3460 *ret_min_profitable_estimate = 0; 3461 return; 3462 } 3463 3464 /* Requires loop versioning tests to handle misalignment. */ 3465 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) 3466 { 3467 /* FIXME: Make cost depend on complexity of individual check. */ 3468 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length (); 3469 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0, 3470 vect_prologue); 3471 if (dump_enabled_p ()) 3472 dump_printf (MSG_NOTE, 3473 "cost model: Adding cost of checks for loop " 3474 "versioning to treat misalignment.\n"); 3475 } 3476 3477 /* Requires loop versioning with alias checks. */ 3478 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) 3479 { 3480 /* FIXME: Make cost depend on complexity of individual check. */ 3481 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length (); 3482 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0, 3483 vect_prologue); 3484 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length (); 3485 if (len) 3486 /* Count LEN - 1 ANDs and LEN comparisons. */ 3487 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt, 3488 NULL, 0, vect_prologue); 3489 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length (); 3490 if (len) 3491 { 3492 /* Count LEN - 1 ANDs and LEN comparisons. */ 3493 unsigned int nstmts = len * 2 - 1; 3494 /* +1 for each bias that needs adding. */ 3495 for (unsigned int i = 0; i < len; ++i) 3496 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p) 3497 nstmts += 1; 3498 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt, 3499 NULL, 0, vect_prologue); 3500 } 3501 if (dump_enabled_p ()) 3502 dump_printf (MSG_NOTE, 3503 "cost model: Adding cost of checks for loop " 3504 "versioning aliasing.\n"); 3505 } 3506 3507 /* Requires loop versioning with niter checks. */ 3508 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo)) 3509 { 3510 /* FIXME: Make cost depend on complexity of individual check. */ 3511 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0, 3512 vect_prologue); 3513 if (dump_enabled_p ()) 3514 dump_printf (MSG_NOTE, 3515 "cost model: Adding cost of checks for loop " 3516 "versioning niters.\n"); 3517 } 3518 3519 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 3520 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0, 3521 vect_prologue); 3522 3523 /* Count statements in scalar loop. Using this as scalar cost for a single 3524 iteration for now. 3525 3526 TODO: Add outer loop support. 3527 3528 TODO: Consider assigning different costs to different scalar 3529 statements. */ 3530 3531 scalar_single_iter_cost 3532 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo); 3533 3534 /* Add additional cost for the peeled instructions in prologue and epilogue 3535 loop. (For fully-masked loops there will be no peeling.) 3536 3537 FORNOW: If we don't know the value of peel_iters for prologue or epilogue 3538 at compile-time - we assume it's vf/2 (the worst would be vf-1). 3539 3540 TODO: Build an expression that represents peel_iters for prologue and 3541 epilogue to be used in a run-time test. */ 3542 3543 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 3544 { 3545 peel_iters_prologue = 0; 3546 peel_iters_epilogue = 0; 3547 3548 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) 3549 { 3550 /* We need to peel exactly one iteration. */ 3551 peel_iters_epilogue += 1; 3552 stmt_info_for_cost *si; 3553 int j; 3554 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 3555 j, si) 3556 (void) add_stmt_cost (target_cost_data, si->count, 3557 si->kind, si->stmt_info, si->misalign, 3558 vect_epilogue); 3559 } 3560 3561 /* Calculate how many masks we need to generate. */ 3562 unsigned int num_masks = 0; 3563 rgroup_masks *rgm; 3564 unsigned int num_vectors_m1; 3565 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm) 3566 if (rgm->mask_type) 3567 num_masks += num_vectors_m1 + 1; 3568 gcc_assert (num_masks > 0); 3569 3570 /* In the worst case, we need to generate each mask in the prologue 3571 and in the loop body. One of the loop body mask instructions 3572 replaces the comparison in the scalar loop, and since we don't 3573 count the scalar comparison against the scalar body, we shouldn't 3574 count that vector instruction against the vector body either. 3575 3576 Sometimes we can use unpacks instead of generating prologue 3577 masks and sometimes the prologue mask will fold to a constant, 3578 so the actual prologue cost might be smaller. However, it's 3579 simpler and safer to use the worst-case cost; if this ends up 3580 being the tie-breaker between vectorizing or not, then it's 3581 probably better not to vectorize. */ 3582 (void) add_stmt_cost (target_cost_data, num_masks, vector_stmt, 3583 NULL, 0, vect_prologue); 3584 (void) add_stmt_cost (target_cost_data, num_masks - 1, vector_stmt, 3585 NULL, 0, vect_body); 3586 } 3587 else if (npeel < 0) 3588 { 3589 peel_iters_prologue = assumed_vf / 2; 3590 if (dump_enabled_p ()) 3591 dump_printf (MSG_NOTE, "cost model: " 3592 "prologue peel iters set to vf/2.\n"); 3593 3594 /* If peeling for alignment is unknown, loop bound of main loop becomes 3595 unknown. */ 3596 peel_iters_epilogue = assumed_vf / 2; 3597 if (dump_enabled_p ()) 3598 dump_printf (MSG_NOTE, "cost model: " 3599 "epilogue peel iters set to vf/2 because " 3600 "peeling for alignment is unknown.\n"); 3601 3602 /* If peeled iterations are unknown, count a taken branch and a not taken 3603 branch per peeled loop. Even if scalar loop iterations are known, 3604 vector iterations are not known since peeled prologue iterations are 3605 not known. Hence guards remain the same. */ 3606 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, 3607 NULL, 0, vect_prologue); 3608 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken, 3609 NULL, 0, vect_prologue); 3610 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, 3611 NULL, 0, vect_epilogue); 3612 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken, 3613 NULL, 0, vect_epilogue); 3614 stmt_info_for_cost *si; 3615 int j; 3616 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si) 3617 { 3618 (void) add_stmt_cost (target_cost_data, 3619 si->count * peel_iters_prologue, 3620 si->kind, si->stmt_info, si->misalign, 3621 vect_prologue); 3622 (void) add_stmt_cost (target_cost_data, 3623 si->count * peel_iters_epilogue, 3624 si->kind, si->stmt_info, si->misalign, 3625 vect_epilogue); 3626 } 3627 } 3628 else 3629 { 3630 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; 3631 stmt_info_for_cost *si; 3632 int j; 3633 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); 3634 3635 prologue_cost_vec.create (2); 3636 epilogue_cost_vec.create (2); 3637 peel_iters_prologue = npeel; 3638 3639 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue, 3640 &peel_iters_epilogue, 3641 &LOOP_VINFO_SCALAR_ITERATION_COST 3642 (loop_vinfo), 3643 &prologue_cost_vec, 3644 &epilogue_cost_vec); 3645 3646 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si) 3647 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info, 3648 si->misalign, vect_prologue); 3649 3650 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si) 3651 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info, 3652 si->misalign, vect_epilogue); 3653 3654 prologue_cost_vec.release (); 3655 epilogue_cost_vec.release (); 3656 } 3657 3658 /* FORNOW: The scalar outside cost is incremented in one of the 3659 following ways: 3660 3661 1. The vectorizer checks for alignment and aliasing and generates 3662 a condition that allows dynamic vectorization. A cost model 3663 check is ANDED with the versioning condition. Hence scalar code 3664 path now has the added cost of the versioning check. 3665 3666 if (cost > th & versioning_check) 3667 jmp to vector code 3668 3669 Hence run-time scalar is incremented by not-taken branch cost. 3670 3671 2. The vectorizer then checks if a prologue is required. If the 3672 cost model check was not done before during versioning, it has to 3673 be done before the prologue check. 3674 3675 if (cost <= th) 3676 prologue = scalar_iters 3677 if (prologue == 0) 3678 jmp to vector code 3679 else 3680 execute prologue 3681 if (prologue == num_iters) 3682 go to exit 3683 3684 Hence the run-time scalar cost is incremented by a taken branch, 3685 plus a not-taken branch, plus a taken branch cost. 3686 3687 3. The vectorizer then checks if an epilogue is required. If the 3688 cost model check was not done before during prologue check, it 3689 has to be done with the epilogue check. 3690 3691 if (prologue == 0) 3692 jmp to vector code 3693 else 3694 execute prologue 3695 if (prologue == num_iters) 3696 go to exit 3697 vector code: 3698 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0)) 3699 jmp to epilogue 3700 3701 Hence the run-time scalar cost should be incremented by 2 taken 3702 branches. 3703 3704 TODO: The back end may reorder the BBS's differently and reverse 3705 conditions/branch directions. Change the estimates below to 3706 something more reasonable. */ 3707 3708 /* If the number of iterations is known and we do not do versioning, we can 3709 decide whether to vectorize at compile time. Hence the scalar version 3710 do not carry cost model guard costs. */ 3711 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 3712 || LOOP_REQUIRES_VERSIONING (loop_vinfo)) 3713 { 3714 /* Cost model check occurs at versioning. */ 3715 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 3716 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken); 3717 else 3718 { 3719 /* Cost model check occurs at prologue generation. */ 3720 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) 3721 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken) 3722 + vect_get_stmt_cost (cond_branch_not_taken); 3723 /* Cost model check occurs at epilogue generation. */ 3724 else 3725 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken); 3726 } 3727 } 3728 3729 /* Complete the target-specific cost calculations. */ 3730 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost, 3731 &vec_inside_cost, &vec_epilogue_cost); 3732 3733 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost); 3734 3735 /* Stash the costs so that we can compare two loop_vec_infos. */ 3736 loop_vinfo->vec_inside_cost = vec_inside_cost; 3737 loop_vinfo->vec_outside_cost = vec_outside_cost; 3738 3739 if (dump_enabled_p ()) 3740 { 3741 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n"); 3742 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n", 3743 vec_inside_cost); 3744 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n", 3745 vec_prologue_cost); 3746 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n", 3747 vec_epilogue_cost); 3748 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n", 3749 scalar_single_iter_cost); 3750 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n", 3751 scalar_outside_cost); 3752 dump_printf (MSG_NOTE, " Vector outside cost: %d\n", 3753 vec_outside_cost); 3754 dump_printf (MSG_NOTE, " prologue iterations: %d\n", 3755 peel_iters_prologue); 3756 dump_printf (MSG_NOTE, " epilogue iterations: %d\n", 3757 peel_iters_epilogue); 3758 } 3759 3760 /* Calculate number of iterations required to make the vector version 3761 profitable, relative to the loop bodies only. The following condition 3762 must hold true: 3763 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC 3764 where 3765 SIC = scalar iteration cost, VIC = vector iteration cost, 3766 VOC = vector outside cost, VF = vectorization factor, 3767 NPEEL = prologue iterations + epilogue iterations, 3768 SOC = scalar outside cost for run time cost model check. */ 3769 3770 int saving_per_viter = (scalar_single_iter_cost * assumed_vf 3771 - vec_inside_cost); 3772 if (saving_per_viter <= 0) 3773 { 3774 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize) 3775 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd, 3776 "vectorization did not happen for a simd loop"); 3777 3778 if (dump_enabled_p ()) 3779 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3780 "cost model: the vector iteration cost = %d " 3781 "divided by the scalar iteration cost = %d " 3782 "is greater or equal to the vectorization factor = %d" 3783 ".\n", 3784 vec_inside_cost, scalar_single_iter_cost, assumed_vf); 3785 *ret_min_profitable_niters = -1; 3786 *ret_min_profitable_estimate = -1; 3787 return; 3788 } 3789 3790 /* ??? The "if" arm is written to handle all cases; see below for what 3791 we would do for !LOOP_VINFO_FULLY_MASKED_P. */ 3792 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 3793 { 3794 /* Rewriting the condition above in terms of the number of 3795 vector iterations (vniters) rather than the number of 3796 scalar iterations (niters) gives: 3797 3798 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC 3799 3800 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC 3801 3802 For integer N, X and Y when X > 0: 3803 3804 N * X > Y <==> N >= (Y /[floor] X) + 1. */ 3805 int outside_overhead = (vec_outside_cost 3806 - scalar_single_iter_cost * peel_iters_prologue 3807 - scalar_single_iter_cost * peel_iters_epilogue 3808 - scalar_outside_cost); 3809 /* We're only interested in cases that require at least one 3810 vector iteration. */ 3811 int min_vec_niters = 1; 3812 if (outside_overhead > 0) 3813 min_vec_niters = outside_overhead / saving_per_viter + 1; 3814 3815 if (dump_enabled_p ()) 3816 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n", 3817 min_vec_niters); 3818 3819 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 3820 { 3821 /* Now that we know the minimum number of vector iterations, 3822 find the minimum niters for which the scalar cost is larger: 3823 3824 SIC * niters > VIC * vniters + VOC - SOC 3825 3826 We know that the minimum niters is no more than 3827 vniters * VF + NPEEL, but it might be (and often is) less 3828 than that if a partial vector iteration is cheaper than the 3829 equivalent scalar code. */ 3830 int threshold = (vec_inside_cost * min_vec_niters 3831 + vec_outside_cost 3832 - scalar_outside_cost); 3833 if (threshold <= 0) 3834 min_profitable_iters = 1; 3835 else 3836 min_profitable_iters = threshold / scalar_single_iter_cost + 1; 3837 } 3838 else 3839 /* Convert the number of vector iterations into a number of 3840 scalar iterations. */ 3841 min_profitable_iters = (min_vec_niters * assumed_vf 3842 + peel_iters_prologue 3843 + peel_iters_epilogue); 3844 } 3845 else 3846 { 3847 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) 3848 * assumed_vf 3849 - vec_inside_cost * peel_iters_prologue 3850 - vec_inside_cost * peel_iters_epilogue); 3851 if (min_profitable_iters <= 0) 3852 min_profitable_iters = 0; 3853 else 3854 { 3855 min_profitable_iters /= saving_per_viter; 3856 3857 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters) 3858 <= (((int) vec_inside_cost * min_profitable_iters) 3859 + (((int) vec_outside_cost - scalar_outside_cost) 3860 * assumed_vf))) 3861 min_profitable_iters++; 3862 } 3863 } 3864 3865 if (dump_enabled_p ()) 3866 dump_printf (MSG_NOTE, 3867 " Calculated minimum iters for profitability: %d\n", 3868 min_profitable_iters); 3869 3870 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 3871 && min_profitable_iters < (assumed_vf + peel_iters_prologue)) 3872 /* We want the vectorized loop to execute at least once. */ 3873 min_profitable_iters = assumed_vf + peel_iters_prologue; 3874 3875 if (dump_enabled_p ()) 3876 dump_printf_loc (MSG_NOTE, vect_location, 3877 " Runtime profitability threshold = %d\n", 3878 min_profitable_iters); 3879 3880 *ret_min_profitable_niters = min_profitable_iters; 3881 3882 /* Calculate number of iterations required to make the vector version 3883 profitable, relative to the loop bodies only. 3884 3885 Non-vectorized variant is SIC * niters and it must win over vector 3886 variant on the expected loop trip count. The following condition must hold true: 3887 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */ 3888 3889 if (vec_outside_cost <= 0) 3890 min_profitable_estimate = 0; 3891 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 3892 { 3893 /* This is a repeat of the code above, but with + SOC rather 3894 than - SOC. */ 3895 int outside_overhead = (vec_outside_cost 3896 - scalar_single_iter_cost * peel_iters_prologue 3897 - scalar_single_iter_cost * peel_iters_epilogue 3898 + scalar_outside_cost); 3899 int min_vec_niters = 1; 3900 if (outside_overhead > 0) 3901 min_vec_niters = outside_overhead / saving_per_viter + 1; 3902 3903 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 3904 { 3905 int threshold = (vec_inside_cost * min_vec_niters 3906 + vec_outside_cost 3907 + scalar_outside_cost); 3908 min_profitable_estimate = threshold / scalar_single_iter_cost + 1; 3909 } 3910 else 3911 min_profitable_estimate = (min_vec_niters * assumed_vf 3912 + peel_iters_prologue 3913 + peel_iters_epilogue); 3914 } 3915 else 3916 { 3917 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) 3918 * assumed_vf 3919 - vec_inside_cost * peel_iters_prologue 3920 - vec_inside_cost * peel_iters_epilogue) 3921 / ((scalar_single_iter_cost * assumed_vf) 3922 - vec_inside_cost); 3923 } 3924 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters); 3925 if (dump_enabled_p ()) 3926 dump_printf_loc (MSG_NOTE, vect_location, 3927 " Static estimate profitability threshold = %d\n", 3928 min_profitable_estimate); 3929 3930 *ret_min_profitable_estimate = min_profitable_estimate; 3931} 3932 3933/* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET 3934 vector elements (not bits) for a vector with NELT elements. */ 3935static void 3936calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt, 3937 vec_perm_builder *sel) 3938{ 3939 /* The encoding is a single stepped pattern. Any wrap-around is handled 3940 by vec_perm_indices. */ 3941 sel->new_vector (nelt, 1, 3); 3942 for (unsigned int i = 0; i < 3; i++) 3943 sel->quick_push (i + offset); 3944} 3945 3946/* Checks whether the target supports whole-vector shifts for vectors of mode 3947 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_ 3948 it supports vec_perm_const with masks for all necessary shift amounts. */ 3949static bool 3950have_whole_vector_shift (machine_mode mode) 3951{ 3952 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing) 3953 return true; 3954 3955 /* Variable-length vectors should be handled via the optab. */ 3956 unsigned int nelt; 3957 if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) 3958 return false; 3959 3960 vec_perm_builder sel; 3961 vec_perm_indices indices; 3962 for (unsigned int i = nelt / 2; i >= 1; i /= 2) 3963 { 3964 calc_vec_perm_mask_for_shift (i, nelt, &sel); 3965 indices.new_vector (sel, 2, nelt); 3966 if (!can_vec_perm_const_p (mode, indices, false)) 3967 return false; 3968 } 3969 return true; 3970} 3971 3972/* TODO: Close dependency between vect_model_*_cost and vectorizable_* 3973 functions. Design better to avoid maintenance issues. */ 3974 3975/* Function vect_model_reduction_cost. 3976 3977 Models cost for a reduction operation, including the vector ops 3978 generated within the strip-mine loop in some cases, the initial 3979 definition before the loop, and the epilogue code that must be generated. */ 3980 3981static void 3982vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn, 3983 vect_reduction_type reduction_type, 3984 int ncopies, stmt_vector_for_cost *cost_vec) 3985{ 3986 int prologue_cost = 0, epilogue_cost = 0, inside_cost; 3987 enum tree_code code; 3988 optab optab; 3989 tree vectype; 3990 machine_mode mode; 3991 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 3992 class loop *loop = NULL; 3993 3994 if (loop_vinfo) 3995 loop = LOOP_VINFO_LOOP (loop_vinfo); 3996 3997 /* Condition reductions generate two reductions in the loop. */ 3998 if (reduction_type == COND_REDUCTION) 3999 ncopies *= 2; 4000 4001 vectype = STMT_VINFO_VECTYPE (stmt_info); 4002 mode = TYPE_MODE (vectype); 4003 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); 4004 4005 code = gimple_assign_rhs_code (orig_stmt_info->stmt); 4006 4007 if (reduction_type == EXTRACT_LAST_REDUCTION) 4008 /* No extra instructions are needed in the prologue. The loop body 4009 operations are costed in vectorizable_condition. */ 4010 inside_cost = 0; 4011 else if (reduction_type == FOLD_LEFT_REDUCTION) 4012 { 4013 /* No extra instructions needed in the prologue. */ 4014 prologue_cost = 0; 4015 4016 if (reduc_fn != IFN_LAST) 4017 /* Count one reduction-like operation per vector. */ 4018 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar, 4019 stmt_info, 0, vect_body); 4020 else 4021 { 4022 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */ 4023 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype); 4024 inside_cost = record_stmt_cost (cost_vec, nelements, 4025 vec_to_scalar, stmt_info, 0, 4026 vect_body); 4027 inside_cost += record_stmt_cost (cost_vec, nelements, 4028 scalar_stmt, stmt_info, 0, 4029 vect_body); 4030 } 4031 } 4032 else 4033 { 4034 /* Add in cost for initial definition. 4035 For cond reduction we have four vectors: initial index, step, 4036 initial result of the data reduction, initial value of the index 4037 reduction. */ 4038 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1; 4039 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts, 4040 scalar_to_vec, stmt_info, 0, 4041 vect_prologue); 4042 } 4043 4044 /* Determine cost of epilogue code. 4045 4046 We have a reduction operator that will reduce the vector in one statement. 4047 Also requires scalar extract. */ 4048 4049 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info)) 4050 { 4051 if (reduc_fn != IFN_LAST) 4052 { 4053 if (reduction_type == COND_REDUCTION) 4054 { 4055 /* An EQ stmt and an COND_EXPR stmt. */ 4056 epilogue_cost += record_stmt_cost (cost_vec, 2, 4057 vector_stmt, stmt_info, 0, 4058 vect_epilogue); 4059 /* Reduction of the max index and a reduction of the found 4060 values. */ 4061 epilogue_cost += record_stmt_cost (cost_vec, 2, 4062 vec_to_scalar, stmt_info, 0, 4063 vect_epilogue); 4064 /* A broadcast of the max value. */ 4065 epilogue_cost += record_stmt_cost (cost_vec, 1, 4066 scalar_to_vec, stmt_info, 0, 4067 vect_epilogue); 4068 } 4069 else 4070 { 4071 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt, 4072 stmt_info, 0, vect_epilogue); 4073 epilogue_cost += record_stmt_cost (cost_vec, 1, 4074 vec_to_scalar, stmt_info, 0, 4075 vect_epilogue); 4076 } 4077 } 4078 else if (reduction_type == COND_REDUCTION) 4079 { 4080 unsigned estimated_nunits = vect_nunits_for_cost (vectype); 4081 /* Extraction of scalar elements. */ 4082 epilogue_cost += record_stmt_cost (cost_vec, 4083 2 * estimated_nunits, 4084 vec_to_scalar, stmt_info, 0, 4085 vect_epilogue); 4086 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */ 4087 epilogue_cost += record_stmt_cost (cost_vec, 4088 2 * estimated_nunits - 3, 4089 scalar_stmt, stmt_info, 0, 4090 vect_epilogue); 4091 } 4092 else if (reduction_type == EXTRACT_LAST_REDUCTION 4093 || reduction_type == FOLD_LEFT_REDUCTION) 4094 /* No extra instructions need in the epilogue. */ 4095 ; 4096 else 4097 { 4098 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); 4099 tree bitsize = 4100 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt))); 4101 int element_bitsize = tree_to_uhwi (bitsize); 4102 int nelements = vec_size_in_bits / element_bitsize; 4103 4104 if (code == COND_EXPR) 4105 code = MAX_EXPR; 4106 4107 optab = optab_for_tree_code (code, vectype, optab_default); 4108 4109 /* We have a whole vector shift available. */ 4110 if (optab != unknown_optab 4111 && VECTOR_MODE_P (mode) 4112 && optab_handler (optab, mode) != CODE_FOR_nothing 4113 && have_whole_vector_shift (mode)) 4114 { 4115 /* Final reduction via vector shifts and the reduction operator. 4116 Also requires scalar extract. */ 4117 epilogue_cost += record_stmt_cost (cost_vec, 4118 exact_log2 (nelements) * 2, 4119 vector_stmt, stmt_info, 0, 4120 vect_epilogue); 4121 epilogue_cost += record_stmt_cost (cost_vec, 1, 4122 vec_to_scalar, stmt_info, 0, 4123 vect_epilogue); 4124 } 4125 else 4126 /* Use extracts and reduction op for final reduction. For N 4127 elements, we have N extracts and N-1 reduction ops. */ 4128 epilogue_cost += record_stmt_cost (cost_vec, 4129 nelements + nelements - 1, 4130 vector_stmt, stmt_info, 0, 4131 vect_epilogue); 4132 } 4133 } 4134 4135 if (dump_enabled_p ()) 4136 dump_printf (MSG_NOTE, 4137 "vect_model_reduction_cost: inside_cost = %d, " 4138 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost, 4139 prologue_cost, epilogue_cost); 4140} 4141 4142 4143/* Function vect_model_induction_cost. 4144 4145 Models cost for induction operations. */ 4146 4147static void 4148vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies, 4149 stmt_vector_for_cost *cost_vec) 4150{ 4151 unsigned inside_cost, prologue_cost; 4152 4153 if (PURE_SLP_STMT (stmt_info)) 4154 return; 4155 4156 /* loop cost for vec_loop. */ 4157 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt, 4158 stmt_info, 0, vect_body); 4159 4160 /* prologue cost for vec_init and vec_step. */ 4161 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec, 4162 stmt_info, 0, vect_prologue); 4163 4164 if (dump_enabled_p ()) 4165 dump_printf_loc (MSG_NOTE, vect_location, 4166 "vect_model_induction_cost: inside_cost = %d, " 4167 "prologue_cost = %d .\n", inside_cost, prologue_cost); 4168} 4169 4170 4171 4172/* Function get_initial_def_for_reduction 4173 4174 Input: 4175 STMT_VINFO - a stmt that performs a reduction operation in the loop. 4176 INIT_VAL - the initial value of the reduction variable 4177 4178 Output: 4179 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result 4180 of the reduction (used for adjusting the epilog - see below). 4181 Return a vector variable, initialized according to the operation that 4182 STMT_VINFO performs. This vector will be used as the initial value 4183 of the vector of partial results. 4184 4185 Option1 (adjust in epilog): Initialize the vector as follows: 4186 add/bit or/xor: [0,0,...,0,0] 4187 mult/bit and: [1,1,...,1,1] 4188 min/max/cond_expr: [init_val,init_val,..,init_val,init_val] 4189 and when necessary (e.g. add/mult case) let the caller know 4190 that it needs to adjust the result by init_val. 4191 4192 Option2: Initialize the vector as follows: 4193 add/bit or/xor: [init_val,0,0,...,0] 4194 mult/bit and: [init_val,1,1,...,1] 4195 min/max/cond_expr: [init_val,init_val,...,init_val] 4196 and no adjustments are needed. 4197 4198 For example, for the following code: 4199 4200 s = init_val; 4201 for (i=0;i<n;i++) 4202 s = s + a[i]; 4203 4204 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'. 4205 For a vector of 4 units, we want to return either [0,0,0,init_val], 4206 or [0,0,0,0] and let the caller know that it needs to adjust 4207 the result at the end by 'init_val'. 4208 4209 FORNOW, we are using the 'adjust in epilog' scheme, because this way the 4210 initialization vector is simpler (same element in all entries), if 4211 ADJUSTMENT_DEF is not NULL, and Option2 otherwise. 4212 4213 A cost model should help decide between these two schemes. */ 4214 4215static tree 4216get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, 4217 enum tree_code code, tree init_val, 4218 tree *adjustment_def) 4219{ 4220 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); 4221 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 4222 tree scalar_type = TREE_TYPE (init_val); 4223 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type); 4224 tree def_for_init; 4225 tree init_def; 4226 REAL_VALUE_TYPE real_init_val = dconst0; 4227 int int_init_val = 0; 4228 gimple_seq stmts = NULL; 4229 4230 gcc_assert (vectype); 4231 4232 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type) 4233 || SCALAR_FLOAT_TYPE_P (scalar_type)); 4234 4235 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo) 4236 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father); 4237 4238 /* ADJUSTMENT_DEF is NULL when called from 4239 vect_create_epilog_for_reduction to vectorize double reduction. */ 4240 if (adjustment_def) 4241 *adjustment_def = NULL; 4242 4243 switch (code) 4244 { 4245 case WIDEN_SUM_EXPR: 4246 case DOT_PROD_EXPR: 4247 case SAD_EXPR: 4248 case PLUS_EXPR: 4249 case MINUS_EXPR: 4250 case BIT_IOR_EXPR: 4251 case BIT_XOR_EXPR: 4252 case MULT_EXPR: 4253 case BIT_AND_EXPR: 4254 { 4255 if (code == MULT_EXPR) 4256 { 4257 real_init_val = dconst1; 4258 int_init_val = 1; 4259 } 4260 4261 if (code == BIT_AND_EXPR) 4262 int_init_val = -1; 4263 4264 if (SCALAR_FLOAT_TYPE_P (scalar_type)) 4265 def_for_init = build_real (scalar_type, real_init_val); 4266 else 4267 def_for_init = build_int_cst (scalar_type, int_init_val); 4268 4269 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0)) 4270 { 4271 /* Option1: the first element is '0' or '1' as well. */ 4272 if (!operand_equal_p (def_for_init, init_val, 0)) 4273 *adjustment_def = init_val; 4274 init_def = gimple_build_vector_from_val (&stmts, vectype, 4275 def_for_init); 4276 } 4277 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ()) 4278 { 4279 /* Option2 (variable length): the first element is INIT_VAL. */ 4280 init_def = gimple_build_vector_from_val (&stmts, vectype, 4281 def_for_init); 4282 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT, 4283 vectype, init_def, init_val); 4284 } 4285 else 4286 { 4287 /* Option2: the first element is INIT_VAL. */ 4288 tree_vector_builder elts (vectype, 1, 2); 4289 elts.quick_push (init_val); 4290 elts.quick_push (def_for_init); 4291 init_def = gimple_build_vector (&stmts, &elts); 4292 } 4293 } 4294 break; 4295 4296 case MIN_EXPR: 4297 case MAX_EXPR: 4298 case COND_EXPR: 4299 { 4300 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val); 4301 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val); 4302 } 4303 break; 4304 4305 default: 4306 gcc_unreachable (); 4307 } 4308 4309 if (stmts) 4310 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); 4311 return init_def; 4312} 4313 4314/* Get at the initial defs for the reduction PHIs in SLP_NODE. 4315 NUMBER_OF_VECTORS is the number of vector defs to create. 4316 If NEUTRAL_OP is nonnull, introducing extra elements of that 4317 value will not change the result. */ 4318 4319static void 4320get_initial_defs_for_reduction (slp_tree slp_node, 4321 vec<tree> *vec_oprnds, 4322 unsigned int number_of_vectors, 4323 bool reduc_chain, tree neutral_op) 4324{ 4325 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node); 4326 stmt_vec_info stmt_vinfo = stmts[0]; 4327 vec_info *vinfo = stmt_vinfo->vinfo; 4328 unsigned HOST_WIDE_INT nunits; 4329 unsigned j, number_of_places_left_in_vector; 4330 tree vector_type; 4331 unsigned int group_size = stmts.length (); 4332 unsigned int i; 4333 class loop *loop; 4334 4335 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo); 4336 4337 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def); 4338 4339 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father; 4340 gcc_assert (loop); 4341 edge pe = loop_preheader_edge (loop); 4342 4343 gcc_assert (!reduc_chain || neutral_op); 4344 4345 /* NUMBER_OF_COPIES is the number of times we need to use the same values in 4346 created vectors. It is greater than 1 if unrolling is performed. 4347 4348 For example, we have two scalar operands, s1 and s2 (e.g., group of 4349 strided accesses of size two), while NUNITS is four (i.e., four scalars 4350 of this type can be packed in a vector). The output vector will contain 4351 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES 4352 will be 2). 4353 4354 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several 4355 vectors containing the operands. 4356 4357 For example, NUNITS is four as before, and the group size is 8 4358 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and 4359 {s5, s6, s7, s8}. */ 4360 4361 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits)) 4362 nunits = group_size; 4363 4364 number_of_places_left_in_vector = nunits; 4365 bool constant_p = true; 4366 tree_vector_builder elts (vector_type, nunits, 1); 4367 elts.quick_grow (nunits); 4368 gimple_seq ctor_seq = NULL; 4369 for (j = 0; j < nunits * number_of_vectors; ++j) 4370 { 4371 tree op; 4372 i = j % group_size; 4373 stmt_vinfo = stmts[i]; 4374 4375 /* Get the def before the loop. In reduction chain we have only 4376 one initial value. Else we have as many as PHIs in the group. */ 4377 if (reduc_chain) 4378 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe); 4379 else if (((vec_oprnds->length () + 1) * nunits 4380 - number_of_places_left_in_vector >= group_size) 4381 && neutral_op) 4382 op = neutral_op; 4383 else 4384 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe); 4385 4386 /* Create 'vect_ = {op0,op1,...,opn}'. */ 4387 number_of_places_left_in_vector--; 4388 elts[nunits - number_of_places_left_in_vector - 1] = op; 4389 if (!CONSTANT_CLASS_P (op)) 4390 constant_p = false; 4391 4392 if (number_of_places_left_in_vector == 0) 4393 { 4394 tree init; 4395 if (constant_p && !neutral_op 4396 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits) 4397 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits)) 4398 /* Build the vector directly from ELTS. */ 4399 init = gimple_build_vector (&ctor_seq, &elts); 4400 else if (neutral_op) 4401 { 4402 /* Build a vector of the neutral value and shift the 4403 other elements into place. */ 4404 init = gimple_build_vector_from_val (&ctor_seq, vector_type, 4405 neutral_op); 4406 int k = nunits; 4407 while (k > 0 && elts[k - 1] == neutral_op) 4408 k -= 1; 4409 while (k > 0) 4410 { 4411 k -= 1; 4412 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT, 4413 vector_type, init, elts[k]); 4414 } 4415 } 4416 else 4417 { 4418 /* First time round, duplicate ELTS to fill the 4419 required number of vectors. */ 4420 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts, 4421 number_of_vectors, *vec_oprnds); 4422 break; 4423 } 4424 vec_oprnds->quick_push (init); 4425 4426 number_of_places_left_in_vector = nunits; 4427 elts.new_vector (vector_type, nunits, 1); 4428 elts.quick_grow (nunits); 4429 constant_p = true; 4430 } 4431 } 4432 if (ctor_seq != NULL) 4433 gsi_insert_seq_on_edge_immediate (pe, ctor_seq); 4434} 4435 4436/* For a statement STMT_INFO taking part in a reduction operation return 4437 the stmt_vec_info the meta information is stored on. */ 4438 4439stmt_vec_info 4440info_for_reduction (stmt_vec_info stmt_info) 4441{ 4442 stmt_info = vect_orig_stmt (stmt_info); 4443 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info)); 4444 if (!is_a <gphi *> (stmt_info->stmt) 4445 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info))) 4446 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); 4447 gphi *phi = as_a <gphi *> (stmt_info->stmt); 4448 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) 4449 { 4450 if (gimple_phi_num_args (phi) == 1) 4451 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); 4452 } 4453 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) 4454 { 4455 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father); 4456 stmt_vec_info info 4457 = stmt_info->vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe)); 4458 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def) 4459 stmt_info = info; 4460 } 4461 return stmt_info; 4462} 4463 4464/* Function vect_create_epilog_for_reduction 4465 4466 Create code at the loop-epilog to finalize the result of a reduction 4467 computation. 4468 4469 STMT_INFO is the scalar reduction stmt that is being vectorized. 4470 SLP_NODE is an SLP node containing a group of reduction statements. The 4471 first one in this group is STMT_INFO. 4472 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE 4473 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi 4474 (counting from 0) 4475 4476 This function: 4477 1. Completes the reduction def-use cycles. 4478 2. "Reduces" each vector of partial results VECT_DEFS into a single result, 4479 by calling the function specified by REDUC_FN if available, or by 4480 other means (whole-vector shifts or a scalar loop). 4481 The function also creates a new phi node at the loop exit to preserve 4482 loop-closed form, as illustrated below. 4483 4484 The flow at the entry to this function: 4485 4486 loop: 4487 vec_def = phi <vec_init, null> # REDUCTION_PHI 4488 VECT_DEF = vector_stmt # vectorized form of STMT_INFO 4489 s_loop = scalar_stmt # (scalar) STMT_INFO 4490 loop_exit: 4491 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 4492 use <s_out0> 4493 use <s_out0> 4494 4495 The above is transformed by this function into: 4496 4497 loop: 4498 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI 4499 VECT_DEF = vector_stmt # vectorized form of STMT_INFO 4500 s_loop = scalar_stmt # (scalar) STMT_INFO 4501 loop_exit: 4502 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 4503 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 4504 v_out2 = reduce <v_out1> 4505 s_out3 = extract_field <v_out2, 0> 4506 s_out4 = adjust_result <s_out3> 4507 use <s_out4> 4508 use <s_out4> 4509*/ 4510 4511static void 4512vect_create_epilog_for_reduction (stmt_vec_info stmt_info, 4513 slp_tree slp_node, 4514 slp_instance slp_node_instance) 4515{ 4516 stmt_vec_info reduc_info = info_for_reduction (stmt_info); 4517 gcc_assert (reduc_info->is_reduc_info); 4518 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4519 /* For double reductions we need to get at the inner loop reduction 4520 stmt which has the meta info attached. Our stmt_info is that of the 4521 loop-closed PHI of the inner loop which we remember as 4522 def for the reduction PHI generation. */ 4523 bool double_reduc = false; 4524 stmt_vec_info rdef_info = stmt_info; 4525 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) 4526 { 4527 gcc_assert (!slp_node); 4528 double_reduc = true; 4529 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def 4530 (stmt_info->stmt, 0)); 4531 stmt_info = vect_stmt_to_vectorize (stmt_info); 4532 } 4533 gphi *reduc_def_stmt 4534 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt); 4535 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info); 4536 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info); 4537 stmt_vec_info prev_phi_info; 4538 tree vectype; 4539 machine_mode mode; 4540 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL; 4541 basic_block exit_bb; 4542 tree scalar_dest; 4543 tree scalar_type; 4544 gimple *new_phi = NULL, *phi; 4545 stmt_vec_info phi_info; 4546 gimple_stmt_iterator exit_gsi; 4547 tree new_temp = NULL_TREE, new_name, new_scalar_dest; 4548 gimple *epilog_stmt = NULL; 4549 gimple *exit_phi; 4550 tree bitsize; 4551 tree def; 4552 tree orig_name, scalar_result; 4553 imm_use_iterator imm_iter, phi_imm_iter; 4554 use_operand_p use_p, phi_use_p; 4555 gimple *use_stmt; 4556 bool nested_in_vect_loop = false; 4557 auto_vec<gimple *> new_phis; 4558 int j, i; 4559 auto_vec<tree> scalar_results; 4560 unsigned int group_size = 1, k; 4561 auto_vec<gimple *> phis; 4562 bool slp_reduc = false; 4563 bool direct_slp_reduc; 4564 tree new_phi_result; 4565 tree induction_index = NULL_TREE; 4566 4567 if (slp_node) 4568 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 4569 4570 if (nested_in_vect_loop_p (loop, stmt_info)) 4571 { 4572 outer_loop = loop; 4573 loop = loop->inner; 4574 nested_in_vect_loop = true; 4575 gcc_assert (!slp_node); 4576 } 4577 gcc_assert (!nested_in_vect_loop || double_reduc); 4578 4579 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info); 4580 gcc_assert (vectype); 4581 mode = TYPE_MODE (vectype); 4582 4583 tree initial_def = NULL; 4584 tree induc_val = NULL_TREE; 4585 tree adjustment_def = NULL; 4586 if (slp_node) 4587 ; 4588 else 4589 { 4590 /* Get at the scalar def before the loop, that defines the initial value 4591 of the reduction variable. */ 4592 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt, 4593 loop_preheader_edge (loop)); 4594 /* Optimize: for induction condition reduction, if we can't use zero 4595 for induc_val, use initial_def. */ 4596 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) 4597 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); 4598 else if (double_reduc) 4599 ; 4600 else if (nested_in_vect_loop) 4601 ; 4602 else 4603 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info); 4604 } 4605 4606 unsigned vec_num; 4607 int ncopies; 4608 if (slp_node) 4609 { 4610 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length (); 4611 ncopies = 1; 4612 } 4613 else 4614 { 4615 vec_num = 1; 4616 ncopies = 0; 4617 phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt)); 4618 do 4619 { 4620 ncopies++; 4621 phi_info = STMT_VINFO_RELATED_STMT (phi_info); 4622 } 4623 while (phi_info); 4624 } 4625 4626 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR) 4627 which is updated with the current index of the loop for every match of 4628 the original loop's cond_expr (VEC_STMT). This results in a vector 4629 containing the last time the condition passed for that vector lane. 4630 The first match will be a 1 to allow 0 to be used for non-matching 4631 indexes. If there are no matches at all then the vector will be all 4632 zeroes. 4633 4634 PR92772: This algorithm is broken for architectures that support 4635 masked vectors, but do not provide fold_extract_last. */ 4636 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION) 4637 { 4638 auto_vec<std::pair<tree, bool>, 2> ccompares; 4639 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info); 4640 cond_info = vect_stmt_to_vectorize (cond_info); 4641 while (cond_info != reduc_info) 4642 { 4643 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR) 4644 { 4645 gimple *vec_stmt = STMT_VINFO_VEC_STMT (cond_info)->stmt; 4646 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR); 4647 ccompares.safe_push 4648 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)), 4649 STMT_VINFO_REDUC_IDX (cond_info) == 2)); 4650 } 4651 cond_info 4652 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt, 4653 1 + STMT_VINFO_REDUC_IDX 4654 (cond_info))); 4655 cond_info = vect_stmt_to_vectorize (cond_info); 4656 } 4657 gcc_assert (ccompares.length () != 0); 4658 4659 tree indx_before_incr, indx_after_incr; 4660 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype); 4661 int scalar_precision 4662 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype))); 4663 tree cr_index_scalar_type = make_unsigned_type (scalar_precision); 4664 tree cr_index_vector_type = get_related_vectype_for_scalar_type 4665 (TYPE_MODE (vectype), cr_index_scalar_type, 4666 TYPE_VECTOR_SUBPARTS (vectype)); 4667 4668 /* First we create a simple vector induction variable which starts 4669 with the values {1,2,3,...} (SERIES_VECT) and increments by the 4670 vector size (STEP). */ 4671 4672 /* Create a {1,2,3,...} vector. */ 4673 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1); 4674 4675 /* Create a vector of the step value. */ 4676 tree step = build_int_cst (cr_index_scalar_type, nunits_out); 4677 tree vec_step = build_vector_from_val (cr_index_vector_type, step); 4678 4679 /* Create an induction variable. */ 4680 gimple_stmt_iterator incr_gsi; 4681 bool insert_after; 4682 standard_iv_increment_position (loop, &incr_gsi, &insert_after); 4683 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi, 4684 insert_after, &indx_before_incr, &indx_after_incr); 4685 4686 /* Next create a new phi node vector (NEW_PHI_TREE) which starts 4687 filled with zeros (VEC_ZERO). */ 4688 4689 /* Create a vector of 0s. */ 4690 tree zero = build_zero_cst (cr_index_scalar_type); 4691 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero); 4692 4693 /* Create a vector phi node. */ 4694 tree new_phi_tree = make_ssa_name (cr_index_vector_type); 4695 new_phi = create_phi_node (new_phi_tree, loop->header); 4696 loop_vinfo->add_stmt (new_phi); 4697 add_phi_arg (as_a <gphi *> (new_phi), vec_zero, 4698 loop_preheader_edge (loop), UNKNOWN_LOCATION); 4699 4700 /* Now take the condition from the loops original cond_exprs 4701 and produce a new cond_exprs (INDEX_COND_EXPR) which for 4702 every match uses values from the induction variable 4703 (INDEX_BEFORE_INCR) otherwise uses values from the phi node 4704 (NEW_PHI_TREE). 4705 Finally, we update the phi (NEW_PHI_TREE) to take the value of 4706 the new cond_expr (INDEX_COND_EXPR). */ 4707 gimple_seq stmts = NULL; 4708 for (int i = ccompares.length () - 1; i != -1; --i) 4709 { 4710 tree ccompare = ccompares[i].first; 4711 if (ccompares[i].second) 4712 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR, 4713 cr_index_vector_type, 4714 ccompare, 4715 indx_before_incr, new_phi_tree); 4716 else 4717 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR, 4718 cr_index_vector_type, 4719 ccompare, 4720 new_phi_tree, indx_before_incr); 4721 } 4722 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT); 4723 stmt_vec_info index_vec_info 4724 = loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (new_phi_tree)); 4725 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type; 4726 4727 /* Update the phi with the vec cond. */ 4728 induction_index = new_phi_tree; 4729 add_phi_arg (as_a <gphi *> (new_phi), induction_index, 4730 loop_latch_edge (loop), UNKNOWN_LOCATION); 4731 } 4732 4733 /* 2. Create epilog code. 4734 The reduction epilog code operates across the elements of the vector 4735 of partial results computed by the vectorized loop. 4736 The reduction epilog code consists of: 4737 4738 step 1: compute the scalar result in a vector (v_out2) 4739 step 2: extract the scalar result (s_out3) from the vector (v_out2) 4740 step 3: adjust the scalar result (s_out3) if needed. 4741 4742 Step 1 can be accomplished using one the following three schemes: 4743 (scheme 1) using reduc_fn, if available. 4744 (scheme 2) using whole-vector shifts, if available. 4745 (scheme 3) using a scalar loop. In this case steps 1+2 above are 4746 combined. 4747 4748 The overall epilog code looks like this: 4749 4750 s_out0 = phi <s_loop> # original EXIT_PHI 4751 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 4752 v_out2 = reduce <v_out1> # step 1 4753 s_out3 = extract_field <v_out2, 0> # step 2 4754 s_out4 = adjust_result <s_out3> # step 3 4755 4756 (step 3 is optional, and steps 1 and 2 may be combined). 4757 Lastly, the uses of s_out0 are replaced by s_out4. */ 4758 4759 4760 /* 2.1 Create new loop-exit-phis to preserve loop-closed form: 4761 v_out1 = phi <VECT_DEF> 4762 Store them in NEW_PHIS. */ 4763 if (double_reduc) 4764 loop = outer_loop; 4765 exit_bb = single_exit (loop)->dest; 4766 prev_phi_info = NULL; 4767 new_phis.create (slp_node ? vec_num : ncopies); 4768 for (unsigned i = 0; i < vec_num; i++) 4769 { 4770 if (slp_node) 4771 def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt); 4772 else 4773 def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt); 4774 for (j = 0; j < ncopies; j++) 4775 { 4776 tree new_def = copy_ssa_name (def); 4777 phi = create_phi_node (new_def, exit_bb); 4778 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi); 4779 if (j == 0) 4780 new_phis.quick_push (phi); 4781 else 4782 { 4783 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def); 4784 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info; 4785 } 4786 4787 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); 4788 prev_phi_info = phi_info; 4789 } 4790 } 4791 4792 exit_gsi = gsi_after_labels (exit_bb); 4793 4794 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 4795 (i.e. when reduc_fn is not available) and in the final adjustment 4796 code (if needed). Also get the original scalar reduction variable as 4797 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it 4798 represents a reduction pattern), the tree-code and scalar-def are 4799 taken from the original stmt that the pattern-stmt (STMT) replaces. 4800 Otherwise (it is a regular reduction) - the tree-code and scalar-def 4801 are taken from STMT. */ 4802 4803 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info); 4804 if (orig_stmt_info != stmt_info) 4805 { 4806 /* Reduction pattern */ 4807 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); 4808 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info); 4809 } 4810 4811 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt); 4812 scalar_type = TREE_TYPE (scalar_dest); 4813 scalar_results.create (group_size); 4814 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); 4815 bitsize = TYPE_SIZE (scalar_type); 4816 4817 /* SLP reduction without reduction chain, e.g., 4818 # a1 = phi <a2, a0> 4819 # b1 = phi <b2, b0> 4820 a2 = operation (a1) 4821 b2 = operation (b1) */ 4822 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)); 4823 4824 /* True if we should implement SLP_REDUC using native reduction operations 4825 instead of scalar operations. */ 4826 direct_slp_reduc = (reduc_fn != IFN_LAST 4827 && slp_reduc 4828 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ()); 4829 4830 /* In case of reduction chain, e.g., 4831 # a1 = phi <a3, a0> 4832 a2 = operation (a1) 4833 a3 = operation (a2), 4834 4835 we may end up with more than one vector result. Here we reduce them to 4836 one vector. */ 4837 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc) 4838 { 4839 gimple_seq stmts = NULL; 4840 tree first_vect = PHI_RESULT (new_phis[0]); 4841 first_vect = gimple_convert (&stmts, vectype, first_vect); 4842 for (k = 1; k < new_phis.length (); k++) 4843 { 4844 gimple *next_phi = new_phis[k]; 4845 tree second_vect = PHI_RESULT (next_phi); 4846 second_vect = gimple_convert (&stmts, vectype, second_vect); 4847 first_vect = gimple_build (&stmts, code, vectype, 4848 first_vect, second_vect); 4849 } 4850 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 4851 4852 new_phi_result = first_vect; 4853 new_phis.truncate (0); 4854 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect)); 4855 } 4856 /* Likewise if we couldn't use a single defuse cycle. */ 4857 else if (ncopies > 1) 4858 { 4859 gcc_assert (new_phis.length () == 1); 4860 gimple_seq stmts = NULL; 4861 tree first_vect = PHI_RESULT (new_phis[0]); 4862 first_vect = gimple_convert (&stmts, vectype, first_vect); 4863 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]); 4864 for (int k = 1; k < ncopies; ++k) 4865 { 4866 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info); 4867 tree second_vect = PHI_RESULT (next_phi_info->stmt); 4868 second_vect = gimple_convert (&stmts, vectype, second_vect); 4869 first_vect = gimple_build (&stmts, code, vectype, 4870 first_vect, second_vect); 4871 } 4872 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 4873 new_phi_result = first_vect; 4874 new_phis.truncate (0); 4875 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect)); 4876 } 4877 else 4878 new_phi_result = PHI_RESULT (new_phis[0]); 4879 4880 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION 4881 && reduc_fn != IFN_LAST) 4882 { 4883 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing 4884 various data values where the condition matched and another vector 4885 (INDUCTION_INDEX) containing all the indexes of those matches. We 4886 need to extract the last matching index (which will be the index with 4887 highest value) and use this to index into the data vector. 4888 For the case where there were no matches, the data vector will contain 4889 all default values and the index vector will be all zeros. */ 4890 4891 /* Get various versions of the type of the vector of indexes. */ 4892 tree index_vec_type = TREE_TYPE (induction_index); 4893 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type)); 4894 tree index_scalar_type = TREE_TYPE (index_vec_type); 4895 tree index_vec_cmp_type = truth_type_for (index_vec_type); 4896 4897 /* Get an unsigned integer version of the type of the data vector. */ 4898 int scalar_precision 4899 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type)); 4900 tree scalar_type_unsigned = make_unsigned_type (scalar_precision); 4901 tree vectype_unsigned = build_vector_type 4902 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype)); 4903 4904 /* First we need to create a vector (ZERO_VEC) of zeros and another 4905 vector (MAX_INDEX_VEC) filled with the last matching index, which we 4906 can create using a MAX reduction and then expanding. 4907 In the case where the loop never made any matches, the max index will 4908 be zero. */ 4909 4910 /* Vector of {0, 0, 0,...}. */ 4911 tree zero_vec = build_zero_cst (vectype); 4912 4913 gimple_seq stmts = NULL; 4914 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result); 4915 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 4916 4917 /* Find maximum value from the vector of found indexes. */ 4918 tree max_index = make_ssa_name (index_scalar_type); 4919 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX, 4920 1, induction_index); 4921 gimple_call_set_lhs (max_index_stmt, max_index); 4922 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT); 4923 4924 /* Vector of {max_index, max_index, max_index,...}. */ 4925 tree max_index_vec = make_ssa_name (index_vec_type); 4926 tree max_index_vec_rhs = build_vector_from_val (index_vec_type, 4927 max_index); 4928 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec, 4929 max_index_vec_rhs); 4930 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT); 4931 4932 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes 4933 with the vector (INDUCTION_INDEX) of found indexes, choosing values 4934 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC) 4935 otherwise. Only one value should match, resulting in a vector 4936 (VEC_COND) with one data value and the rest zeros. 4937 In the case where the loop never made any matches, every index will 4938 match, resulting in a vector with all data values (which will all be 4939 the default value). */ 4940 4941 /* Compare the max index vector to the vector of found indexes to find 4942 the position of the max value. */ 4943 tree vec_compare = make_ssa_name (index_vec_cmp_type); 4944 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR, 4945 induction_index, 4946 max_index_vec); 4947 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT); 4948 4949 /* Use the compare to choose either values from the data vector or 4950 zero. */ 4951 tree vec_cond = make_ssa_name (vectype); 4952 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR, 4953 vec_compare, new_phi_result, 4954 zero_vec); 4955 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT); 4956 4957 /* Finally we need to extract the data value from the vector (VEC_COND) 4958 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR 4959 reduction, but because this doesn't exist, we can use a MAX reduction 4960 instead. The data value might be signed or a float so we need to cast 4961 it first. 4962 In the case where the loop never made any matches, the data values are 4963 all identical, and so will reduce down correctly. */ 4964 4965 /* Make the matched data values unsigned. */ 4966 tree vec_cond_cast = make_ssa_name (vectype_unsigned); 4967 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned, 4968 vec_cond); 4969 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast, 4970 VIEW_CONVERT_EXPR, 4971 vec_cond_cast_rhs); 4972 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT); 4973 4974 /* Reduce down to a scalar value. */ 4975 tree data_reduc = make_ssa_name (scalar_type_unsigned); 4976 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX, 4977 1, vec_cond_cast); 4978 gimple_call_set_lhs (data_reduc_stmt, data_reduc); 4979 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT); 4980 4981 /* Convert the reduced value back to the result type and set as the 4982 result. */ 4983 stmts = NULL; 4984 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type, 4985 data_reduc); 4986 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 4987 scalar_results.safe_push (new_temp); 4988 } 4989 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION 4990 && reduc_fn == IFN_LAST) 4991 { 4992 /* Condition reduction without supported IFN_REDUC_MAX. Generate 4993 idx = 0; 4994 idx_val = induction_index[0]; 4995 val = data_reduc[0]; 4996 for (idx = 0, val = init, i = 0; i < nelts; ++i) 4997 if (induction_index[i] > idx_val) 4998 val = data_reduc[i], idx_val = induction_index[i]; 4999 return val; */ 5000 5001 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result)); 5002 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index)); 5003 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype)); 5004 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index)); 5005 /* Enforced by vectorizable_reduction, which ensures we have target 5006 support before allowing a conditional reduction on variable-length 5007 vectors. */ 5008 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant (); 5009 tree idx_val = NULL_TREE, val = NULL_TREE; 5010 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size) 5011 { 5012 tree old_idx_val = idx_val; 5013 tree old_val = val; 5014 idx_val = make_ssa_name (idx_eltype); 5015 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF, 5016 build3 (BIT_FIELD_REF, idx_eltype, 5017 induction_index, 5018 bitsize_int (el_size), 5019 bitsize_int (off))); 5020 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5021 val = make_ssa_name (data_eltype); 5022 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF, 5023 build3 (BIT_FIELD_REF, 5024 data_eltype, 5025 new_phi_result, 5026 bitsize_int (el_size), 5027 bitsize_int (off))); 5028 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5029 if (off != 0) 5030 { 5031 tree new_idx_val = idx_val; 5032 if (off != v_size - el_size) 5033 { 5034 new_idx_val = make_ssa_name (idx_eltype); 5035 epilog_stmt = gimple_build_assign (new_idx_val, 5036 MAX_EXPR, idx_val, 5037 old_idx_val); 5038 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5039 } 5040 tree new_val = make_ssa_name (data_eltype); 5041 epilog_stmt = gimple_build_assign (new_val, 5042 COND_EXPR, 5043 build2 (GT_EXPR, 5044 boolean_type_node, 5045 idx_val, 5046 old_idx_val), 5047 val, old_val); 5048 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5049 idx_val = new_idx_val; 5050 val = new_val; 5051 } 5052 } 5053 /* Convert the reduced value back to the result type and set as the 5054 result. */ 5055 gimple_seq stmts = NULL; 5056 val = gimple_convert (&stmts, scalar_type, val); 5057 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 5058 scalar_results.safe_push (val); 5059 } 5060 5061 /* 2.3 Create the reduction code, using one of the three schemes described 5062 above. In SLP we simply need to extract all the elements from the 5063 vector (without reducing them), so we use scalar shifts. */ 5064 else if (reduc_fn != IFN_LAST && !slp_reduc) 5065 { 5066 tree tmp; 5067 tree vec_elem_type; 5068 5069 /* Case 1: Create: 5070 v_out2 = reduc_expr <v_out1> */ 5071 5072 if (dump_enabled_p ()) 5073 dump_printf_loc (MSG_NOTE, vect_location, 5074 "Reduce using direct vector reduction.\n"); 5075 5076 gimple_seq stmts = NULL; 5077 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result); 5078 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result)); 5079 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn), 5080 vec_elem_type, new_phi_result); 5081 new_temp = gimple_convert (&stmts, scalar_type, new_temp); 5082 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 5083 5084 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) 5085 && induc_val) 5086 { 5087 /* Earlier we set the initial value to be a vector if induc_val 5088 values. Check the result and if it is induc_val then replace 5089 with the original initial value, unless induc_val is 5090 the same as initial_def already. */ 5091 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, 5092 induc_val); 5093 5094 tmp = make_ssa_name (new_scalar_dest); 5095 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, 5096 initial_def, new_temp); 5097 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5098 new_temp = tmp; 5099 } 5100 5101 scalar_results.safe_push (new_temp); 5102 } 5103 else if (direct_slp_reduc) 5104 { 5105 /* Here we create one vector for each of the REDUC_GROUP_SIZE results, 5106 with the elements for other SLP statements replaced with the 5107 neutral value. We can then do a normal reduction on each vector. */ 5108 5109 /* Enforced by vectorizable_reduction. */ 5110 gcc_assert (new_phis.length () == 1); 5111 gcc_assert (pow2p_hwi (group_size)); 5112 5113 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis; 5114 vec<stmt_vec_info> orig_phis 5115 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node); 5116 gimple_seq seq = NULL; 5117 5118 /* Build a vector {0, 1, 2, ...}, with the same number of elements 5119 and the same element size as VECTYPE. */ 5120 tree index = build_index_vector (vectype, 0, 1); 5121 tree index_type = TREE_TYPE (index); 5122 tree index_elt_type = TREE_TYPE (index_type); 5123 tree mask_type = truth_type_for (index_type); 5124 5125 /* Create a vector that, for each element, identifies which of 5126 the REDUC_GROUP_SIZE results should use it. */ 5127 tree index_mask = build_int_cst (index_elt_type, group_size - 1); 5128 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index, 5129 build_vector_from_val (index_type, index_mask)); 5130 5131 /* Get a neutral vector value. This is simply a splat of the neutral 5132 scalar value if we have one, otherwise the initial scalar value 5133 is itself a neutral value. */ 5134 tree vector_identity = NULL_TREE; 5135 tree neutral_op = NULL_TREE; 5136 if (slp_node) 5137 { 5138 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info); 5139 neutral_op 5140 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, 5141 vectype, code, first != NULL); 5142 } 5143 if (neutral_op) 5144 vector_identity = gimple_build_vector_from_val (&seq, vectype, 5145 neutral_op); 5146 for (unsigned int i = 0; i < group_size; ++i) 5147 { 5148 /* If there's no univeral neutral value, we can use the 5149 initial scalar value from the original PHI. This is used 5150 for MIN and MAX reduction, for example. */ 5151 if (!neutral_op) 5152 { 5153 tree scalar_value 5154 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt, 5155 loop_preheader_edge (loop)); 5156 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype), 5157 scalar_value); 5158 vector_identity = gimple_build_vector_from_val (&seq, vectype, 5159 scalar_value); 5160 } 5161 5162 /* Calculate the equivalent of: 5163 5164 sel[j] = (index[j] == i); 5165 5166 which selects the elements of NEW_PHI_RESULT that should 5167 be included in the result. */ 5168 tree compare_val = build_int_cst (index_elt_type, i); 5169 compare_val = build_vector_from_val (index_type, compare_val); 5170 tree sel = gimple_build (&seq, EQ_EXPR, mask_type, 5171 index, compare_val); 5172 5173 /* Calculate the equivalent of: 5174 5175 vec = seq ? new_phi_result : vector_identity; 5176 5177 VEC is now suitable for a full vector reduction. */ 5178 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype, 5179 sel, new_phi_result, vector_identity); 5180 5181 /* Do the reduction and convert it to the appropriate type. */ 5182 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn), 5183 TREE_TYPE (vectype), vec); 5184 scalar = gimple_convert (&seq, scalar_type, scalar); 5185 scalar_results.safe_push (scalar); 5186 } 5187 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT); 5188 } 5189 else 5190 { 5191 bool reduce_with_shift; 5192 tree vec_temp; 5193 5194 gcc_assert (slp_reduc || new_phis.length () == 1); 5195 5196 /* See if the target wants to do the final (shift) reduction 5197 in a vector mode of smaller size and first reduce upper/lower 5198 halves against each other. */ 5199 enum machine_mode mode1 = mode; 5200 tree stype = TREE_TYPE (vectype); 5201 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); 5202 unsigned nunits1 = nunits; 5203 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode 5204 && new_phis.length () == 1) 5205 { 5206 nunits1 = GET_MODE_NUNITS (mode1).to_constant (); 5207 /* For SLP reductions we have to make sure lanes match up, but 5208 since we're doing individual element final reduction reducing 5209 vector width here is even more important. 5210 ??? We can also separate lanes with permutes, for the common 5211 case of power-of-two group-size odd/even extracts would work. */ 5212 if (slp_reduc && nunits != nunits1) 5213 { 5214 nunits1 = least_common_multiple (nunits1, group_size); 5215 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits); 5216 } 5217 } 5218 if (!slp_reduc 5219 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode) 5220 nunits1 = GET_MODE_NUNITS (mode1).to_constant (); 5221 5222 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype), 5223 stype, nunits1); 5224 reduce_with_shift = have_whole_vector_shift (mode1); 5225 if (!VECTOR_MODE_P (mode1)) 5226 reduce_with_shift = false; 5227 else 5228 { 5229 optab optab = optab_for_tree_code (code, vectype1, optab_default); 5230 if (optab_handler (optab, mode1) == CODE_FOR_nothing) 5231 reduce_with_shift = false; 5232 } 5233 5234 /* First reduce the vector to the desired vector size we should 5235 do shift reduction on by combining upper and lower halves. */ 5236 new_temp = new_phi_result; 5237 while (nunits > nunits1) 5238 { 5239 nunits /= 2; 5240 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype), 5241 stype, nunits); 5242 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1)); 5243 5244 /* The target has to make sure we support lowpart/highpart 5245 extraction, either via direct vector extract or through 5246 an integer mode punning. */ 5247 tree dst1, dst2; 5248 if (convert_optab_handler (vec_extract_optab, 5249 TYPE_MODE (TREE_TYPE (new_temp)), 5250 TYPE_MODE (vectype1)) 5251 != CODE_FOR_nothing) 5252 { 5253 /* Extract sub-vectors directly once vec_extract becomes 5254 a conversion optab. */ 5255 dst1 = make_ssa_name (vectype1); 5256 epilog_stmt 5257 = gimple_build_assign (dst1, BIT_FIELD_REF, 5258 build3 (BIT_FIELD_REF, vectype1, 5259 new_temp, TYPE_SIZE (vectype1), 5260 bitsize_int (0))); 5261 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5262 dst2 = make_ssa_name (vectype1); 5263 epilog_stmt 5264 = gimple_build_assign (dst2, BIT_FIELD_REF, 5265 build3 (BIT_FIELD_REF, vectype1, 5266 new_temp, TYPE_SIZE (vectype1), 5267 bitsize_int (bitsize))); 5268 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5269 } 5270 else 5271 { 5272 /* Extract via punning to appropriately sized integer mode 5273 vector. */ 5274 tree eltype = build_nonstandard_integer_type (bitsize, 1); 5275 tree etype = build_vector_type (eltype, 2); 5276 gcc_assert (convert_optab_handler (vec_extract_optab, 5277 TYPE_MODE (etype), 5278 TYPE_MODE (eltype)) 5279 != CODE_FOR_nothing); 5280 tree tem = make_ssa_name (etype); 5281 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR, 5282 build1 (VIEW_CONVERT_EXPR, 5283 etype, new_temp)); 5284 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5285 new_temp = tem; 5286 tem = make_ssa_name (eltype); 5287 epilog_stmt 5288 = gimple_build_assign (tem, BIT_FIELD_REF, 5289 build3 (BIT_FIELD_REF, eltype, 5290 new_temp, TYPE_SIZE (eltype), 5291 bitsize_int (0))); 5292 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5293 dst1 = make_ssa_name (vectype1); 5294 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR, 5295 build1 (VIEW_CONVERT_EXPR, 5296 vectype1, tem)); 5297 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5298 tem = make_ssa_name (eltype); 5299 epilog_stmt 5300 = gimple_build_assign (tem, BIT_FIELD_REF, 5301 build3 (BIT_FIELD_REF, eltype, 5302 new_temp, TYPE_SIZE (eltype), 5303 bitsize_int (bitsize))); 5304 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5305 dst2 = make_ssa_name (vectype1); 5306 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR, 5307 build1 (VIEW_CONVERT_EXPR, 5308 vectype1, tem)); 5309 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5310 } 5311 5312 new_temp = make_ssa_name (vectype1); 5313 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2); 5314 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5315 new_phis[0] = epilog_stmt; 5316 } 5317 5318 if (reduce_with_shift && !slp_reduc) 5319 { 5320 int element_bitsize = tree_to_uhwi (bitsize); 5321 /* Enforced by vectorizable_reduction, which disallows SLP reductions 5322 for variable-length vectors and also requires direct target support 5323 for loop reductions. */ 5324 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); 5325 int nelements = vec_size_in_bits / element_bitsize; 5326 vec_perm_builder sel; 5327 vec_perm_indices indices; 5328 5329 int elt_offset; 5330 5331 tree zero_vec = build_zero_cst (vectype1); 5332 /* Case 2: Create: 5333 for (offset = nelements/2; offset >= 1; offset/=2) 5334 { 5335 Create: va' = vec_shift <va, offset> 5336 Create: va = vop <va, va'> 5337 } */ 5338 5339 tree rhs; 5340 5341 if (dump_enabled_p ()) 5342 dump_printf_loc (MSG_NOTE, vect_location, 5343 "Reduce using vector shifts\n"); 5344 5345 gimple_seq stmts = NULL; 5346 new_temp = gimple_convert (&stmts, vectype1, new_temp); 5347 for (elt_offset = nelements / 2; 5348 elt_offset >= 1; 5349 elt_offset /= 2) 5350 { 5351 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel); 5352 indices.new_vector (sel, 2, nelements); 5353 tree mask = vect_gen_perm_mask_any (vectype1, indices); 5354 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1, 5355 new_temp, zero_vec, mask); 5356 new_temp = gimple_build (&stmts, code, 5357 vectype1, new_name, new_temp); 5358 } 5359 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 5360 5361 /* 2.4 Extract the final scalar result. Create: 5362 s_out3 = extract_field <v_out2, bitpos> */ 5363 5364 if (dump_enabled_p ()) 5365 dump_printf_loc (MSG_NOTE, vect_location, 5366 "extract scalar result\n"); 5367 5368 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, 5369 bitsize, bitsize_zero_node); 5370 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); 5371 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); 5372 gimple_assign_set_lhs (epilog_stmt, new_temp); 5373 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5374 scalar_results.safe_push (new_temp); 5375 } 5376 else 5377 { 5378 /* Case 3: Create: 5379 s = extract_field <v_out2, 0> 5380 for (offset = element_size; 5381 offset < vector_size; 5382 offset += element_size;) 5383 { 5384 Create: s' = extract_field <v_out2, offset> 5385 Create: s = op <s, s'> // For non SLP cases 5386 } */ 5387 5388 if (dump_enabled_p ()) 5389 dump_printf_loc (MSG_NOTE, vect_location, 5390 "Reduce using scalar code.\n"); 5391 5392 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); 5393 int element_bitsize = tree_to_uhwi (bitsize); 5394 tree compute_type = TREE_TYPE (vectype); 5395 gimple_seq stmts = NULL; 5396 FOR_EACH_VEC_ELT (new_phis, i, new_phi) 5397 { 5398 int bit_offset; 5399 if (gimple_code (new_phi) == GIMPLE_PHI) 5400 vec_temp = PHI_RESULT (new_phi); 5401 else 5402 vec_temp = gimple_assign_lhs (new_phi); 5403 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type, 5404 vec_temp, bitsize, bitsize_zero_node); 5405 5406 /* In SLP we don't need to apply reduction operation, so we just 5407 collect s' values in SCALAR_RESULTS. */ 5408 if (slp_reduc) 5409 scalar_results.safe_push (new_temp); 5410 5411 for (bit_offset = element_bitsize; 5412 bit_offset < vec_size_in_bits; 5413 bit_offset += element_bitsize) 5414 { 5415 tree bitpos = bitsize_int (bit_offset); 5416 new_name = gimple_build (&stmts, BIT_FIELD_REF, 5417 compute_type, vec_temp, 5418 bitsize, bitpos); 5419 if (slp_reduc) 5420 { 5421 /* In SLP we don't need to apply reduction operation, so 5422 we just collect s' values in SCALAR_RESULTS. */ 5423 new_temp = new_name; 5424 scalar_results.safe_push (new_name); 5425 } 5426 else 5427 new_temp = gimple_build (&stmts, code, compute_type, 5428 new_name, new_temp); 5429 } 5430 } 5431 5432 /* The only case where we need to reduce scalar results in SLP, is 5433 unrolling. If the size of SCALAR_RESULTS is greater than 5434 REDUC_GROUP_SIZE, we reduce them combining elements modulo 5435 REDUC_GROUP_SIZE. */ 5436 if (slp_reduc) 5437 { 5438 tree res, first_res, new_res; 5439 5440 /* Reduce multiple scalar results in case of SLP unrolling. */ 5441 for (j = group_size; scalar_results.iterate (j, &res); 5442 j++) 5443 { 5444 first_res = scalar_results[j % group_size]; 5445 new_res = gimple_build (&stmts, code, compute_type, 5446 first_res, res); 5447 scalar_results[j % group_size] = new_res; 5448 } 5449 for (k = 0; k < group_size; k++) 5450 scalar_results[k] = gimple_convert (&stmts, scalar_type, 5451 scalar_results[k]); 5452 } 5453 else 5454 { 5455 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */ 5456 new_temp = gimple_convert (&stmts, scalar_type, new_temp); 5457 scalar_results.safe_push (new_temp); 5458 } 5459 5460 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 5461 } 5462 5463 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) 5464 && induc_val) 5465 { 5466 /* Earlier we set the initial value to be a vector if induc_val 5467 values. Check the result and if it is induc_val then replace 5468 with the original initial value, unless induc_val is 5469 the same as initial_def already. */ 5470 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, 5471 induc_val); 5472 5473 tree tmp = make_ssa_name (new_scalar_dest); 5474 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare, 5475 initial_def, new_temp); 5476 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); 5477 scalar_results[0] = tmp; 5478 } 5479 } 5480 5481 /* 2.5 Adjust the final result by the initial value of the reduction 5482 variable. (When such adjustment is not needed, then 5483 'adjustment_def' is zero). For example, if code is PLUS we create: 5484 new_temp = loop_exit_def + adjustment_def */ 5485 5486 if (adjustment_def) 5487 { 5488 gcc_assert (!slp_reduc); 5489 gimple_seq stmts = NULL; 5490 if (nested_in_vect_loop) 5491 { 5492 new_phi = new_phis[0]; 5493 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def))); 5494 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def); 5495 new_temp = gimple_build (&stmts, code, vectype, 5496 PHI_RESULT (new_phi), adjustment_def); 5497 } 5498 else 5499 { 5500 new_temp = scalar_results[0]; 5501 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); 5502 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def); 5503 new_temp = gimple_build (&stmts, code, scalar_type, 5504 new_temp, adjustment_def); 5505 } 5506 5507 epilog_stmt = gimple_seq_last_stmt (stmts); 5508 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 5509 if (nested_in_vect_loop) 5510 { 5511 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt); 5512 STMT_VINFO_RELATED_STMT (epilog_stmt_info) 5513 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi)); 5514 5515 if (!double_reduc) 5516 scalar_results.quick_push (new_temp); 5517 else 5518 scalar_results[0] = new_temp; 5519 } 5520 else 5521 scalar_results[0] = new_temp; 5522 5523 new_phis[0] = epilog_stmt; 5524 } 5525 5526 if (double_reduc) 5527 loop = loop->inner; 5528 5529 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit 5530 phis with new adjusted scalar results, i.e., replace use <s_out0> 5531 with use <s_out4>. 5532 5533 Transform: 5534 loop_exit: 5535 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 5536 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 5537 v_out2 = reduce <v_out1> 5538 s_out3 = extract_field <v_out2, 0> 5539 s_out4 = adjust_result <s_out3> 5540 use <s_out0> 5541 use <s_out0> 5542 5543 into: 5544 5545 loop_exit: 5546 s_out0 = phi <s_loop> # (scalar) EXIT_PHI 5547 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI 5548 v_out2 = reduce <v_out1> 5549 s_out3 = extract_field <v_out2, 0> 5550 s_out4 = adjust_result <s_out3> 5551 use <s_out4> 5552 use <s_out4> */ 5553 5554 5555 /* In SLP reduction chain we reduce vector results into one vector if 5556 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the 5557 LHS of the last stmt in the reduction chain, since we are looking for 5558 the loop exit phi node. */ 5559 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) 5560 { 5561 stmt_vec_info dest_stmt_info 5562 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]); 5563 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt); 5564 group_size = 1; 5565 } 5566 5567 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in 5568 case that REDUC_GROUP_SIZE is greater than vectorization factor). 5569 Therefore, we need to match SCALAR_RESULTS with corresponding statements. 5570 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results 5571 correspond to the first vector stmt, etc. 5572 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */ 5573 if (group_size > new_phis.length ()) 5574 gcc_assert (!(group_size % new_phis.length ())); 5575 5576 for (k = 0; k < group_size; k++) 5577 { 5578 if (slp_reduc) 5579 { 5580 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k]; 5581 5582 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info); 5583 /* SLP statements can't participate in patterns. */ 5584 gcc_assert (!orig_stmt_info); 5585 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt); 5586 } 5587 5588 if (nested_in_vect_loop) 5589 { 5590 if (double_reduc) 5591 loop = outer_loop; 5592 else 5593 gcc_unreachable (); 5594 } 5595 5596 phis.create (3); 5597 /* Find the loop-closed-use at the loop exit of the original scalar 5598 result. (The reduction result is expected to have two immediate uses, 5599 one at the latch block, and one at the loop exit). For double 5600 reductions we are looking for exit phis of the outer loop. */ 5601 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) 5602 { 5603 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) 5604 { 5605 if (!is_gimple_debug (USE_STMT (use_p))) 5606 phis.safe_push (USE_STMT (use_p)); 5607 } 5608 else 5609 { 5610 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI) 5611 { 5612 tree phi_res = PHI_RESULT (USE_STMT (use_p)); 5613 5614 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res) 5615 { 5616 if (!flow_bb_inside_loop_p (loop, 5617 gimple_bb (USE_STMT (phi_use_p))) 5618 && !is_gimple_debug (USE_STMT (phi_use_p))) 5619 phis.safe_push (USE_STMT (phi_use_p)); 5620 } 5621 } 5622 } 5623 } 5624 5625 FOR_EACH_VEC_ELT (phis, i, exit_phi) 5626 { 5627 /* Replace the uses: */ 5628 orig_name = PHI_RESULT (exit_phi); 5629 scalar_result = scalar_results[k]; 5630 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) 5631 { 5632 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 5633 SET_USE (use_p, scalar_result); 5634 update_stmt (use_stmt); 5635 } 5636 } 5637 5638 phis.release (); 5639 } 5640} 5641 5642/* Return a vector of type VECTYPE that is equal to the vector select 5643 operation "MASK ? VEC : IDENTITY". Insert the select statements 5644 before GSI. */ 5645 5646static tree 5647merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype, 5648 tree vec, tree identity) 5649{ 5650 tree cond = make_temp_ssa_name (vectype, NULL, "cond"); 5651 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR, 5652 mask, vec, identity); 5653 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5654 return cond; 5655} 5656 5657/* Successively apply CODE to each element of VECTOR_RHS, in left-to-right 5658 order, starting with LHS. Insert the extraction statements before GSI and 5659 associate the new scalar SSA names with variable SCALAR_DEST. 5660 Return the SSA name for the result. */ 5661 5662static tree 5663vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest, 5664 tree_code code, tree lhs, tree vector_rhs) 5665{ 5666 tree vectype = TREE_TYPE (vector_rhs); 5667 tree scalar_type = TREE_TYPE (vectype); 5668 tree bitsize = TYPE_SIZE (scalar_type); 5669 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); 5670 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize); 5671 5672 for (unsigned HOST_WIDE_INT bit_offset = 0; 5673 bit_offset < vec_size_in_bits; 5674 bit_offset += element_bitsize) 5675 { 5676 tree bitpos = bitsize_int (bit_offset); 5677 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs, 5678 bitsize, bitpos); 5679 5680 gassign *stmt = gimple_build_assign (scalar_dest, rhs); 5681 rhs = make_ssa_name (scalar_dest, stmt); 5682 gimple_assign_set_lhs (stmt, rhs); 5683 gsi_insert_before (gsi, stmt, GSI_SAME_STMT); 5684 5685 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs); 5686 tree new_name = make_ssa_name (scalar_dest, stmt); 5687 gimple_assign_set_lhs (stmt, new_name); 5688 gsi_insert_before (gsi, stmt, GSI_SAME_STMT); 5689 lhs = new_name; 5690 } 5691 return lhs; 5692} 5693 5694/* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the 5695 type of the vector input. */ 5696 5697static internal_fn 5698get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in) 5699{ 5700 internal_fn mask_reduc_fn; 5701 5702 switch (reduc_fn) 5703 { 5704 case IFN_FOLD_LEFT_PLUS: 5705 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS; 5706 break; 5707 5708 default: 5709 return IFN_LAST; 5710 } 5711 5712 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in, 5713 OPTIMIZE_FOR_SPEED)) 5714 return mask_reduc_fn; 5715 return IFN_LAST; 5716} 5717 5718/* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the 5719 statement that sets the live-out value. REDUC_DEF_STMT is the phi 5720 statement. CODE is the operation performed by STMT_INFO and OPS are 5721 its scalar operands. REDUC_INDEX is the index of the operand in 5722 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that 5723 implements in-order reduction, or IFN_LAST if we should open-code it. 5724 VECTYPE_IN is the type of the vector input. MASKS specifies the masks 5725 that should be used to control the operation in a fully-masked loop. */ 5726 5727static bool 5728vectorize_fold_left_reduction (stmt_vec_info stmt_info, 5729 gimple_stmt_iterator *gsi, 5730 stmt_vec_info *vec_stmt, slp_tree slp_node, 5731 gimple *reduc_def_stmt, 5732 tree_code code, internal_fn reduc_fn, 5733 tree ops[3], tree vectype_in, 5734 int reduc_index, vec_loop_masks *masks) 5735{ 5736 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 5737 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 5738 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); 5739 stmt_vec_info new_stmt_info = NULL; 5740 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in); 5741 5742 int ncopies; 5743 if (slp_node) 5744 ncopies = 1; 5745 else 5746 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); 5747 5748 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info)); 5749 gcc_assert (ncopies == 1); 5750 gcc_assert (TREE_CODE_LENGTH (code) == binary_op); 5751 5752 if (slp_node) 5753 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out), 5754 TYPE_VECTOR_SUBPARTS (vectype_in))); 5755 5756 tree op0 = ops[1 - reduc_index]; 5757 5758 int group_size = 1; 5759 stmt_vec_info scalar_dest_def_info; 5760 auto_vec<tree> vec_oprnds0; 5761 if (slp_node) 5762 { 5763 auto_vec<vec<tree> > vec_defs (2); 5764 vect_get_slp_defs (slp_node, &vec_defs); 5765 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]); 5766 vec_defs[0].release (); 5767 vec_defs[1].release (); 5768 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 5769 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]; 5770 } 5771 else 5772 { 5773 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info); 5774 vec_oprnds0.create (1); 5775 vec_oprnds0.quick_push (loop_vec_def0); 5776 scalar_dest_def_info = stmt_info; 5777 } 5778 5779 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt); 5780 tree scalar_type = TREE_TYPE (scalar_dest); 5781 tree reduc_var = gimple_phi_result (reduc_def_stmt); 5782 5783 int vec_num = vec_oprnds0.length (); 5784 gcc_assert (vec_num == 1 || slp_node); 5785 tree vec_elem_type = TREE_TYPE (vectype_out); 5786 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type)); 5787 5788 tree vector_identity = NULL_TREE; 5789 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 5790 vector_identity = build_zero_cst (vectype_out); 5791 5792 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL); 5793 int i; 5794 tree def0; 5795 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) 5796 { 5797 gimple *new_stmt; 5798 tree mask = NULL_TREE; 5799 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 5800 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i); 5801 5802 /* Handle MINUS by adding the negative. */ 5803 if (reduc_fn != IFN_LAST && code == MINUS_EXPR) 5804 { 5805 tree negated = make_ssa_name (vectype_out); 5806 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0); 5807 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5808 def0 = negated; 5809 } 5810 5811 if (mask && mask_reduc_fn == IFN_LAST) 5812 def0 = merge_with_identity (gsi, mask, vectype_out, def0, 5813 vector_identity); 5814 5815 /* On the first iteration the input is simply the scalar phi 5816 result, and for subsequent iterations it is the output of 5817 the preceding operation. */ 5818 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST)) 5819 { 5820 if (mask && mask_reduc_fn != IFN_LAST) 5821 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var, 5822 def0, mask); 5823 else 5824 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, 5825 def0); 5826 /* For chained SLP reductions the output of the previous reduction 5827 operation serves as the input of the next. For the final statement 5828 the output cannot be a temporary - we reuse the original 5829 scalar destination of the last statement. */ 5830 if (i != vec_num - 1) 5831 { 5832 gimple_set_lhs (new_stmt, scalar_dest_var); 5833 reduc_var = make_ssa_name (scalar_dest_var, new_stmt); 5834 gimple_set_lhs (new_stmt, reduc_var); 5835 } 5836 } 5837 else 5838 { 5839 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code, 5840 reduc_var, def0); 5841 new_stmt = SSA_NAME_DEF_STMT (reduc_var); 5842 /* Remove the statement, so that we can use the same code paths 5843 as for statements that we've just created. */ 5844 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt); 5845 gsi_remove (&tmp_gsi, true); 5846 } 5847 5848 if (i == vec_num - 1) 5849 { 5850 gimple_set_lhs (new_stmt, scalar_dest); 5851 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info, 5852 new_stmt); 5853 } 5854 else 5855 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info, 5856 new_stmt, gsi); 5857 5858 if (slp_node) 5859 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info); 5860 } 5861 5862 if (!slp_node) 5863 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info; 5864 5865 return true; 5866} 5867 5868/* Function is_nonwrapping_integer_induction. 5869 5870 Check if STMT_VINO (which is part of loop LOOP) both increments and 5871 does not cause overflow. */ 5872 5873static bool 5874is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop) 5875{ 5876 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt); 5877 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo); 5878 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo); 5879 tree lhs_type = TREE_TYPE (gimple_phi_result (phi)); 5880 widest_int ni, max_loop_value, lhs_max; 5881 wi::overflow_type overflow = wi::OVF_NONE; 5882 5883 /* Make sure the loop is integer based. */ 5884 if (TREE_CODE (base) != INTEGER_CST 5885 || TREE_CODE (step) != INTEGER_CST) 5886 return false; 5887 5888 /* Check that the max size of the loop will not wrap. */ 5889 5890 if (TYPE_OVERFLOW_UNDEFINED (lhs_type)) 5891 return true; 5892 5893 if (! max_stmt_executions (loop, &ni)) 5894 return false; 5895 5896 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type), 5897 &overflow); 5898 if (overflow) 5899 return false; 5900 5901 max_loop_value = wi::add (wi::to_widest (base), max_loop_value, 5902 TYPE_SIGN (lhs_type), &overflow); 5903 if (overflow) 5904 return false; 5905 5906 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type)) 5907 <= TYPE_PRECISION (lhs_type)); 5908} 5909 5910/* Check if masking can be supported by inserting a conditional expression. 5911 CODE is the code for the operation. COND_FN is the conditional internal 5912 function, if it exists. VECTYPE_IN is the type of the vector input. */ 5913static bool 5914use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn, 5915 tree vectype_in) 5916{ 5917 if (cond_fn != IFN_LAST 5918 && direct_internal_fn_supported_p (cond_fn, vectype_in, 5919 OPTIMIZE_FOR_SPEED)) 5920 return false; 5921 5922 switch (code) 5923 { 5924 case DOT_PROD_EXPR: 5925 case SAD_EXPR: 5926 return true; 5927 5928 default: 5929 return false; 5930 } 5931} 5932 5933/* Insert a conditional expression to enable masked vectorization. CODE is the 5934 code for the operation. VOP is the array of operands. MASK is the loop 5935 mask. GSI is a statement iterator used to place the new conditional 5936 expression. */ 5937static void 5938build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask, 5939 gimple_stmt_iterator *gsi) 5940{ 5941 switch (code) 5942 { 5943 case DOT_PROD_EXPR: 5944 { 5945 tree vectype = TREE_TYPE (vop[1]); 5946 tree zero = build_zero_cst (vectype); 5947 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1"); 5948 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR, 5949 mask, vop[1], zero); 5950 gsi_insert_before (gsi, select, GSI_SAME_STMT); 5951 vop[1] = masked_op1; 5952 break; 5953 } 5954 5955 case SAD_EXPR: 5956 { 5957 tree vectype = TREE_TYPE (vop[1]); 5958 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1"); 5959 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR, 5960 mask, vop[1], vop[0]); 5961 gsi_insert_before (gsi, select, GSI_SAME_STMT); 5962 vop[1] = masked_op1; 5963 break; 5964 } 5965 5966 default: 5967 gcc_unreachable (); 5968 } 5969} 5970 5971/* Function vectorizable_reduction. 5972 5973 Check if STMT_INFO performs a reduction operation that can be vectorized. 5974 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized 5975 stmt to replace it, put it in VEC_STMT, and insert it at GSI. 5976 Return true if STMT_INFO is vectorizable in this way. 5977 5978 This function also handles reduction idioms (patterns) that have been 5979 recognized in advance during vect_pattern_recog. In this case, STMT_INFO 5980 may be of this form: 5981 X = pattern_expr (arg0, arg1, ..., X) 5982 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original 5983 sequence that had been detected and replaced by the pattern-stmt 5984 (STMT_INFO). 5985 5986 This function also handles reduction of condition expressions, for example: 5987 for (int i = 0; i < N; i++) 5988 if (a[i] < value) 5989 last = a[i]; 5990 This is handled by vectorising the loop and creating an additional vector 5991 containing the loop indexes for which "a[i] < value" was true. In the 5992 function epilogue this is reduced to a single max value and then used to 5993 index into the vector of results. 5994 5995 In some cases of reduction patterns, the type of the reduction variable X is 5996 different than the type of the other arguments of STMT_INFO. 5997 In such cases, the vectype that is used when transforming STMT_INFO into 5998 a vector stmt is different than the vectype that is used to determine the 5999 vectorization factor, because it consists of a different number of elements 6000 than the actual number of elements that are being operated upon in parallel. 6001 6002 For example, consider an accumulation of shorts into an int accumulator. 6003 On some targets it's possible to vectorize this pattern operating on 8 6004 shorts at a time (hence, the vectype for purposes of determining the 6005 vectorization factor should be V8HI); on the other hand, the vectype that 6006 is used to create the vector form is actually V4SI (the type of the result). 6007 6008 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that 6009 indicates what is the actual level of parallelism (V8HI in the example), so 6010 that the right vectorization factor would be derived. This vectype 6011 corresponds to the type of arguments to the reduction stmt, and should *NOT* 6012 be used to create the vectorized stmt. The right vectype for the vectorized 6013 stmt is obtained from the type of the result X: 6014 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X)) 6015 6016 This means that, contrary to "regular" reductions (or "regular" stmts in 6017 general), the following equation: 6018 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X)) 6019 does *NOT* necessarily hold for reduction patterns. */ 6020 6021bool 6022vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node, 6023 slp_instance slp_node_instance, 6024 stmt_vector_for_cost *cost_vec) 6025{ 6026 tree scalar_dest; 6027 tree vectype_in = NULL_TREE; 6028 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6029 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 6030 enum vect_def_type cond_reduc_dt = vect_unknown_def_type; 6031 stmt_vec_info cond_stmt_vinfo = NULL; 6032 tree scalar_type; 6033 int i; 6034 int ncopies; 6035 bool single_defuse_cycle = false; 6036 bool nested_cycle = false; 6037 bool double_reduc = false; 6038 int vec_num; 6039 tree tem; 6040 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE; 6041 tree cond_reduc_val = NULL_TREE; 6042 6043 /* Make sure it was already recognized as a reduction computation. */ 6044 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def 6045 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def 6046 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle) 6047 return false; 6048 6049 /* The stmt we store reduction analysis meta on. */ 6050 stmt_vec_info reduc_info = info_for_reduction (stmt_info); 6051 reduc_info->is_reduc_info = true; 6052 6053 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle) 6054 { 6055 if (is_a <gphi *> (stmt_info->stmt)) 6056 /* Analysis for double-reduction is done on the outer 6057 loop PHI, nested cycles have no further restrictions. */ 6058 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type; 6059 else 6060 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; 6061 return true; 6062 } 6063 6064 stmt_vec_info orig_stmt_of_analysis = stmt_info; 6065 stmt_vec_info phi_info = stmt_info; 6066 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def 6067 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) 6068 { 6069 if (!is_a <gphi *> (stmt_info->stmt)) 6070 { 6071 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; 6072 return true; 6073 } 6074 if (slp_node) 6075 { 6076 slp_node_instance->reduc_phis = slp_node; 6077 /* ??? We're leaving slp_node to point to the PHIs, we only 6078 need it to get at the number of vector stmts which wasn't 6079 yet initialized for the instance root. */ 6080 } 6081 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def) 6082 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info)); 6083 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */ 6084 { 6085 use_operand_p use_p; 6086 gimple *use_stmt; 6087 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt), 6088 &use_p, &use_stmt); 6089 gcc_assert (res); 6090 phi_info = loop_vinfo->lookup_stmt (use_stmt); 6091 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info)); 6092 } 6093 } 6094 6095 /* PHIs should not participate in patterns. */ 6096 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info)); 6097 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt); 6098 6099 /* Verify following REDUC_IDX from the latch def leads us back to the PHI 6100 and compute the reduction chain length. */ 6101 tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, 6102 loop_latch_edge (loop)); 6103 unsigned reduc_chain_length = 0; 6104 bool only_slp_reduc_chain = true; 6105 stmt_info = NULL; 6106 while (reduc_def != PHI_RESULT (reduc_def_phi)) 6107 { 6108 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def); 6109 stmt_vec_info vdef = vect_stmt_to_vectorize (def); 6110 if (STMT_VINFO_REDUC_IDX (vdef) == -1) 6111 { 6112 if (dump_enabled_p ()) 6113 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6114 "reduction chain broken by patterns.\n"); 6115 return false; 6116 } 6117 if (!REDUC_GROUP_FIRST_ELEMENT (vdef)) 6118 only_slp_reduc_chain = false; 6119 /* ??? For epilogue generation live members of the chain need 6120 to point back to the PHI via their original stmt for 6121 info_for_reduction to work. */ 6122 if (STMT_VINFO_LIVE_P (vdef)) 6123 STMT_VINFO_REDUC_DEF (def) = phi_info; 6124 gassign *assign = dyn_cast <gassign *> (vdef->stmt); 6125 if (!assign) 6126 { 6127 if (dump_enabled_p ()) 6128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6129 "reduction chain includes calls.\n"); 6130 return false; 6131 } 6132 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign))) 6133 { 6134 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)), 6135 TREE_TYPE (gimple_assign_rhs1 (assign)))) 6136 { 6137 if (dump_enabled_p ()) 6138 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6139 "conversion in the reduction chain.\n"); 6140 return false; 6141 } 6142 } 6143 else if (!stmt_info) 6144 /* First non-conversion stmt. */ 6145 stmt_info = vdef; 6146 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef)); 6147 reduc_chain_length++; 6148 } 6149 /* PHIs should not participate in patterns. */ 6150 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info)); 6151 6152 if (nested_in_vect_loop_p (loop, stmt_info)) 6153 { 6154 loop = loop->inner; 6155 nested_cycle = true; 6156 } 6157 6158 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last 6159 element. */ 6160 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info)) 6161 { 6162 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info)); 6163 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info); 6164 } 6165 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) 6166 gcc_assert (slp_node 6167 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info); 6168 6169 /* 1. Is vectorizable reduction? */ 6170 /* Not supportable if the reduction variable is used in the loop, unless 6171 it's a reduction chain. */ 6172 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer 6173 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)) 6174 return false; 6175 6176 /* Reductions that are not used even in an enclosing outer-loop, 6177 are expected to be "live" (used out of the loop). */ 6178 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope 6179 && !STMT_VINFO_LIVE_P (stmt_info)) 6180 return false; 6181 6182 /* 2. Has this been recognized as a reduction pattern? 6183 6184 Check if STMT represents a pattern that has been recognized 6185 in earlier analysis stages. For stmts that represent a pattern, 6186 the STMT_VINFO_RELATED_STMT field records the last stmt in 6187 the original sequence that constitutes the pattern. */ 6188 6189 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info); 6190 if (orig_stmt_info) 6191 { 6192 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); 6193 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info)); 6194 } 6195 6196 /* 3. Check the operands of the operation. The first operands are defined 6197 inside the loop body. The last operand is the reduction variable, 6198 which is defined by the loop-header-phi. */ 6199 6200 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); 6201 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out; 6202 gassign *stmt = as_a <gassign *> (stmt_info->stmt); 6203 enum tree_code code = gimple_assign_rhs_code (stmt); 6204 bool lane_reduc_code_p 6205 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR); 6206 int op_type = TREE_CODE_LENGTH (code); 6207 6208 scalar_dest = gimple_assign_lhs (stmt); 6209 scalar_type = TREE_TYPE (scalar_dest); 6210 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type) 6211 && !SCALAR_FLOAT_TYPE_P (scalar_type)) 6212 return false; 6213 6214 /* Do not try to vectorize bit-precision reductions. */ 6215 if (!type_has_mode_precision_p (scalar_type)) 6216 return false; 6217 6218 /* For lane-reducing ops we're reducing the number of reduction PHIs 6219 which means the only use of that may be in the lane-reducing operation. */ 6220 if (lane_reduc_code_p 6221 && reduc_chain_length != 1 6222 && !only_slp_reduc_chain) 6223 { 6224 if (dump_enabled_p ()) 6225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6226 "lane-reducing reduction with extra stmts.\n"); 6227 return false; 6228 } 6229 6230 /* All uses but the last are expected to be defined in the loop. 6231 The last use is the reduction variable. In case of nested cycle this 6232 assumption is not true: we use reduc_index to record the index of the 6233 reduction variable. */ 6234 reduc_def = PHI_RESULT (reduc_def_phi); 6235 for (i = 0; i < op_type; i++) 6236 { 6237 tree op = gimple_op (stmt, i + 1); 6238 /* The condition of COND_EXPR is checked in vectorizable_condition(). */ 6239 if (i == 0 && code == COND_EXPR) 6240 continue; 6241 6242 stmt_vec_info def_stmt_info; 6243 enum vect_def_type dt; 6244 if (!vect_is_simple_use (op, loop_vinfo, &dt, &tem, 6245 &def_stmt_info)) 6246 { 6247 if (dump_enabled_p ()) 6248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6249 "use not simple.\n"); 6250 return false; 6251 } 6252 if (i == STMT_VINFO_REDUC_IDX (stmt_info)) 6253 continue; 6254 6255 /* There should be only one cycle def in the stmt, the one 6256 leading to reduc_def. */ 6257 if (VECTORIZABLE_CYCLE_DEF (dt)) 6258 return false; 6259 6260 /* To properly compute ncopies we are interested in the widest 6261 non-reduction input type in case we're looking at a widening 6262 accumulation that we later handle in vect_transform_reduction. */ 6263 if (lane_reduc_code_p 6264 && tem 6265 && (!vectype_in 6266 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in))) 6267 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))) 6268 vectype_in = tem; 6269 6270 if (code == COND_EXPR) 6271 { 6272 /* Record how the non-reduction-def value of COND_EXPR is defined. */ 6273 if (dt == vect_constant_def) 6274 { 6275 cond_reduc_dt = dt; 6276 cond_reduc_val = op; 6277 } 6278 if (dt == vect_induction_def 6279 && def_stmt_info 6280 && is_nonwrapping_integer_induction (def_stmt_info, loop)) 6281 { 6282 cond_reduc_dt = dt; 6283 cond_stmt_vinfo = def_stmt_info; 6284 } 6285 } 6286 } 6287 if (!vectype_in) 6288 vectype_in = STMT_VINFO_VECTYPE (phi_info); 6289 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in; 6290 6291 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info); 6292 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type; 6293 /* If we have a condition reduction, see if we can simplify it further. */ 6294 if (v_reduc_type == COND_REDUCTION) 6295 { 6296 if (slp_node) 6297 return false; 6298 6299 /* When the condition uses the reduction value in the condition, fail. */ 6300 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0) 6301 { 6302 if (dump_enabled_p ()) 6303 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6304 "condition depends on previous iteration\n"); 6305 return false; 6306 } 6307 6308 if (reduc_chain_length == 1 6309 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, 6310 vectype_in, OPTIMIZE_FOR_SPEED)) 6311 { 6312 if (dump_enabled_p ()) 6313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6314 "optimizing condition reduction with" 6315 " FOLD_EXTRACT_LAST.\n"); 6316 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION; 6317 } 6318 else if (cond_reduc_dt == vect_induction_def) 6319 { 6320 tree base 6321 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo); 6322 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo); 6323 6324 gcc_assert (TREE_CODE (base) == INTEGER_CST 6325 && TREE_CODE (step) == INTEGER_CST); 6326 cond_reduc_val = NULL_TREE; 6327 enum tree_code cond_reduc_op_code = ERROR_MARK; 6328 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo)); 6329 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base))) 6330 ; 6331 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR 6332 above base; punt if base is the minimum value of the type for 6333 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */ 6334 else if (tree_int_cst_sgn (step) == -1) 6335 { 6336 cond_reduc_op_code = MIN_EXPR; 6337 if (tree_int_cst_sgn (base) == -1) 6338 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0); 6339 else if (tree_int_cst_lt (base, 6340 TYPE_MAX_VALUE (TREE_TYPE (base)))) 6341 cond_reduc_val 6342 = int_const_binop (PLUS_EXPR, base, integer_one_node); 6343 } 6344 else 6345 { 6346 cond_reduc_op_code = MAX_EXPR; 6347 if (tree_int_cst_sgn (base) == 1) 6348 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0); 6349 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)), 6350 base)) 6351 cond_reduc_val 6352 = int_const_binop (MINUS_EXPR, base, integer_one_node); 6353 } 6354 if (cond_reduc_val) 6355 { 6356 if (dump_enabled_p ()) 6357 dump_printf_loc (MSG_NOTE, vect_location, 6358 "condition expression based on " 6359 "integer induction.\n"); 6360 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code; 6361 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) 6362 = cond_reduc_val; 6363 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION; 6364 } 6365 } 6366 else if (cond_reduc_dt == vect_constant_def) 6367 { 6368 enum vect_def_type cond_initial_dt; 6369 tree cond_initial_val 6370 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop)); 6371 6372 gcc_assert (cond_reduc_val != NULL_TREE); 6373 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt); 6374 if (cond_initial_dt == vect_constant_def 6375 && types_compatible_p (TREE_TYPE (cond_initial_val), 6376 TREE_TYPE (cond_reduc_val))) 6377 { 6378 tree e = fold_binary (LE_EXPR, boolean_type_node, 6379 cond_initial_val, cond_reduc_val); 6380 if (e && (integer_onep (e) || integer_zerop (e))) 6381 { 6382 if (dump_enabled_p ()) 6383 dump_printf_loc (MSG_NOTE, vect_location, 6384 "condition expression based on " 6385 "compile time constant.\n"); 6386 /* Record reduction code at analysis stage. */ 6387 STMT_VINFO_REDUC_CODE (reduc_info) 6388 = integer_onep (e) ? MAX_EXPR : MIN_EXPR; 6389 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION; 6390 } 6391 } 6392 } 6393 } 6394 6395 if (STMT_VINFO_LIVE_P (phi_info)) 6396 return false; 6397 6398 if (slp_node) 6399 ncopies = 1; 6400 else 6401 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); 6402 6403 gcc_assert (ncopies >= 1); 6404 6405 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); 6406 6407 if (nested_cycle) 6408 { 6409 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) 6410 == vect_double_reduction_def); 6411 double_reduc = true; 6412 } 6413 6414 /* 4.2. Check support for the epilog operation. 6415 6416 If STMT represents a reduction pattern, then the type of the 6417 reduction variable may be different than the type of the rest 6418 of the arguments. For example, consider the case of accumulation 6419 of shorts into an int accumulator; The original code: 6420 S1: int_a = (int) short_a; 6421 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>; 6422 6423 was replaced with: 6424 STMT: int_acc = widen_sum <short_a, int_acc> 6425 6426 This means that: 6427 1. The tree-code that is used to create the vector operation in the 6428 epilog code (that reduces the partial results) is not the 6429 tree-code of STMT, but is rather the tree-code of the original 6430 stmt from the pattern that STMT is replacing. I.e, in the example 6431 above we want to use 'widen_sum' in the loop, but 'plus' in the 6432 epilog. 6433 2. The type (mode) we use to check available target support 6434 for the vector operation to be created in the *epilog*, is 6435 determined by the type of the reduction variable (in the example 6436 above we'd check this: optab_handler (plus_optab, vect_int_mode])). 6437 However the type (mode) we use to check available target support 6438 for the vector operation to be created *inside the loop*, is 6439 determined by the type of the other arguments to STMT (in the 6440 example we'd check this: optab_handler (widen_sum_optab, 6441 vect_short_mode)). 6442 6443 This is contrary to "regular" reductions, in which the types of all 6444 the arguments are the same as the type of the reduction variable. 6445 For "regular" reductions we can therefore use the same vector type 6446 (and also the same tree-code) when generating the epilog code and 6447 when generating the code inside the loop. */ 6448 6449 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info); 6450 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code; 6451 6452 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); 6453 if (reduction_type == TREE_CODE_REDUCTION) 6454 { 6455 /* Check whether it's ok to change the order of the computation. 6456 Generally, when vectorizing a reduction we change the order of the 6457 computation. This may change the behavior of the program in some 6458 cases, so we need to check that this is ok. One exception is when 6459 vectorizing an outer-loop: the inner-loop is executed sequentially, 6460 and therefore vectorizing reductions in the inner-loop during 6461 outer-loop vectorization is safe. */ 6462 if (needs_fold_left_reduction_p (scalar_type, orig_code)) 6463 { 6464 /* When vectorizing a reduction chain w/o SLP the reduction PHI 6465 is not directy used in stmt. */ 6466 if (!only_slp_reduc_chain 6467 && reduc_chain_length != 1) 6468 { 6469 if (dump_enabled_p ()) 6470 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6471 "in-order reduction chain without SLP.\n"); 6472 return false; 6473 } 6474 STMT_VINFO_REDUC_TYPE (reduc_info) 6475 = reduction_type = FOLD_LEFT_REDUCTION; 6476 } 6477 else if (!commutative_tree_code (orig_code) 6478 || !associative_tree_code (orig_code)) 6479 { 6480 if (dump_enabled_p ()) 6481 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6482 "reduction: not commutative/associative"); 6483 return false; 6484 } 6485 } 6486 6487 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION) 6488 && ncopies > 1) 6489 { 6490 if (dump_enabled_p ()) 6491 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6492 "multiple types in double reduction or condition " 6493 "reduction or fold-left reduction.\n"); 6494 return false; 6495 } 6496 6497 internal_fn reduc_fn = IFN_LAST; 6498 if (reduction_type == TREE_CODE_REDUCTION 6499 || reduction_type == FOLD_LEFT_REDUCTION 6500 || reduction_type == INTEGER_INDUC_COND_REDUCTION 6501 || reduction_type == CONST_COND_REDUCTION) 6502 { 6503 if (reduction_type == FOLD_LEFT_REDUCTION 6504 ? fold_left_reduction_fn (orig_code, &reduc_fn) 6505 : reduction_fn_for_scalar_code (orig_code, &reduc_fn)) 6506 { 6507 if (reduc_fn != IFN_LAST 6508 && !direct_internal_fn_supported_p (reduc_fn, vectype_out, 6509 OPTIMIZE_FOR_SPEED)) 6510 { 6511 if (dump_enabled_p ()) 6512 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6513 "reduc op not supported by target.\n"); 6514 6515 reduc_fn = IFN_LAST; 6516 } 6517 } 6518 else 6519 { 6520 if (!nested_cycle || double_reduc) 6521 { 6522 if (dump_enabled_p ()) 6523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6524 "no reduc code for scalar code.\n"); 6525 6526 return false; 6527 } 6528 } 6529 } 6530 else if (reduction_type == COND_REDUCTION) 6531 { 6532 int scalar_precision 6533 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type)); 6534 cr_index_scalar_type = make_unsigned_type (scalar_precision); 6535 cr_index_vector_type = build_vector_type (cr_index_scalar_type, 6536 nunits_out); 6537 6538 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type, 6539 OPTIMIZE_FOR_SPEED)) 6540 reduc_fn = IFN_REDUC_MAX; 6541 } 6542 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn; 6543 6544 if (reduction_type != EXTRACT_LAST_REDUCTION 6545 && (!nested_cycle || double_reduc) 6546 && reduc_fn == IFN_LAST 6547 && !nunits_out.is_constant ()) 6548 { 6549 if (dump_enabled_p ()) 6550 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6551 "missing target support for reduction on" 6552 " variable-length vectors.\n"); 6553 return false; 6554 } 6555 6556 /* For SLP reductions, see if there is a neutral value we can use. */ 6557 tree neutral_op = NULL_TREE; 6558 if (slp_node) 6559 neutral_op = neutral_op_for_slp_reduction 6560 (slp_node_instance->reduc_phis, vectype_out, orig_code, 6561 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL); 6562 6563 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION) 6564 { 6565 /* We can't support in-order reductions of code such as this: 6566 6567 for (int i = 0; i < n1; ++i) 6568 for (int j = 0; j < n2; ++j) 6569 l += a[j]; 6570 6571 since GCC effectively transforms the loop when vectorizing: 6572 6573 for (int i = 0; i < n1 / VF; ++i) 6574 for (int j = 0; j < n2; ++j) 6575 for (int k = 0; k < VF; ++k) 6576 l += a[j]; 6577 6578 which is a reassociation of the original operation. */ 6579 if (dump_enabled_p ()) 6580 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6581 "in-order double reduction not supported.\n"); 6582 6583 return false; 6584 } 6585 6586 if (reduction_type == FOLD_LEFT_REDUCTION 6587 && slp_node 6588 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)) 6589 { 6590 /* We cannot use in-order reductions in this case because there is 6591 an implicit reassociation of the operations involved. */ 6592 if (dump_enabled_p ()) 6593 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6594 "in-order unchained SLP reductions not supported.\n"); 6595 return false; 6596 } 6597 6598 /* For double reductions, and for SLP reductions with a neutral value, 6599 we construct a variable-length initial vector by loading a vector 6600 full of the neutral value and then shift-and-inserting the start 6601 values into the low-numbered elements. */ 6602 if ((double_reduc || neutral_op) 6603 && !nunits_out.is_constant () 6604 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT, 6605 vectype_out, OPTIMIZE_FOR_SPEED)) 6606 { 6607 if (dump_enabled_p ()) 6608 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6609 "reduction on variable-length vectors requires" 6610 " target support for a vector-shift-and-insert" 6611 " operation.\n"); 6612 return false; 6613 } 6614 6615 /* Check extra constraints for variable-length unchained SLP reductions. */ 6616 if (STMT_SLP_TYPE (stmt_info) 6617 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) 6618 && !nunits_out.is_constant ()) 6619 { 6620 /* We checked above that we could build the initial vector when 6621 there's a neutral element value. Check here for the case in 6622 which each SLP statement has its own initial value and in which 6623 that value needs to be repeated for every instance of the 6624 statement within the initial vector. */ 6625 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); 6626 if (!neutral_op 6627 && !can_duplicate_and_interleave_p (loop_vinfo, group_size, 6628 TREE_TYPE (vectype_out))) 6629 { 6630 if (dump_enabled_p ()) 6631 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6632 "unsupported form of SLP reduction for" 6633 " variable-length vectors: cannot build" 6634 " initial vector.\n"); 6635 return false; 6636 } 6637 /* The epilogue code relies on the number of elements being a multiple 6638 of the group size. The duplicate-and-interleave approach to setting 6639 up the initial vector does too. */ 6640 if (!multiple_p (nunits_out, group_size)) 6641 { 6642 if (dump_enabled_p ()) 6643 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6644 "unsupported form of SLP reduction for" 6645 " variable-length vectors: the vector size" 6646 " is not a multiple of the number of results.\n"); 6647 return false; 6648 } 6649 } 6650 6651 if (reduction_type == COND_REDUCTION) 6652 { 6653 widest_int ni; 6654 6655 if (! max_loop_iterations (loop, &ni)) 6656 { 6657 if (dump_enabled_p ()) 6658 dump_printf_loc (MSG_NOTE, vect_location, 6659 "loop count not known, cannot create cond " 6660 "reduction.\n"); 6661 return false; 6662 } 6663 /* Convert backedges to iterations. */ 6664 ni += 1; 6665 6666 /* The additional index will be the same type as the condition. Check 6667 that the loop can fit into this less one (because we'll use up the 6668 zero slot for when there are no matches). */ 6669 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type); 6670 if (wi::geu_p (ni, wi::to_widest (max_index))) 6671 { 6672 if (dump_enabled_p ()) 6673 dump_printf_loc (MSG_NOTE, vect_location, 6674 "loop size is greater than data size.\n"); 6675 return false; 6676 } 6677 } 6678 6679 /* In case the vectorization factor (VF) is bigger than the number 6680 of elements that we can fit in a vectype (nunits), we have to generate 6681 more than one vector stmt - i.e - we need to "unroll" the 6682 vector stmt by a factor VF/nunits. For more details see documentation 6683 in vectorizable_operation. */ 6684 6685 /* If the reduction is used in an outer loop we need to generate 6686 VF intermediate results, like so (e.g. for ncopies=2): 6687 r0 = phi (init, r0) 6688 r1 = phi (init, r1) 6689 r0 = x0 + r0; 6690 r1 = x1 + r1; 6691 (i.e. we generate VF results in 2 registers). 6692 In this case we have a separate def-use cycle for each copy, and therefore 6693 for each copy we get the vector def for the reduction variable from the 6694 respective phi node created for this copy. 6695 6696 Otherwise (the reduction is unused in the loop nest), we can combine 6697 together intermediate results, like so (e.g. for ncopies=2): 6698 r = phi (init, r) 6699 r = x0 + r; 6700 r = x1 + r; 6701 (i.e. we generate VF/2 results in a single register). 6702 In this case for each copy we get the vector def for the reduction variable 6703 from the vectorized reduction operation generated in the previous iteration. 6704 6705 This only works when we see both the reduction PHI and its only consumer 6706 in vectorizable_reduction and there are no intermediate stmts 6707 participating. */ 6708 if (ncopies > 1 6709 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live) 6710 && reduc_chain_length == 1) 6711 single_defuse_cycle = true; 6712 6713 if (single_defuse_cycle || lane_reduc_code_p) 6714 { 6715 gcc_assert (code != COND_EXPR); 6716 6717 /* 4. Supportable by target? */ 6718 bool ok = true; 6719 6720 /* 4.1. check support for the operation in the loop */ 6721 optab optab = optab_for_tree_code (code, vectype_in, optab_vector); 6722 if (!optab) 6723 { 6724 if (dump_enabled_p ()) 6725 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6726 "no optab.\n"); 6727 ok = false; 6728 } 6729 6730 machine_mode vec_mode = TYPE_MODE (vectype_in); 6731 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing) 6732 { 6733 if (dump_enabled_p ()) 6734 dump_printf (MSG_NOTE, "op not supported by target.\n"); 6735 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD) 6736 || !vect_worthwhile_without_simd_p (loop_vinfo, code)) 6737 ok = false; 6738 else 6739 if (dump_enabled_p ()) 6740 dump_printf (MSG_NOTE, "proceeding using word mode.\n"); 6741 } 6742 6743 /* Worthwhile without SIMD support? */ 6744 if (ok 6745 && !VECTOR_MODE_P (TYPE_MODE (vectype_in)) 6746 && !vect_worthwhile_without_simd_p (loop_vinfo, code)) 6747 { 6748 if (dump_enabled_p ()) 6749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6750 "not worthwhile without SIMD support.\n"); 6751 ok = false; 6752 } 6753 6754 /* lane-reducing operations have to go through vect_transform_reduction. 6755 For the other cases try without the single cycle optimization. */ 6756 if (!ok) 6757 { 6758 if (lane_reduc_code_p) 6759 return false; 6760 else 6761 single_defuse_cycle = false; 6762 } 6763 } 6764 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle; 6765 6766 /* If the reduction stmt is one of the patterns that have lane 6767 reduction embedded we cannot handle the case of ! single_defuse_cycle. */ 6768 if ((ncopies > 1 && ! single_defuse_cycle) 6769 && lane_reduc_code_p) 6770 { 6771 if (dump_enabled_p ()) 6772 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6773 "multi def-use cycle not possible for lane-reducing " 6774 "reduction operation\n"); 6775 return false; 6776 } 6777 6778 if (slp_node) 6779 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 6780 else 6781 vec_num = 1; 6782 6783 vect_model_reduction_cost (stmt_info, reduc_fn, reduction_type, ncopies, 6784 cost_vec); 6785 /* Cost the reduction op inside the loop if transformed via 6786 vect_transform_reduction. Otherwise this is costed by the 6787 separate vectorizable_* routines. */ 6788 if (single_defuse_cycle 6789 || code == DOT_PROD_EXPR 6790 || code == WIDEN_SUM_EXPR 6791 || code == SAD_EXPR) 6792 record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body); 6793 6794 if (dump_enabled_p () 6795 && reduction_type == FOLD_LEFT_REDUCTION) 6796 dump_printf_loc (MSG_NOTE, vect_location, 6797 "using an in-order (fold-left) reduction.\n"); 6798 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type; 6799 /* All but single defuse-cycle optimized, lane-reducing and fold-left 6800 reductions go through their own vectorizable_* routines. */ 6801 if (!single_defuse_cycle 6802 && code != DOT_PROD_EXPR 6803 && code != WIDEN_SUM_EXPR 6804 && code != SAD_EXPR 6805 && reduction_type != FOLD_LEFT_REDUCTION) 6806 { 6807 stmt_vec_info tem 6808 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info)); 6809 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem)) 6810 { 6811 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem)); 6812 tem = REDUC_GROUP_FIRST_ELEMENT (tem); 6813 } 6814 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def; 6815 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def; 6816 } 6817 else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) 6818 { 6819 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); 6820 internal_fn cond_fn = get_conditional_internal_fn (code); 6821 6822 if (reduction_type != FOLD_LEFT_REDUCTION 6823 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in) 6824 && (cond_fn == IFN_LAST 6825 || !direct_internal_fn_supported_p (cond_fn, vectype_in, 6826 OPTIMIZE_FOR_SPEED))) 6827 { 6828 if (dump_enabled_p ()) 6829 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6830 "can't use a fully-masked loop because no" 6831 " conditional operation is available.\n"); 6832 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; 6833 } 6834 else if (reduction_type == FOLD_LEFT_REDUCTION 6835 && reduc_fn == IFN_LAST 6836 && !expand_vec_cond_expr_p (vectype_in, 6837 truth_type_for (vectype_in), 6838 SSA_NAME)) 6839 { 6840 if (dump_enabled_p ()) 6841 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6842 "can't use a fully-masked loop because no" 6843 " conditional operation is available.\n"); 6844 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; 6845 } 6846 else 6847 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num, 6848 vectype_in, NULL); 6849 } 6850 return true; 6851} 6852 6853/* Transform the definition stmt STMT_INFO of a reduction PHI backedge 6854 value. */ 6855 6856bool 6857vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, 6858 stmt_vec_info *vec_stmt, slp_tree slp_node) 6859{ 6860 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); 6861 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6862 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 6863 int i; 6864 int ncopies; 6865 int j; 6866 int vec_num; 6867 6868 stmt_vec_info reduc_info = info_for_reduction (stmt_info); 6869 gcc_assert (reduc_info->is_reduc_info); 6870 6871 if (nested_in_vect_loop_p (loop, stmt_info)) 6872 { 6873 loop = loop->inner; 6874 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def); 6875 } 6876 6877 gassign *stmt = as_a <gassign *> (stmt_info->stmt); 6878 enum tree_code code = gimple_assign_rhs_code (stmt); 6879 int op_type = TREE_CODE_LENGTH (code); 6880 6881 /* Flatten RHS. */ 6882 tree ops[3]; 6883 switch (get_gimple_rhs_class (code)) 6884 { 6885 case GIMPLE_TERNARY_RHS: 6886 ops[2] = gimple_assign_rhs3 (stmt); 6887 /* Fall thru. */ 6888 case GIMPLE_BINARY_RHS: 6889 ops[0] = gimple_assign_rhs1 (stmt); 6890 ops[1] = gimple_assign_rhs2 (stmt); 6891 break; 6892 default: 6893 gcc_unreachable (); 6894 } 6895 6896 /* All uses but the last are expected to be defined in the loop. 6897 The last use is the reduction variable. In case of nested cycle this 6898 assumption is not true: we use reduc_index to record the index of the 6899 reduction variable. */ 6900 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)); 6901 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt); 6902 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info); 6903 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info); 6904 6905 if (slp_node) 6906 { 6907 ncopies = 1; 6908 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 6909 } 6910 else 6911 { 6912 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); 6913 vec_num = 1; 6914 } 6915 6916 internal_fn cond_fn = get_conditional_internal_fn (code); 6917 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); 6918 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in); 6919 6920 /* Transform. */ 6921 stmt_vec_info new_stmt_info = NULL; 6922 stmt_vec_info prev_stmt_info; 6923 tree new_temp = NULL_TREE; 6924 auto_vec<tree> vec_oprnds0; 6925 auto_vec<tree> vec_oprnds1; 6926 auto_vec<tree> vec_oprnds2; 6927 tree def0; 6928 6929 if (dump_enabled_p ()) 6930 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n"); 6931 6932 /* FORNOW: Multiple types are not supported for condition. */ 6933 if (code == COND_EXPR) 6934 gcc_assert (ncopies == 1); 6935 6936 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); 6937 6938 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info); 6939 if (reduction_type == FOLD_LEFT_REDUCTION) 6940 { 6941 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info); 6942 return vectorize_fold_left_reduction 6943 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code, 6944 reduc_fn, ops, vectype_in, reduc_index, masks); 6945 } 6946 6947 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info); 6948 gcc_assert (single_defuse_cycle 6949 || code == DOT_PROD_EXPR 6950 || code == WIDEN_SUM_EXPR 6951 || code == SAD_EXPR); 6952 6953 /* Create the destination vector */ 6954 tree scalar_dest = gimple_assign_lhs (stmt); 6955 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out); 6956 6957 prev_stmt_info = NULL; 6958 if (!slp_node) 6959 { 6960 vec_oprnds0.create (1); 6961 vec_oprnds1.create (1); 6962 if (op_type == ternary_op) 6963 vec_oprnds2.create (1); 6964 } 6965 6966 for (j = 0; j < ncopies; j++) 6967 { 6968 /* Handle uses. */ 6969 if (j == 0) 6970 { 6971 if (slp_node) 6972 { 6973 /* Get vec defs for all the operands except the reduction index, 6974 ensuring the ordering of the ops in the vector is kept. */ 6975 auto_vec<vec<tree>, 3> vec_defs; 6976 vect_get_slp_defs (slp_node, &vec_defs); 6977 vec_oprnds0.safe_splice (vec_defs[0]); 6978 vec_defs[0].release (); 6979 vec_oprnds1.safe_splice (vec_defs[1]); 6980 vec_defs[1].release (); 6981 if (op_type == ternary_op) 6982 { 6983 vec_oprnds2.safe_splice (vec_defs[2]); 6984 vec_defs[2].release (); 6985 } 6986 } 6987 else 6988 { 6989 vec_oprnds0.quick_push 6990 (vect_get_vec_def_for_operand (ops[0], stmt_info)); 6991 vec_oprnds1.quick_push 6992 (vect_get_vec_def_for_operand (ops[1], stmt_info)); 6993 if (op_type == ternary_op) 6994 vec_oprnds2.quick_push 6995 (vect_get_vec_def_for_operand (ops[2], stmt_info)); 6996 } 6997 } 6998 else 6999 { 7000 if (!slp_node) 7001 { 7002 gcc_assert (reduc_index != -1 || ! single_defuse_cycle); 7003 7004 if (single_defuse_cycle && reduc_index == 0) 7005 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt); 7006 else 7007 vec_oprnds0[0] 7008 = vect_get_vec_def_for_stmt_copy (loop_vinfo, 7009 vec_oprnds0[0]); 7010 if (single_defuse_cycle && reduc_index == 1) 7011 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt); 7012 else 7013 vec_oprnds1[0] 7014 = vect_get_vec_def_for_stmt_copy (loop_vinfo, 7015 vec_oprnds1[0]); 7016 if (op_type == ternary_op) 7017 { 7018 if (single_defuse_cycle && reduc_index == 2) 7019 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt); 7020 else 7021 vec_oprnds2[0] 7022 = vect_get_vec_def_for_stmt_copy (loop_vinfo, 7023 vec_oprnds2[0]); 7024 } 7025 } 7026 } 7027 7028 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0) 7029 { 7030 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE }; 7031 if (masked_loop_p && !mask_by_cond_expr) 7032 { 7033 /* Make sure that the reduction accumulator is vop[0]. */ 7034 if (reduc_index == 1) 7035 { 7036 gcc_assert (commutative_tree_code (code)); 7037 std::swap (vop[0], vop[1]); 7038 } 7039 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies, 7040 vectype_in, i * ncopies + j); 7041 gcall *call = gimple_build_call_internal (cond_fn, 4, mask, 7042 vop[0], vop[1], 7043 vop[0]); 7044 new_temp = make_ssa_name (vec_dest, call); 7045 gimple_call_set_lhs (call, new_temp); 7046 gimple_call_set_nothrow (call, true); 7047 new_stmt_info 7048 = vect_finish_stmt_generation (stmt_info, call, gsi); 7049 } 7050 else 7051 { 7052 if (op_type == ternary_op) 7053 vop[2] = vec_oprnds2[i]; 7054 7055 if (masked_loop_p && mask_by_cond_expr) 7056 { 7057 tree mask = vect_get_loop_mask (gsi, masks, 7058 vec_num * ncopies, 7059 vectype_in, i * ncopies + j); 7060 build_vect_cond_expr (code, vop, mask, gsi); 7061 } 7062 7063 gassign *new_stmt = gimple_build_assign (vec_dest, code, 7064 vop[0], vop[1], vop[2]); 7065 new_temp = make_ssa_name (vec_dest, new_stmt); 7066 gimple_assign_set_lhs (new_stmt, new_temp); 7067 new_stmt_info 7068 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi); 7069 } 7070 7071 if (slp_node) 7072 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info); 7073 } 7074 7075 if (slp_node || single_defuse_cycle) 7076 continue; 7077 7078 if (j == 0) 7079 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info; 7080 else 7081 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info; 7082 7083 prev_stmt_info = new_stmt_info; 7084 } 7085 7086 if (single_defuse_cycle && !slp_node) 7087 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info; 7088 7089 return true; 7090} 7091 7092/* Transform phase of a cycle PHI. */ 7093 7094bool 7095vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt, 7096 slp_tree slp_node, slp_instance slp_node_instance) 7097{ 7098 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); 7099 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 7100 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 7101 int i; 7102 int ncopies; 7103 stmt_vec_info prev_phi_info; 7104 int j; 7105 bool nested_cycle = false; 7106 int vec_num; 7107 7108 if (nested_in_vect_loop_p (loop, stmt_info)) 7109 { 7110 loop = loop->inner; 7111 nested_cycle = true; 7112 } 7113 7114 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info); 7115 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info); 7116 stmt_vec_info reduc_info = info_for_reduction (stmt_info); 7117 gcc_assert (reduc_info->is_reduc_info); 7118 7119 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION 7120 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION) 7121 /* Leave the scalar phi in place. */ 7122 return true; 7123 7124 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info); 7125 /* For a nested cycle we do not fill the above. */ 7126 if (!vectype_in) 7127 vectype_in = STMT_VINFO_VECTYPE (stmt_info); 7128 gcc_assert (vectype_in); 7129 7130 if (slp_node) 7131 { 7132 /* The size vect_schedule_slp_instance computes is off for us. */ 7133 vec_num = vect_get_num_vectors 7134 (LOOP_VINFO_VECT_FACTOR (loop_vinfo) 7135 * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in); 7136 ncopies = 1; 7137 } 7138 else 7139 { 7140 vec_num = 1; 7141 ncopies = vect_get_num_copies (loop_vinfo, vectype_in); 7142 } 7143 7144 /* Check whether we should use a single PHI node and accumulate 7145 vectors to one before the backedge. */ 7146 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info)) 7147 ncopies = 1; 7148 7149 /* Create the destination vector */ 7150 gphi *phi = as_a <gphi *> (stmt_info->stmt); 7151 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi), 7152 vectype_out); 7153 7154 /* Get the loop-entry arguments. */ 7155 tree vec_initial_def; 7156 auto_vec<tree> vec_initial_defs; 7157 if (slp_node) 7158 { 7159 vec_initial_defs.reserve (vec_num); 7160 gcc_assert (slp_node == slp_node_instance->reduc_phis); 7161 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info); 7162 tree neutral_op 7163 = neutral_op_for_slp_reduction (slp_node, vectype_out, 7164 STMT_VINFO_REDUC_CODE (reduc_info), 7165 first != NULL); 7166 get_initial_defs_for_reduction (slp_node_instance->reduc_phis, 7167 &vec_initial_defs, vec_num, 7168 first != NULL, neutral_op); 7169 } 7170 else 7171 { 7172 /* Get at the scalar def before the loop, that defines the initial 7173 value of the reduction variable. */ 7174 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi, 7175 loop_preheader_edge (loop)); 7176 /* Optimize: if initial_def is for REDUC_MAX smaller than the base 7177 and we can't use zero for induc_val, use initial_def. Similarly 7178 for REDUC_MIN and initial_def larger than the base. */ 7179 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION) 7180 { 7181 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info); 7182 if (TREE_CODE (initial_def) == INTEGER_CST 7183 && !integer_zerop (induc_val) 7184 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR 7185 && tree_int_cst_lt (initial_def, induc_val)) 7186 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR 7187 && tree_int_cst_lt (induc_val, initial_def)))) 7188 { 7189 induc_val = initial_def; 7190 /* Communicate we used the initial_def to epilouge 7191 generation. */ 7192 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE; 7193 } 7194 vec_initial_def = build_vector_from_val (vectype_out, induc_val); 7195 } 7196 else if (nested_cycle) 7197 { 7198 /* Do not use an adjustment def as that case is not supported 7199 correctly if ncopies is not one. */ 7200 vec_initial_def = vect_get_vec_def_for_operand (initial_def, 7201 reduc_stmt_info); 7202 } 7203 else 7204 { 7205 tree adjustment_def = NULL_TREE; 7206 tree *adjustment_defp = &adjustment_def; 7207 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info); 7208 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) 7209 adjustment_defp = NULL; 7210 vec_initial_def 7211 = get_initial_def_for_reduction (reduc_stmt_info, code, 7212 initial_def, adjustment_defp); 7213 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def; 7214 } 7215 vec_initial_defs.create (1); 7216 vec_initial_defs.quick_push (vec_initial_def); 7217 } 7218 7219 /* Generate the reduction PHIs upfront. */ 7220 prev_phi_info = NULL; 7221 for (i = 0; i < vec_num; i++) 7222 { 7223 tree vec_init_def = vec_initial_defs[i]; 7224 for (j = 0; j < ncopies; j++) 7225 { 7226 /* Create the reduction-phi that defines the reduction 7227 operand. */ 7228 gphi *new_phi = create_phi_node (vec_dest, loop->header); 7229 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi); 7230 7231 /* Set the loop-entry arg of the reduction-phi. */ 7232 if (j != 0 && nested_cycle) 7233 vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo, 7234 vec_init_def); 7235 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop), 7236 UNKNOWN_LOCATION); 7237 7238 /* The loop-latch arg is set in epilogue processing. */ 7239 7240 if (slp_node) 7241 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info); 7242 else 7243 { 7244 if (j == 0) 7245 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info; 7246 else 7247 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info; 7248 prev_phi_info = new_phi_info; 7249 } 7250 } 7251 } 7252 7253 return true; 7254} 7255 7256/* Vectorizes LC PHIs. */ 7257 7258bool 7259vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt, 7260 slp_tree slp_node) 7261{ 7262 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 7263 if (!loop_vinfo 7264 || !is_a <gphi *> (stmt_info->stmt) 7265 || gimple_phi_num_args (stmt_info->stmt) != 1) 7266 return false; 7267 7268 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def 7269 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def) 7270 return false; 7271 7272 if (!vec_stmt) /* transformation not required. */ 7273 { 7274 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type; 7275 return true; 7276 } 7277 7278 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 7279 tree scalar_dest = gimple_phi_result (stmt_info->stmt); 7280 basic_block bb = gimple_bb (stmt_info->stmt); 7281 edge e = single_pred_edge (bb); 7282 tree vec_dest = vect_create_destination_var (scalar_dest, vectype); 7283 vec<tree> vec_oprnds = vNULL; 7284 vect_get_vec_defs (gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE, 7285 stmt_info, &vec_oprnds, NULL, slp_node); 7286 if (slp_node) 7287 { 7288 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 7289 gcc_assert (vec_oprnds.length () == vec_num); 7290 for (unsigned i = 0; i < vec_num; i++) 7291 { 7292 /* Create the vectorized LC PHI node. */ 7293 gphi *new_phi = create_phi_node (vec_dest, bb); 7294 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION); 7295 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi); 7296 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info); 7297 } 7298 } 7299 else 7300 { 7301 unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype); 7302 stmt_vec_info prev_phi_info = NULL; 7303 for (unsigned i = 0; i < ncopies; i++) 7304 { 7305 if (i != 0) 7306 vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL); 7307 /* Create the vectorized LC PHI node. */ 7308 gphi *new_phi = create_phi_node (vec_dest, bb); 7309 add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION); 7310 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi); 7311 if (i == 0) 7312 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info; 7313 else 7314 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info; 7315 prev_phi_info = new_phi_info; 7316 } 7317 } 7318 vec_oprnds.release (); 7319 7320 return true; 7321} 7322 7323 7324/* Function vect_min_worthwhile_factor. 7325 7326 For a loop where we could vectorize the operation indicated by CODE, 7327 return the minimum vectorization factor that makes it worthwhile 7328 to use generic vectors. */ 7329static unsigned int 7330vect_min_worthwhile_factor (enum tree_code code) 7331{ 7332 switch (code) 7333 { 7334 case PLUS_EXPR: 7335 case MINUS_EXPR: 7336 case NEGATE_EXPR: 7337 return 4; 7338 7339 case BIT_AND_EXPR: 7340 case BIT_IOR_EXPR: 7341 case BIT_XOR_EXPR: 7342 case BIT_NOT_EXPR: 7343 return 2; 7344 7345 default: 7346 return INT_MAX; 7347 } 7348} 7349 7350/* Return true if VINFO indicates we are doing loop vectorization and if 7351 it is worth decomposing CODE operations into scalar operations for 7352 that loop's vectorization factor. */ 7353 7354bool 7355vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code) 7356{ 7357 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); 7358 unsigned HOST_WIDE_INT value; 7359 return (loop_vinfo 7360 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value) 7361 && value >= vect_min_worthwhile_factor (code)); 7362} 7363 7364/* Function vectorizable_induction 7365 7366 Check if STMT_INFO performs an induction computation that can be vectorized. 7367 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized 7368 phi to replace it, put it in VEC_STMT, and add it to the same basic block. 7369 Return true if STMT_INFO is vectorizable in this way. */ 7370 7371bool 7372vectorizable_induction (stmt_vec_info stmt_info, 7373 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, 7374 stmt_vec_info *vec_stmt, slp_tree slp_node, 7375 stmt_vector_for_cost *cost_vec) 7376{ 7377 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 7378 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 7379 unsigned ncopies; 7380 bool nested_in_vect_loop = false; 7381 class loop *iv_loop; 7382 tree vec_def; 7383 edge pe = loop_preheader_edge (loop); 7384 basic_block new_bb; 7385 tree new_vec, vec_init, vec_step, t; 7386 tree new_name; 7387 gimple *new_stmt; 7388 gphi *induction_phi; 7389 tree induc_def, vec_dest; 7390 tree init_expr, step_expr; 7391 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 7392 unsigned i; 7393 tree expr; 7394 gimple_seq stmts; 7395 imm_use_iterator imm_iter; 7396 use_operand_p use_p; 7397 gimple *exit_phi; 7398 edge latch_e; 7399 tree loop_arg; 7400 gimple_stmt_iterator si; 7401 7402 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt); 7403 if (!phi) 7404 return false; 7405 7406 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 7407 return false; 7408 7409 /* Make sure it was recognized as induction computation. */ 7410 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def) 7411 return false; 7412 7413 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 7414 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); 7415 7416 if (slp_node) 7417 ncopies = 1; 7418 else 7419 ncopies = vect_get_num_copies (loop_vinfo, vectype); 7420 gcc_assert (ncopies >= 1); 7421 7422 /* FORNOW. These restrictions should be relaxed. */ 7423 if (nested_in_vect_loop_p (loop, stmt_info)) 7424 { 7425 imm_use_iterator imm_iter; 7426 use_operand_p use_p; 7427 gimple *exit_phi; 7428 edge latch_e; 7429 tree loop_arg; 7430 7431 if (ncopies > 1) 7432 { 7433 if (dump_enabled_p ()) 7434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7435 "multiple types in nested loop.\n"); 7436 return false; 7437 } 7438 7439 /* FORNOW: outer loop induction with SLP not supported. */ 7440 if (STMT_SLP_TYPE (stmt_info)) 7441 return false; 7442 7443 exit_phi = NULL; 7444 latch_e = loop_latch_edge (loop->inner); 7445 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); 7446 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg) 7447 { 7448 gimple *use_stmt = USE_STMT (use_p); 7449 if (is_gimple_debug (use_stmt)) 7450 continue; 7451 7452 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt))) 7453 { 7454 exit_phi = use_stmt; 7455 break; 7456 } 7457 } 7458 if (exit_phi) 7459 { 7460 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi); 7461 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo) 7462 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))) 7463 { 7464 if (dump_enabled_p ()) 7465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7466 "inner-loop induction only used outside " 7467 "of the outer vectorized loop.\n"); 7468 return false; 7469 } 7470 } 7471 7472 nested_in_vect_loop = true; 7473 iv_loop = loop->inner; 7474 } 7475 else 7476 iv_loop = loop; 7477 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father); 7478 7479 if (slp_node && !nunits.is_constant ()) 7480 { 7481 /* The current SLP code creates the initial value element-by-element. */ 7482 if (dump_enabled_p ()) 7483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 7484 "SLP induction not supported for variable-length" 7485 " vectors.\n"); 7486 return false; 7487 } 7488 7489 if (!vec_stmt) /* transformation not required. */ 7490 { 7491 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type; 7492 DUMP_VECT_SCOPE ("vectorizable_induction"); 7493 vect_model_induction_cost (stmt_info, ncopies, cost_vec); 7494 return true; 7495 } 7496 7497 /* Transform. */ 7498 7499 /* Compute a vector variable, initialized with the first VF values of 7500 the induction variable. E.g., for an iv with IV_PHI='X' and 7501 evolution S, for a vector of 4 units, we want to compute: 7502 [X, X + S, X + 2*S, X + 3*S]. */ 7503 7504 if (dump_enabled_p ()) 7505 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n"); 7506 7507 latch_e = loop_latch_edge (iv_loop); 7508 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); 7509 7510 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info); 7511 gcc_assert (step_expr != NULL_TREE); 7512 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype); 7513 7514 pe = loop_preheader_edge (iv_loop); 7515 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, 7516 loop_preheader_edge (iv_loop)); 7517 7518 stmts = NULL; 7519 if (!nested_in_vect_loop) 7520 { 7521 /* Convert the initial value to the IV update type. */ 7522 tree new_type = TREE_TYPE (step_expr); 7523 init_expr = gimple_convert (&stmts, new_type, init_expr); 7524 7525 /* If we are using the loop mask to "peel" for alignment then we need 7526 to adjust the start value here. */ 7527 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); 7528 if (skip_niters != NULL_TREE) 7529 { 7530 if (FLOAT_TYPE_P (vectype)) 7531 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type, 7532 skip_niters); 7533 else 7534 skip_niters = gimple_convert (&stmts, new_type, skip_niters); 7535 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type, 7536 skip_niters, step_expr); 7537 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type, 7538 init_expr, skip_step); 7539 } 7540 } 7541 7542 if (stmts) 7543 { 7544 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 7545 gcc_assert (!new_bb); 7546 } 7547 7548 /* Find the first insertion point in the BB. */ 7549 basic_block bb = gimple_bb (phi); 7550 si = gsi_after_labels (bb); 7551 7552 /* For SLP induction we have to generate several IVs as for example 7553 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S] 7554 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform 7555 [VF*S, VF*S, VF*S, VF*S] for all. */ 7556 if (slp_node) 7557 { 7558 /* Enforced above. */ 7559 unsigned int const_nunits = nunits.to_constant (); 7560 7561 /* Generate [VF*S, VF*S, ... ]. */ 7562 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) 7563 { 7564 expr = build_int_cst (integer_type_node, vf); 7565 expr = fold_convert (TREE_TYPE (step_expr), expr); 7566 } 7567 else 7568 expr = build_int_cst (TREE_TYPE (step_expr), vf); 7569 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), 7570 expr, step_expr); 7571 if (! CONSTANT_CLASS_P (new_name)) 7572 new_name = vect_init_vector (stmt_info, new_name, 7573 TREE_TYPE (step_expr), NULL); 7574 new_vec = build_vector_from_val (step_vectype, new_name); 7575 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL); 7576 7577 /* Now generate the IVs. */ 7578 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 7579 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 7580 unsigned elts = const_nunits * nvects; 7581 unsigned nivs = least_common_multiple (group_size, 7582 const_nunits) / const_nunits; 7583 gcc_assert (elts % group_size == 0); 7584 tree elt = init_expr; 7585 unsigned ivn; 7586 for (ivn = 0; ivn < nivs; ++ivn) 7587 { 7588 tree_vector_builder elts (step_vectype, const_nunits, 1); 7589 stmts = NULL; 7590 for (unsigned eltn = 0; eltn < const_nunits; ++eltn) 7591 { 7592 if (ivn*const_nunits + eltn >= group_size 7593 && (ivn * const_nunits + eltn) % group_size == 0) 7594 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt), 7595 elt, step_expr); 7596 elts.quick_push (elt); 7597 } 7598 vec_init = gimple_build_vector (&stmts, &elts); 7599 vec_init = gimple_convert (&stmts, vectype, vec_init); 7600 if (stmts) 7601 { 7602 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 7603 gcc_assert (!new_bb); 7604 } 7605 7606 /* Create the induction-phi that defines the induction-operand. */ 7607 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); 7608 induction_phi = create_phi_node (vec_dest, iv_loop->header); 7609 stmt_vec_info induction_phi_info 7610 = loop_vinfo->add_stmt (induction_phi); 7611 induc_def = PHI_RESULT (induction_phi); 7612 7613 /* Create the iv update inside the loop */ 7614 gimple_seq stmts = NULL; 7615 vec_def = gimple_convert (&stmts, step_vectype, induc_def); 7616 vec_def = gimple_build (&stmts, 7617 PLUS_EXPR, step_vectype, vec_def, vec_step); 7618 vec_def = gimple_convert (&stmts, vectype, vec_def); 7619 loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def)); 7620 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); 7621 7622 /* Set the arguments of the phi node: */ 7623 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); 7624 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), 7625 UNKNOWN_LOCATION); 7626 7627 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info); 7628 } 7629 7630 /* Re-use IVs when we can. */ 7631 if (ivn < nvects) 7632 { 7633 unsigned vfp 7634 = least_common_multiple (group_size, const_nunits) / group_size; 7635 /* Generate [VF'*S, VF'*S, ... ]. */ 7636 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) 7637 { 7638 expr = build_int_cst (integer_type_node, vfp); 7639 expr = fold_convert (TREE_TYPE (step_expr), expr); 7640 } 7641 else 7642 expr = build_int_cst (TREE_TYPE (step_expr), vfp); 7643 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), 7644 expr, step_expr); 7645 if (! CONSTANT_CLASS_P (new_name)) 7646 new_name = vect_init_vector (stmt_info, new_name, 7647 TREE_TYPE (step_expr), NULL); 7648 new_vec = build_vector_from_val (step_vectype, new_name); 7649 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL); 7650 for (; ivn < nvects; ++ivn) 7651 { 7652 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt; 7653 tree def; 7654 if (gimple_code (iv) == GIMPLE_PHI) 7655 def = gimple_phi_result (iv); 7656 else 7657 def = gimple_assign_lhs (iv); 7658 gimple_seq stmts = NULL; 7659 def = gimple_convert (&stmts, step_vectype, def); 7660 def = gimple_build (&stmts, 7661 PLUS_EXPR, step_vectype, def, vec_step); 7662 def = gimple_convert (&stmts, vectype, def); 7663 if (gimple_code (iv) == GIMPLE_PHI) 7664 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); 7665 else 7666 { 7667 gimple_stmt_iterator tgsi = gsi_for_stmt (iv); 7668 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING); 7669 } 7670 SLP_TREE_VEC_STMTS (slp_node).quick_push 7671 (loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def))); 7672 } 7673 } 7674 7675 return true; 7676 } 7677 7678 /* Create the vector that holds the initial_value of the induction. */ 7679 if (nested_in_vect_loop) 7680 { 7681 /* iv_loop is nested in the loop to be vectorized. init_expr had already 7682 been created during vectorization of previous stmts. We obtain it 7683 from the STMT_VINFO_VEC_STMT of the defining stmt. */ 7684 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info); 7685 /* If the initial value is not of proper type, convert it. */ 7686 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init))) 7687 { 7688 new_stmt 7689 = gimple_build_assign (vect_get_new_ssa_name (vectype, 7690 vect_simple_var, 7691 "vec_iv_"), 7692 VIEW_CONVERT_EXPR, 7693 build1 (VIEW_CONVERT_EXPR, vectype, 7694 vec_init)); 7695 vec_init = gimple_assign_lhs (new_stmt); 7696 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop), 7697 new_stmt); 7698 gcc_assert (!new_bb); 7699 loop_vinfo->add_stmt (new_stmt); 7700 } 7701 } 7702 else 7703 { 7704 /* iv_loop is the loop to be vectorized. Create: 7705 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */ 7706 stmts = NULL; 7707 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr); 7708 7709 unsigned HOST_WIDE_INT const_nunits; 7710 if (nunits.is_constant (&const_nunits)) 7711 { 7712 tree_vector_builder elts (step_vectype, const_nunits, 1); 7713 elts.quick_push (new_name); 7714 for (i = 1; i < const_nunits; i++) 7715 { 7716 /* Create: new_name_i = new_name + step_expr */ 7717 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name), 7718 new_name, step_expr); 7719 elts.quick_push (new_name); 7720 } 7721 /* Create a vector from [new_name_0, new_name_1, ..., 7722 new_name_nunits-1] */ 7723 vec_init = gimple_build_vector (&stmts, &elts); 7724 } 7725 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))) 7726 /* Build the initial value directly from a VEC_SERIES_EXPR. */ 7727 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype, 7728 new_name, step_expr); 7729 else 7730 { 7731 /* Build: 7732 [base, base, base, ...] 7733 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */ 7734 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))); 7735 gcc_assert (flag_associative_math); 7736 tree index = build_index_vector (step_vectype, 0, 1); 7737 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype, 7738 new_name); 7739 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype, 7740 step_expr); 7741 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index); 7742 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype, 7743 vec_init, step_vec); 7744 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype, 7745 vec_init, base_vec); 7746 } 7747 vec_init = gimple_convert (&stmts, vectype, vec_init); 7748 7749 if (stmts) 7750 { 7751 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 7752 gcc_assert (!new_bb); 7753 } 7754 } 7755 7756 7757 /* Create the vector that holds the step of the induction. */ 7758 if (nested_in_vect_loop) 7759 /* iv_loop is nested in the loop to be vectorized. Generate: 7760 vec_step = [S, S, S, S] */ 7761 new_name = step_expr; 7762 else 7763 { 7764 /* iv_loop is the loop to be vectorized. Generate: 7765 vec_step = [VF*S, VF*S, VF*S, VF*S] */ 7766 gimple_seq seq = NULL; 7767 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) 7768 { 7769 expr = build_int_cst (integer_type_node, vf); 7770 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr); 7771 } 7772 else 7773 expr = build_int_cst (TREE_TYPE (step_expr), vf); 7774 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), 7775 expr, step_expr); 7776 if (seq) 7777 { 7778 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); 7779 gcc_assert (!new_bb); 7780 } 7781 } 7782 7783 t = unshare_expr (new_name); 7784 gcc_assert (CONSTANT_CLASS_P (new_name) 7785 || TREE_CODE (new_name) == SSA_NAME); 7786 new_vec = build_vector_from_val (step_vectype, t); 7787 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL); 7788 7789 7790 /* Create the following def-use cycle: 7791 loop prolog: 7792 vec_init = ... 7793 vec_step = ... 7794 loop: 7795 vec_iv = PHI <vec_init, vec_loop> 7796 ... 7797 STMT 7798 ... 7799 vec_loop = vec_iv + vec_step; */ 7800 7801 /* Create the induction-phi that defines the induction-operand. */ 7802 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); 7803 induction_phi = create_phi_node (vec_dest, iv_loop->header); 7804 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi); 7805 induc_def = PHI_RESULT (induction_phi); 7806 7807 /* Create the iv update inside the loop */ 7808 stmts = NULL; 7809 vec_def = gimple_convert (&stmts, step_vectype, induc_def); 7810 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step); 7811 vec_def = gimple_convert (&stmts, vectype, vec_def); 7812 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); 7813 new_stmt = SSA_NAME_DEF_STMT (vec_def); 7814 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt); 7815 7816 /* Set the arguments of the phi node: */ 7817 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); 7818 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), 7819 UNKNOWN_LOCATION); 7820 7821 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info; 7822 7823 /* In case that vectorization factor (VF) is bigger than the number 7824 of elements that we can fit in a vectype (nunits), we have to generate 7825 more than one vector stmt - i.e - we need to "unroll" the 7826 vector stmt by a factor VF/nunits. For more details see documentation 7827 in vectorizable_operation. */ 7828 7829 if (ncopies > 1) 7830 { 7831 gimple_seq seq = NULL; 7832 stmt_vec_info prev_stmt_vinfo; 7833 /* FORNOW. This restriction should be relaxed. */ 7834 gcc_assert (!nested_in_vect_loop); 7835 7836 /* Create the vector that holds the step of the induction. */ 7837 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))) 7838 { 7839 expr = build_int_cst (integer_type_node, nunits); 7840 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr); 7841 } 7842 else 7843 expr = build_int_cst (TREE_TYPE (step_expr), nunits); 7844 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), 7845 expr, step_expr); 7846 if (seq) 7847 { 7848 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); 7849 gcc_assert (!new_bb); 7850 } 7851 7852 t = unshare_expr (new_name); 7853 gcc_assert (CONSTANT_CLASS_P (new_name) 7854 || TREE_CODE (new_name) == SSA_NAME); 7855 new_vec = build_vector_from_val (step_vectype, t); 7856 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL); 7857 7858 vec_def = induc_def; 7859 prev_stmt_vinfo = induction_phi_info; 7860 for (i = 1; i < ncopies; i++) 7861 { 7862 /* vec_i = vec_prev + vec_step */ 7863 gimple_seq stmts = NULL; 7864 vec_def = gimple_convert (&stmts, step_vectype, vec_def); 7865 vec_def = gimple_build (&stmts, 7866 PLUS_EXPR, step_vectype, vec_def, vec_step); 7867 vec_def = gimple_convert (&stmts, vectype, vec_def); 7868 7869 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT); 7870 new_stmt = SSA_NAME_DEF_STMT (vec_def); 7871 new_stmt_info = loop_vinfo->add_stmt (new_stmt); 7872 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info; 7873 prev_stmt_vinfo = new_stmt_info; 7874 } 7875 } 7876 7877 if (nested_in_vect_loop) 7878 { 7879 /* Find the loop-closed exit-phi of the induction, and record 7880 the final vector of induction results: */ 7881 exit_phi = NULL; 7882 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg) 7883 { 7884 gimple *use_stmt = USE_STMT (use_p); 7885 if (is_gimple_debug (use_stmt)) 7886 continue; 7887 7888 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt))) 7889 { 7890 exit_phi = use_stmt; 7891 break; 7892 } 7893 } 7894 if (exit_phi) 7895 { 7896 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi); 7897 /* FORNOW. Currently not supporting the case that an inner-loop induction 7898 is not used in the outer-loop (i.e. only outside the outer-loop). */ 7899 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo) 7900 && !STMT_VINFO_LIVE_P (stmt_vinfo)); 7901 7902 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info; 7903 if (dump_enabled_p ()) 7904 dump_printf_loc (MSG_NOTE, vect_location, 7905 "vector of inductions after inner-loop:%G", 7906 new_stmt); 7907 } 7908 } 7909 7910 7911 if (dump_enabled_p ()) 7912 dump_printf_loc (MSG_NOTE, vect_location, 7913 "transform induction: created def-use cycle: %G%G", 7914 induction_phi, SSA_NAME_DEF_STMT (vec_def)); 7915 7916 return true; 7917} 7918 7919/* Function vectorizable_live_operation. 7920 7921 STMT_INFO computes a value that is used outside the loop. Check if 7922 it can be supported. */ 7923 7924bool 7925vectorizable_live_operation (stmt_vec_info stmt_info, 7926 gimple_stmt_iterator *gsi, 7927 slp_tree slp_node, slp_instance slp_node_instance, 7928 int slp_index, bool vec_stmt_p, 7929 stmt_vector_for_cost *) 7930{ 7931 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 7932 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 7933 imm_use_iterator imm_iter; 7934 tree lhs, lhs_type, bitsize, vec_bitsize; 7935 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 7936 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype); 7937 int ncopies; 7938 gimple *use_stmt; 7939 auto_vec<tree> vec_oprnds; 7940 int vec_entry = 0; 7941 poly_uint64 vec_index = 0; 7942 7943 gcc_assert (STMT_VINFO_LIVE_P (stmt_info)); 7944 7945 /* Due to how we generate code for SLP_TREE_TWO_OPERATORS we cannot 7946 vectorize live operations out of it. */ 7947 if (slp_node && SLP_TREE_TWO_OPERATORS (slp_node)) 7948 return false; 7949 7950 /* If a stmt of a reduction is live, vectorize it via 7951 vect_create_epilog_for_reduction. vectorizable_reduction assessed 7952 validity so just trigger the transform here. */ 7953 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))) 7954 { 7955 if (!vec_stmt_p) 7956 return true; 7957 if (slp_node) 7958 { 7959 /* For reduction chains the meta-info is attached to 7960 the group leader. */ 7961 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)) 7962 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info); 7963 /* For SLP reductions we vectorize the epilogue for 7964 all involved stmts together. */ 7965 else if (slp_index != 0) 7966 return true; 7967 } 7968 stmt_vec_info reduc_info = info_for_reduction (stmt_info); 7969 gcc_assert (reduc_info->is_reduc_info); 7970 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION 7971 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION) 7972 return true; 7973 vect_create_epilog_for_reduction (stmt_info, slp_node, 7974 slp_node_instance); 7975 return true; 7976 } 7977 7978 /* FORNOW. CHECKME. */ 7979 if (nested_in_vect_loop_p (loop, stmt_info)) 7980 return false; 7981 7982 /* If STMT is not relevant and it is a simple assignment and its inputs are 7983 invariant then it can remain in place, unvectorized. The original last 7984 scalar value that it computes will be used. */ 7985 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 7986 { 7987 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo)); 7988 if (dump_enabled_p ()) 7989 dump_printf_loc (MSG_NOTE, vect_location, 7990 "statement is simple and uses invariant. Leaving in " 7991 "place.\n"); 7992 return true; 7993 } 7994 7995 if (slp_node) 7996 ncopies = 1; 7997 else 7998 ncopies = vect_get_num_copies (loop_vinfo, vectype); 7999 8000 if (slp_node) 8001 { 8002 gcc_assert (slp_index >= 0); 8003 8004 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length (); 8005 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); 8006 8007 /* Get the last occurrence of the scalar index from the concatenation of 8008 all the slp vectors. Calculate which slp vector it is and the index 8009 within. */ 8010 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index; 8011 8012 /* Calculate which vector contains the result, and which lane of 8013 that vector we need. */ 8014 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index)) 8015 { 8016 if (dump_enabled_p ()) 8017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8018 "Cannot determine which vector holds the" 8019 " final result.\n"); 8020 return false; 8021 } 8022 } 8023 8024 if (!vec_stmt_p) 8025 { 8026 /* No transformation required. */ 8027 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)) 8028 { 8029 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype, 8030 OPTIMIZE_FOR_SPEED)) 8031 { 8032 if (dump_enabled_p ()) 8033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8034 "can't use a fully-masked loop because " 8035 "the target doesn't support extract last " 8036 "reduction.\n"); 8037 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; 8038 } 8039 else if (slp_node) 8040 { 8041 if (dump_enabled_p ()) 8042 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8043 "can't use a fully-masked loop because an " 8044 "SLP statement is live after the loop.\n"); 8045 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; 8046 } 8047 else if (ncopies > 1) 8048 { 8049 if (dump_enabled_p ()) 8050 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 8051 "can't use a fully-masked loop because" 8052 " ncopies is greater than 1.\n"); 8053 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false; 8054 } 8055 else 8056 { 8057 gcc_assert (ncopies == 1 && !slp_node); 8058 vect_record_loop_mask (loop_vinfo, 8059 &LOOP_VINFO_MASKS (loop_vinfo), 8060 1, vectype, NULL); 8061 } 8062 } 8063 return true; 8064 } 8065 8066 /* Use the lhs of the original scalar statement. */ 8067 gimple *stmt = vect_orig_stmt (stmt_info)->stmt; 8068 8069 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt) 8070 : gimple_get_lhs (stmt); 8071 lhs_type = TREE_TYPE (lhs); 8072 8073 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype) 8074 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype))) 8075 : TYPE_SIZE (TREE_TYPE (vectype))); 8076 vec_bitsize = TYPE_SIZE (vectype); 8077 8078 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */ 8079 tree vec_lhs, bitstart; 8080 if (slp_node) 8081 { 8082 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); 8083 8084 /* Get the correct slp vectorized stmt. */ 8085 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt; 8086 if (gphi *phi = dyn_cast <gphi *> (vec_stmt)) 8087 vec_lhs = gimple_phi_result (phi); 8088 else 8089 vec_lhs = gimple_get_lhs (vec_stmt); 8090 8091 /* Get entry to use. */ 8092 bitstart = bitsize_int (vec_index); 8093 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart); 8094 } 8095 else 8096 { 8097 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info); 8098 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt); 8099 gcc_checking_assert (ncopies == 1 8100 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)); 8101 8102 /* For multiple copies, get the last copy. */ 8103 for (int i = 1; i < ncopies; ++i) 8104 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs); 8105 8106 /* Get the last lane in the vector. */ 8107 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize); 8108 } 8109 8110 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI 8111 requirement, insert one phi node for it. It looks like: 8112 loop; 8113 BB: 8114 # lhs' = PHI <lhs> 8115 ==> 8116 loop; 8117 BB: 8118 # vec_lhs' = PHI <vec_lhs> 8119 new_tree = lane_extract <vec_lhs', ...>; 8120 lhs' = new_tree; */ 8121 8122 basic_block exit_bb = single_exit (loop)->dest; 8123 gcc_assert (single_pred_p (exit_bb)); 8124 8125 tree vec_lhs_phi = copy_ssa_name (vec_lhs); 8126 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb); 8127 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs); 8128 8129 gimple_seq stmts = NULL; 8130 tree new_tree; 8131 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 8132 { 8133 /* Emit: 8134 8135 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK> 8136 8137 where VEC_LHS is the vectorized live-out result and MASK is 8138 the loop mask for the final iteration. */ 8139 gcc_assert (ncopies == 1 && !slp_node); 8140 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info)); 8141 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1, 8142 vectype, 0); 8143 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type, 8144 mask, vec_lhs_phi); 8145 8146 /* Convert the extracted vector element to the required scalar type. */ 8147 new_tree = gimple_convert (&stmts, lhs_type, scalar_res); 8148 } 8149 else 8150 { 8151 tree bftype = TREE_TYPE (vectype); 8152 if (VECTOR_BOOLEAN_TYPE_P (vectype)) 8153 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1); 8154 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart); 8155 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), 8156 &stmts, true, NULL_TREE); 8157 } 8158 8159 if (stmts) 8160 { 8161 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb); 8162 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT); 8163 8164 /* Remove existing phi from lhs and create one copy from new_tree. */ 8165 tree lhs_phi = NULL_TREE; 8166 gimple_stmt_iterator gsi; 8167 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi)) 8168 { 8169 gimple *phi = gsi_stmt (gsi); 8170 if ((gimple_phi_arg_def (phi, 0) == lhs)) 8171 { 8172 remove_phi_node (&gsi, false); 8173 lhs_phi = gimple_phi_result (phi); 8174 gimple *copy = gimple_build_assign (lhs_phi, new_tree); 8175 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT); 8176 break; 8177 } 8178 } 8179 } 8180 8181 /* Replace use of lhs with newly computed result. If the use stmt is a 8182 single arg PHI, just replace all uses of PHI result. It's necessary 8183 because lcssa PHI defining lhs may be before newly inserted stmt. */ 8184 use_operand_p use_p; 8185 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs) 8186 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)) 8187 && !is_gimple_debug (use_stmt)) 8188 { 8189 if (gimple_code (use_stmt) == GIMPLE_PHI 8190 && gimple_phi_num_args (use_stmt) == 1) 8191 { 8192 replace_uses_by (gimple_phi_result (use_stmt), new_tree); 8193 } 8194 else 8195 { 8196 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 8197 SET_USE (use_p, new_tree); 8198 } 8199 update_stmt (use_stmt); 8200 } 8201 8202 return true; 8203} 8204 8205/* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */ 8206 8207static void 8208vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info) 8209{ 8210 ssa_op_iter op_iter; 8211 imm_use_iterator imm_iter; 8212 def_operand_p def_p; 8213 gimple *ustmt; 8214 8215 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF) 8216 { 8217 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p)) 8218 { 8219 basic_block bb; 8220 8221 if (!is_gimple_debug (ustmt)) 8222 continue; 8223 8224 bb = gimple_bb (ustmt); 8225 8226 if (!flow_bb_inside_loop_p (loop, bb)) 8227 { 8228 if (gimple_debug_bind_p (ustmt)) 8229 { 8230 if (dump_enabled_p ()) 8231 dump_printf_loc (MSG_NOTE, vect_location, 8232 "killing debug use\n"); 8233 8234 gimple_debug_bind_reset_value (ustmt); 8235 update_stmt (ustmt); 8236 } 8237 else 8238 gcc_unreachable (); 8239 } 8240 } 8241 } 8242} 8243 8244/* Given loop represented by LOOP_VINFO, return true if computation of 8245 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false 8246 otherwise. */ 8247 8248static bool 8249loop_niters_no_overflow (loop_vec_info loop_vinfo) 8250{ 8251 /* Constant case. */ 8252 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 8253 { 8254 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo); 8255 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo); 8256 8257 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST); 8258 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST); 8259 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters)) 8260 return true; 8261 } 8262 8263 widest_int max; 8264 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 8265 /* Check the upper bound of loop niters. */ 8266 if (get_max_loop_iterations (loop, &max)) 8267 { 8268 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); 8269 signop sgn = TYPE_SIGN (type); 8270 widest_int type_max = widest_int::from (wi::max_value (type), sgn); 8271 if (max < type_max) 8272 return true; 8273 } 8274 return false; 8275} 8276 8277/* Return a mask type with half the number of elements as OLD_TYPE, 8278 given that it should have mode NEW_MODE. */ 8279 8280tree 8281vect_halve_mask_nunits (tree old_type, machine_mode new_mode) 8282{ 8283 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2); 8284 return build_truth_vector_type_for_mode (nunits, new_mode); 8285} 8286 8287/* Return a mask type with twice as many elements as OLD_TYPE, 8288 given that it should have mode NEW_MODE. */ 8289 8290tree 8291vect_double_mask_nunits (tree old_type, machine_mode new_mode) 8292{ 8293 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2; 8294 return build_truth_vector_type_for_mode (nunits, new_mode); 8295} 8296 8297/* Record that a fully-masked version of LOOP_VINFO would need MASKS to 8298 contain a sequence of NVECTORS masks that each control a vector of type 8299 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND 8300 these vector masks with the vector version of SCALAR_MASK. */ 8301 8302void 8303vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks, 8304 unsigned int nvectors, tree vectype, tree scalar_mask) 8305{ 8306 gcc_assert (nvectors != 0); 8307 if (masks->length () < nvectors) 8308 masks->safe_grow_cleared (nvectors); 8309 rgroup_masks *rgm = &(*masks)[nvectors - 1]; 8310 /* The number of scalars per iteration and the number of vectors are 8311 both compile-time constants. */ 8312 unsigned int nscalars_per_iter 8313 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype), 8314 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant (); 8315 8316 if (scalar_mask) 8317 { 8318 scalar_cond_masked_key cond (scalar_mask, nvectors); 8319 loop_vinfo->scalar_cond_masked_set.add (cond); 8320 } 8321 8322 if (rgm->max_nscalars_per_iter < nscalars_per_iter) 8323 { 8324 rgm->max_nscalars_per_iter = nscalars_per_iter; 8325 rgm->mask_type = truth_type_for (vectype); 8326 } 8327} 8328 8329/* Given a complete set of masks MASKS, extract mask number INDEX 8330 for an rgroup that operates on NVECTORS vectors of type VECTYPE, 8331 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI. 8332 8333 See the comment above vec_loop_masks for more details about the mask 8334 arrangement. */ 8335 8336tree 8337vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks, 8338 unsigned int nvectors, tree vectype, unsigned int index) 8339{ 8340 rgroup_masks *rgm = &(*masks)[nvectors - 1]; 8341 tree mask_type = rgm->mask_type; 8342 8343 /* Populate the rgroup's mask array, if this is the first time we've 8344 used it. */ 8345 if (rgm->masks.is_empty ()) 8346 { 8347 rgm->masks.safe_grow_cleared (nvectors); 8348 for (unsigned int i = 0; i < nvectors; ++i) 8349 { 8350 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask"); 8351 /* Provide a dummy definition until the real one is available. */ 8352 SSA_NAME_DEF_STMT (mask) = gimple_build_nop (); 8353 rgm->masks[i] = mask; 8354 } 8355 } 8356 8357 tree mask = rgm->masks[index]; 8358 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type), 8359 TYPE_VECTOR_SUBPARTS (vectype))) 8360 { 8361 /* A loop mask for data type X can be reused for data type Y 8362 if X has N times more elements than Y and if Y's elements 8363 are N times bigger than X's. In this case each sequence 8364 of N elements in the loop mask will be all-zero or all-one. 8365 We can then view-convert the mask so that each sequence of 8366 N elements is replaced by a single element. */ 8367 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type), 8368 TYPE_VECTOR_SUBPARTS (vectype))); 8369 gimple_seq seq = NULL; 8370 mask_type = truth_type_for (vectype); 8371 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask); 8372 if (seq) 8373 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT); 8374 } 8375 return mask; 8376} 8377 8378/* Scale profiling counters by estimation for LOOP which is vectorized 8379 by factor VF. */ 8380 8381static void 8382scale_profile_for_vect_loop (class loop *loop, unsigned vf) 8383{ 8384 edge preheader = loop_preheader_edge (loop); 8385 /* Reduce loop iterations by the vectorization factor. */ 8386 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf); 8387 profile_count freq_h = loop->header->count, freq_e = preheader->count (); 8388 8389 if (freq_h.nonzero_p ()) 8390 { 8391 profile_probability p; 8392 8393 /* Avoid dropping loop body profile counter to 0 because of zero count 8394 in loop's preheader. */ 8395 if (!(freq_e == profile_count::zero ())) 8396 freq_e = freq_e.force_nonzero (); 8397 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h); 8398 scale_loop_frequencies (loop, p); 8399 } 8400 8401 edge exit_e = single_exit (loop); 8402 exit_e->probability = profile_probability::always () 8403 .apply_scale (1, new_est_niter + 1); 8404 8405 edge exit_l = single_pred_edge (loop->latch); 8406 profile_probability prob = exit_l->probability; 8407 exit_l->probability = exit_e->probability.invert (); 8408 if (prob.initialized_p () && exit_l->probability.initialized_p ()) 8409 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob); 8410} 8411 8412/* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI 8413 latch edge values originally defined by it. */ 8414 8415static void 8416maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo, 8417 stmt_vec_info def_stmt_info) 8418{ 8419 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt); 8420 if (!def || TREE_CODE (def) != SSA_NAME) 8421 return; 8422 stmt_vec_info phi_info; 8423 imm_use_iterator iter; 8424 use_operand_p use_p; 8425 FOR_EACH_IMM_USE_FAST (use_p, iter, def) 8426 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p))) 8427 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi) 8428 && (phi_info = loop_vinfo->lookup_stmt (phi)) 8429 && STMT_VINFO_RELEVANT_P (phi_info) 8430 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info)) 8431 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION 8432 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION) 8433 { 8434 loop_p loop = gimple_bb (phi)->loop_father; 8435 edge e = loop_latch_edge (loop); 8436 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def) 8437 { 8438 stmt_vec_info phi_vec_info = STMT_VINFO_VEC_STMT (phi_info); 8439 stmt_vec_info def_vec_info = STMT_VINFO_VEC_STMT (def_stmt_info); 8440 do 8441 { 8442 add_phi_arg (as_a <gphi *> (phi_vec_info->stmt), 8443 gimple_get_lhs (def_vec_info->stmt), e, 8444 gimple_phi_arg_location (phi, e->dest_idx)); 8445 phi_vec_info = STMT_VINFO_RELATED_STMT (phi_vec_info); 8446 def_vec_info = STMT_VINFO_RELATED_STMT (def_vec_info); 8447 } 8448 while (phi_vec_info); 8449 gcc_assert (!def_vec_info); 8450 } 8451 } 8452} 8453 8454/* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI. 8455 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its 8456 stmt_vec_info. */ 8457 8458static bool 8459vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, 8460 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store) 8461{ 8462 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 8463 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 8464 8465 if (dump_enabled_p ()) 8466 dump_printf_loc (MSG_NOTE, vect_location, 8467 "------>vectorizing statement: %G", stmt_info->stmt); 8468 8469 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info)) 8470 vect_loop_kill_debug_uses (loop, stmt_info); 8471 8472 if (!STMT_VINFO_RELEVANT_P (stmt_info) 8473 && !STMT_VINFO_LIVE_P (stmt_info)) 8474 return false; 8475 8476 if (STMT_VINFO_VECTYPE (stmt_info)) 8477 { 8478 poly_uint64 nunits 8479 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); 8480 if (!STMT_SLP_TYPE (stmt_info) 8481 && maybe_ne (nunits, vf) 8482 && dump_enabled_p ()) 8483 /* For SLP VF is set according to unrolling factor, and not 8484 to vector size, hence for SLP this print is not valid. */ 8485 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n"); 8486 } 8487 8488 /* Pure SLP statements have already been vectorized. We still need 8489 to apply loop vectorization to hybrid SLP statements. */ 8490 if (PURE_SLP_STMT (stmt_info)) 8491 return false; 8492 8493 if (dump_enabled_p ()) 8494 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n"); 8495 8496 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL)) 8497 *seen_store = stmt_info; 8498 8499 return true; 8500} 8501 8502/* Helper function to pass to simplify_replace_tree to enable replacing tree's 8503 in the hash_map with its corresponding values. */ 8504 8505static tree 8506find_in_mapping (tree t, void *context) 8507{ 8508 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context; 8509 8510 tree *value = mapping->get (t); 8511 return value ? *value : t; 8512} 8513 8514/* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the 8515 original loop that has now been vectorized. 8516 8517 The inits of the data_references need to be advanced with the number of 8518 iterations of the main loop. This has been computed in vect_do_peeling and 8519 is stored in parameter ADVANCE. We first restore the data_references 8520 initial offset with the values recored in ORIG_DRS_INIT. 8521 8522 Since the loop_vec_info of this EPILOGUE was constructed for the original 8523 loop, its stmt_vec_infos all point to the original statements. These need 8524 to be updated to point to their corresponding copies as well as the SSA_NAMES 8525 in their PATTERN_DEF_SEQs and RELATED_STMTs. 8526 8527 The data_reference's connections also need to be updated. Their 8528 corresponding dr_vec_info need to be reconnected to the EPILOGUE's 8529 stmt_vec_infos, their statements need to point to their corresponding copy, 8530 if they are gather loads or scatter stores then their reference needs to be 8531 updated to point to its corresponding copy and finally we set 8532 'base_misaligned' to false as we have already peeled for alignment in the 8533 prologue of the main loop. */ 8534 8535static void 8536update_epilogue_loop_vinfo (class loop *epilogue, tree advance) 8537{ 8538 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue); 8539 auto_vec<gimple *> stmt_worklist; 8540 hash_map<tree,tree> mapping; 8541 gimple *orig_stmt, *new_stmt; 8542 gimple_stmt_iterator epilogue_gsi; 8543 gphi_iterator epilogue_phi_gsi; 8544 stmt_vec_info stmt_vinfo = NULL, related_vinfo; 8545 basic_block *epilogue_bbs = get_loop_body (epilogue); 8546 unsigned i; 8547 8548 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs; 8549 8550 /* Advance data_reference's with the number of iterations of the previous 8551 loop and its prologue. */ 8552 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR); 8553 8554 8555 /* The EPILOGUE loop is a copy of the original loop so they share the same 8556 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to 8557 point to the copied statements. We also create a mapping of all LHS' in 8558 the original loop and all the LHS' in the EPILOGUE and create worklists to 8559 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */ 8560 for (unsigned i = 0; i < epilogue->num_nodes; ++i) 8561 { 8562 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]); 8563 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi)) 8564 { 8565 new_stmt = epilogue_phi_gsi.phi (); 8566 8567 gcc_assert (gimple_uid (new_stmt) > 0); 8568 stmt_vinfo 8569 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1]; 8570 8571 orig_stmt = STMT_VINFO_STMT (stmt_vinfo); 8572 STMT_VINFO_STMT (stmt_vinfo) = new_stmt; 8573 8574 mapping.put (gimple_phi_result (orig_stmt), 8575 gimple_phi_result (new_stmt)); 8576 /* PHI nodes can not have patterns or related statements. */ 8577 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL 8578 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL); 8579 } 8580 8581 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]); 8582 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi)) 8583 { 8584 new_stmt = gsi_stmt (epilogue_gsi); 8585 8586 gcc_assert (gimple_uid (new_stmt) > 0); 8587 stmt_vinfo 8588 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1]; 8589 8590 orig_stmt = STMT_VINFO_STMT (stmt_vinfo); 8591 STMT_VINFO_STMT (stmt_vinfo) = new_stmt; 8592 8593 if (tree old_lhs = gimple_get_lhs (orig_stmt)) 8594 mapping.put (old_lhs, gimple_get_lhs (new_stmt)); 8595 8596 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo)) 8597 { 8598 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo); 8599 for (gimple_stmt_iterator gsi = gsi_start (seq); 8600 !gsi_end_p (gsi); gsi_next (&gsi)) 8601 stmt_worklist.safe_push (gsi_stmt (gsi)); 8602 } 8603 8604 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo); 8605 if (related_vinfo != NULL && related_vinfo != stmt_vinfo) 8606 { 8607 gimple *stmt = STMT_VINFO_STMT (related_vinfo); 8608 stmt_worklist.safe_push (stmt); 8609 /* Set BB such that the assert in 8610 'get_initial_def_for_reduction' is able to determine that 8611 the BB of the related stmt is inside this loop. */ 8612 gimple_set_bb (stmt, 8613 gimple_bb (new_stmt)); 8614 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo); 8615 gcc_assert (related_vinfo == NULL 8616 || related_vinfo == stmt_vinfo); 8617 } 8618 } 8619 } 8620 8621 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed 8622 using the original main loop and thus need to be updated to refer to the 8623 cloned variables used in the epilogue. */ 8624 for (unsigned i = 0; i < stmt_worklist.length (); ++i) 8625 { 8626 gimple *stmt = stmt_worklist[i]; 8627 tree *new_op; 8628 8629 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j) 8630 { 8631 tree op = gimple_op (stmt, j); 8632 if ((new_op = mapping.get(op))) 8633 gimple_set_op (stmt, j, *new_op); 8634 else 8635 { 8636 /* PR92429: The last argument of simplify_replace_tree disables 8637 folding when replacing arguments. This is required as 8638 otherwise you might end up with different statements than the 8639 ones analyzed in vect_loop_analyze, leading to different 8640 vectorization. */ 8641 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE, 8642 &find_in_mapping, &mapping, false); 8643 gimple_set_op (stmt, j, op); 8644 } 8645 } 8646 } 8647 8648 struct data_reference *dr; 8649 vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs; 8650 FOR_EACH_VEC_ELT (datarefs, i, dr) 8651 { 8652 orig_stmt = DR_STMT (dr); 8653 gcc_assert (gimple_uid (orig_stmt) > 0); 8654 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1]; 8655 /* Data references for gather loads and scatter stores do not use the 8656 updated offset we set using ADVANCE. Instead we have to make sure the 8657 reference in the data references point to the corresponding copy of 8658 the original in the epilogue. */ 8659 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo)) 8660 == VMAT_GATHER_SCATTER) 8661 { 8662 DR_REF (dr) 8663 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE, 8664 &find_in_mapping, &mapping); 8665 DR_BASE_ADDRESS (dr) 8666 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE, 8667 &find_in_mapping, &mapping); 8668 } 8669 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo); 8670 stmt_vinfo->dr_aux.stmt = stmt_vinfo; 8671 /* The vector size of the epilogue is smaller than that of the main loop 8672 so the alignment is either the same or lower. This means the dr will 8673 thus by definition be aligned. */ 8674 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false; 8675 } 8676 8677 epilogue_vinfo->shared->datarefs_copy.release (); 8678 epilogue_vinfo->shared->save_datarefs (); 8679} 8680 8681/* Function vect_transform_loop. 8682 8683 The analysis phase has determined that the loop is vectorizable. 8684 Vectorize the loop - created vectorized stmts to replace the scalar 8685 stmts in the loop, and update the loop exit condition. 8686 Returns scalar epilogue loop if any. */ 8687 8688class loop * 8689vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) 8690{ 8691 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 8692 class loop *epilogue = NULL; 8693 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); 8694 int nbbs = loop->num_nodes; 8695 int i; 8696 tree niters_vector = NULL_TREE; 8697 tree step_vector = NULL_TREE; 8698 tree niters_vector_mult_vf = NULL_TREE; 8699 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 8700 unsigned int lowest_vf = constant_lower_bound (vf); 8701 gimple *stmt; 8702 bool check_profitability = false; 8703 unsigned int th; 8704 8705 DUMP_VECT_SCOPE ("vec_transform_loop"); 8706 8707 loop_vinfo->shared->check_datarefs (); 8708 8709 /* Use the more conservative vectorization threshold. If the number 8710 of iterations is constant assume the cost check has been performed 8711 by our caller. If the threshold makes all loops profitable that 8712 run at least the (estimated) vectorization factor number of times 8713 checking is pointless, too. */ 8714 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); 8715 if (vect_apply_runtime_profitability_check_p (loop_vinfo)) 8716 { 8717 if (dump_enabled_p ()) 8718 dump_printf_loc (MSG_NOTE, vect_location, 8719 "Profitability threshold is %d loop iterations.\n", 8720 th); 8721 check_profitability = true; 8722 } 8723 8724 /* Make sure there exists a single-predecessor exit bb. Do this before 8725 versioning. */ 8726 edge e = single_exit (loop); 8727 if (! single_pred_p (e->dest)) 8728 { 8729 split_loop_exit_edge (e, true); 8730 if (dump_enabled_p ()) 8731 dump_printf (MSG_NOTE, "split exit edge\n"); 8732 } 8733 8734 /* Version the loop first, if required, so the profitability check 8735 comes first. */ 8736 8737 if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) 8738 { 8739 class loop *sloop 8740 = vect_loop_versioning (loop_vinfo, loop_vectorized_call); 8741 sloop->force_vectorize = false; 8742 check_profitability = false; 8743 } 8744 8745 /* Make sure there exists a single-predecessor exit bb also on the 8746 scalar loop copy. Do this after versioning but before peeling 8747 so CFG structure is fine for both scalar and if-converted loop 8748 to make slpeel_duplicate_current_defs_from_edges face matched 8749 loop closed PHI nodes on the exit. */ 8750 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) 8751 { 8752 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)); 8753 if (! single_pred_p (e->dest)) 8754 { 8755 split_loop_exit_edge (e, true); 8756 if (dump_enabled_p ()) 8757 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n"); 8758 } 8759 } 8760 8761 tree niters = vect_build_loop_niters (loop_vinfo); 8762 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters; 8763 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo)); 8764 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo); 8765 tree advance; 8766 drs_init_vec orig_drs_init; 8767 8768 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, 8769 &step_vector, &niters_vector_mult_vf, th, 8770 check_profitability, niters_no_overflow, 8771 &advance); 8772 8773 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo) 8774 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ()) 8775 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo), 8776 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo)); 8777 8778 if (niters_vector == NULL_TREE) 8779 { 8780 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 8781 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 8782 && known_eq (lowest_vf, vf)) 8783 { 8784 niters_vector 8785 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)), 8786 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf); 8787 step_vector = build_one_cst (TREE_TYPE (niters)); 8788 } 8789 else 8790 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector, 8791 &step_vector, niters_no_overflow); 8792 } 8793 8794 /* 1) Make sure the loop header has exactly two entries 8795 2) Make sure we have a preheader basic block. */ 8796 8797 gcc_assert (EDGE_COUNT (loop->header->preds) == 2); 8798 8799 split_edge (loop_preheader_edge (loop)); 8800 8801 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 8802 && vect_use_loop_mask_for_alignment_p (loop_vinfo)) 8803 /* This will deal with any possible peeling. */ 8804 vect_prepare_for_masked_peels (loop_vinfo); 8805 8806 /* Schedule the SLP instances first, then handle loop vectorization 8807 below. */ 8808 if (!loop_vinfo->slp_instances.is_empty ()) 8809 { 8810 DUMP_VECT_SCOPE ("scheduling SLP instances"); 8811 vect_schedule_slp (loop_vinfo); 8812 } 8813 8814 /* FORNOW: the vectorizer supports only loops which body consist 8815 of one basic block (header + empty latch). When the vectorizer will 8816 support more involved loop forms, the order by which the BBs are 8817 traversed need to be reconsidered. */ 8818 8819 for (i = 0; i < nbbs; i++) 8820 { 8821 basic_block bb = bbs[i]; 8822 stmt_vec_info stmt_info; 8823 8824 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 8825 gsi_next (&si)) 8826 { 8827 gphi *phi = si.phi (); 8828 if (dump_enabled_p ()) 8829 dump_printf_loc (MSG_NOTE, vect_location, 8830 "------>vectorizing phi: %G", phi); 8831 stmt_info = loop_vinfo->lookup_stmt (phi); 8832 if (!stmt_info) 8833 continue; 8834 8835 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info)) 8836 vect_loop_kill_debug_uses (loop, stmt_info); 8837 8838 if (!STMT_VINFO_RELEVANT_P (stmt_info) 8839 && !STMT_VINFO_LIVE_P (stmt_info)) 8840 continue; 8841 8842 if (STMT_VINFO_VECTYPE (stmt_info) 8843 && (maybe_ne 8844 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf)) 8845 && dump_enabled_p ()) 8846 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n"); 8847 8848 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def 8849 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def 8850 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def 8851 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle 8852 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def) 8853 && ! PURE_SLP_STMT (stmt_info)) 8854 { 8855 if (dump_enabled_p ()) 8856 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n"); 8857 vect_transform_stmt (stmt_info, NULL, NULL, NULL); 8858 } 8859 } 8860 8861 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si); 8862 gsi_next (&si)) 8863 { 8864 gphi *phi = si.phi (); 8865 stmt_info = loop_vinfo->lookup_stmt (phi); 8866 if (!stmt_info) 8867 continue; 8868 8869 if (!STMT_VINFO_RELEVANT_P (stmt_info) 8870 && !STMT_VINFO_LIVE_P (stmt_info)) 8871 continue; 8872 8873 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def 8874 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def 8875 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def 8876 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle 8877 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def) 8878 && ! PURE_SLP_STMT (stmt_info)) 8879 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info); 8880 } 8881 8882 for (gimple_stmt_iterator si = gsi_start_bb (bb); 8883 !gsi_end_p (si);) 8884 { 8885 stmt = gsi_stmt (si); 8886 /* During vectorization remove existing clobber stmts. */ 8887 if (gimple_clobber_p (stmt)) 8888 { 8889 unlink_stmt_vdef (stmt); 8890 gsi_remove (&si, true); 8891 release_defs (stmt); 8892 } 8893 else 8894 { 8895 stmt_info = loop_vinfo->lookup_stmt (stmt); 8896 8897 /* vector stmts created in the outer-loop during vectorization of 8898 stmts in an inner-loop may not have a stmt_info, and do not 8899 need to be vectorized. */ 8900 stmt_vec_info seen_store = NULL; 8901 if (stmt_info) 8902 { 8903 if (STMT_VINFO_IN_PATTERN_P (stmt_info)) 8904 { 8905 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info); 8906 for (gimple_stmt_iterator subsi = gsi_start (def_seq); 8907 !gsi_end_p (subsi); gsi_next (&subsi)) 8908 { 8909 stmt_vec_info pat_stmt_info 8910 = loop_vinfo->lookup_stmt (gsi_stmt (subsi)); 8911 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, 8912 &si, &seen_store); 8913 } 8914 stmt_vec_info pat_stmt_info 8915 = STMT_VINFO_RELATED_STMT (stmt_info); 8916 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, 8917 &si, &seen_store)) 8918 maybe_set_vectorized_backedge_value (loop_vinfo, 8919 pat_stmt_info); 8920 } 8921 else 8922 { 8923 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si, 8924 &seen_store)) 8925 maybe_set_vectorized_backedge_value (loop_vinfo, 8926 stmt_info); 8927 } 8928 } 8929 gsi_next (&si); 8930 if (seen_store) 8931 { 8932 if (STMT_VINFO_GROUPED_ACCESS (seen_store)) 8933 /* Interleaving. If IS_STORE is TRUE, the 8934 vectorization of the interleaving chain was 8935 completed - free all the stores in the chain. */ 8936 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store)); 8937 else 8938 /* Free the attached stmt_vec_info and remove the stmt. */ 8939 loop_vinfo->remove_stmt (stmt_info); 8940 } 8941 } 8942 } 8943 8944 /* Stub out scalar statements that must not survive vectorization. 8945 Doing this here helps with grouped statements, or statements that 8946 are involved in patterns. */ 8947 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); 8948 !gsi_end_p (gsi); gsi_next (&gsi)) 8949 { 8950 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi)); 8951 if (!call || !gimple_call_internal_p (call)) 8952 continue; 8953 internal_fn ifn = gimple_call_internal_fn (call); 8954 if (ifn == IFN_MASK_LOAD) 8955 { 8956 tree lhs = gimple_get_lhs (call); 8957 if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) 8958 { 8959 tree zero = build_zero_cst (TREE_TYPE (lhs)); 8960 gimple *new_stmt = gimple_build_assign (lhs, zero); 8961 gsi_replace (&gsi, new_stmt, true); 8962 } 8963 } 8964 else if (conditional_internal_fn_code (ifn) != ERROR_MARK) 8965 { 8966 tree lhs = gimple_get_lhs (call); 8967 if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) 8968 { 8969 tree else_arg 8970 = gimple_call_arg (call, gimple_call_num_args (call) - 1); 8971 gimple *new_stmt = gimple_build_assign (lhs, else_arg); 8972 gsi_replace (&gsi, new_stmt, true); 8973 } 8974 } 8975 } 8976 } /* BBs in loop */ 8977 8978 /* The vectorization factor is always > 1, so if we use an IV increment of 1. 8979 a zero NITERS becomes a nonzero NITERS_VECTOR. */ 8980 if (integer_onep (step_vector)) 8981 niters_no_overflow = true; 8982 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector, 8983 niters_vector_mult_vf, !niters_no_overflow); 8984 8985 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); 8986 scale_profile_for_vect_loop (loop, assumed_vf); 8987 8988 /* True if the final iteration might not handle a full vector's 8989 worth of scalar iterations. */ 8990 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); 8991 /* The minimum number of iterations performed by the epilogue. This 8992 is 1 when peeling for gaps because we always need a final scalar 8993 iteration. */ 8994 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0; 8995 /* +1 to convert latch counts to loop iteration counts, 8996 -min_epilogue_iters to remove iterations that cannot be performed 8997 by the vector code. */ 8998 int bias_for_lowest = 1 - min_epilogue_iters; 8999 int bias_for_assumed = bias_for_lowest; 9000 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 9001 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 9002 { 9003 /* When the amount of peeling is known at compile time, the first 9004 iteration will have exactly alignment_npeels active elements. 9005 In the worst case it will have at least one. */ 9006 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1); 9007 bias_for_lowest += lowest_vf - min_first_active; 9008 bias_for_assumed += assumed_vf - min_first_active; 9009 } 9010 /* In these calculations the "- 1" converts loop iteration counts 9011 back to latch counts. */ 9012 if (loop->any_upper_bound) 9013 loop->nb_iterations_upper_bound 9014 = (final_iter_may_be_partial 9015 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest, 9016 lowest_vf) - 1 9017 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest, 9018 lowest_vf) - 1); 9019 if (loop->any_likely_upper_bound) 9020 loop->nb_iterations_likely_upper_bound 9021 = (final_iter_may_be_partial 9022 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound 9023 + bias_for_lowest, lowest_vf) - 1 9024 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound 9025 + bias_for_lowest, lowest_vf) - 1); 9026 if (loop->any_estimate) 9027 loop->nb_iterations_estimate 9028 = (final_iter_may_be_partial 9029 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed, 9030 assumed_vf) - 1 9031 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed, 9032 assumed_vf) - 1); 9033 9034 if (dump_enabled_p ()) 9035 { 9036 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 9037 { 9038 dump_printf_loc (MSG_NOTE, vect_location, 9039 "LOOP VECTORIZED\n"); 9040 if (loop->inner) 9041 dump_printf_loc (MSG_NOTE, vect_location, 9042 "OUTER LOOP VECTORIZED\n"); 9043 dump_printf (MSG_NOTE, "\n"); 9044 } 9045 else 9046 dump_printf_loc (MSG_NOTE, vect_location, 9047 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n", 9048 GET_MODE_NAME (loop_vinfo->vector_mode)); 9049 } 9050 9051 /* Loops vectorized with a variable factor won't benefit from 9052 unrolling/peeling. */ 9053 if (!vf.is_constant ()) 9054 { 9055 loop->unroll = 1; 9056 if (dump_enabled_p ()) 9057 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to" 9058 " variable-length vectorization factor\n"); 9059 } 9060 /* Free SLP instances here because otherwise stmt reference counting 9061 won't work. */ 9062 slp_instance instance; 9063 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance) 9064 vect_free_slp_instance (instance, true); 9065 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release (); 9066 /* Clear-up safelen field since its value is invalid after vectorization 9067 since vectorized loop can have loop-carried dependencies. */ 9068 loop->safelen = 0; 9069 9070 if (epilogue) 9071 { 9072 update_epilogue_loop_vinfo (epilogue, advance); 9073 9074 epilogue->simduid = loop->simduid; 9075 epilogue->force_vectorize = loop->force_vectorize; 9076 epilogue->dont_vectorize = false; 9077 } 9078 9079 return epilogue; 9080} 9081 9082/* The code below is trying to perform simple optimization - revert 9083 if-conversion for masked stores, i.e. if the mask of a store is zero 9084 do not perform it and all stored value producers also if possible. 9085 For example, 9086 for (i=0; i<n; i++) 9087 if (c[i]) 9088 { 9089 p1[i] += 1; 9090 p2[i] = p3[i] +2; 9091 } 9092 this transformation will produce the following semi-hammock: 9093 9094 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 }) 9095 { 9096 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165); 9097 vect__12.22_172 = vect__11.19_170 + vect_cst__171; 9098 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172); 9099 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165); 9100 vect__19.28_184 = vect__18.25_182 + vect_cst__183; 9101 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184); 9102 } 9103*/ 9104 9105void 9106optimize_mask_stores (class loop *loop) 9107{ 9108 basic_block *bbs = get_loop_body (loop); 9109 unsigned nbbs = loop->num_nodes; 9110 unsigned i; 9111 basic_block bb; 9112 class loop *bb_loop; 9113 gimple_stmt_iterator gsi; 9114 gimple *stmt; 9115 auto_vec<gimple *> worklist; 9116 auto_purge_vect_location sentinel; 9117 9118 vect_location = find_loop_location (loop); 9119 /* Pick up all masked stores in loop if any. */ 9120 for (i = 0; i < nbbs; i++) 9121 { 9122 bb = bbs[i]; 9123 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); 9124 gsi_next (&gsi)) 9125 { 9126 stmt = gsi_stmt (gsi); 9127 if (gimple_call_internal_p (stmt, IFN_MASK_STORE)) 9128 worklist.safe_push (stmt); 9129 } 9130 } 9131 9132 free (bbs); 9133 if (worklist.is_empty ()) 9134 return; 9135 9136 /* Loop has masked stores. */ 9137 while (!worklist.is_empty ()) 9138 { 9139 gimple *last, *last_store; 9140 edge e, efalse; 9141 tree mask; 9142 basic_block store_bb, join_bb; 9143 gimple_stmt_iterator gsi_to; 9144 tree vdef, new_vdef; 9145 gphi *phi; 9146 tree vectype; 9147 tree zero; 9148 9149 last = worklist.pop (); 9150 mask = gimple_call_arg (last, 2); 9151 bb = gimple_bb (last); 9152 /* Create then_bb and if-then structure in CFG, then_bb belongs to 9153 the same loop as if_bb. It could be different to LOOP when two 9154 level loop-nest is vectorized and mask_store belongs to the inner 9155 one. */ 9156 e = split_block (bb, last); 9157 bb_loop = bb->loop_father; 9158 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop)); 9159 join_bb = e->dest; 9160 store_bb = create_empty_bb (bb); 9161 add_bb_to_loop (store_bb, bb_loop); 9162 e->flags = EDGE_TRUE_VALUE; 9163 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE); 9164 /* Put STORE_BB to likely part. */ 9165 efalse->probability = profile_probability::unlikely (); 9166 store_bb->count = efalse->count (); 9167 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU); 9168 if (dom_info_available_p (CDI_DOMINATORS)) 9169 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb); 9170 if (dump_enabled_p ()) 9171 dump_printf_loc (MSG_NOTE, vect_location, 9172 "Create new block %d to sink mask stores.", 9173 store_bb->index); 9174 /* Create vector comparison with boolean result. */ 9175 vectype = TREE_TYPE (mask); 9176 zero = build_zero_cst (vectype); 9177 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE); 9178 gsi = gsi_last_bb (bb); 9179 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT); 9180 /* Create new PHI node for vdef of the last masked store: 9181 .MEM_2 = VDEF <.MEM_1> 9182 will be converted to 9183 .MEM.3 = VDEF <.MEM_1> 9184 and new PHI node will be created in join bb 9185 .MEM_2 = PHI <.MEM_1, .MEM_3> 9186 */ 9187 vdef = gimple_vdef (last); 9188 new_vdef = make_ssa_name (gimple_vop (cfun), last); 9189 gimple_set_vdef (last, new_vdef); 9190 phi = create_phi_node (vdef, join_bb); 9191 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION); 9192 9193 /* Put all masked stores with the same mask to STORE_BB if possible. */ 9194 while (true) 9195 { 9196 gimple_stmt_iterator gsi_from; 9197 gimple *stmt1 = NULL; 9198 9199 /* Move masked store to STORE_BB. */ 9200 last_store = last; 9201 gsi = gsi_for_stmt (last); 9202 gsi_from = gsi; 9203 /* Shift GSI to the previous stmt for further traversal. */ 9204 gsi_prev (&gsi); 9205 gsi_to = gsi_start_bb (store_bb); 9206 gsi_move_before (&gsi_from, &gsi_to); 9207 /* Setup GSI_TO to the non-empty block start. */ 9208 gsi_to = gsi_start_bb (store_bb); 9209 if (dump_enabled_p ()) 9210 dump_printf_loc (MSG_NOTE, vect_location, 9211 "Move stmt to created bb\n%G", last); 9212 /* Move all stored value producers if possible. */ 9213 while (!gsi_end_p (gsi)) 9214 { 9215 tree lhs; 9216 imm_use_iterator imm_iter; 9217 use_operand_p use_p; 9218 bool res; 9219 9220 /* Skip debug statements. */ 9221 if (is_gimple_debug (gsi_stmt (gsi))) 9222 { 9223 gsi_prev (&gsi); 9224 continue; 9225 } 9226 stmt1 = gsi_stmt (gsi); 9227 /* Do not consider statements writing to memory or having 9228 volatile operand. */ 9229 if (gimple_vdef (stmt1) 9230 || gimple_has_volatile_ops (stmt1)) 9231 break; 9232 gsi_from = gsi; 9233 gsi_prev (&gsi); 9234 lhs = gimple_get_lhs (stmt1); 9235 if (!lhs) 9236 break; 9237 9238 /* LHS of vectorized stmt must be SSA_NAME. */ 9239 if (TREE_CODE (lhs) != SSA_NAME) 9240 break; 9241 9242 if (!VECTOR_TYPE_P (TREE_TYPE (lhs))) 9243 { 9244 /* Remove dead scalar statement. */ 9245 if (has_zero_uses (lhs)) 9246 { 9247 gsi_remove (&gsi_from, true); 9248 continue; 9249 } 9250 } 9251 9252 /* Check that LHS does not have uses outside of STORE_BB. */ 9253 res = true; 9254 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs) 9255 { 9256 gimple *use_stmt; 9257 use_stmt = USE_STMT (use_p); 9258 if (is_gimple_debug (use_stmt)) 9259 continue; 9260 if (gimple_bb (use_stmt) != store_bb) 9261 { 9262 res = false; 9263 break; 9264 } 9265 } 9266 if (!res) 9267 break; 9268 9269 if (gimple_vuse (stmt1) 9270 && gimple_vuse (stmt1) != gimple_vuse (last_store)) 9271 break; 9272 9273 /* Can move STMT1 to STORE_BB. */ 9274 if (dump_enabled_p ()) 9275 dump_printf_loc (MSG_NOTE, vect_location, 9276 "Move stmt to created bb\n%G", stmt1); 9277 gsi_move_before (&gsi_from, &gsi_to); 9278 /* Shift GSI_TO for further insertion. */ 9279 gsi_prev (&gsi_to); 9280 } 9281 /* Put other masked stores with the same mask to STORE_BB. */ 9282 if (worklist.is_empty () 9283 || gimple_call_arg (worklist.last (), 2) != mask 9284 || worklist.last () != stmt1) 9285 break; 9286 last = worklist.pop (); 9287 } 9288 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION); 9289 } 9290} 9291 9292/* Decide whether it is possible to use a zero-based induction variable 9293 when vectorizing LOOP_VINFO with a fully-masked loop. If it is, 9294 return the value that the induction variable must be able to hold 9295 in order to ensure that the loop ends with an all-false mask. 9296 Return -1 otherwise. */ 9297widest_int 9298vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo) 9299{ 9300 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); 9301 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 9302 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo); 9303 9304 /* Calculate the value that the induction variable must be able 9305 to hit in order to ensure that we end the loop with an all-false mask. 9306 This involves adding the maximum number of inactive trailing scalar 9307 iterations. */ 9308 widest_int iv_limit = -1; 9309 if (max_loop_iterations (loop, &iv_limit)) 9310 { 9311 if (niters_skip) 9312 { 9313 /* Add the maximum number of skipped iterations to the 9314 maximum iteration count. */ 9315 if (TREE_CODE (niters_skip) == INTEGER_CST) 9316 iv_limit += wi::to_widest (niters_skip); 9317 else 9318 iv_limit += max_vf - 1; 9319 } 9320 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)) 9321 /* Make a conservatively-correct assumption. */ 9322 iv_limit += max_vf - 1; 9323 9324 /* IV_LIMIT is the maximum number of latch iterations, which is also 9325 the maximum in-range IV value. Round this value down to the previous 9326 vector alignment boundary and then add an extra full iteration. */ 9327 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 9328 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf; 9329 } 9330 return iv_limit; 9331} 9332 9333