1/* Data References Analysis and Manipulation Utilities for Vectorization. 2 Copyright (C) 2003-2020 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> 4 and Ira Rosen <irar@il.ibm.com> 5 6This file is part of GCC. 7 8GCC is free software; you can redistribute it and/or modify it under 9the terms of the GNU General Public License as published by the Free 10Software Foundation; either version 3, or (at your option) any later 11version. 12 13GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14WARRANTY; without even the implied warranty of MERCHANTABILITY or 15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16for more details. 17 18You should have received a copy of the GNU General Public License 19along with GCC; see the file COPYING3. If not see 20<http://www.gnu.org/licenses/>. */ 21 22#include "config.h" 23#include "system.h" 24#include "coretypes.h" 25#include "backend.h" 26#include "target.h" 27#include "rtl.h" 28#include "tree.h" 29#include "gimple.h" 30#include "predict.h" 31#include "memmodel.h" 32#include "tm_p.h" 33#include "ssa.h" 34#include "optabs-tree.h" 35#include "cgraph.h" 36#include "dumpfile.h" 37#include "alias.h" 38#include "fold-const.h" 39#include "stor-layout.h" 40#include "tree-eh.h" 41#include "gimplify.h" 42#include "gimple-iterator.h" 43#include "gimplify-me.h" 44#include "tree-ssa-loop-ivopts.h" 45#include "tree-ssa-loop-manip.h" 46#include "tree-ssa-loop.h" 47#include "cfgloop.h" 48#include "tree-scalar-evolution.h" 49#include "tree-vectorizer.h" 50#include "expr.h" 51#include "builtins.h" 52#include "tree-cfg.h" 53#include "tree-hash-traits.h" 54#include "vec-perm-indices.h" 55#include "internal-fn.h" 56 57/* Return true if load- or store-lanes optab OPTAB is implemented for 58 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */ 59 60static bool 61vect_lanes_optab_supported_p (const char *name, convert_optab optab, 62 tree vectype, unsigned HOST_WIDE_INT count) 63{ 64 machine_mode mode, array_mode; 65 bool limit_p; 66 67 mode = TYPE_MODE (vectype); 68 if (!targetm.array_mode (mode, count).exists (&array_mode)) 69 { 70 poly_uint64 bits = count * GET_MODE_BITSIZE (mode); 71 limit_p = !targetm.array_mode_supported_p (mode, count); 72 if (!int_mode_for_size (bits, limit_p).exists (&array_mode)) 73 { 74 if (dump_enabled_p ()) 75 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 76 "no array mode for %s[%wu]\n", 77 GET_MODE_NAME (mode), count); 78 return false; 79 } 80 } 81 82 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing) 83 { 84 if (dump_enabled_p ()) 85 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 86 "cannot use %s<%s><%s>\n", name, 87 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode)); 88 return false; 89 } 90 91 if (dump_enabled_p ()) 92 dump_printf_loc (MSG_NOTE, vect_location, 93 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode), 94 GET_MODE_NAME (mode)); 95 96 return true; 97} 98 99 100/* Return the smallest scalar part of STMT_INFO. 101 This is used to determine the vectype of the stmt. We generally set the 102 vectype according to the type of the result (lhs). For stmts whose 103 result-type is different than the type of the arguments (e.g., demotion, 104 promotion), vectype will be reset appropriately (later). Note that we have 105 to visit the smallest datatype in this function, because that determines the 106 VF. If the smallest datatype in the loop is present only as the rhs of a 107 promotion operation - we'd miss it. 108 Such a case, where a variable of this datatype does not appear in the lhs 109 anywhere in the loop, can only occur if it's an invariant: e.g.: 110 'int_x = (int) short_inv', which we'd expect to have been optimized away by 111 invariant motion. However, we cannot rely on invariant motion to always 112 take invariants out of the loop, and so in the case of promotion we also 113 have to check the rhs. 114 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding 115 types. */ 116 117tree 118vect_get_smallest_scalar_type (stmt_vec_info stmt_info, 119 HOST_WIDE_INT *lhs_size_unit, 120 HOST_WIDE_INT *rhs_size_unit) 121{ 122 tree scalar_type = gimple_expr_type (stmt_info->stmt); 123 HOST_WIDE_INT lhs, rhs; 124 125 /* During the analysis phase, this function is called on arbitrary 126 statements that might not have scalar results. */ 127 if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type))) 128 return scalar_type; 129 130 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 131 132 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt); 133 if (assign 134 && (gimple_assign_cast_p (assign) 135 || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR 136 || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR 137 || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR 138 || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR 139 || gimple_assign_rhs_code (assign) == FLOAT_EXPR)) 140 { 141 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign)); 142 143 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); 144 if (rhs < lhs) 145 scalar_type = rhs_type; 146 } 147 else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt)) 148 { 149 unsigned int i = 0; 150 if (gimple_call_internal_p (call)) 151 { 152 internal_fn ifn = gimple_call_internal_fn (call); 153 if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn)) 154 /* gimple_expr_type already picked the type of the loaded 155 or stored data. */ 156 i = ~0U; 157 else if (internal_fn_mask_index (ifn) == 0) 158 i = 1; 159 } 160 if (i < gimple_call_num_args (call)) 161 { 162 tree rhs_type = TREE_TYPE (gimple_call_arg (call, i)); 163 if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type))) 164 { 165 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type)); 166 if (rhs < lhs) 167 scalar_type = rhs_type; 168 } 169 } 170 } 171 172 *lhs_size_unit = lhs; 173 *rhs_size_unit = rhs; 174 return scalar_type; 175} 176 177 178/* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be 179 tested at run-time. Return TRUE if DDR was successfully inserted. 180 Return false if versioning is not supported. */ 181 182static opt_result 183vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo) 184{ 185 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 186 187 if ((unsigned) param_vect_max_version_for_alias_checks == 0) 188 return opt_result::failure_at (vect_location, 189 "will not create alias checks, as" 190 " --param vect-max-version-for-alias-checks" 191 " == 0\n"); 192 193 opt_result res 194 = runtime_alias_check_p (ddr, loop, 195 optimize_loop_nest_for_speed_p (loop)); 196 if (!res) 197 return res; 198 199 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr); 200 return opt_result::success (); 201} 202 203/* Record that loop LOOP_VINFO needs to check that VALUE is nonzero. */ 204 205static void 206vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value) 207{ 208 vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo); 209 for (unsigned int i = 0; i < checks.length(); ++i) 210 if (checks[i] == value) 211 return; 212 213 if (dump_enabled_p ()) 214 dump_printf_loc (MSG_NOTE, vect_location, 215 "need run-time check that %T is nonzero\n", 216 value); 217 LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value); 218} 219 220/* Return true if we know that the order of vectorized DR_INFO_A and 221 vectorized DR_INFO_B will be the same as the order of DR_INFO_A and 222 DR_INFO_B. At least one of the accesses is a write. */ 223 224static bool 225vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b) 226{ 227 stmt_vec_info stmtinfo_a = dr_info_a->stmt; 228 stmt_vec_info stmtinfo_b = dr_info_b->stmt; 229 230 /* Single statements are always kept in their original order. */ 231 if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a) 232 && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)) 233 return true; 234 235 /* STMT_A and STMT_B belong to overlapping groups. All loads in a 236 SLP group are emitted at the position of the last scalar load and 237 all loads in an interleaving group are emitted at the position 238 of the first scalar load. 239 Stores in a group are emitted at the position of the last scalar store. 240 Compute that position and check whether the resulting order matches 241 the current one. 242 We have not yet decided between SLP and interleaving so we have 243 to conservatively assume both. */ 244 stmt_vec_info il_a; 245 stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a); 246 if (last_a) 247 { 248 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s; 249 s = DR_GROUP_NEXT_ELEMENT (s)) 250 last_a = get_later_stmt (last_a, s); 251 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a))) 252 { 253 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s; 254 s = DR_GROUP_NEXT_ELEMENT (s)) 255 if (get_later_stmt (il_a, s) == il_a) 256 il_a = s; 257 } 258 else 259 il_a = last_a; 260 } 261 else 262 last_a = il_a = stmtinfo_a; 263 stmt_vec_info il_b; 264 stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b); 265 if (last_b) 266 { 267 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s; 268 s = DR_GROUP_NEXT_ELEMENT (s)) 269 last_b = get_later_stmt (last_b, s); 270 if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b))) 271 { 272 for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s; 273 s = DR_GROUP_NEXT_ELEMENT (s)) 274 if (get_later_stmt (il_b, s) == il_b) 275 il_b = s; 276 } 277 else 278 il_b = last_b; 279 } 280 else 281 last_b = il_b = stmtinfo_b; 282 bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a); 283 return (/* SLP */ 284 (get_later_stmt (last_a, last_b) == last_a) == a_after_b 285 /* Interleaving */ 286 && (get_later_stmt (il_a, il_b) == il_a) == a_after_b 287 /* Mixed */ 288 && (get_later_stmt (il_a, last_b) == il_a) == a_after_b 289 && (get_later_stmt (last_a, il_b) == last_a) == a_after_b); 290} 291 292/* A subroutine of vect_analyze_data_ref_dependence. Handle 293 DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence 294 distances. These distances are conservatively correct but they don't 295 reflect a guaranteed dependence. 296 297 Return true if this function does all the work necessary to avoid 298 an alias or false if the caller should use the dependence distances 299 to limit the vectorization factor in the usual way. LOOP_DEPTH is 300 the depth of the loop described by LOOP_VINFO and the other arguments 301 are as for vect_analyze_data_ref_dependence. */ 302 303static bool 304vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr, 305 loop_vec_info loop_vinfo, 306 int loop_depth, unsigned int *max_vf) 307{ 308 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 309 lambda_vector dist_v; 310 unsigned int i; 311 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 312 { 313 int dist = dist_v[loop_depth]; 314 if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr))) 315 { 316 /* If the user asserted safelen >= DIST consecutive iterations 317 can be executed concurrently, assume independence. 318 319 ??? An alternative would be to add the alias check even 320 in this case, and vectorize the fallback loop with the 321 maximum VF set to safelen. However, if the user has 322 explicitly given a length, it's less likely that that 323 would be a win. */ 324 if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen) 325 { 326 if ((unsigned int) loop->safelen < *max_vf) 327 *max_vf = loop->safelen; 328 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 329 continue; 330 } 331 332 /* For dependence distances of 2 or more, we have the option 333 of limiting VF or checking for an alias at runtime. 334 Prefer to check at runtime if we can, to avoid limiting 335 the VF unnecessarily when the bases are in fact independent. 336 337 Note that the alias checks will be removed if the VF ends up 338 being small enough. */ 339 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr)); 340 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr)); 341 return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt) 342 && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt) 343 && vect_mark_for_runtime_alias_test (ddr, loop_vinfo)); 344 } 345 } 346 return true; 347} 348 349 350/* Function vect_analyze_data_ref_dependence. 351 352 FIXME: I needed to change the sense of the returned flag. 353 354 Return FALSE if there (might) exist a dependence between a memory-reference 355 DRA and a memory-reference DRB. When versioning for alias may check a 356 dependence at run-time, return TRUE. Adjust *MAX_VF according to 357 the data dependence. */ 358 359static opt_result 360vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr, 361 loop_vec_info loop_vinfo, 362 unsigned int *max_vf) 363{ 364 unsigned int i; 365 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 366 struct data_reference *dra = DDR_A (ddr); 367 struct data_reference *drb = DDR_B (ddr); 368 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra); 369 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb); 370 stmt_vec_info stmtinfo_a = dr_info_a->stmt; 371 stmt_vec_info stmtinfo_b = dr_info_b->stmt; 372 lambda_vector dist_v; 373 unsigned int loop_depth; 374 375 /* In loop analysis all data references should be vectorizable. */ 376 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) 377 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b)) 378 gcc_unreachable (); 379 380 /* Independent data accesses. */ 381 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 382 return opt_result::success (); 383 384 if (dra == drb 385 || (DR_IS_READ (dra) && DR_IS_READ (drb))) 386 return opt_result::success (); 387 388 /* We do not have to consider dependences between accesses that belong 389 to the same group, unless the stride could be smaller than the 390 group size. */ 391 if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a) 392 && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a) 393 == DR_GROUP_FIRST_ELEMENT (stmtinfo_b)) 394 && !STMT_VINFO_STRIDED_P (stmtinfo_a)) 395 return opt_result::success (); 396 397 /* Even if we have an anti-dependence then, as the vectorized loop covers at 398 least two scalar iterations, there is always also a true dependence. 399 As the vectorizer does not re-order loads and stores we can ignore 400 the anti-dependence if TBAA can disambiguate both DRs similar to the 401 case with known negative distance anti-dependences (positive 402 distance anti-dependences would violate TBAA constraints). */ 403 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb)) 404 || (DR_IS_WRITE (dra) && DR_IS_READ (drb))) 405 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)), 406 get_alias_set (DR_REF (drb)))) 407 return opt_result::success (); 408 409 /* Unknown data dependence. */ 410 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 411 { 412 /* If user asserted safelen consecutive iterations can be 413 executed concurrently, assume independence. */ 414 if (loop->safelen >= 2) 415 { 416 if ((unsigned int) loop->safelen < *max_vf) 417 *max_vf = loop->safelen; 418 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 419 return opt_result::success (); 420 } 421 422 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) 423 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 424 return opt_result::failure_at 425 (stmtinfo_a->stmt, 426 "versioning for alias not supported for: " 427 "can't determine dependence between %T and %T\n", 428 DR_REF (dra), DR_REF (drb)); 429 430 if (dump_enabled_p ()) 431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt, 432 "versioning for alias required: " 433 "can't determine dependence between %T and %T\n", 434 DR_REF (dra), DR_REF (drb)); 435 436 /* Add to list of ddrs that need to be tested at run-time. */ 437 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 438 } 439 440 /* Known data dependence. */ 441 if (DDR_NUM_DIST_VECTS (ddr) == 0) 442 { 443 /* If user asserted safelen consecutive iterations can be 444 executed concurrently, assume independence. */ 445 if (loop->safelen >= 2) 446 { 447 if ((unsigned int) loop->safelen < *max_vf) 448 *max_vf = loop->safelen; 449 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false; 450 return opt_result::success (); 451 } 452 453 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) 454 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 455 return opt_result::failure_at 456 (stmtinfo_a->stmt, 457 "versioning for alias not supported for: " 458 "bad dist vector for %T and %T\n", 459 DR_REF (dra), DR_REF (drb)); 460 461 if (dump_enabled_p ()) 462 dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt, 463 "versioning for alias required: " 464 "bad dist vector for %T and %T\n", 465 DR_REF (dra), DR_REF (drb)); 466 /* Add to list of ddrs that need to be tested at run-time. */ 467 return vect_mark_for_runtime_alias_test (ddr, loop_vinfo); 468 } 469 470 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr)); 471 472 if (DDR_COULD_BE_INDEPENDENT_P (ddr) 473 && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo, 474 loop_depth, max_vf)) 475 return opt_result::success (); 476 477 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 478 { 479 int dist = dist_v[loop_depth]; 480 481 if (dump_enabled_p ()) 482 dump_printf_loc (MSG_NOTE, vect_location, 483 "dependence distance = %d.\n", dist); 484 485 if (dist == 0) 486 { 487 if (dump_enabled_p ()) 488 dump_printf_loc (MSG_NOTE, vect_location, 489 "dependence distance == 0 between %T and %T\n", 490 DR_REF (dra), DR_REF (drb)); 491 492 /* When we perform grouped accesses and perform implicit CSE 493 by detecting equal accesses and doing disambiguation with 494 runtime alias tests like for 495 .. = a[i]; 496 .. = a[i+1]; 497 a[i] = ..; 498 a[i+1] = ..; 499 *p = ..; 500 .. = a[i]; 501 .. = a[i+1]; 502 where we will end up loading { a[i], a[i+1] } once, make 503 sure that inserting group loads before the first load and 504 stores after the last store will do the right thing. 505 Similar for groups like 506 a[i] = ...; 507 ... = a[i]; 508 a[i+1] = ...; 509 where loads from the group interleave with the store. */ 510 if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b)) 511 return opt_result::failure_at (stmtinfo_a->stmt, 512 "READ_WRITE dependence" 513 " in interleaving.\n"); 514 515 if (loop->safelen < 2) 516 { 517 tree indicator = dr_zero_step_indicator (dra); 518 if (!indicator || integer_zerop (indicator)) 519 return opt_result::failure_at (stmtinfo_a->stmt, 520 "access also has a zero step\n"); 521 else if (TREE_CODE (indicator) != INTEGER_CST) 522 vect_check_nonzero_value (loop_vinfo, indicator); 523 } 524 continue; 525 } 526 527 if (dist > 0 && DDR_REVERSED_P (ddr)) 528 { 529 /* If DDR_REVERSED_P the order of the data-refs in DDR was 530 reversed (to make distance vector positive), and the actual 531 distance is negative. */ 532 if (dump_enabled_p ()) 533 dump_printf_loc (MSG_NOTE, vect_location, 534 "dependence distance negative.\n"); 535 /* When doing outer loop vectorization, we need to check if there is 536 a backward dependence at the inner loop level if the dependence 537 at the outer loop is reversed. See PR81740. */ 538 if (nested_in_vect_loop_p (loop, stmtinfo_a) 539 || nested_in_vect_loop_p (loop, stmtinfo_b)) 540 { 541 unsigned inner_depth = index_in_loop_nest (loop->inner->num, 542 DDR_LOOP_NEST (ddr)); 543 if (dist_v[inner_depth] < 0) 544 return opt_result::failure_at (stmtinfo_a->stmt, 545 "not vectorized, dependence " 546 "between data-refs %T and %T\n", 547 DR_REF (dra), DR_REF (drb)); 548 } 549 /* Record a negative dependence distance to later limit the 550 amount of stmt copying / unrolling we can perform. 551 Only need to handle read-after-write dependence. */ 552 if (DR_IS_READ (drb) 553 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0 554 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist)) 555 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist; 556 continue; 557 } 558 559 unsigned int abs_dist = abs (dist); 560 if (abs_dist >= 2 && abs_dist < *max_vf) 561 { 562 /* The dependence distance requires reduction of the maximal 563 vectorization factor. */ 564 *max_vf = abs_dist; 565 if (dump_enabled_p ()) 566 dump_printf_loc (MSG_NOTE, vect_location, 567 "adjusting maximal vectorization factor to %i\n", 568 *max_vf); 569 } 570 571 if (abs_dist >= *max_vf) 572 { 573 /* Dependence distance does not create dependence, as far as 574 vectorization is concerned, in this case. */ 575 if (dump_enabled_p ()) 576 dump_printf_loc (MSG_NOTE, vect_location, 577 "dependence distance >= VF.\n"); 578 continue; 579 } 580 581 return opt_result::failure_at (stmtinfo_a->stmt, 582 "not vectorized, possible dependence " 583 "between data-refs %T and %T\n", 584 DR_REF (dra), DR_REF (drb)); 585 } 586 587 return opt_result::success (); 588} 589 590/* Function vect_analyze_data_ref_dependences. 591 592 Examine all the data references in the loop, and make sure there do not 593 exist any data dependences between them. Set *MAX_VF according to 594 the maximum vectorization factor the data dependences allow. */ 595 596opt_result 597vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, 598 unsigned int *max_vf) 599{ 600 unsigned int i; 601 struct data_dependence_relation *ddr; 602 603 DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences"); 604 605 if (!LOOP_VINFO_DDRS (loop_vinfo).exists ()) 606 { 607 LOOP_VINFO_DDRS (loop_vinfo) 608 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length () 609 * LOOP_VINFO_DATAREFS (loop_vinfo).length ()); 610 /* We need read-read dependences to compute 611 STMT_VINFO_SAME_ALIGN_REFS. */ 612 bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo), 613 &LOOP_VINFO_DDRS (loop_vinfo), 614 LOOP_VINFO_LOOP_NEST (loop_vinfo), 615 true); 616 gcc_assert (res); 617 } 618 619 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true; 620 621 /* For epilogues we either have no aliases or alias versioning 622 was applied to original loop. Therefore we may just get max_vf 623 using VF of original loop. */ 624 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) 625 *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo); 626 else 627 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr) 628 { 629 opt_result res 630 = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf); 631 if (!res) 632 return res; 633 } 634 635 return opt_result::success (); 636} 637 638 639/* Function vect_slp_analyze_data_ref_dependence. 640 641 Return TRUE if there (might) exist a dependence between a memory-reference 642 DRA and a memory-reference DRB for VINFO. When versioning for alias 643 may check a dependence at run-time, return FALSE. Adjust *MAX_VF 644 according to the data dependence. */ 645 646static bool 647vect_slp_analyze_data_ref_dependence (vec_info *vinfo, 648 struct data_dependence_relation *ddr) 649{ 650 struct data_reference *dra = DDR_A (ddr); 651 struct data_reference *drb = DDR_B (ddr); 652 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra); 653 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb); 654 655 /* We need to check dependences of statements marked as unvectorizable 656 as well, they still can prohibit vectorization. */ 657 658 /* Independent data accesses. */ 659 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 660 return false; 661 662 if (dra == drb) 663 return false; 664 665 /* Read-read is OK. */ 666 if (DR_IS_READ (dra) && DR_IS_READ (drb)) 667 return false; 668 669 /* If dra and drb are part of the same interleaving chain consider 670 them independent. */ 671 if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt) 672 && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt) 673 == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt))) 674 return false; 675 676 /* Unknown data dependence. */ 677 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 678 { 679 if (dump_enabled_p ()) 680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 681 "can't determine dependence between %T and %T\n", 682 DR_REF (dra), DR_REF (drb)); 683 } 684 else if (dump_enabled_p ()) 685 dump_printf_loc (MSG_NOTE, vect_location, 686 "determined dependence between %T and %T\n", 687 DR_REF (dra), DR_REF (drb)); 688 689 return true; 690} 691 692 693/* Analyze dependences involved in the transform of SLP NODE. STORES 694 contain the vector of scalar stores of this instance if we are 695 disambiguating the loads. */ 696 697static bool 698vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node, 699 vec<stmt_vec_info> stores, 700 stmt_vec_info last_store_info) 701{ 702 /* This walks over all stmts involved in the SLP load/store done 703 in NODE verifying we can sink them up to the last stmt in the 704 group. */ 705 stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node); 706 vec_info *vinfo = last_access_info->vinfo; 707 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 708 { 709 stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k]; 710 if (access_info == last_access_info) 711 continue; 712 data_reference *dr_a = STMT_VINFO_DATA_REF (access_info); 713 ao_ref ref; 714 bool ref_initialized_p = false; 715 for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt); 716 gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi)) 717 { 718 gimple *stmt = gsi_stmt (gsi); 719 if (! gimple_vuse (stmt) 720 || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt))) 721 continue; 722 723 /* If we couldn't record a (single) data reference for this 724 stmt we have to resort to the alias oracle. */ 725 stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt); 726 data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info); 727 if (!dr_b) 728 { 729 /* We are moving a store or sinking a load - this means 730 we cannot use TBAA for disambiguation. */ 731 if (!ref_initialized_p) 732 ao_ref_init (&ref, DR_REF (dr_a)); 733 if (stmt_may_clobber_ref_p_1 (stmt, &ref, false) 734 || ref_maybe_used_by_stmt_p (stmt, &ref, false)) 735 return false; 736 continue; 737 } 738 739 bool dependent = false; 740 /* If we run into a store of this same instance (we've just 741 marked those) then delay dependence checking until we run 742 into the last store because this is where it will have 743 been sunk to (and we verify if we can do that as well). */ 744 if (gimple_visited_p (stmt)) 745 { 746 if (stmt_info != last_store_info) 747 continue; 748 unsigned i; 749 stmt_vec_info store_info; 750 FOR_EACH_VEC_ELT (stores, i, store_info) 751 { 752 data_reference *store_dr = STMT_VINFO_DATA_REF (store_info); 753 ddr_p ddr = initialize_data_dependence_relation 754 (dr_a, store_dr, vNULL); 755 dependent 756 = vect_slp_analyze_data_ref_dependence (vinfo, ddr); 757 free_dependence_relation (ddr); 758 if (dependent) 759 break; 760 } 761 } 762 else 763 { 764 ddr_p ddr = initialize_data_dependence_relation (dr_a, 765 dr_b, vNULL); 766 dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr); 767 free_dependence_relation (ddr); 768 } 769 if (dependent) 770 return false; 771 } 772 } 773 return true; 774} 775 776 777/* Function vect_analyze_data_ref_dependences. 778 779 Examine all the data references in the basic-block, and make sure there 780 do not exist any data dependences between them. Set *MAX_VF according to 781 the maximum vectorization factor the data dependences allow. */ 782 783bool 784vect_slp_analyze_instance_dependence (slp_instance instance) 785{ 786 DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence"); 787 788 /* The stores of this instance are at the root of the SLP tree. */ 789 slp_tree store = SLP_INSTANCE_TREE (instance); 790 if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0])) 791 store = NULL; 792 793 /* Verify we can sink stores to the vectorized stmt insert location. */ 794 stmt_vec_info last_store_info = NULL; 795 if (store) 796 { 797 if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL)) 798 return false; 799 800 /* Mark stores in this instance and remember the last one. */ 801 last_store_info = vect_find_last_scalar_stmt_in_slp (store); 802 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 803 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true); 804 } 805 806 bool res = true; 807 808 /* Verify we can sink loads to the vectorized stmt insert location, 809 special-casing stores of this instance. */ 810 slp_tree load; 811 unsigned int i; 812 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load) 813 if (! vect_slp_analyze_node_dependences (instance, load, 814 store 815 ? SLP_TREE_SCALAR_STMTS (store) 816 : vNULL, last_store_info)) 817 { 818 res = false; 819 break; 820 } 821 822 /* Unset the visited flag. */ 823 if (store) 824 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k) 825 gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false); 826 827 return res; 828} 829 830/* Record the base alignment guarantee given by DRB, which occurs 831 in STMT_INFO. */ 832 833static void 834vect_record_base_alignment (stmt_vec_info stmt_info, 835 innermost_loop_behavior *drb) 836{ 837 vec_info *vinfo = stmt_info->vinfo; 838 bool existed; 839 innermost_loop_behavior *&entry 840 = vinfo->base_alignments.get_or_insert (drb->base_address, &existed); 841 if (!existed || entry->base_alignment < drb->base_alignment) 842 { 843 entry = drb; 844 if (dump_enabled_p ()) 845 dump_printf_loc (MSG_NOTE, vect_location, 846 "recording new base alignment for %T\n" 847 " alignment: %d\n" 848 " misalignment: %d\n" 849 " based on: %G", 850 drb->base_address, 851 drb->base_alignment, 852 drb->base_misalignment, 853 stmt_info->stmt); 854 } 855} 856 857/* If the region we're going to vectorize is reached, all unconditional 858 data references occur at least once. We can therefore pool the base 859 alignment guarantees from each unconditional reference. Do this by 860 going through all the data references in VINFO and checking whether 861 the containing statement makes the reference unconditionally. If so, 862 record the alignment of the base address in VINFO so that it can be 863 used for all other references with the same base. */ 864 865void 866vect_record_base_alignments (vec_info *vinfo) 867{ 868 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo); 869 class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL; 870 data_reference *dr; 871 unsigned int i; 872 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr) 873 { 874 dr_vec_info *dr_info = vinfo->lookup_dr (dr); 875 stmt_vec_info stmt_info = dr_info->stmt; 876 if (!DR_IS_CONDITIONAL_IN_STMT (dr) 877 && STMT_VINFO_VECTORIZABLE (stmt_info) 878 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)) 879 { 880 vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr)); 881 882 /* If DR is nested in the loop that is being vectorized, we can also 883 record the alignment of the base wrt the outer loop. */ 884 if (loop && nested_in_vect_loop_p (loop, stmt_info)) 885 vect_record_base_alignment 886 (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info)); 887 } 888 } 889} 890 891/* Return the target alignment for the vectorized form of DR_INFO. */ 892 893static poly_uint64 894vect_calculate_target_alignment (dr_vec_info *dr_info) 895{ 896 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt); 897 return targetm.vectorize.preferred_vector_alignment (vectype); 898} 899 900/* Function vect_compute_data_ref_alignment 901 902 Compute the misalignment of the data reference DR_INFO. 903 904 Output: 905 1. DR_MISALIGNMENT (DR_INFO) is defined. 906 907 FOR NOW: No analysis is actually performed. Misalignment is calculated 908 only for trivial cases. TODO. */ 909 910static void 911vect_compute_data_ref_alignment (dr_vec_info *dr_info) 912{ 913 stmt_vec_info stmt_info = dr_info->stmt; 914 vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments; 915 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 916 class loop *loop = NULL; 917 tree ref = DR_REF (dr_info->dr); 918 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 919 920 if (dump_enabled_p ()) 921 dump_printf_loc (MSG_NOTE, vect_location, 922 "vect_compute_data_ref_alignment:\n"); 923 924 if (loop_vinfo) 925 loop = LOOP_VINFO_LOOP (loop_vinfo); 926 927 /* Initialize misalignment to unknown. */ 928 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN); 929 930 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) 931 return; 932 933 innermost_loop_behavior *drb = vect_dr_behavior (dr_info); 934 bool step_preserves_misalignment_p; 935 936 poly_uint64 vector_alignment 937 = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT); 938 DR_TARGET_ALIGNMENT (dr_info) = vector_alignment; 939 940 /* If the main loop has peeled for alignment we have no way of knowing 941 whether the data accesses in the epilogues are aligned. We can't at 942 compile time answer the question whether we have entered the main loop or 943 not. Fixes PR 92351. */ 944 if (loop_vinfo) 945 { 946 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); 947 if (orig_loop_vinfo 948 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo) != 0) 949 return; 950 } 951 952 unsigned HOST_WIDE_INT vect_align_c; 953 if (!vector_alignment.is_constant (&vect_align_c)) 954 return; 955 956 /* No step for BB vectorization. */ 957 if (!loop) 958 { 959 gcc_assert (integer_zerop (drb->step)); 960 step_preserves_misalignment_p = true; 961 } 962 963 /* In case the dataref is in an inner-loop of the loop that is being 964 vectorized (LOOP), we use the base and misalignment information 965 relative to the outer-loop (LOOP). This is ok only if the misalignment 966 stays the same throughout the execution of the inner-loop, which is why 967 we have to check that the stride of the dataref in the inner-loop evenly 968 divides by the vector alignment. */ 969 else if (nested_in_vect_loop_p (loop, stmt_info)) 970 { 971 step_preserves_misalignment_p 972 = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0; 973 974 if (dump_enabled_p ()) 975 { 976 if (step_preserves_misalignment_p) 977 dump_printf_loc (MSG_NOTE, vect_location, 978 "inner step divides the vector alignment.\n"); 979 else 980 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 981 "inner step doesn't divide the vector" 982 " alignment.\n"); 983 } 984 } 985 986 /* Similarly we can only use base and misalignment information relative to 987 an innermost loop if the misalignment stays the same throughout the 988 execution of the loop. As above, this is the case if the stride of 989 the dataref evenly divides by the alignment. */ 990 else 991 { 992 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 993 step_preserves_misalignment_p 994 = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c); 995 996 if (!step_preserves_misalignment_p && dump_enabled_p ()) 997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 998 "step doesn't divide the vector alignment.\n"); 999 } 1000 1001 unsigned int base_alignment = drb->base_alignment; 1002 unsigned int base_misalignment = drb->base_misalignment; 1003 1004 /* Calculate the maximum of the pooled base address alignment and the 1005 alignment that we can compute for DR itself. */ 1006 innermost_loop_behavior **entry = base_alignments->get (drb->base_address); 1007 if (entry && base_alignment < (*entry)->base_alignment) 1008 { 1009 base_alignment = (*entry)->base_alignment; 1010 base_misalignment = (*entry)->base_misalignment; 1011 } 1012 1013 if (drb->offset_alignment < vect_align_c 1014 || !step_preserves_misalignment_p 1015 /* We need to know whether the step wrt the vectorized loop is 1016 negative when computing the starting misalignment below. */ 1017 || TREE_CODE (drb->step) != INTEGER_CST) 1018 { 1019 if (dump_enabled_p ()) 1020 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1021 "Unknown alignment for access: %T\n", ref); 1022 return; 1023 } 1024 1025 if (base_alignment < vect_align_c) 1026 { 1027 unsigned int max_alignment; 1028 tree base = get_base_for_alignment (drb->base_address, &max_alignment); 1029 if (max_alignment < vect_align_c 1030 || !vect_can_force_dr_alignment_p (base, 1031 vect_align_c * BITS_PER_UNIT)) 1032 { 1033 if (dump_enabled_p ()) 1034 dump_printf_loc (MSG_NOTE, vect_location, 1035 "can't force alignment of ref: %T\n", ref); 1036 return; 1037 } 1038 1039 /* Force the alignment of the decl. 1040 NOTE: This is the only change to the code we make during 1041 the analysis phase, before deciding to vectorize the loop. */ 1042 if (dump_enabled_p ()) 1043 dump_printf_loc (MSG_NOTE, vect_location, 1044 "force alignment of %T\n", ref); 1045 1046 dr_info->base_decl = base; 1047 dr_info->base_misaligned = true; 1048 base_misalignment = 0; 1049 } 1050 poly_int64 misalignment 1051 = base_misalignment + wi::to_poly_offset (drb->init).force_shwi (); 1052 1053 /* If this is a backward running DR then first access in the larger 1054 vectype actually is N-1 elements before the address in the DR. 1055 Adjust misalign accordingly. */ 1056 if (tree_int_cst_sgn (drb->step) < 0) 1057 /* PLUS because STEP is negative. */ 1058 misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1) 1059 * -TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)))); 1060 1061 unsigned int const_misalignment; 1062 if (!known_misalignment (misalignment, vect_align_c, &const_misalignment)) 1063 { 1064 if (dump_enabled_p ()) 1065 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1066 "Non-constant misalignment for access: %T\n", ref); 1067 return; 1068 } 1069 1070 SET_DR_MISALIGNMENT (dr_info, const_misalignment); 1071 1072 if (dump_enabled_p ()) 1073 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1074 "misalign = %d bytes of ref %T\n", 1075 DR_MISALIGNMENT (dr_info), ref); 1076 1077 return; 1078} 1079 1080/* Function vect_update_misalignment_for_peel. 1081 Sets DR_INFO's misalignment 1082 - to 0 if it has the same alignment as DR_PEEL_INFO, 1083 - to the misalignment computed using NPEEL if DR_INFO's salignment is known, 1084 - to -1 (unknown) otherwise. 1085 1086 DR_INFO - the data reference whose misalignment is to be adjusted. 1087 DR_PEEL_INFO - the data reference whose misalignment is being made 1088 zero in the vector loop by the peel. 1089 NPEEL - the number of iterations in the peel loop if the misalignment 1090 of DR_PEEL_INFO is known at compile time. */ 1091 1092static void 1093vect_update_misalignment_for_peel (dr_vec_info *dr_info, 1094 dr_vec_info *dr_peel_info, int npeel) 1095{ 1096 unsigned int i; 1097 vec<dr_p> same_aligned_drs; 1098 struct data_reference *current_dr; 1099 stmt_vec_info peel_stmt_info = dr_peel_info->stmt; 1100 1101 /* It can be assumed that if dr_info has the same alignment as dr_peel, 1102 it is aligned in the vector loop. */ 1103 same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info); 1104 FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr) 1105 { 1106 if (current_dr != dr_info->dr) 1107 continue; 1108 gcc_assert (!known_alignment_for_access_p (dr_info) 1109 || !known_alignment_for_access_p (dr_peel_info) 1110 || (DR_MISALIGNMENT (dr_info) 1111 == DR_MISALIGNMENT (dr_peel_info))); 1112 SET_DR_MISALIGNMENT (dr_info, 0); 1113 return; 1114 } 1115 1116 unsigned HOST_WIDE_INT alignment; 1117 if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment) 1118 && known_alignment_for_access_p (dr_info) 1119 && known_alignment_for_access_p (dr_peel_info)) 1120 { 1121 int misal = DR_MISALIGNMENT (dr_info); 1122 misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr)); 1123 misal &= alignment - 1; 1124 SET_DR_MISALIGNMENT (dr_info, misal); 1125 return; 1126 } 1127 1128 if (dump_enabled_p ()) 1129 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \ 1130 "to unknown (-1).\n"); 1131 SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN); 1132} 1133 1134 1135/* Function verify_data_ref_alignment 1136 1137 Return TRUE if DR_INFO can be handled with respect to alignment. */ 1138 1139static opt_result 1140verify_data_ref_alignment (dr_vec_info *dr_info) 1141{ 1142 enum dr_alignment_support supportable_dr_alignment 1143 = vect_supportable_dr_alignment (dr_info, false); 1144 if (!supportable_dr_alignment) 1145 return opt_result::failure_at 1146 (dr_info->stmt->stmt, 1147 DR_IS_READ (dr_info->dr) 1148 ? "not vectorized: unsupported unaligned load: %T\n" 1149 : "not vectorized: unsupported unaligned store: %T\n", 1150 DR_REF (dr_info->dr)); 1151 1152 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ()) 1153 dump_printf_loc (MSG_NOTE, vect_location, 1154 "Vectorizing an unaligned access.\n"); 1155 1156 return opt_result::success (); 1157} 1158 1159/* Function vect_verify_datarefs_alignment 1160 1161 Return TRUE if all data references in the loop can be 1162 handled with respect to alignment. */ 1163 1164opt_result 1165vect_verify_datarefs_alignment (loop_vec_info vinfo) 1166{ 1167 vec<data_reference_p> datarefs = vinfo->shared->datarefs; 1168 struct data_reference *dr; 1169 unsigned int i; 1170 1171 FOR_EACH_VEC_ELT (datarefs, i, dr) 1172 { 1173 dr_vec_info *dr_info = vinfo->lookup_dr (dr); 1174 stmt_vec_info stmt_info = dr_info->stmt; 1175 1176 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1177 continue; 1178 1179 /* For interleaving, only the alignment of the first access matters. */ 1180 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1181 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info) 1182 continue; 1183 1184 /* Strided accesses perform only component accesses, alignment is 1185 irrelevant for them. */ 1186 if (STMT_VINFO_STRIDED_P (stmt_info) 1187 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1188 continue; 1189 1190 opt_result res = verify_data_ref_alignment (dr_info); 1191 if (!res) 1192 return res; 1193 } 1194 1195 return opt_result::success (); 1196} 1197 1198/* Given an memory reference EXP return whether its alignment is less 1199 than its size. */ 1200 1201static bool 1202not_size_aligned (tree exp) 1203{ 1204 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp)))) 1205 return true; 1206 1207 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp))) 1208 > get_object_alignment (exp)); 1209} 1210 1211/* Function vector_alignment_reachable_p 1212 1213 Return true if vector alignment for DR_INFO is reachable by peeling 1214 a few loop iterations. Return false otherwise. */ 1215 1216static bool 1217vector_alignment_reachable_p (dr_vec_info *dr_info) 1218{ 1219 stmt_vec_info stmt_info = dr_info->stmt; 1220 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 1221 1222 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1223 { 1224 /* For interleaved access we peel only if number of iterations in 1225 the prolog loop ({VF - misalignment}), is a multiple of the 1226 number of the interleaved accesses. */ 1227 int elem_size, mis_in_elements; 1228 1229 /* FORNOW: handle only known alignment. */ 1230 if (!known_alignment_for_access_p (dr_info)) 1231 return false; 1232 1233 poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype); 1234 poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype)); 1235 elem_size = vector_element_size (vector_size, nelements); 1236 mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size; 1237 1238 if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info))) 1239 return false; 1240 } 1241 1242 /* If misalignment is known at the compile time then allow peeling 1243 only if natural alignment is reachable through peeling. */ 1244 if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info)) 1245 { 1246 HOST_WIDE_INT elmsize = 1247 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); 1248 if (dump_enabled_p ()) 1249 { 1250 dump_printf_loc (MSG_NOTE, vect_location, 1251 "data size = %wd. misalignment = %d.\n", elmsize, 1252 DR_MISALIGNMENT (dr_info)); 1253 } 1254 if (DR_MISALIGNMENT (dr_info) % elmsize) 1255 { 1256 if (dump_enabled_p ()) 1257 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1258 "data size does not divide the misalignment.\n"); 1259 return false; 1260 } 1261 } 1262 1263 if (!known_alignment_for_access_p (dr_info)) 1264 { 1265 tree type = TREE_TYPE (DR_REF (dr_info->dr)); 1266 bool is_packed = not_size_aligned (DR_REF (dr_info->dr)); 1267 if (dump_enabled_p ()) 1268 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1269 "Unknown misalignment, %snaturally aligned\n", 1270 is_packed ? "not " : ""); 1271 return targetm.vectorize.vector_alignment_reachable (type, is_packed); 1272 } 1273 1274 return true; 1275} 1276 1277 1278/* Calculate the cost of the memory access represented by DR_INFO. */ 1279 1280static void 1281vect_get_data_access_cost (dr_vec_info *dr_info, 1282 unsigned int *inside_cost, 1283 unsigned int *outside_cost, 1284 stmt_vector_for_cost *body_cost_vec, 1285 stmt_vector_for_cost *prologue_cost_vec) 1286{ 1287 stmt_vec_info stmt_info = dr_info->stmt; 1288 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1289 int ncopies; 1290 1291 if (PURE_SLP_STMT (stmt_info)) 1292 ncopies = 1; 1293 else 1294 ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info)); 1295 1296 if (DR_IS_READ (dr_info->dr)) 1297 vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost, 1298 prologue_cost_vec, body_cost_vec, false); 1299 else 1300 vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec); 1301 1302 if (dump_enabled_p ()) 1303 dump_printf_loc (MSG_NOTE, vect_location, 1304 "vect_get_data_access_cost: inside_cost = %d, " 1305 "outside_cost = %d.\n", *inside_cost, *outside_cost); 1306} 1307 1308 1309typedef struct _vect_peel_info 1310{ 1311 dr_vec_info *dr_info; 1312 int npeel; 1313 unsigned int count; 1314} *vect_peel_info; 1315 1316typedef struct _vect_peel_extended_info 1317{ 1318 struct _vect_peel_info peel_info; 1319 unsigned int inside_cost; 1320 unsigned int outside_cost; 1321} *vect_peel_extended_info; 1322 1323 1324/* Peeling hashtable helpers. */ 1325 1326struct peel_info_hasher : free_ptr_hash <_vect_peel_info> 1327{ 1328 static inline hashval_t hash (const _vect_peel_info *); 1329 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *); 1330}; 1331 1332inline hashval_t 1333peel_info_hasher::hash (const _vect_peel_info *peel_info) 1334{ 1335 return (hashval_t) peel_info->npeel; 1336} 1337 1338inline bool 1339peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b) 1340{ 1341 return (a->npeel == b->npeel); 1342} 1343 1344 1345/* Insert DR_INFO into peeling hash table with NPEEL as key. */ 1346 1347static void 1348vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab, 1349 loop_vec_info loop_vinfo, dr_vec_info *dr_info, 1350 int npeel) 1351{ 1352 struct _vect_peel_info elem, *slot; 1353 _vect_peel_info **new_slot; 1354 bool supportable_dr_alignment 1355 = vect_supportable_dr_alignment (dr_info, true); 1356 1357 elem.npeel = npeel; 1358 slot = peeling_htab->find (&elem); 1359 if (slot) 1360 slot->count++; 1361 else 1362 { 1363 slot = XNEW (struct _vect_peel_info); 1364 slot->npeel = npeel; 1365 slot->dr_info = dr_info; 1366 slot->count = 1; 1367 new_slot = peeling_htab->find_slot (slot, INSERT); 1368 *new_slot = slot; 1369 } 1370 1371 if (!supportable_dr_alignment 1372 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1373 slot->count += VECT_MAX_COST; 1374} 1375 1376 1377/* Traverse peeling hash table to find peeling option that aligns maximum 1378 number of data accesses. */ 1379 1380int 1381vect_peeling_hash_get_most_frequent (_vect_peel_info **slot, 1382 _vect_peel_extended_info *max) 1383{ 1384 vect_peel_info elem = *slot; 1385 1386 if (elem->count > max->peel_info.count 1387 || (elem->count == max->peel_info.count 1388 && max->peel_info.npeel > elem->npeel)) 1389 { 1390 max->peel_info.npeel = elem->npeel; 1391 max->peel_info.count = elem->count; 1392 max->peel_info.dr_info = elem->dr_info; 1393 } 1394 1395 return 1; 1396} 1397 1398/* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking 1399 data access costs for all data refs. If UNKNOWN_MISALIGNMENT is true, 1400 we assume DR0_INFO's misalignment will be zero after peeling. */ 1401 1402static void 1403vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo, 1404 dr_vec_info *dr0_info, 1405 unsigned int *inside_cost, 1406 unsigned int *outside_cost, 1407 stmt_vector_for_cost *body_cost_vec, 1408 stmt_vector_for_cost *prologue_cost_vec, 1409 unsigned int npeel, 1410 bool unknown_misalignment) 1411{ 1412 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1413 unsigned i; 1414 data_reference *dr; 1415 1416 FOR_EACH_VEC_ELT (datarefs, i, dr) 1417 { 1418 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); 1419 stmt_vec_info stmt_info = dr_info->stmt; 1420 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1421 continue; 1422 1423 /* For interleaving, only the alignment of the first access 1424 matters. */ 1425 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1426 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info) 1427 continue; 1428 1429 /* Strided accesses perform only component accesses, alignment is 1430 irrelevant for them. */ 1431 if (STMT_VINFO_STRIDED_P (stmt_info) 1432 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1433 continue; 1434 1435 int save_misalignment; 1436 save_misalignment = DR_MISALIGNMENT (dr_info); 1437 if (npeel == 0) 1438 ; 1439 else if (unknown_misalignment && dr_info == dr0_info) 1440 SET_DR_MISALIGNMENT (dr_info, 0); 1441 else 1442 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel); 1443 vect_get_data_access_cost (dr_info, inside_cost, outside_cost, 1444 body_cost_vec, prologue_cost_vec); 1445 SET_DR_MISALIGNMENT (dr_info, save_misalignment); 1446 } 1447} 1448 1449/* Traverse peeling hash table and calculate cost for each peeling option. 1450 Find the one with the lowest cost. */ 1451 1452int 1453vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot, 1454 _vect_peel_extended_info *min) 1455{ 1456 vect_peel_info elem = *slot; 1457 int dummy; 1458 unsigned int inside_cost = 0, outside_cost = 0; 1459 stmt_vec_info stmt_info = elem->dr_info->stmt; 1460 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 1461 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, 1462 epilogue_cost_vec; 1463 1464 prologue_cost_vec.create (2); 1465 body_cost_vec.create (2); 1466 epilogue_cost_vec.create (2); 1467 1468 vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost, 1469 &outside_cost, &body_cost_vec, 1470 &prologue_cost_vec, elem->npeel, false); 1471 1472 body_cost_vec.release (); 1473 1474 outside_cost += vect_get_known_peeling_cost 1475 (loop_vinfo, elem->npeel, &dummy, 1476 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1477 &prologue_cost_vec, &epilogue_cost_vec); 1478 1479 /* Prologue and epilogue costs are added to the target model later. 1480 These costs depend only on the scalar iteration cost, the 1481 number of peeling iterations finally chosen, and the number of 1482 misaligned statements. So discard the information found here. */ 1483 prologue_cost_vec.release (); 1484 epilogue_cost_vec.release (); 1485 1486 if (inside_cost < min->inside_cost 1487 || (inside_cost == min->inside_cost 1488 && outside_cost < min->outside_cost)) 1489 { 1490 min->inside_cost = inside_cost; 1491 min->outside_cost = outside_cost; 1492 min->peel_info.dr_info = elem->dr_info; 1493 min->peel_info.npeel = elem->npeel; 1494 min->peel_info.count = elem->count; 1495 } 1496 1497 return 1; 1498} 1499 1500 1501/* Choose best peeling option by traversing peeling hash table and either 1502 choosing an option with the lowest cost (if cost model is enabled) or the 1503 option that aligns as many accesses as possible. */ 1504 1505static struct _vect_peel_extended_info 1506vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab, 1507 loop_vec_info loop_vinfo) 1508{ 1509 struct _vect_peel_extended_info res; 1510 1511 res.peel_info.dr_info = NULL; 1512 1513 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1514 { 1515 res.inside_cost = INT_MAX; 1516 res.outside_cost = INT_MAX; 1517 peeling_htab->traverse <_vect_peel_extended_info *, 1518 vect_peeling_hash_get_lowest_cost> (&res); 1519 } 1520 else 1521 { 1522 res.peel_info.count = 0; 1523 peeling_htab->traverse <_vect_peel_extended_info *, 1524 vect_peeling_hash_get_most_frequent> (&res); 1525 res.inside_cost = 0; 1526 res.outside_cost = 0; 1527 } 1528 1529 return res; 1530} 1531 1532/* Return true if the new peeling NPEEL is supported. */ 1533 1534static bool 1535vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info, 1536 unsigned npeel) 1537{ 1538 unsigned i; 1539 struct data_reference *dr = NULL; 1540 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1541 enum dr_alignment_support supportable_dr_alignment; 1542 1543 /* Ensure that all data refs can be vectorized after the peel. */ 1544 FOR_EACH_VEC_ELT (datarefs, i, dr) 1545 { 1546 int save_misalignment; 1547 1548 if (dr == dr0_info->dr) 1549 continue; 1550 1551 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); 1552 stmt_vec_info stmt_info = dr_info->stmt; 1553 /* For interleaving, only the alignment of the first access 1554 matters. */ 1555 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1556 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info) 1557 continue; 1558 1559 /* Strided accesses perform only component accesses, alignment is 1560 irrelevant for them. */ 1561 if (STMT_VINFO_STRIDED_P (stmt_info) 1562 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1563 continue; 1564 1565 save_misalignment = DR_MISALIGNMENT (dr_info); 1566 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel); 1567 supportable_dr_alignment 1568 = vect_supportable_dr_alignment (dr_info, false); 1569 SET_DR_MISALIGNMENT (dr_info, save_misalignment); 1570 1571 if (!supportable_dr_alignment) 1572 return false; 1573 } 1574 1575 return true; 1576} 1577 1578/* Function vect_enhance_data_refs_alignment 1579 1580 This pass will use loop versioning and loop peeling in order to enhance 1581 the alignment of data references in the loop. 1582 1583 FOR NOW: we assume that whatever versioning/peeling takes place, only the 1584 original loop is to be vectorized. Any other loops that are created by 1585 the transformations performed in this pass - are not supposed to be 1586 vectorized. This restriction will be relaxed. 1587 1588 This pass will require a cost model to guide it whether to apply peeling 1589 or versioning or a combination of the two. For example, the scheme that 1590 intel uses when given a loop with several memory accesses, is as follows: 1591 choose one memory access ('p') which alignment you want to force by doing 1592 peeling. Then, either (1) generate a loop in which 'p' is aligned and all 1593 other accesses are not necessarily aligned, or (2) use loop versioning to 1594 generate one loop in which all accesses are aligned, and another loop in 1595 which only 'p' is necessarily aligned. 1596 1597 ("Automatic Intra-Register Vectorization for the Intel Architecture", 1598 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International 1599 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.) 1600 1601 Devising a cost model is the most critical aspect of this work. It will 1602 guide us on which access to peel for, whether to use loop versioning, how 1603 many versions to create, etc. The cost model will probably consist of 1604 generic considerations as well as target specific considerations (on 1605 powerpc for example, misaligned stores are more painful than misaligned 1606 loads). 1607 1608 Here are the general steps involved in alignment enhancements: 1609 1610 -- original loop, before alignment analysis: 1611 for (i=0; i<N; i++){ 1612 x = q[i]; # DR_MISALIGNMENT(q) = unknown 1613 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1614 } 1615 1616 -- After vect_compute_data_refs_alignment: 1617 for (i=0; i<N; i++){ 1618 x = q[i]; # DR_MISALIGNMENT(q) = 3 1619 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1620 } 1621 1622 -- Possibility 1: we do loop versioning: 1623 if (p is aligned) { 1624 for (i=0; i<N; i++){ # loop 1A 1625 x = q[i]; # DR_MISALIGNMENT(q) = 3 1626 p[i] = y; # DR_MISALIGNMENT(p) = 0 1627 } 1628 } 1629 else { 1630 for (i=0; i<N; i++){ # loop 1B 1631 x = q[i]; # DR_MISALIGNMENT(q) = 3 1632 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1633 } 1634 } 1635 1636 -- Possibility 2: we do loop peeling: 1637 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1638 x = q[i]; 1639 p[i] = y; 1640 } 1641 for (i = 3; i < N; i++){ # loop 2A 1642 x = q[i]; # DR_MISALIGNMENT(q) = 0 1643 p[i] = y; # DR_MISALIGNMENT(p) = unknown 1644 } 1645 1646 -- Possibility 3: combination of loop peeling and versioning: 1647 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized). 1648 x = q[i]; 1649 p[i] = y; 1650 } 1651 if (p is aligned) { 1652 for (i = 3; i<N; i++){ # loop 3A 1653 x = q[i]; # DR_MISALIGNMENT(q) = 0 1654 p[i] = y; # DR_MISALIGNMENT(p) = 0 1655 } 1656 } 1657 else { 1658 for (i = 3; i<N; i++){ # loop 3B 1659 x = q[i]; # DR_MISALIGNMENT(q) = 0 1660 p[i] = y; # DR_MISALIGNMENT(p) = unaligned 1661 } 1662 } 1663 1664 These loops are later passed to loop_transform to be vectorized. The 1665 vectorizer will use the alignment information to guide the transformation 1666 (whether to generate regular loads/stores, or with special handling for 1667 misalignment). */ 1668 1669opt_result 1670vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo) 1671{ 1672 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1673 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1674 enum dr_alignment_support supportable_dr_alignment; 1675 dr_vec_info *first_store = NULL; 1676 dr_vec_info *dr0_info = NULL; 1677 struct data_reference *dr; 1678 unsigned int i, j; 1679 bool do_peeling = false; 1680 bool do_versioning = false; 1681 unsigned int npeel = 0; 1682 bool one_misalignment_known = false; 1683 bool one_misalignment_unknown = false; 1684 bool one_dr_unsupportable = false; 1685 dr_vec_info *unsupportable_dr_info = NULL; 1686 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1687 unsigned possible_npeel_number = 1; 1688 tree vectype; 1689 unsigned int mis, same_align_drs_max = 0; 1690 hash_table<peel_info_hasher> peeling_htab (1); 1691 1692 DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment"); 1693 1694 /* Reset data so we can safely be called multiple times. */ 1695 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); 1696 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0; 1697 1698 /* While cost model enhancements are expected in the future, the high level 1699 view of the code at this time is as follows: 1700 1701 A) If there is a misaligned access then see if peeling to align 1702 this access can make all data references satisfy 1703 vect_supportable_dr_alignment. If so, update data structures 1704 as needed and return true. 1705 1706 B) If peeling wasn't possible and there is a data reference with an 1707 unknown misalignment that does not satisfy vect_supportable_dr_alignment 1708 then see if loop versioning checks can be used to make all data 1709 references satisfy vect_supportable_dr_alignment. If so, update 1710 data structures as needed and return true. 1711 1712 C) If neither peeling nor versioning were successful then return false if 1713 any data reference does not satisfy vect_supportable_dr_alignment. 1714 1715 D) Return true (all data references satisfy vect_supportable_dr_alignment). 1716 1717 Note, Possibility 3 above (which is peeling and versioning together) is not 1718 being done at this time. */ 1719 1720 /* (1) Peeling to force alignment. */ 1721 1722 /* (1.1) Decide whether to perform peeling, and how many iterations to peel: 1723 Considerations: 1724 + How many accesses will become aligned due to the peeling 1725 - How many accesses will become unaligned due to the peeling, 1726 and the cost of misaligned accesses. 1727 - The cost of peeling (the extra runtime checks, the increase 1728 in code size). */ 1729 1730 FOR_EACH_VEC_ELT (datarefs, i, dr) 1731 { 1732 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); 1733 stmt_vec_info stmt_info = dr_info->stmt; 1734 1735 if (!STMT_VINFO_RELEVANT_P (stmt_info)) 1736 continue; 1737 1738 /* For interleaving, only the alignment of the first access 1739 matters. */ 1740 if (STMT_VINFO_GROUPED_ACCESS (stmt_info) 1741 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info) 1742 continue; 1743 1744 /* For scatter-gather or invariant accesses there is nothing 1745 to enhance. */ 1746 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info) 1747 || integer_zerop (DR_STEP (dr))) 1748 continue; 1749 1750 /* Strided accesses perform only component accesses, alignment is 1751 irrelevant for them. */ 1752 if (STMT_VINFO_STRIDED_P (stmt_info) 1753 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 1754 continue; 1755 1756 supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true); 1757 do_peeling = vector_alignment_reachable_p (dr_info); 1758 if (do_peeling) 1759 { 1760 if (known_alignment_for_access_p (dr_info)) 1761 { 1762 unsigned int npeel_tmp = 0; 1763 bool negative = tree_int_cst_compare (DR_STEP (dr), 1764 size_zero_node) < 0; 1765 1766 vectype = STMT_VINFO_VECTYPE (stmt_info); 1767 /* If known_alignment_for_access_p then we have set 1768 DR_MISALIGNMENT which is only done if we know it at compiler 1769 time, so it is safe to assume target alignment is constant. 1770 */ 1771 unsigned int target_align = 1772 DR_TARGET_ALIGNMENT (dr_info).to_constant (); 1773 unsigned int dr_size = vect_get_scalar_dr_size (dr_info); 1774 mis = (negative 1775 ? DR_MISALIGNMENT (dr_info) 1776 : -DR_MISALIGNMENT (dr_info)); 1777 if (DR_MISALIGNMENT (dr_info) != 0) 1778 npeel_tmp = (mis & (target_align - 1)) / dr_size; 1779 1780 /* For multiple types, it is possible that the bigger type access 1781 will have more than one peeling option. E.g., a loop with two 1782 types: one of size (vector size / 4), and the other one of 1783 size (vector size / 8). Vectorization factor will 8. If both 1784 accesses are misaligned by 3, the first one needs one scalar 1785 iteration to be aligned, and the second one needs 5. But the 1786 first one will be aligned also by peeling 5 scalar 1787 iterations, and in that case both accesses will be aligned. 1788 Hence, except for the immediate peeling amount, we also want 1789 to try to add full vector size, while we don't exceed 1790 vectorization factor. 1791 We do this automatically for cost model, since we calculate 1792 cost for every peeling option. */ 1793 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) 1794 { 1795 poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info) 1796 ? vf * DR_GROUP_SIZE (stmt_info) : vf); 1797 possible_npeel_number 1798 = vect_get_num_vectors (nscalars, vectype); 1799 1800 /* NPEEL_TMP is 0 when there is no misalignment, but also 1801 allow peeling NELEMENTS. */ 1802 if (DR_MISALIGNMENT (dr_info) == 0) 1803 possible_npeel_number++; 1804 } 1805 1806 /* Save info about DR in the hash table. Also include peeling 1807 amounts according to the explanation above. */ 1808 for (j = 0; j < possible_npeel_number; j++) 1809 { 1810 vect_peeling_hash_insert (&peeling_htab, loop_vinfo, 1811 dr_info, npeel_tmp); 1812 npeel_tmp += target_align / dr_size; 1813 } 1814 1815 one_misalignment_known = true; 1816 } 1817 else 1818 { 1819 /* If we don't know any misalignment values, we prefer 1820 peeling for data-ref that has the maximum number of data-refs 1821 with the same alignment, unless the target prefers to align 1822 stores over load. */ 1823 unsigned same_align_drs 1824 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length (); 1825 if (!dr0_info 1826 || same_align_drs_max < same_align_drs) 1827 { 1828 same_align_drs_max = same_align_drs; 1829 dr0_info = dr_info; 1830 } 1831 /* For data-refs with the same number of related 1832 accesses prefer the one where the misalign 1833 computation will be invariant in the outermost loop. */ 1834 else if (same_align_drs_max == same_align_drs) 1835 { 1836 class loop *ivloop0, *ivloop; 1837 ivloop0 = outermost_invariant_loop_for_expr 1838 (loop, DR_BASE_ADDRESS (dr0_info->dr)); 1839 ivloop = outermost_invariant_loop_for_expr 1840 (loop, DR_BASE_ADDRESS (dr)); 1841 if ((ivloop && !ivloop0) 1842 || (ivloop && ivloop0 1843 && flow_loop_nested_p (ivloop, ivloop0))) 1844 dr0_info = dr_info; 1845 } 1846 1847 one_misalignment_unknown = true; 1848 1849 /* Check for data refs with unsupportable alignment that 1850 can be peeled. */ 1851 if (!supportable_dr_alignment) 1852 { 1853 one_dr_unsupportable = true; 1854 unsupportable_dr_info = dr_info; 1855 } 1856 1857 if (!first_store && DR_IS_WRITE (dr)) 1858 first_store = dr_info; 1859 } 1860 } 1861 else 1862 { 1863 if (!aligned_access_p (dr_info)) 1864 { 1865 if (dump_enabled_p ()) 1866 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1867 "vector alignment may not be reachable\n"); 1868 break; 1869 } 1870 } 1871 } 1872 1873 /* Check if we can possibly peel the loop. */ 1874 if (!vect_can_advance_ivs_p (loop_vinfo) 1875 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)) 1876 || loop->inner) 1877 do_peeling = false; 1878 1879 struct _vect_peel_extended_info peel_for_known_alignment; 1880 struct _vect_peel_extended_info peel_for_unknown_alignment; 1881 struct _vect_peel_extended_info best_peel; 1882 1883 peel_for_unknown_alignment.inside_cost = INT_MAX; 1884 peel_for_unknown_alignment.outside_cost = INT_MAX; 1885 peel_for_unknown_alignment.peel_info.count = 0; 1886 1887 if (do_peeling 1888 && one_misalignment_unknown) 1889 { 1890 /* Check if the target requires to prefer stores over loads, i.e., if 1891 misaligned stores are more expensive than misaligned loads (taking 1892 drs with same alignment into account). */ 1893 unsigned int load_inside_cost = 0; 1894 unsigned int load_outside_cost = 0; 1895 unsigned int store_inside_cost = 0; 1896 unsigned int store_outside_cost = 0; 1897 unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2; 1898 1899 stmt_vector_for_cost dummy; 1900 dummy.create (2); 1901 vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info, 1902 &load_inside_cost, 1903 &load_outside_cost, 1904 &dummy, &dummy, estimated_npeels, true); 1905 dummy.release (); 1906 1907 if (first_store) 1908 { 1909 dummy.create (2); 1910 vect_get_peeling_costs_all_drs (loop_vinfo, first_store, 1911 &store_inside_cost, 1912 &store_outside_cost, 1913 &dummy, &dummy, 1914 estimated_npeels, true); 1915 dummy.release (); 1916 } 1917 else 1918 { 1919 store_inside_cost = INT_MAX; 1920 store_outside_cost = INT_MAX; 1921 } 1922 1923 if (load_inside_cost > store_inside_cost 1924 || (load_inside_cost == store_inside_cost 1925 && load_outside_cost > store_outside_cost)) 1926 { 1927 dr0_info = first_store; 1928 peel_for_unknown_alignment.inside_cost = store_inside_cost; 1929 peel_for_unknown_alignment.outside_cost = store_outside_cost; 1930 } 1931 else 1932 { 1933 peel_for_unknown_alignment.inside_cost = load_inside_cost; 1934 peel_for_unknown_alignment.outside_cost = load_outside_cost; 1935 } 1936 1937 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; 1938 prologue_cost_vec.create (2); 1939 epilogue_cost_vec.create (2); 1940 1941 int dummy2; 1942 peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost 1943 (loop_vinfo, estimated_npeels, &dummy2, 1944 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 1945 &prologue_cost_vec, &epilogue_cost_vec); 1946 1947 prologue_cost_vec.release (); 1948 epilogue_cost_vec.release (); 1949 1950 peel_for_unknown_alignment.peel_info.count = 1 1951 + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length (); 1952 } 1953 1954 peel_for_unknown_alignment.peel_info.npeel = 0; 1955 peel_for_unknown_alignment.peel_info.dr_info = dr0_info; 1956 1957 best_peel = peel_for_unknown_alignment; 1958 1959 peel_for_known_alignment.inside_cost = INT_MAX; 1960 peel_for_known_alignment.outside_cost = INT_MAX; 1961 peel_for_known_alignment.peel_info.count = 0; 1962 peel_for_known_alignment.peel_info.dr_info = NULL; 1963 1964 if (do_peeling && one_misalignment_known) 1965 { 1966 /* Peeling is possible, but there is no data access that is not supported 1967 unless aligned. So we try to choose the best possible peeling from 1968 the hash table. */ 1969 peel_for_known_alignment = vect_peeling_hash_choose_best_peeling 1970 (&peeling_htab, loop_vinfo); 1971 } 1972 1973 /* Compare costs of peeling for known and unknown alignment. */ 1974 if (peel_for_known_alignment.peel_info.dr_info != NULL 1975 && peel_for_unknown_alignment.inside_cost 1976 >= peel_for_known_alignment.inside_cost) 1977 { 1978 best_peel = peel_for_known_alignment; 1979 1980 /* If the best peeling for known alignment has NPEEL == 0, perform no 1981 peeling at all except if there is an unsupportable dr that we can 1982 align. */ 1983 if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable) 1984 do_peeling = false; 1985 } 1986 1987 /* If there is an unsupportable data ref, prefer this over all choices so far 1988 since we'd have to discard a chosen peeling except when it accidentally 1989 aligned the unsupportable data ref. */ 1990 if (one_dr_unsupportable) 1991 dr0_info = unsupportable_dr_info; 1992 else if (do_peeling) 1993 { 1994 /* Calculate the penalty for no peeling, i.e. leaving everything as-is. 1995 TODO: Use nopeel_outside_cost or get rid of it? */ 1996 unsigned nopeel_inside_cost = 0; 1997 unsigned nopeel_outside_cost = 0; 1998 1999 stmt_vector_for_cost dummy; 2000 dummy.create (2); 2001 vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost, 2002 &nopeel_outside_cost, &dummy, &dummy, 2003 0, false); 2004 dummy.release (); 2005 2006 /* Add epilogue costs. As we do not peel for alignment here, no prologue 2007 costs will be recorded. */ 2008 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec; 2009 prologue_cost_vec.create (2); 2010 epilogue_cost_vec.create (2); 2011 2012 int dummy2; 2013 nopeel_outside_cost += vect_get_known_peeling_cost 2014 (loop_vinfo, 0, &dummy2, 2015 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), 2016 &prologue_cost_vec, &epilogue_cost_vec); 2017 2018 prologue_cost_vec.release (); 2019 epilogue_cost_vec.release (); 2020 2021 npeel = best_peel.peel_info.npeel; 2022 dr0_info = best_peel.peel_info.dr_info; 2023 2024 /* If no peeling is not more expensive than the best peeling we 2025 have so far, don't perform any peeling. */ 2026 if (nopeel_inside_cost <= best_peel.inside_cost) 2027 do_peeling = false; 2028 } 2029 2030 if (do_peeling) 2031 { 2032 stmt_vec_info stmt_info = dr0_info->stmt; 2033 vectype = STMT_VINFO_VECTYPE (stmt_info); 2034 2035 if (known_alignment_for_access_p (dr0_info)) 2036 { 2037 bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr), 2038 size_zero_node) < 0; 2039 if (!npeel) 2040 { 2041 /* Since it's known at compile time, compute the number of 2042 iterations in the peeled loop (the peeling factor) for use in 2043 updating DR_MISALIGNMENT values. The peeling factor is the 2044 vectorization factor minus the misalignment as an element 2045 count. */ 2046 mis = (negative 2047 ? DR_MISALIGNMENT (dr0_info) 2048 : -DR_MISALIGNMENT (dr0_info)); 2049 /* If known_alignment_for_access_p then we have set 2050 DR_MISALIGNMENT which is only done if we know it at compiler 2051 time, so it is safe to assume target alignment is constant. 2052 */ 2053 unsigned int target_align = 2054 DR_TARGET_ALIGNMENT (dr0_info).to_constant (); 2055 npeel = ((mis & (target_align - 1)) 2056 / vect_get_scalar_dr_size (dr0_info)); 2057 } 2058 2059 /* For interleaved data access every iteration accesses all the 2060 members of the group, therefore we divide the number of iterations 2061 by the group size. */ 2062 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2063 npeel /= DR_GROUP_SIZE (stmt_info); 2064 2065 if (dump_enabled_p ()) 2066 dump_printf_loc (MSG_NOTE, vect_location, 2067 "Try peeling by %d\n", npeel); 2068 } 2069 2070 /* Ensure that all datarefs can be vectorized after the peel. */ 2071 if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel)) 2072 do_peeling = false; 2073 2074 /* Check if all datarefs are supportable and log. */ 2075 if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0) 2076 { 2077 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo); 2078 if (!stat) 2079 do_peeling = false; 2080 else 2081 return stat; 2082 } 2083 2084 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */ 2085 if (do_peeling) 2086 { 2087 unsigned max_allowed_peel 2088 = param_vect_max_peeling_for_alignment; 2089 if (flag_vect_cost_model == VECT_COST_MODEL_CHEAP) 2090 max_allowed_peel = 0; 2091 if (max_allowed_peel != (unsigned)-1) 2092 { 2093 unsigned max_peel = npeel; 2094 if (max_peel == 0) 2095 { 2096 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info); 2097 unsigned HOST_WIDE_INT target_align_c; 2098 if (target_align.is_constant (&target_align_c)) 2099 max_peel = 2100 target_align_c / vect_get_scalar_dr_size (dr0_info) - 1; 2101 else 2102 { 2103 do_peeling = false; 2104 if (dump_enabled_p ()) 2105 dump_printf_loc (MSG_NOTE, vect_location, 2106 "Disable peeling, max peels set and vector" 2107 " alignment unknown\n"); 2108 } 2109 } 2110 if (max_peel > max_allowed_peel) 2111 { 2112 do_peeling = false; 2113 if (dump_enabled_p ()) 2114 dump_printf_loc (MSG_NOTE, vect_location, 2115 "Disable peeling, max peels reached: %d\n", max_peel); 2116 } 2117 } 2118 } 2119 2120 /* Cost model #2 - if peeling may result in a remaining loop not 2121 iterating enough to be vectorized then do not peel. Since this 2122 is a cost heuristic rather than a correctness decision, use the 2123 most likely runtime value for variable vectorization factors. */ 2124 if (do_peeling 2125 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) 2126 { 2127 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo); 2128 unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel; 2129 if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo) 2130 < assumed_vf + max_peel) 2131 do_peeling = false; 2132 } 2133 2134 if (do_peeling) 2135 { 2136 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i. 2137 If the misalignment of DR_i is identical to that of dr0 then set 2138 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and 2139 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i) 2140 by the peeling factor times the element size of DR_i (MOD the 2141 vectorization factor times the size). Otherwise, the 2142 misalignment of DR_i must be set to unknown. */ 2143 FOR_EACH_VEC_ELT (datarefs, i, dr) 2144 if (dr != dr0_info->dr) 2145 { 2146 /* Strided accesses perform only component accesses, alignment 2147 is irrelevant for them. */ 2148 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); 2149 stmt_info = dr_info->stmt; 2150 if (STMT_VINFO_STRIDED_P (stmt_info) 2151 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2152 continue; 2153 2154 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel); 2155 } 2156 2157 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info; 2158 if (npeel) 2159 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel; 2160 else 2161 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) 2162 = DR_MISALIGNMENT (dr0_info); 2163 SET_DR_MISALIGNMENT (dr0_info, 0); 2164 if (dump_enabled_p ()) 2165 { 2166 dump_printf_loc (MSG_NOTE, vect_location, 2167 "Alignment of access forced using peeling.\n"); 2168 dump_printf_loc (MSG_NOTE, vect_location, 2169 "Peeling for alignment will be applied.\n"); 2170 } 2171 2172 /* The inside-loop cost will be accounted for in vectorizable_load 2173 and vectorizable_store correctly with adjusted alignments. 2174 Drop the body_cst_vec on the floor here. */ 2175 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo); 2176 gcc_assert (stat); 2177 return stat; 2178 } 2179 } 2180 2181 /* (2) Versioning to force alignment. */ 2182 2183 /* Try versioning if: 2184 1) optimize loop for speed and the cost-model is not cheap 2185 2) there is at least one unsupported misaligned data ref with an unknown 2186 misalignment, and 2187 3) all misaligned data refs with a known misalignment are supported, and 2188 4) the number of runtime alignment checks is within reason. */ 2189 2190 do_versioning 2191 = (optimize_loop_nest_for_speed_p (loop) 2192 && !loop->inner /* FORNOW */ 2193 && flag_vect_cost_model != VECT_COST_MODEL_CHEAP); 2194 2195 if (do_versioning) 2196 { 2197 FOR_EACH_VEC_ELT (datarefs, i, dr) 2198 { 2199 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); 2200 stmt_vec_info stmt_info = dr_info->stmt; 2201 2202 /* For interleaving, only the alignment of the first access 2203 matters. */ 2204 if (aligned_access_p (dr_info) 2205 || (STMT_VINFO_GROUPED_ACCESS (stmt_info) 2206 && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)) 2207 continue; 2208 2209 if (STMT_VINFO_STRIDED_P (stmt_info)) 2210 { 2211 /* Strided loads perform only component accesses, alignment is 2212 irrelevant for them. */ 2213 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info)) 2214 continue; 2215 do_versioning = false; 2216 break; 2217 } 2218 2219 supportable_dr_alignment 2220 = vect_supportable_dr_alignment (dr_info, false); 2221 2222 if (!supportable_dr_alignment) 2223 { 2224 int mask; 2225 tree vectype; 2226 2227 if (known_alignment_for_access_p (dr_info) 2228 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length () 2229 >= (unsigned) param_vect_max_version_for_alignment_checks) 2230 { 2231 do_versioning = false; 2232 break; 2233 } 2234 2235 vectype = STMT_VINFO_VECTYPE (stmt_info); 2236 gcc_assert (vectype); 2237 2238 /* At present we don't support versioning for alignment 2239 with variable VF, since there's no guarantee that the 2240 VF is a power of two. We could relax this if we added 2241 a way of enforcing a power-of-two size. */ 2242 unsigned HOST_WIDE_INT size; 2243 if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size)) 2244 { 2245 do_versioning = false; 2246 break; 2247 } 2248 2249 /* Forcing alignment in the first iteration is no good if 2250 we don't keep it across iterations. For now, just disable 2251 versioning in this case. 2252 ?? We could actually unroll the loop to achieve the required 2253 overall step alignment, and forcing the alignment could be 2254 done by doing some iterations of the non-vectorized loop. */ 2255 if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo) 2256 * DR_STEP_ALIGNMENT (dr), 2257 DR_TARGET_ALIGNMENT (dr_info))) 2258 { 2259 do_versioning = false; 2260 break; 2261 } 2262 2263 /* The rightmost bits of an aligned address must be zeros. 2264 Construct the mask needed for this test. For example, 2265 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the 2266 mask must be 15 = 0xf. */ 2267 mask = size - 1; 2268 2269 /* FORNOW: use the same mask to test all potentially unaligned 2270 references in the loop. */ 2271 if (LOOP_VINFO_PTR_MASK (loop_vinfo) 2272 && LOOP_VINFO_PTR_MASK (loop_vinfo) != mask) 2273 { 2274 do_versioning = false; 2275 break; 2276 } 2277 2278 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask; 2279 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info); 2280 } 2281 } 2282 2283 /* Versioning requires at least one misaligned data reference. */ 2284 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) 2285 do_versioning = false; 2286 else if (!do_versioning) 2287 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0); 2288 } 2289 2290 if (do_versioning) 2291 { 2292 vec<stmt_vec_info> may_misalign_stmts 2293 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); 2294 stmt_vec_info stmt_info; 2295 2296 /* It can now be assumed that the data references in the statements 2297 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version 2298 of the loop being vectorized. */ 2299 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info) 2300 { 2301 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); 2302 SET_DR_MISALIGNMENT (dr_info, 0); 2303 if (dump_enabled_p ()) 2304 dump_printf_loc (MSG_NOTE, vect_location, 2305 "Alignment of access forced using versioning.\n"); 2306 } 2307 2308 if (dump_enabled_p ()) 2309 dump_printf_loc (MSG_NOTE, vect_location, 2310 "Versioning for alignment will be applied.\n"); 2311 2312 /* Peeling and versioning can't be done together at this time. */ 2313 gcc_assert (! (do_peeling && do_versioning)); 2314 2315 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo); 2316 gcc_assert (stat); 2317 return stat; 2318 } 2319 2320 /* This point is reached if neither peeling nor versioning is being done. */ 2321 gcc_assert (! (do_peeling || do_versioning)); 2322 2323 opt_result stat = vect_verify_datarefs_alignment (loop_vinfo); 2324 return stat; 2325} 2326 2327 2328/* Function vect_find_same_alignment_drs. 2329 2330 Update group and alignment relations in VINFO according to the chosen 2331 vectorization factor. */ 2332 2333static void 2334vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr) 2335{ 2336 struct data_reference *dra = DDR_A (ddr); 2337 struct data_reference *drb = DDR_B (ddr); 2338 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra); 2339 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb); 2340 stmt_vec_info stmtinfo_a = dr_info_a->stmt; 2341 stmt_vec_info stmtinfo_b = dr_info_b->stmt; 2342 2343 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 2344 return; 2345 2346 if (dra == drb) 2347 return; 2348 2349 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a) 2350 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 2351 return; 2352 2353 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0) 2354 || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0) 2355 || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0)) 2356 return; 2357 2358 /* Two references with distance zero have the same alignment. */ 2359 poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra)) 2360 - wi::to_poly_offset (DR_INIT (drb))); 2361 if (maybe_ne (diff, 0)) 2362 { 2363 /* Get the wider of the two alignments. */ 2364 poly_uint64 align_a = 2365 exact_div (vect_calculate_target_alignment (dr_info_a), 2366 BITS_PER_UNIT); 2367 poly_uint64 align_b = 2368 exact_div (vect_calculate_target_alignment (dr_info_b), 2369 BITS_PER_UNIT); 2370 unsigned HOST_WIDE_INT align_a_c, align_b_c; 2371 if (!align_a.is_constant (&align_a_c) 2372 || !align_b.is_constant (&align_b_c)) 2373 return; 2374 2375 unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c); 2376 2377 /* Require the gap to be a multiple of the larger vector alignment. */ 2378 if (!multiple_p (diff, max_align)) 2379 return; 2380 } 2381 2382 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb); 2383 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra); 2384 if (dump_enabled_p ()) 2385 dump_printf_loc (MSG_NOTE, vect_location, 2386 "accesses have the same alignment: %T and %T\n", 2387 DR_REF (dra), DR_REF (drb)); 2388} 2389 2390 2391/* Function vect_analyze_data_refs_alignment 2392 2393 Analyze the alignment of the data-references in the loop. 2394 Return FALSE if a data reference is found that cannot be vectorized. */ 2395 2396opt_result 2397vect_analyze_data_refs_alignment (loop_vec_info vinfo) 2398{ 2399 DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment"); 2400 2401 /* Mark groups of data references with same alignment using 2402 data dependence information. */ 2403 vec<ddr_p> ddrs = vinfo->shared->ddrs; 2404 struct data_dependence_relation *ddr; 2405 unsigned int i; 2406 2407 FOR_EACH_VEC_ELT (ddrs, i, ddr) 2408 vect_find_same_alignment_drs (vinfo, ddr); 2409 2410 vec<data_reference_p> datarefs = vinfo->shared->datarefs; 2411 struct data_reference *dr; 2412 2413 vect_record_base_alignments (vinfo); 2414 FOR_EACH_VEC_ELT (datarefs, i, dr) 2415 { 2416 dr_vec_info *dr_info = vinfo->lookup_dr (dr); 2417 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)) 2418 vect_compute_data_ref_alignment (dr_info); 2419 } 2420 2421 return opt_result::success (); 2422} 2423 2424 2425/* Analyze alignment of DRs of stmts in NODE. */ 2426 2427static bool 2428vect_slp_analyze_and_verify_node_alignment (slp_tree node) 2429{ 2430 /* We vectorize from the first scalar stmt in the node unless 2431 the node is permuted in which case we start from the first 2432 element in the group. */ 2433 stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; 2434 dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info); 2435 if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) 2436 first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info); 2437 2438 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info); 2439 vect_compute_data_ref_alignment (dr_info); 2440 /* For creating the data-ref pointer we need alignment of the 2441 first element anyway. */ 2442 if (dr_info != first_dr_info) 2443 vect_compute_data_ref_alignment (first_dr_info); 2444 if (! verify_data_ref_alignment (dr_info)) 2445 { 2446 if (dump_enabled_p ()) 2447 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2448 "not vectorized: bad data alignment in basic " 2449 "block.\n"); 2450 return false; 2451 } 2452 2453 return true; 2454} 2455 2456/* Function vect_slp_analyze_instance_alignment 2457 2458 Analyze the alignment of the data-references in the SLP instance. 2459 Return FALSE if a data reference is found that cannot be vectorized. */ 2460 2461bool 2462vect_slp_analyze_and_verify_instance_alignment (slp_instance instance) 2463{ 2464 DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment"); 2465 2466 slp_tree node; 2467 unsigned i; 2468 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node) 2469 if (! vect_slp_analyze_and_verify_node_alignment (node)) 2470 return false; 2471 2472 node = SLP_INSTANCE_TREE (instance); 2473 if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0]) 2474 && ! vect_slp_analyze_and_verify_node_alignment 2475 (SLP_INSTANCE_TREE (instance))) 2476 return false; 2477 2478 return true; 2479} 2480 2481 2482/* Analyze groups of accesses: check that DR_INFO belongs to a group of 2483 accesses of legal size, step, etc. Detect gaps, single element 2484 interleaving, and other special cases. Set grouped access info. 2485 Collect groups of strided stores for further use in SLP analysis. 2486 Worker for vect_analyze_group_access. */ 2487 2488static bool 2489vect_analyze_group_access_1 (dr_vec_info *dr_info) 2490{ 2491 data_reference *dr = dr_info->dr; 2492 tree step = DR_STEP (dr); 2493 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2494 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type)); 2495 stmt_vec_info stmt_info = dr_info->stmt; 2496 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2497 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 2498 HOST_WIDE_INT dr_step = -1; 2499 HOST_WIDE_INT groupsize, last_accessed_element = 1; 2500 bool slp_impossible = false; 2501 2502 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the 2503 size of the interleaving group (including gaps). */ 2504 if (tree_fits_shwi_p (step)) 2505 { 2506 dr_step = tree_to_shwi (step); 2507 /* Check that STEP is a multiple of type size. Otherwise there is 2508 a non-element-sized gap at the end of the group which we 2509 cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE. 2510 ??? As we can handle non-constant step fine here we should 2511 simply remove uses of DR_GROUP_GAP between the last and first 2512 element and instead rely on DR_STEP. DR_GROUP_SIZE then would 2513 simply not include that gap. */ 2514 if ((dr_step % type_size) != 0) 2515 { 2516 if (dump_enabled_p ()) 2517 dump_printf_loc (MSG_NOTE, vect_location, 2518 "Step %T is not a multiple of the element size" 2519 " for %T\n", 2520 step, DR_REF (dr)); 2521 return false; 2522 } 2523 groupsize = absu_hwi (dr_step) / type_size; 2524 } 2525 else 2526 groupsize = 0; 2527 2528 /* Not consecutive access is possible only if it is a part of interleaving. */ 2529 if (!DR_GROUP_FIRST_ELEMENT (stmt_info)) 2530 { 2531 /* Check if it this DR is a part of interleaving, and is a single 2532 element of the group that is accessed in the loop. */ 2533 2534 /* Gaps are supported only for loads. STEP must be a multiple of the type 2535 size. */ 2536 if (DR_IS_READ (dr) 2537 && (dr_step % type_size) == 0 2538 && groupsize > 0) 2539 { 2540 DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info; 2541 DR_GROUP_SIZE (stmt_info) = groupsize; 2542 DR_GROUP_GAP (stmt_info) = groupsize - 1; 2543 if (dump_enabled_p ()) 2544 dump_printf_loc (MSG_NOTE, vect_location, 2545 "Detected single element interleaving %T" 2546 " step %T\n", 2547 DR_REF (dr), step); 2548 2549 return true; 2550 } 2551 2552 if (dump_enabled_p ()) 2553 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2554 "not consecutive access %G", stmt_info->stmt); 2555 2556 if (bb_vinfo) 2557 { 2558 /* Mark the statement as unvectorizable. */ 2559 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 2560 return true; 2561 } 2562 2563 if (dump_enabled_p ()) 2564 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n"); 2565 STMT_VINFO_STRIDED_P (stmt_info) = true; 2566 return true; 2567 } 2568 2569 if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info) 2570 { 2571 /* First stmt in the interleaving chain. Check the chain. */ 2572 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info); 2573 struct data_reference *data_ref = dr; 2574 unsigned int count = 1; 2575 tree prev_init = DR_INIT (data_ref); 2576 HOST_WIDE_INT diff, gaps = 0; 2577 2578 /* By construction, all group members have INTEGER_CST DR_INITs. */ 2579 while (next) 2580 { 2581 /* We never have the same DR multiple times. */ 2582 gcc_assert (tree_int_cst_compare (DR_INIT (data_ref), 2583 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0); 2584 2585 data_ref = STMT_VINFO_DATA_REF (next); 2586 2587 /* All group members have the same STEP by construction. */ 2588 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0)); 2589 2590 /* Check that the distance between two accesses is equal to the type 2591 size. Otherwise, we have gaps. */ 2592 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref)) 2593 - TREE_INT_CST_LOW (prev_init)) / type_size; 2594 if (diff != 1) 2595 { 2596 /* FORNOW: SLP of accesses with gaps is not supported. */ 2597 slp_impossible = true; 2598 if (DR_IS_WRITE (data_ref)) 2599 { 2600 if (dump_enabled_p ()) 2601 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2602 "interleaved store with gaps\n"); 2603 return false; 2604 } 2605 2606 gaps += diff - 1; 2607 } 2608 2609 last_accessed_element += diff; 2610 2611 /* Store the gap from the previous member of the group. If there is no 2612 gap in the access, DR_GROUP_GAP is always 1. */ 2613 DR_GROUP_GAP (next) = diff; 2614 2615 prev_init = DR_INIT (data_ref); 2616 next = DR_GROUP_NEXT_ELEMENT (next); 2617 /* Count the number of data-refs in the chain. */ 2618 count++; 2619 } 2620 2621 if (groupsize == 0) 2622 groupsize = count + gaps; 2623 2624 /* This could be UINT_MAX but as we are generating code in a very 2625 inefficient way we have to cap earlier. See PR78699 for example. */ 2626 if (groupsize > 4096) 2627 { 2628 if (dump_enabled_p ()) 2629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2630 "group is too large\n"); 2631 return false; 2632 } 2633 2634 /* Check that the size of the interleaving is equal to count for stores, 2635 i.e., that there are no gaps. */ 2636 if (groupsize != count 2637 && !DR_IS_READ (dr)) 2638 { 2639 groupsize = count; 2640 STMT_VINFO_STRIDED_P (stmt_info) = true; 2641 } 2642 2643 /* If there is a gap after the last load in the group it is the 2644 difference between the groupsize and the last accessed 2645 element. 2646 When there is no gap, this difference should be 0. */ 2647 DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element; 2648 2649 DR_GROUP_SIZE (stmt_info) = groupsize; 2650 if (dump_enabled_p ()) 2651 { 2652 dump_printf_loc (MSG_NOTE, vect_location, 2653 "Detected interleaving "); 2654 if (DR_IS_READ (dr)) 2655 dump_printf (MSG_NOTE, "load "); 2656 else if (STMT_VINFO_STRIDED_P (stmt_info)) 2657 dump_printf (MSG_NOTE, "strided store "); 2658 else 2659 dump_printf (MSG_NOTE, "store "); 2660 dump_printf (MSG_NOTE, "of size %u\n", 2661 (unsigned)groupsize); 2662 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt); 2663 next = DR_GROUP_NEXT_ELEMENT (stmt_info); 2664 while (next) 2665 { 2666 if (DR_GROUP_GAP (next) != 1) 2667 dump_printf_loc (MSG_NOTE, vect_location, 2668 "\t<gap of %d elements>\n", 2669 DR_GROUP_GAP (next) - 1); 2670 dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt); 2671 next = DR_GROUP_NEXT_ELEMENT (next); 2672 } 2673 if (DR_GROUP_GAP (stmt_info) != 0) 2674 dump_printf_loc (MSG_NOTE, vect_location, 2675 "\t<gap of %d elements>\n", 2676 DR_GROUP_GAP (stmt_info)); 2677 } 2678 2679 /* SLP: create an SLP data structure for every interleaving group of 2680 stores for further analysis in vect_analyse_slp. */ 2681 if (DR_IS_WRITE (dr) && !slp_impossible) 2682 { 2683 if (loop_vinfo) 2684 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info); 2685 if (bb_vinfo) 2686 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info); 2687 } 2688 } 2689 2690 return true; 2691} 2692 2693/* Analyze groups of accesses: check that DR_INFO belongs to a group of 2694 accesses of legal size, step, etc. Detect gaps, single element 2695 interleaving, and other special cases. Set grouped access info. 2696 Collect groups of strided stores for further use in SLP analysis. */ 2697 2698static bool 2699vect_analyze_group_access (dr_vec_info *dr_info) 2700{ 2701 if (!vect_analyze_group_access_1 (dr_info)) 2702 { 2703 /* Dissolve the group if present. */ 2704 stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt); 2705 while (stmt_info) 2706 { 2707 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info); 2708 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; 2709 DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL; 2710 stmt_info = next; 2711 } 2712 return false; 2713 } 2714 return true; 2715} 2716 2717/* Analyze the access pattern of the data-reference DR_INFO. 2718 In case of non-consecutive accesses call vect_analyze_group_access() to 2719 analyze groups of accesses. */ 2720 2721static bool 2722vect_analyze_data_ref_access (dr_vec_info *dr_info) 2723{ 2724 data_reference *dr = dr_info->dr; 2725 tree step = DR_STEP (dr); 2726 tree scalar_type = TREE_TYPE (DR_REF (dr)); 2727 stmt_vec_info stmt_info = dr_info->stmt; 2728 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 2729 class loop *loop = NULL; 2730 2731 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) 2732 return true; 2733 2734 if (loop_vinfo) 2735 loop = LOOP_VINFO_LOOP (loop_vinfo); 2736 2737 if (loop_vinfo && !step) 2738 { 2739 if (dump_enabled_p ()) 2740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 2741 "bad data-ref access in loop\n"); 2742 return false; 2743 } 2744 2745 /* Allow loads with zero step in inner-loop vectorization. */ 2746 if (loop_vinfo && integer_zerop (step)) 2747 { 2748 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; 2749 if (!nested_in_vect_loop_p (loop, stmt_info)) 2750 return DR_IS_READ (dr); 2751 /* Allow references with zero step for outer loops marked 2752 with pragma omp simd only - it guarantees absence of 2753 loop-carried dependencies between inner loop iterations. */ 2754 if (loop->safelen < 2) 2755 { 2756 if (dump_enabled_p ()) 2757 dump_printf_loc (MSG_NOTE, vect_location, 2758 "zero step in inner loop of nest\n"); 2759 return false; 2760 } 2761 } 2762 2763 if (loop && nested_in_vect_loop_p (loop, stmt_info)) 2764 { 2765 /* Interleaved accesses are not yet supported within outer-loop 2766 vectorization for references in the inner-loop. */ 2767 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; 2768 2769 /* For the rest of the analysis we use the outer-loop step. */ 2770 step = STMT_VINFO_DR_STEP (stmt_info); 2771 if (integer_zerop (step)) 2772 { 2773 if (dump_enabled_p ()) 2774 dump_printf_loc (MSG_NOTE, vect_location, 2775 "zero step in outer loop.\n"); 2776 return DR_IS_READ (dr); 2777 } 2778 } 2779 2780 /* Consecutive? */ 2781 if (TREE_CODE (step) == INTEGER_CST) 2782 { 2783 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step); 2784 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type)) 2785 || (dr_step < 0 2786 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step))) 2787 { 2788 /* Mark that it is not interleaving. */ 2789 DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL; 2790 return true; 2791 } 2792 } 2793 2794 if (loop && nested_in_vect_loop_p (loop, stmt_info)) 2795 { 2796 if (dump_enabled_p ()) 2797 dump_printf_loc (MSG_NOTE, vect_location, 2798 "grouped access in outer loop.\n"); 2799 return false; 2800 } 2801 2802 2803 /* Assume this is a DR handled by non-constant strided load case. */ 2804 if (TREE_CODE (step) != INTEGER_CST) 2805 return (STMT_VINFO_STRIDED_P (stmt_info) 2806 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info) 2807 || vect_analyze_group_access (dr_info))); 2808 2809 /* Not consecutive access - check if it's a part of interleaving group. */ 2810 return vect_analyze_group_access (dr_info); 2811} 2812 2813/* Compare two data-references DRA and DRB to group them into chunks 2814 suitable for grouping. */ 2815 2816static int 2817dr_group_sort_cmp (const void *dra_, const void *drb_) 2818{ 2819 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_); 2820 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_); 2821 int cmp; 2822 2823 /* Stabilize sort. */ 2824 if (dra == drb) 2825 return 0; 2826 2827 /* DRs in different loops never belong to the same group. */ 2828 loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father; 2829 loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father; 2830 if (loopa != loopb) 2831 return loopa->num < loopb->num ? -1 : 1; 2832 2833 /* Ordering of DRs according to base. */ 2834 cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra), 2835 DR_BASE_ADDRESS (drb)); 2836 if (cmp != 0) 2837 return cmp; 2838 2839 /* And according to DR_OFFSET. */ 2840 cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)); 2841 if (cmp != 0) 2842 return cmp; 2843 2844 /* Put reads before writes. */ 2845 if (DR_IS_READ (dra) != DR_IS_READ (drb)) 2846 return DR_IS_READ (dra) ? -1 : 1; 2847 2848 /* Then sort after access size. */ 2849 cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 2850 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)))); 2851 if (cmp != 0) 2852 return cmp; 2853 2854 /* And after step. */ 2855 cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)); 2856 if (cmp != 0) 2857 return cmp; 2858 2859 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */ 2860 cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)); 2861 if (cmp == 0) 2862 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1; 2863 return cmp; 2864} 2865 2866/* If OP is the result of a conversion, return the unconverted value, 2867 otherwise return null. */ 2868 2869static tree 2870strip_conversion (tree op) 2871{ 2872 if (TREE_CODE (op) != SSA_NAME) 2873 return NULL_TREE; 2874 gimple *stmt = SSA_NAME_DEF_STMT (op); 2875 if (!is_gimple_assign (stmt) 2876 || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt))) 2877 return NULL_TREE; 2878 return gimple_assign_rhs1 (stmt); 2879} 2880 2881/* Return true if vectorizable_* routines can handle statements STMT1_INFO 2882 and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can 2883 be grouped in SLP mode. */ 2884 2885static bool 2886can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info, 2887 bool allow_slp_p) 2888{ 2889 if (gimple_assign_single_p (stmt1_info->stmt)) 2890 return gimple_assign_single_p (stmt2_info->stmt); 2891 2892 gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt); 2893 if (call1 && gimple_call_internal_p (call1)) 2894 { 2895 /* Check for two masked loads or two masked stores. */ 2896 gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt); 2897 if (!call2 || !gimple_call_internal_p (call2)) 2898 return false; 2899 internal_fn ifn = gimple_call_internal_fn (call1); 2900 if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE) 2901 return false; 2902 if (ifn != gimple_call_internal_fn (call2)) 2903 return false; 2904 2905 /* Check that the masks are the same. Cope with casts of masks, 2906 like those created by build_mask_conversion. */ 2907 tree mask1 = gimple_call_arg (call1, 2); 2908 tree mask2 = gimple_call_arg (call2, 2); 2909 if (!operand_equal_p (mask1, mask2, 0) 2910 && (ifn == IFN_MASK_STORE || !allow_slp_p)) 2911 { 2912 mask1 = strip_conversion (mask1); 2913 if (!mask1) 2914 return false; 2915 mask2 = strip_conversion (mask2); 2916 if (!mask2) 2917 return false; 2918 if (!operand_equal_p (mask1, mask2, 0)) 2919 return false; 2920 } 2921 return true; 2922 } 2923 2924 return false; 2925} 2926 2927/* Function vect_analyze_data_ref_accesses. 2928 2929 Analyze the access pattern of all the data references in the loop. 2930 2931 FORNOW: the only access pattern that is considered vectorizable is a 2932 simple step 1 (consecutive) access. 2933 2934 FORNOW: handle only arrays and pointer accesses. */ 2935 2936opt_result 2937vect_analyze_data_ref_accesses (vec_info *vinfo) 2938{ 2939 unsigned int i; 2940 vec<data_reference_p> datarefs = vinfo->shared->datarefs; 2941 struct data_reference *dr; 2942 2943 DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses"); 2944 2945 if (datarefs.is_empty ()) 2946 return opt_result::success (); 2947 2948 /* Sort the array of datarefs to make building the interleaving chains 2949 linear. Don't modify the original vector's order, it is needed for 2950 determining what dependencies are reversed. */ 2951 vec<data_reference_p> datarefs_copy = datarefs.copy (); 2952 datarefs_copy.qsort (dr_group_sort_cmp); 2953 hash_set<stmt_vec_info> to_fixup; 2954 2955 /* Build the interleaving chains. */ 2956 for (i = 0; i < datarefs_copy.length () - 1;) 2957 { 2958 data_reference_p dra = datarefs_copy[i]; 2959 dr_vec_info *dr_info_a = vinfo->lookup_dr (dra); 2960 stmt_vec_info stmtinfo_a = dr_info_a->stmt; 2961 stmt_vec_info lastinfo = NULL; 2962 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a) 2963 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)) 2964 { 2965 ++i; 2966 continue; 2967 } 2968 for (i = i + 1; i < datarefs_copy.length (); ++i) 2969 { 2970 data_reference_p drb = datarefs_copy[i]; 2971 dr_vec_info *dr_info_b = vinfo->lookup_dr (drb); 2972 stmt_vec_info stmtinfo_b = dr_info_b->stmt; 2973 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b) 2974 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b)) 2975 break; 2976 2977 /* ??? Imperfect sorting (non-compatible types, non-modulo 2978 accesses, same accesses) can lead to a group to be artificially 2979 split here as we don't just skip over those. If it really 2980 matters we can push those to a worklist and re-iterate 2981 over them. The we can just skip ahead to the next DR here. */ 2982 2983 /* DRs in a different loop should not be put into the same 2984 interleaving group. */ 2985 if (gimple_bb (DR_STMT (dra))->loop_father 2986 != gimple_bb (DR_STMT (drb))->loop_father) 2987 break; 2988 2989 /* Check that the data-refs have same first location (except init) 2990 and they are both either store or load (not load and store, 2991 not masked loads or stores). */ 2992 if (DR_IS_READ (dra) != DR_IS_READ (drb) 2993 || data_ref_compare_tree (DR_BASE_ADDRESS (dra), 2994 DR_BASE_ADDRESS (drb)) != 0 2995 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0 2996 || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true)) 2997 break; 2998 2999 /* Check that the data-refs have the same constant size. */ 3000 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))); 3001 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))); 3002 if (!tree_fits_uhwi_p (sza) 3003 || !tree_fits_uhwi_p (szb) 3004 || !tree_int_cst_equal (sza, szb)) 3005 break; 3006 3007 /* Check that the data-refs have the same step. */ 3008 if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0) 3009 break; 3010 3011 /* Check the types are compatible. 3012 ??? We don't distinguish this during sorting. */ 3013 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)), 3014 TREE_TYPE (DR_REF (drb)))) 3015 break; 3016 3017 /* Check that the DR_INITs are compile-time constants. */ 3018 if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST 3019 || TREE_CODE (DR_INIT (drb)) != INTEGER_CST) 3020 break; 3021 3022 /* Different .GOMP_SIMD_LANE calls still give the same lane, 3023 just hold extra information. */ 3024 if (STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_a) 3025 && STMT_VINFO_SIMD_LANE_ACCESS_P (stmtinfo_b) 3026 && data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb)) == 0) 3027 break; 3028 3029 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */ 3030 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra)); 3031 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb)); 3032 HOST_WIDE_INT init_prev 3033 = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1])); 3034 gcc_assert (init_a <= init_b 3035 && init_a <= init_prev 3036 && init_prev <= init_b); 3037 3038 /* Do not place the same access in the interleaving chain twice. */ 3039 if (init_b == init_prev) 3040 { 3041 gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1])) 3042 < gimple_uid (DR_STMT (drb))); 3043 /* Simply link in duplicates and fix up the chain below. */ 3044 } 3045 else 3046 { 3047 /* If init_b == init_a + the size of the type * k, we have an 3048 interleaving, and DRA is accessed before DRB. */ 3049 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza); 3050 if (type_size_a == 0 3051 || (init_b - init_a) % type_size_a != 0) 3052 break; 3053 3054 /* If we have a store, the accesses are adjacent. This splits 3055 groups into chunks we support (we don't support vectorization 3056 of stores with gaps). */ 3057 if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a) 3058 break; 3059 3060 /* If the step (if not zero or non-constant) is greater than the 3061 difference between data-refs' inits this splits groups into 3062 suitable sizes. */ 3063 if (tree_fits_shwi_p (DR_STEP (dra))) 3064 { 3065 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra)); 3066 if (step != 0 && step <= (init_b - init_a)) 3067 break; 3068 } 3069 } 3070 3071 if (dump_enabled_p ()) 3072 dump_printf_loc (MSG_NOTE, vect_location, 3073 DR_IS_READ (dra) 3074 ? "Detected interleaving load %T and %T\n" 3075 : "Detected interleaving store %T and %T\n", 3076 DR_REF (dra), DR_REF (drb)); 3077 3078 /* Link the found element into the group list. */ 3079 if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a)) 3080 { 3081 DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a; 3082 lastinfo = stmtinfo_a; 3083 } 3084 DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a; 3085 DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b; 3086 lastinfo = stmtinfo_b; 3087 3088 STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a) 3089 = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false); 3090 3091 if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)) 3092 dump_printf_loc (MSG_NOTE, vect_location, 3093 "Load suitable for SLP vectorization only.\n"); 3094 3095 if (init_b == init_prev 3096 && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)) 3097 && dump_enabled_p ()) 3098 dump_printf_loc (MSG_NOTE, vect_location, 3099 "Queuing group with duplicate access for fixup\n"); 3100 } 3101 } 3102 3103 /* Fixup groups with duplicate entries by splitting it. */ 3104 while (1) 3105 { 3106 hash_set<stmt_vec_info>::iterator it = to_fixup.begin (); 3107 if (!(it != to_fixup.end ())) 3108 break; 3109 stmt_vec_info grp = *it; 3110 to_fixup.remove (grp); 3111 3112 /* Find the earliest duplicate group member. */ 3113 unsigned first_duplicate = -1u; 3114 stmt_vec_info next, g = grp; 3115 while ((next = DR_GROUP_NEXT_ELEMENT (g))) 3116 { 3117 if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr), 3118 DR_INIT (STMT_VINFO_DR_INFO (g)->dr)) 3119 && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate) 3120 first_duplicate = gimple_uid (STMT_VINFO_STMT (next)); 3121 g = next; 3122 } 3123 if (first_duplicate == -1U) 3124 continue; 3125 3126 /* Then move all stmts after the first duplicate to a new group. 3127 Note this is a heuristic but one with the property that *it 3128 is fixed up completely. */ 3129 g = grp; 3130 stmt_vec_info newgroup = NULL, ng = grp; 3131 while ((next = DR_GROUP_NEXT_ELEMENT (g))) 3132 { 3133 if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate) 3134 { 3135 DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next); 3136 if (!newgroup) 3137 newgroup = next; 3138 else 3139 DR_GROUP_NEXT_ELEMENT (ng) = next; 3140 ng = next; 3141 DR_GROUP_FIRST_ELEMENT (ng) = newgroup; 3142 } 3143 else 3144 g = DR_GROUP_NEXT_ELEMENT (g); 3145 } 3146 DR_GROUP_NEXT_ELEMENT (ng) = NULL; 3147 3148 /* Fixup the new group which still may contain duplicates. */ 3149 to_fixup.add (newgroup); 3150 } 3151 3152 FOR_EACH_VEC_ELT (datarefs_copy, i, dr) 3153 { 3154 dr_vec_info *dr_info = vinfo->lookup_dr (dr); 3155 if (STMT_VINFO_VECTORIZABLE (dr_info->stmt) 3156 && !vect_analyze_data_ref_access (dr_info)) 3157 { 3158 if (dump_enabled_p ()) 3159 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 3160 "not vectorized: complicated access pattern.\n"); 3161 3162 if (is_a <bb_vec_info> (vinfo)) 3163 { 3164 /* Mark the statement as not vectorizable. */ 3165 STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false; 3166 continue; 3167 } 3168 else 3169 { 3170 datarefs_copy.release (); 3171 return opt_result::failure_at (dr_info->stmt->stmt, 3172 "not vectorized:" 3173 " complicated access pattern.\n"); 3174 } 3175 } 3176 } 3177 3178 datarefs_copy.release (); 3179 return opt_result::success (); 3180} 3181 3182/* Function vect_vfa_segment_size. 3183 3184 Input: 3185 DR_INFO: The data reference. 3186 LENGTH_FACTOR: segment length to consider. 3187 3188 Return a value suitable for the dr_with_seg_len::seg_len field. 3189 This is the "distance travelled" by the pointer from the first 3190 iteration in the segment to the last. Note that it does not include 3191 the size of the access; in effect it only describes the first byte. */ 3192 3193static tree 3194vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor) 3195{ 3196 length_factor = size_binop (MINUS_EXPR, 3197 fold_convert (sizetype, length_factor), 3198 size_one_node); 3199 return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)), 3200 length_factor); 3201} 3202 3203/* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)), 3204 gives the worst-case number of bytes covered by the segment. */ 3205 3206static unsigned HOST_WIDE_INT 3207vect_vfa_access_size (dr_vec_info *dr_info) 3208{ 3209 stmt_vec_info stmt_vinfo = dr_info->stmt; 3210 tree ref_type = TREE_TYPE (DR_REF (dr_info->dr)); 3211 unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type)); 3212 unsigned HOST_WIDE_INT access_size = ref_size; 3213 if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo)) 3214 { 3215 gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo); 3216 access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo); 3217 } 3218 if (STMT_VINFO_VEC_STMT (stmt_vinfo) 3219 && (vect_supportable_dr_alignment (dr_info, false) 3220 == dr_explicit_realign_optimized)) 3221 { 3222 /* We might access a full vector's worth. */ 3223 tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); 3224 access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size; 3225 } 3226 return access_size; 3227} 3228 3229/* Get the minimum alignment for all the scalar accesses that DR_INFO 3230 describes. */ 3231 3232static unsigned int 3233vect_vfa_align (dr_vec_info *dr_info) 3234{ 3235 return dr_alignment (dr_info->dr); 3236} 3237 3238/* Function vect_no_alias_p. 3239 3240 Given data references A and B with equal base and offset, see whether 3241 the alias relation can be decided at compilation time. Return 1 if 3242 it can and the references alias, 0 if it can and the references do 3243 not alias, and -1 if we cannot decide at compile time. SEGMENT_LENGTH_A, 3244 SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent 3245 of dr_with_seg_len::{seg_len,access_size} for A and B. */ 3246 3247static int 3248vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b, 3249 tree segment_length_a, tree segment_length_b, 3250 unsigned HOST_WIDE_INT access_size_a, 3251 unsigned HOST_WIDE_INT access_size_b) 3252{ 3253 poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr)); 3254 poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr)); 3255 poly_uint64 const_length_a; 3256 poly_uint64 const_length_b; 3257 3258 /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT 3259 bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of 3260 [a, a+12) */ 3261 if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0) 3262 { 3263 const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi (); 3264 offset_a -= const_length_a; 3265 } 3266 else 3267 const_length_a = tree_to_poly_uint64 (segment_length_a); 3268 if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0) 3269 { 3270 const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi (); 3271 offset_b -= const_length_b; 3272 } 3273 else 3274 const_length_b = tree_to_poly_uint64 (segment_length_b); 3275 3276 const_length_a += access_size_a; 3277 const_length_b += access_size_b; 3278 3279 if (ranges_known_overlap_p (offset_a, const_length_a, 3280 offset_b, const_length_b)) 3281 return 1; 3282 3283 if (!ranges_maybe_overlap_p (offset_a, const_length_a, 3284 offset_b, const_length_b)) 3285 return 0; 3286 3287 return -1; 3288} 3289 3290/* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH 3291 in DDR is >= VF. */ 3292 3293static bool 3294dependence_distance_ge_vf (data_dependence_relation *ddr, 3295 unsigned int loop_depth, poly_uint64 vf) 3296{ 3297 if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE 3298 || DDR_NUM_DIST_VECTS (ddr) == 0) 3299 return false; 3300 3301 /* If the dependence is exact, we should have limited the VF instead. */ 3302 gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr)); 3303 3304 unsigned int i; 3305 lambda_vector dist_v; 3306 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 3307 { 3308 HOST_WIDE_INT dist = dist_v[loop_depth]; 3309 if (dist != 0 3310 && !(dist > 0 && DDR_REVERSED_P (ddr)) 3311 && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf)) 3312 return false; 3313 } 3314 3315 if (dump_enabled_p ()) 3316 dump_printf_loc (MSG_NOTE, vect_location, 3317 "dependence distance between %T and %T is >= VF\n", 3318 DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr))); 3319 3320 return true; 3321} 3322 3323/* Dump LOWER_BOUND using flags DUMP_KIND. Dumps are known to be enabled. */ 3324 3325static void 3326dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound) 3327{ 3328 dump_printf (dump_kind, "%s (%T) >= ", 3329 lower_bound.unsigned_p ? "unsigned" : "abs", 3330 lower_bound.expr); 3331 dump_dec (dump_kind, lower_bound.min_value); 3332} 3333 3334/* Record that the vectorized loop requires the vec_lower_bound described 3335 by EXPR, UNSIGNED_P and MIN_VALUE. */ 3336 3337static void 3338vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p, 3339 poly_uint64 min_value) 3340{ 3341 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo); 3342 for (unsigned int i = 0; i < lower_bounds.length (); ++i) 3343 if (operand_equal_p (lower_bounds[i].expr, expr, 0)) 3344 { 3345 unsigned_p &= lower_bounds[i].unsigned_p; 3346 min_value = upper_bound (lower_bounds[i].min_value, min_value); 3347 if (lower_bounds[i].unsigned_p != unsigned_p 3348 || maybe_lt (lower_bounds[i].min_value, min_value)) 3349 { 3350 lower_bounds[i].unsigned_p = unsigned_p; 3351 lower_bounds[i].min_value = min_value; 3352 if (dump_enabled_p ()) 3353 { 3354 dump_printf_loc (MSG_NOTE, vect_location, 3355 "updating run-time check to "); 3356 dump_lower_bound (MSG_NOTE, lower_bounds[i]); 3357 dump_printf (MSG_NOTE, "\n"); 3358 } 3359 } 3360 return; 3361 } 3362 3363 vec_lower_bound lower_bound (expr, unsigned_p, min_value); 3364 if (dump_enabled_p ()) 3365 { 3366 dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that "); 3367 dump_lower_bound (MSG_NOTE, lower_bound); 3368 dump_printf (MSG_NOTE, "\n"); 3369 } 3370 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound); 3371} 3372 3373/* Return true if it's unlikely that the step of the vectorized form of DR_INFO 3374 will span fewer than GAP bytes. */ 3375 3376static bool 3377vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info, 3378 poly_int64 gap) 3379{ 3380 stmt_vec_info stmt_info = dr_info->stmt; 3381 HOST_WIDE_INT count 3382 = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo)); 3383 if (DR_GROUP_FIRST_ELEMENT (stmt_info)) 3384 count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info)); 3385 return (estimated_poly_value (gap) 3386 <= count * vect_get_scalar_dr_size (dr_info)); 3387} 3388 3389/* Return true if we know that there is no alias between DR_INFO_A and 3390 DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N. 3391 When returning true, set *LOWER_BOUND_OUT to this N. */ 3392 3393static bool 3394vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b, 3395 poly_uint64 *lower_bound_out) 3396{ 3397 /* Check that there is a constant gap of known sign between DR_A 3398 and DR_B. */ 3399 data_reference *dr_a = dr_info_a->dr; 3400 data_reference *dr_b = dr_info_b->dr; 3401 poly_int64 init_a, init_b; 3402 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0) 3403 || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0) 3404 || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0) 3405 || !poly_int_tree_p (DR_INIT (dr_a), &init_a) 3406 || !poly_int_tree_p (DR_INIT (dr_b), &init_b) 3407 || !ordered_p (init_a, init_b)) 3408 return false; 3409 3410 /* Sort DR_A and DR_B by the address they access. */ 3411 if (maybe_lt (init_b, init_a)) 3412 { 3413 std::swap (init_a, init_b); 3414 std::swap (dr_info_a, dr_info_b); 3415 std::swap (dr_a, dr_b); 3416 } 3417 3418 /* If the two accesses could be dependent within a scalar iteration, 3419 make sure that we'd retain their order. */ 3420 if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b) 3421 && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b)) 3422 return false; 3423 3424 /* There is no alias if abs (DR_STEP) is greater than or equal to 3425 the bytes spanned by the combination of the two accesses. */ 3426 *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a; 3427 return true; 3428} 3429 3430/* Function vect_prune_runtime_alias_test_list. 3431 3432 Prune a list of ddrs to be tested at run-time by versioning for alias. 3433 Merge several alias checks into one if possible. 3434 Return FALSE if resulting list of ddrs is longer then allowed by 3435 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */ 3436 3437opt_result 3438vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) 3439{ 3440 typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash; 3441 hash_set <tree_pair_hash> compared_objects; 3442 3443 vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); 3444 vec<dr_with_seg_len_pair_t> &comp_alias_ddrs 3445 = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo); 3446 vec<vec_object_pair> &check_unequal_addrs 3447 = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo); 3448 poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 3449 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo); 3450 3451 ddr_p ddr; 3452 unsigned int i; 3453 tree length_factor; 3454 3455 DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list"); 3456 3457 /* Step values are irrelevant for aliasing if the number of vector 3458 iterations is equal to the number of scalar iterations (which can 3459 happen for fully-SLP loops). */ 3460 bool vf_one_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U); 3461 3462 if (!vf_one_p) 3463 { 3464 /* Convert the checks for nonzero steps into bound tests. */ 3465 tree value; 3466 FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value) 3467 vect_check_lower_bound (loop_vinfo, value, true, 1); 3468 } 3469 3470 if (may_alias_ddrs.is_empty ()) 3471 return opt_result::success (); 3472 3473 comp_alias_ddrs.create (may_alias_ddrs.length ()); 3474 3475 unsigned int loop_depth 3476 = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num, 3477 LOOP_VINFO_LOOP_NEST (loop_vinfo)); 3478 3479 /* First, we collect all data ref pairs for aliasing checks. */ 3480 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr) 3481 { 3482 poly_uint64 lower_bound; 3483 tree segment_length_a, segment_length_b; 3484 unsigned HOST_WIDE_INT access_size_a, access_size_b; 3485 unsigned int align_a, align_b; 3486 3487 /* Ignore the alias if the VF we chose ended up being no greater 3488 than the dependence distance. */ 3489 if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor)) 3490 continue; 3491 3492 if (DDR_OBJECT_A (ddr)) 3493 { 3494 vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr)); 3495 if (!compared_objects.add (new_pair)) 3496 { 3497 if (dump_enabled_p ()) 3498 dump_printf_loc (MSG_NOTE, vect_location, 3499 "checking that %T and %T" 3500 " have different addresses\n", 3501 new_pair.first, new_pair.second); 3502 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair); 3503 } 3504 continue; 3505 } 3506 3507 dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr)); 3508 stmt_vec_info stmt_info_a = dr_info_a->stmt; 3509 3510 dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr)); 3511 stmt_vec_info stmt_info_b = dr_info_b->stmt; 3512 3513 bool preserves_scalar_order_p 3514 = vect_preserves_scalar_order_p (dr_info_a, dr_info_b); 3515 bool ignore_step_p 3516 = (vf_one_p 3517 && (preserves_scalar_order_p 3518 || operand_equal_p (DR_STEP (dr_info_a->dr), 3519 DR_STEP (dr_info_b->dr)))); 3520 3521 /* Skip the pair if inter-iteration dependencies are irrelevant 3522 and intra-iteration dependencies are guaranteed to be honored. */ 3523 if (ignore_step_p 3524 && (preserves_scalar_order_p 3525 || vectorizable_with_step_bound_p (dr_info_a, dr_info_b, 3526 &lower_bound))) 3527 { 3528 if (dump_enabled_p ()) 3529 dump_printf_loc (MSG_NOTE, vect_location, 3530 "no need for alias check between " 3531 "%T and %T when VF is 1\n", 3532 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr)); 3533 continue; 3534 } 3535 3536 /* See whether we can handle the alias using a bounds check on 3537 the step, and whether that's likely to be the best approach. 3538 (It might not be, for example, if the minimum step is much larger 3539 than the number of bytes handled by one vector iteration.) */ 3540 if (!ignore_step_p 3541 && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST 3542 && vectorizable_with_step_bound_p (dr_info_a, dr_info_b, 3543 &lower_bound) 3544 && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound) 3545 || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound))) 3546 { 3547 bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr); 3548 if (dump_enabled_p ()) 3549 { 3550 dump_printf_loc (MSG_NOTE, vect_location, "no alias between " 3551 "%T and %T when the step %T is outside ", 3552 DR_REF (dr_info_a->dr), 3553 DR_REF (dr_info_b->dr), 3554 DR_STEP (dr_info_a->dr)); 3555 if (unsigned_p) 3556 dump_printf (MSG_NOTE, "[0"); 3557 else 3558 { 3559 dump_printf (MSG_NOTE, "("); 3560 dump_dec (MSG_NOTE, poly_int64 (-lower_bound)); 3561 } 3562 dump_printf (MSG_NOTE, ", "); 3563 dump_dec (MSG_NOTE, lower_bound); 3564 dump_printf (MSG_NOTE, ")\n"); 3565 } 3566 vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr), 3567 unsigned_p, lower_bound); 3568 continue; 3569 } 3570 3571 stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a); 3572 if (dr_group_first_a) 3573 { 3574 stmt_info_a = dr_group_first_a; 3575 dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a); 3576 } 3577 3578 stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b); 3579 if (dr_group_first_b) 3580 { 3581 stmt_info_b = dr_group_first_b; 3582 dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b); 3583 } 3584 3585 if (ignore_step_p) 3586 { 3587 segment_length_a = size_zero_node; 3588 segment_length_b = size_zero_node; 3589 } 3590 else 3591 { 3592 if (!operand_equal_p (DR_STEP (dr_info_a->dr), 3593 DR_STEP (dr_info_b->dr), 0)) 3594 length_factor = scalar_loop_iters; 3595 else 3596 length_factor = size_int (vect_factor); 3597 segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor); 3598 segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor); 3599 } 3600 access_size_a = vect_vfa_access_size (dr_info_a); 3601 access_size_b = vect_vfa_access_size (dr_info_b); 3602 align_a = vect_vfa_align (dr_info_a); 3603 align_b = vect_vfa_align (dr_info_b); 3604 3605 /* See whether the alias is known at compilation time. */ 3606 if (operand_equal_p (DR_BASE_ADDRESS (dr_info_a->dr), 3607 DR_BASE_ADDRESS (dr_info_b->dr), 0) 3608 && operand_equal_p (DR_OFFSET (dr_info_a->dr), 3609 DR_OFFSET (dr_info_b->dr), 0) 3610 && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST 3611 && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST 3612 && poly_int_tree_p (segment_length_a) 3613 && poly_int_tree_p (segment_length_b)) 3614 { 3615 int res = vect_compile_time_alias (dr_info_a, dr_info_b, 3616 segment_length_a, 3617 segment_length_b, 3618 access_size_a, 3619 access_size_b); 3620 if (res >= 0 && dump_enabled_p ()) 3621 { 3622 dump_printf_loc (MSG_NOTE, vect_location, 3623 "can tell at compile time that %T and %T", 3624 DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr)); 3625 if (res == 0) 3626 dump_printf (MSG_NOTE, " do not alias\n"); 3627 else 3628 dump_printf (MSG_NOTE, " alias\n"); 3629 } 3630 3631 if (res == 0) 3632 continue; 3633 3634 if (res == 1) 3635 return opt_result::failure_at (stmt_info_b->stmt, 3636 "not vectorized:" 3637 " compilation time alias: %G%G", 3638 stmt_info_a->stmt, 3639 stmt_info_b->stmt); 3640 } 3641 3642 dr_with_seg_len dr_a (dr_info_a->dr, segment_length_a, 3643 access_size_a, align_a); 3644 dr_with_seg_len dr_b (dr_info_b->dr, segment_length_b, 3645 access_size_b, align_b); 3646 /* Canonicalize the order to be the one that's needed for accurate 3647 RAW, WAR and WAW flags, in cases where the data references are 3648 well-ordered. The order doesn't really matter otherwise, 3649 but we might as well be consistent. */ 3650 if (get_later_stmt (stmt_info_a, stmt_info_b) == stmt_info_a) 3651 std::swap (dr_a, dr_b); 3652 3653 dr_with_seg_len_pair_t dr_with_seg_len_pair 3654 (dr_a, dr_b, (preserves_scalar_order_p 3655 ? dr_with_seg_len_pair_t::WELL_ORDERED 3656 : dr_with_seg_len_pair_t::REORDERED)); 3657 3658 comp_alias_ddrs.safe_push (dr_with_seg_len_pair); 3659 } 3660 3661 prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor); 3662 3663 unsigned int count = (comp_alias_ddrs.length () 3664 + check_unequal_addrs.length ()); 3665 3666 if (dump_enabled_p ()) 3667 dump_printf_loc (MSG_NOTE, vect_location, 3668 "improved number of alias checks from %d to %d\n", 3669 may_alias_ddrs.length (), count); 3670 unsigned limit = param_vect_max_version_for_alias_checks; 3671 if (flag_simd_cost_model == VECT_COST_MODEL_CHEAP) 3672 limit = param_vect_max_version_for_alias_checks * 6 / 10; 3673 if (count > limit) 3674 return opt_result::failure_at 3675 (vect_location, 3676 "number of versioning for alias run-time tests exceeds %d " 3677 "(--param vect-max-version-for-alias-checks)\n", limit); 3678 3679 return opt_result::success (); 3680} 3681 3682/* Check whether we can use an internal function for a gather load 3683 or scatter store. READ_P is true for loads and false for stores. 3684 MASKED_P is true if the load or store is conditional. MEMORY_TYPE is 3685 the type of the memory elements being loaded or stored. OFFSET_TYPE 3686 is the type of the offset that is being applied to the invariant 3687 base address. SCALE is the amount by which the offset should 3688 be multiplied *after* it has been converted to address width. 3689 3690 Return true if the function is supported, storing the function id in 3691 *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT. */ 3692 3693bool 3694vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p, 3695 tree vectype, tree memory_type, tree offset_type, 3696 int scale, internal_fn *ifn_out, 3697 tree *offset_vectype_out) 3698{ 3699 unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type)); 3700 unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype))); 3701 if (element_bits != memory_bits) 3702 /* For now the vector elements must be the same width as the 3703 memory elements. */ 3704 return false; 3705 3706 /* Work out which function we need. */ 3707 internal_fn ifn; 3708 if (read_p) 3709 ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD; 3710 else 3711 ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE; 3712 3713 for (;;) 3714 { 3715 tree offset_vectype = get_vectype_for_scalar_type (vinfo, offset_type); 3716 if (!offset_vectype) 3717 return false; 3718 3719 /* Test whether the target supports this combination. */ 3720 if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type, 3721 offset_vectype, scale)) 3722 { 3723 *ifn_out = ifn; 3724 *offset_vectype_out = offset_vectype; 3725 return true; 3726 } 3727 3728 if (TYPE_PRECISION (offset_type) >= POINTER_SIZE 3729 && TYPE_PRECISION (offset_type) >= element_bits) 3730 return false; 3731 3732 offset_type = build_nonstandard_integer_type 3733 (TYPE_PRECISION (offset_type) * 2, TYPE_UNSIGNED (offset_type)); 3734 } 3735} 3736 3737/* STMT_INFO is a call to an internal gather load or scatter store function. 3738 Describe the operation in INFO. */ 3739 3740static void 3741vect_describe_gather_scatter_call (stmt_vec_info stmt_info, 3742 gather_scatter_info *info) 3743{ 3744 gcall *call = as_a <gcall *> (stmt_info->stmt); 3745 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 3746 data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 3747 3748 info->ifn = gimple_call_internal_fn (call); 3749 info->decl = NULL_TREE; 3750 info->base = gimple_call_arg (call, 0); 3751 info->offset = gimple_call_arg (call, 1); 3752 info->offset_dt = vect_unknown_def_type; 3753 info->offset_vectype = NULL_TREE; 3754 info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); 3755 info->element_type = TREE_TYPE (vectype); 3756 info->memory_type = TREE_TYPE (DR_REF (dr)); 3757} 3758 3759/* Return true if a non-affine read or write in STMT_INFO is suitable for a 3760 gather load or scatter store. Describe the operation in *INFO if so. */ 3761 3762bool 3763vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo, 3764 gather_scatter_info *info) 3765{ 3766 HOST_WIDE_INT scale = 1; 3767 poly_int64 pbitpos, pbitsize; 3768 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 3769 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 3770 tree offtype = NULL_TREE; 3771 tree decl = NULL_TREE, base, off; 3772 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 3773 tree memory_type = TREE_TYPE (DR_REF (dr)); 3774 machine_mode pmode; 3775 int punsignedp, reversep, pvolatilep = 0; 3776 internal_fn ifn; 3777 tree offset_vectype; 3778 bool masked_p = false; 3779 3780 /* See whether this is already a call to a gather/scatter internal function. 3781 If not, see whether it's a masked load or store. */ 3782 gcall *call = dyn_cast <gcall *> (stmt_info->stmt); 3783 if (call && gimple_call_internal_p (call)) 3784 { 3785 ifn = gimple_call_internal_fn (call); 3786 if (internal_gather_scatter_fn_p (ifn)) 3787 { 3788 vect_describe_gather_scatter_call (stmt_info, info); 3789 return true; 3790 } 3791 masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE); 3792 } 3793 3794 /* True if we should aim to use internal functions rather than 3795 built-in functions. */ 3796 bool use_ifn_p = (DR_IS_READ (dr) 3797 ? supports_vec_gather_load_p () 3798 : supports_vec_scatter_store_p ()); 3799 3800 base = DR_REF (dr); 3801 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF, 3802 see if we can use the def stmt of the address. */ 3803 if (masked_p 3804 && TREE_CODE (base) == MEM_REF 3805 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME 3806 && integer_zerop (TREE_OPERAND (base, 1)) 3807 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0))) 3808 { 3809 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0)); 3810 if (is_gimple_assign (def_stmt) 3811 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR) 3812 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0); 3813 } 3814 3815 /* The gather and scatter builtins need address of the form 3816 loop_invariant + vector * {1, 2, 4, 8} 3817 or 3818 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }. 3819 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture 3820 of loop invariants/SSA_NAMEs defined in the loop, with casts, 3821 multiplications and additions in it. To get a vector, we need 3822 a single SSA_NAME that will be defined in the loop and will 3823 contain everything that is not loop invariant and that can be 3824 vectorized. The following code attempts to find such a preexistng 3825 SSA_NAME OFF and put the loop invariants into a tree BASE 3826 that can be gimplified before the loop. */ 3827 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode, 3828 &punsignedp, &reversep, &pvolatilep); 3829 if (reversep) 3830 return false; 3831 3832 poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT); 3833 3834 if (TREE_CODE (base) == MEM_REF) 3835 { 3836 if (!integer_zerop (TREE_OPERAND (base, 1))) 3837 { 3838 if (off == NULL_TREE) 3839 off = wide_int_to_tree (sizetype, mem_ref_offset (base)); 3840 else 3841 off = size_binop (PLUS_EXPR, off, 3842 fold_convert (sizetype, TREE_OPERAND (base, 1))); 3843 } 3844 base = TREE_OPERAND (base, 0); 3845 } 3846 else 3847 base = build_fold_addr_expr (base); 3848 3849 if (off == NULL_TREE) 3850 off = size_zero_node; 3851 3852 /* If base is not loop invariant, either off is 0, then we start with just 3853 the constant offset in the loop invariant BASE and continue with base 3854 as OFF, otherwise give up. 3855 We could handle that case by gimplifying the addition of base + off 3856 into some SSA_NAME and use that as off, but for now punt. */ 3857 if (!expr_invariant_in_loop_p (loop, base)) 3858 { 3859 if (!integer_zerop (off)) 3860 return false; 3861 off = base; 3862 base = size_int (pbytepos); 3863 } 3864 /* Otherwise put base + constant offset into the loop invariant BASE 3865 and continue with OFF. */ 3866 else 3867 { 3868 base = fold_convert (sizetype, base); 3869 base = size_binop (PLUS_EXPR, base, size_int (pbytepos)); 3870 } 3871 3872 /* OFF at this point may be either a SSA_NAME or some tree expression 3873 from get_inner_reference. Try to peel off loop invariants from it 3874 into BASE as long as possible. */ 3875 STRIP_NOPS (off); 3876 while (offtype == NULL_TREE) 3877 { 3878 enum tree_code code; 3879 tree op0, op1, add = NULL_TREE; 3880 3881 if (TREE_CODE (off) == SSA_NAME) 3882 { 3883 gimple *def_stmt = SSA_NAME_DEF_STMT (off); 3884 3885 if (expr_invariant_in_loop_p (loop, off)) 3886 return false; 3887 3888 if (gimple_code (def_stmt) != GIMPLE_ASSIGN) 3889 break; 3890 3891 op0 = gimple_assign_rhs1 (def_stmt); 3892 code = gimple_assign_rhs_code (def_stmt); 3893 op1 = gimple_assign_rhs2 (def_stmt); 3894 } 3895 else 3896 { 3897 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS) 3898 return false; 3899 code = TREE_CODE (off); 3900 extract_ops_from_tree (off, &code, &op0, &op1); 3901 } 3902 switch (code) 3903 { 3904 case POINTER_PLUS_EXPR: 3905 case PLUS_EXPR: 3906 if (expr_invariant_in_loop_p (loop, op0)) 3907 { 3908 add = op0; 3909 off = op1; 3910 do_add: 3911 add = fold_convert (sizetype, add); 3912 if (scale != 1) 3913 add = size_binop (MULT_EXPR, add, size_int (scale)); 3914 base = size_binop (PLUS_EXPR, base, add); 3915 continue; 3916 } 3917 if (expr_invariant_in_loop_p (loop, op1)) 3918 { 3919 add = op1; 3920 off = op0; 3921 goto do_add; 3922 } 3923 break; 3924 case MINUS_EXPR: 3925 if (expr_invariant_in_loop_p (loop, op1)) 3926 { 3927 add = fold_convert (sizetype, op1); 3928 add = size_binop (MINUS_EXPR, size_zero_node, add); 3929 off = op0; 3930 goto do_add; 3931 } 3932 break; 3933 case MULT_EXPR: 3934 if (scale == 1 && tree_fits_shwi_p (op1)) 3935 { 3936 int new_scale = tree_to_shwi (op1); 3937 /* Only treat this as a scaling operation if the target 3938 supports it for at least some offset type. */ 3939 if (use_ifn_p 3940 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), 3941 masked_p, vectype, memory_type, 3942 signed_char_type_node, 3943 new_scale, &ifn, 3944 &offset_vectype) 3945 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), 3946 masked_p, vectype, memory_type, 3947 unsigned_char_type_node, 3948 new_scale, &ifn, 3949 &offset_vectype)) 3950 break; 3951 scale = new_scale; 3952 off = op0; 3953 continue; 3954 } 3955 break; 3956 case SSA_NAME: 3957 off = op0; 3958 continue; 3959 CASE_CONVERT: 3960 if (!POINTER_TYPE_P (TREE_TYPE (op0)) 3961 && !INTEGRAL_TYPE_P (TREE_TYPE (op0))) 3962 break; 3963 3964 /* Don't include the conversion if the target is happy with 3965 the current offset type. */ 3966 if (use_ifn_p 3967 && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), 3968 masked_p, vectype, memory_type, 3969 TREE_TYPE (off), scale, &ifn, 3970 &offset_vectype)) 3971 break; 3972 3973 if (TYPE_PRECISION (TREE_TYPE (op0)) 3974 == TYPE_PRECISION (TREE_TYPE (off))) 3975 { 3976 off = op0; 3977 continue; 3978 } 3979 3980 if (TYPE_PRECISION (TREE_TYPE (op0)) 3981 < TYPE_PRECISION (TREE_TYPE (off))) 3982 { 3983 off = op0; 3984 offtype = TREE_TYPE (off); 3985 STRIP_NOPS (off); 3986 continue; 3987 } 3988 break; 3989 default: 3990 break; 3991 } 3992 break; 3993 } 3994 3995 /* If at the end OFF still isn't a SSA_NAME or isn't 3996 defined in the loop, punt. */ 3997 if (TREE_CODE (off) != SSA_NAME 3998 || expr_invariant_in_loop_p (loop, off)) 3999 return false; 4000 4001 if (offtype == NULL_TREE) 4002 offtype = TREE_TYPE (off); 4003 4004 if (use_ifn_p) 4005 { 4006 if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p, 4007 vectype, memory_type, offtype, scale, 4008 &ifn, &offset_vectype)) 4009 return false; 4010 } 4011 else 4012 { 4013 if (DR_IS_READ (dr)) 4014 { 4015 if (targetm.vectorize.builtin_gather) 4016 decl = targetm.vectorize.builtin_gather (vectype, offtype, scale); 4017 } 4018 else 4019 { 4020 if (targetm.vectorize.builtin_scatter) 4021 decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale); 4022 } 4023 4024 if (!decl) 4025 return false; 4026 4027 ifn = IFN_LAST; 4028 /* The offset vector type will be read from DECL when needed. */ 4029 offset_vectype = NULL_TREE; 4030 } 4031 4032 info->ifn = ifn; 4033 info->decl = decl; 4034 info->base = base; 4035 info->offset = off; 4036 info->offset_dt = vect_unknown_def_type; 4037 info->offset_vectype = offset_vectype; 4038 info->scale = scale; 4039 info->element_type = TREE_TYPE (vectype); 4040 info->memory_type = memory_type; 4041 return true; 4042} 4043 4044/* Find the data references in STMT, analyze them with respect to LOOP and 4045 append them to DATAREFS. Return false if datarefs in this stmt cannot 4046 be handled. */ 4047 4048opt_result 4049vect_find_stmt_data_reference (loop_p loop, gimple *stmt, 4050 vec<data_reference_p> *datarefs) 4051{ 4052 /* We can ignore clobbers for dataref analysis - they are removed during 4053 loop vectorization and BB vectorization checks dependences with a 4054 stmt walk. */ 4055 if (gimple_clobber_p (stmt)) 4056 return opt_result::success (); 4057 4058 if (gimple_has_volatile_ops (stmt)) 4059 return opt_result::failure_at (stmt, "not vectorized: volatile type: %G", 4060 stmt); 4061 4062 if (stmt_can_throw_internal (cfun, stmt)) 4063 return opt_result::failure_at (stmt, 4064 "not vectorized:" 4065 " statement can throw an exception: %G", 4066 stmt); 4067 4068 auto_vec<data_reference_p, 2> refs; 4069 opt_result res = find_data_references_in_stmt (loop, stmt, &refs); 4070 if (!res) 4071 return res; 4072 4073 if (refs.is_empty ()) 4074 return opt_result::success (); 4075 4076 if (refs.length () > 1) 4077 return opt_result::failure_at (stmt, 4078 "not vectorized:" 4079 " more than one data ref in stmt: %G", stmt); 4080 4081 if (gcall *call = dyn_cast <gcall *> (stmt)) 4082 if (!gimple_call_internal_p (call) 4083 || (gimple_call_internal_fn (call) != IFN_MASK_LOAD 4084 && gimple_call_internal_fn (call) != IFN_MASK_STORE)) 4085 return opt_result::failure_at (stmt, 4086 "not vectorized: dr in a call %G", stmt); 4087 4088 data_reference_p dr = refs.pop (); 4089 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF 4090 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1))) 4091 return opt_result::failure_at (stmt, 4092 "not vectorized:" 4093 " statement is bitfield access %G", stmt); 4094 4095 if (DR_BASE_ADDRESS (dr) 4096 && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST) 4097 return opt_result::failure_at (stmt, 4098 "not vectorized:" 4099 " base addr of dr is a constant\n"); 4100 4101 /* Check whether this may be a SIMD lane access and adjust the 4102 DR to make it easier for us to handle it. */ 4103 if (loop 4104 && loop->simduid 4105 && (!DR_BASE_ADDRESS (dr) 4106 || !DR_OFFSET (dr) 4107 || !DR_INIT (dr) 4108 || !DR_STEP (dr))) 4109 { 4110 struct data_reference *newdr 4111 = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt, 4112 DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr)); 4113 if (DR_BASE_ADDRESS (newdr) 4114 && DR_OFFSET (newdr) 4115 && DR_INIT (newdr) 4116 && DR_STEP (newdr) 4117 && TREE_CODE (DR_INIT (newdr)) == INTEGER_CST 4118 && integer_zerop (DR_STEP (newdr))) 4119 { 4120 tree base_address = DR_BASE_ADDRESS (newdr); 4121 tree off = DR_OFFSET (newdr); 4122 tree step = ssize_int (1); 4123 if (integer_zerop (off) 4124 && TREE_CODE (base_address) == POINTER_PLUS_EXPR) 4125 { 4126 off = TREE_OPERAND (base_address, 1); 4127 base_address = TREE_OPERAND (base_address, 0); 4128 } 4129 STRIP_NOPS (off); 4130 if (TREE_CODE (off) == MULT_EXPR 4131 && tree_fits_uhwi_p (TREE_OPERAND (off, 1))) 4132 { 4133 step = TREE_OPERAND (off, 1); 4134 off = TREE_OPERAND (off, 0); 4135 STRIP_NOPS (off); 4136 } 4137 if (CONVERT_EXPR_P (off) 4138 && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0))) 4139 < TYPE_PRECISION (TREE_TYPE (off)))) 4140 off = TREE_OPERAND (off, 0); 4141 if (TREE_CODE (off) == SSA_NAME) 4142 { 4143 gimple *def = SSA_NAME_DEF_STMT (off); 4144 /* Look through widening conversion. */ 4145 if (is_gimple_assign (def) 4146 && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (def))) 4147 { 4148 tree rhs1 = gimple_assign_rhs1 (def); 4149 if (TREE_CODE (rhs1) == SSA_NAME 4150 && INTEGRAL_TYPE_P (TREE_TYPE (rhs1)) 4151 && (TYPE_PRECISION (TREE_TYPE (off)) 4152 > TYPE_PRECISION (TREE_TYPE (rhs1)))) 4153 def = SSA_NAME_DEF_STMT (rhs1); 4154 } 4155 if (is_gimple_call (def) 4156 && gimple_call_internal_p (def) 4157 && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE)) 4158 { 4159 tree arg = gimple_call_arg (def, 0); 4160 tree reft = TREE_TYPE (DR_REF (newdr)); 4161 gcc_assert (TREE_CODE (arg) == SSA_NAME); 4162 arg = SSA_NAME_VAR (arg); 4163 if (arg == loop->simduid 4164 /* For now. */ 4165 && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step)) 4166 { 4167 DR_BASE_ADDRESS (newdr) = base_address; 4168 DR_OFFSET (newdr) = ssize_int (0); 4169 DR_STEP (newdr) = step; 4170 DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT; 4171 DR_STEP_ALIGNMENT (newdr) = highest_pow2_factor (step); 4172 /* Mark as simd-lane access. */ 4173 tree arg2 = gimple_call_arg (def, 1); 4174 newdr->aux = (void *) (-1 - tree_to_uhwi (arg2)); 4175 free_data_ref (dr); 4176 datarefs->safe_push (newdr); 4177 return opt_result::success (); 4178 } 4179 } 4180 } 4181 } 4182 free_data_ref (newdr); 4183 } 4184 4185 datarefs->safe_push (dr); 4186 return opt_result::success (); 4187} 4188 4189/* Function vect_analyze_data_refs. 4190 4191 Find all the data references in the loop or basic block. 4192 4193 The general structure of the analysis of data refs in the vectorizer is as 4194 follows: 4195 1- vect_analyze_data_refs(loop/bb): call 4196 compute_data_dependences_for_loop/bb to find and analyze all data-refs 4197 in the loop/bb and their dependences. 4198 2- vect_analyze_dependences(): apply dependence testing using ddrs. 4199 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok. 4200 4- vect_analyze_drs_access(): check that ref_stmt.step is ok. 4201 4202*/ 4203 4204opt_result 4205vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf, bool *fatal) 4206{ 4207 class loop *loop = NULL; 4208 unsigned int i; 4209 struct data_reference *dr; 4210 tree scalar_type; 4211 4212 DUMP_VECT_SCOPE ("vect_analyze_data_refs"); 4213 4214 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo)) 4215 loop = LOOP_VINFO_LOOP (loop_vinfo); 4216 4217 /* Go through the data-refs, check that the analysis succeeded. Update 4218 pointer from stmt_vec_info struct to DR and vectype. */ 4219 4220 vec<data_reference_p> datarefs = vinfo->shared->datarefs; 4221 FOR_EACH_VEC_ELT (datarefs, i, dr) 4222 { 4223 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE; 4224 poly_uint64 vf; 4225 4226 gcc_assert (DR_REF (dr)); 4227 stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr)); 4228 gcc_assert (!stmt_info->dr_aux.dr); 4229 stmt_info->dr_aux.dr = dr; 4230 stmt_info->dr_aux.stmt = stmt_info; 4231 4232 /* Check that analysis of the data-ref succeeded. */ 4233 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr) 4234 || !DR_STEP (dr)) 4235 { 4236 bool maybe_gather 4237 = DR_IS_READ (dr) 4238 && !TREE_THIS_VOLATILE (DR_REF (dr)) 4239 && (targetm.vectorize.builtin_gather != NULL 4240 || supports_vec_gather_load_p ()); 4241 bool maybe_scatter 4242 = DR_IS_WRITE (dr) 4243 && !TREE_THIS_VOLATILE (DR_REF (dr)) 4244 && (targetm.vectorize.builtin_scatter != NULL 4245 || supports_vec_scatter_store_p ()); 4246 4247 /* If target supports vector gather loads or scatter stores, 4248 see if they can't be used. */ 4249 if (is_a <loop_vec_info> (vinfo) 4250 && !nested_in_vect_loop_p (loop, stmt_info)) 4251 { 4252 if (maybe_gather || maybe_scatter) 4253 { 4254 if (maybe_gather) 4255 gatherscatter = GATHER; 4256 else 4257 gatherscatter = SCATTER; 4258 } 4259 } 4260 4261 if (gatherscatter == SG_NONE) 4262 { 4263 if (dump_enabled_p ()) 4264 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4265 "not vectorized: data ref analysis " 4266 "failed %G", stmt_info->stmt); 4267 if (is_a <bb_vec_info> (vinfo)) 4268 { 4269 /* In BB vectorization the ref can still participate 4270 in dependence analysis, we just can't vectorize it. */ 4271 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 4272 continue; 4273 } 4274 return opt_result::failure_at (stmt_info->stmt, 4275 "not vectorized:" 4276 " data ref analysis failed: %G", 4277 stmt_info->stmt); 4278 } 4279 } 4280 4281 /* See if this was detected as SIMD lane access. */ 4282 if (dr->aux == (void *)-1 4283 || dr->aux == (void *)-2 4284 || dr->aux == (void *)-3 4285 || dr->aux == (void *)-4) 4286 { 4287 if (nested_in_vect_loop_p (loop, stmt_info)) 4288 return opt_result::failure_at (stmt_info->stmt, 4289 "not vectorized:" 4290 " data ref analysis failed: %G", 4291 stmt_info->stmt); 4292 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) 4293 = -(uintptr_t) dr->aux; 4294 } 4295 4296 tree base = get_base_address (DR_REF (dr)); 4297 if (base && VAR_P (base) && DECL_NONALIASED (base)) 4298 { 4299 if (dump_enabled_p ()) 4300 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4301 "not vectorized: base object not addressable " 4302 "for stmt: %G", stmt_info->stmt); 4303 if (is_a <bb_vec_info> (vinfo)) 4304 { 4305 /* In BB vectorization the ref can still participate 4306 in dependence analysis, we just can't vectorize it. */ 4307 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 4308 continue; 4309 } 4310 return opt_result::failure_at (stmt_info->stmt, 4311 "not vectorized: base object not" 4312 " addressable for stmt: %G", 4313 stmt_info->stmt); 4314 } 4315 4316 if (is_a <loop_vec_info> (vinfo) 4317 && DR_STEP (dr) 4318 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST) 4319 { 4320 if (nested_in_vect_loop_p (loop, stmt_info)) 4321 return opt_result::failure_at (stmt_info->stmt, 4322 "not vectorized: " 4323 "not suitable for strided load %G", 4324 stmt_info->stmt); 4325 STMT_VINFO_STRIDED_P (stmt_info) = true; 4326 } 4327 4328 /* Update DR field in stmt_vec_info struct. */ 4329 4330 /* If the dataref is in an inner-loop of the loop that is considered for 4331 for vectorization, we also want to analyze the access relative to 4332 the outer-loop (DR contains information only relative to the 4333 inner-most enclosing loop). We do that by building a reference to the 4334 first location accessed by the inner-loop, and analyze it relative to 4335 the outer-loop. */ 4336 if (loop && nested_in_vect_loop_p (loop, stmt_info)) 4337 { 4338 /* Build a reference to the first location accessed by the 4339 inner loop: *(BASE + INIT + OFFSET). By construction, 4340 this address must be invariant in the inner loop, so we 4341 can consider it as being used in the outer loop. */ 4342 tree base = unshare_expr (DR_BASE_ADDRESS (dr)); 4343 tree offset = unshare_expr (DR_OFFSET (dr)); 4344 tree init = unshare_expr (DR_INIT (dr)); 4345 tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), 4346 init, offset); 4347 tree init_addr = fold_build_pointer_plus (base, init_offset); 4348 tree init_ref = build_fold_indirect_ref (init_addr); 4349 4350 if (dump_enabled_p ()) 4351 dump_printf_loc (MSG_NOTE, vect_location, 4352 "analyze in outer loop: %T\n", init_ref); 4353 4354 opt_result res 4355 = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info), 4356 init_ref, loop, stmt_info->stmt); 4357 if (!res) 4358 /* dr_analyze_innermost already explained the failure. */ 4359 return res; 4360 4361 if (dump_enabled_p ()) 4362 dump_printf_loc (MSG_NOTE, vect_location, 4363 "\touter base_address: %T\n" 4364 "\touter offset from base address: %T\n" 4365 "\touter constant offset from base address: %T\n" 4366 "\touter step: %T\n" 4367 "\touter base alignment: %d\n\n" 4368 "\touter base misalignment: %d\n" 4369 "\touter offset alignment: %d\n" 4370 "\touter step alignment: %d\n", 4371 STMT_VINFO_DR_BASE_ADDRESS (stmt_info), 4372 STMT_VINFO_DR_OFFSET (stmt_info), 4373 STMT_VINFO_DR_INIT (stmt_info), 4374 STMT_VINFO_DR_STEP (stmt_info), 4375 STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info), 4376 STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info), 4377 STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info), 4378 STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info)); 4379 } 4380 4381 /* Set vectype for STMT. */ 4382 scalar_type = TREE_TYPE (DR_REF (dr)); 4383 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type); 4384 if (!vectype) 4385 { 4386 if (dump_enabled_p ()) 4387 { 4388 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 4389 "not vectorized: no vectype for stmt: %G", 4390 stmt_info->stmt); 4391 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: "); 4392 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS, 4393 scalar_type); 4394 dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); 4395 } 4396 4397 if (is_a <bb_vec_info> (vinfo)) 4398 { 4399 /* No vector type is fine, the ref can still participate 4400 in dependence analysis, we just can't vectorize it. */ 4401 STMT_VINFO_VECTORIZABLE (stmt_info) = false; 4402 continue; 4403 } 4404 if (fatal) 4405 *fatal = false; 4406 return opt_result::failure_at (stmt_info->stmt, 4407 "not vectorized:" 4408 " no vectype for stmt: %G" 4409 " scalar_type: %T\n", 4410 stmt_info->stmt, scalar_type); 4411 } 4412 else 4413 { 4414 if (dump_enabled_p ()) 4415 dump_printf_loc (MSG_NOTE, vect_location, 4416 "got vectype for stmt: %G%T\n", 4417 stmt_info->stmt, vectype); 4418 } 4419 4420 /* Adjust the minimal vectorization factor according to the 4421 vector type. */ 4422 vf = TYPE_VECTOR_SUBPARTS (vectype); 4423 *min_vf = upper_bound (*min_vf, vf); 4424 4425 /* Leave the BB vectorizer to pick the vector type later, based on 4426 the final dataref group size and SLP node size. */ 4427 if (is_a <loop_vec_info> (vinfo)) 4428 STMT_VINFO_VECTYPE (stmt_info) = vectype; 4429 4430 if (gatherscatter != SG_NONE) 4431 { 4432 gather_scatter_info gs_info; 4433 if (!vect_check_gather_scatter (stmt_info, 4434 as_a <loop_vec_info> (vinfo), 4435 &gs_info) 4436 || !get_vectype_for_scalar_type (vinfo, 4437 TREE_TYPE (gs_info.offset))) 4438 { 4439 if (fatal) 4440 *fatal = false; 4441 return opt_result::failure_at 4442 (stmt_info->stmt, 4443 (gatherscatter == GATHER) 4444 ? "not vectorized: not suitable for gather load %G" 4445 : "not vectorized: not suitable for scatter store %G", 4446 stmt_info->stmt); 4447 } 4448 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter; 4449 } 4450 } 4451 4452 /* We used to stop processing and prune the list here. Verify we no 4453 longer need to. */ 4454 gcc_assert (i == datarefs.length ()); 4455 4456 return opt_result::success (); 4457} 4458 4459 4460/* Function vect_get_new_vect_var. 4461 4462 Returns a name for a new variable. The current naming scheme appends the 4463 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to 4464 the name of vectorizer generated variables, and appends that to NAME if 4465 provided. */ 4466 4467tree 4468vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) 4469{ 4470 const char *prefix; 4471 tree new_vect_var; 4472 4473 switch (var_kind) 4474 { 4475 case vect_simple_var: 4476 prefix = "vect"; 4477 break; 4478 case vect_scalar_var: 4479 prefix = "stmp"; 4480 break; 4481 case vect_mask_var: 4482 prefix = "mask"; 4483 break; 4484 case vect_pointer_var: 4485 prefix = "vectp"; 4486 break; 4487 default: 4488 gcc_unreachable (); 4489 } 4490 4491 if (name) 4492 { 4493 char* tmp = concat (prefix, "_", name, NULL); 4494 new_vect_var = create_tmp_reg (type, tmp); 4495 free (tmp); 4496 } 4497 else 4498 new_vect_var = create_tmp_reg (type, prefix); 4499 4500 return new_vect_var; 4501} 4502 4503/* Like vect_get_new_vect_var but return an SSA name. */ 4504 4505tree 4506vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name) 4507{ 4508 const char *prefix; 4509 tree new_vect_var; 4510 4511 switch (var_kind) 4512 { 4513 case vect_simple_var: 4514 prefix = "vect"; 4515 break; 4516 case vect_scalar_var: 4517 prefix = "stmp"; 4518 break; 4519 case vect_pointer_var: 4520 prefix = "vectp"; 4521 break; 4522 default: 4523 gcc_unreachable (); 4524 } 4525 4526 if (name) 4527 { 4528 char* tmp = concat (prefix, "_", name, NULL); 4529 new_vect_var = make_temp_ssa_name (type, NULL, tmp); 4530 free (tmp); 4531 } 4532 else 4533 new_vect_var = make_temp_ssa_name (type, NULL, prefix); 4534 4535 return new_vect_var; 4536} 4537 4538/* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO. */ 4539 4540static void 4541vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info) 4542{ 4543 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr)); 4544 int misalign = DR_MISALIGNMENT (dr_info); 4545 if (misalign == DR_MISALIGNMENT_UNKNOWN) 4546 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name)); 4547 else 4548 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), 4549 known_alignment (DR_TARGET_ALIGNMENT (dr_info)), 4550 misalign); 4551} 4552 4553/* Function vect_create_addr_base_for_vector_ref. 4554 4555 Create an expression that computes the address of the first memory location 4556 that will be accessed for a data reference. 4557 4558 Input: 4559 STMT_INFO: The statement containing the data reference. 4560 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. 4561 OFFSET: Optional. If supplied, it is be added to the initial address. 4562 LOOP: Specify relative to which loop-nest should the address be computed. 4563 For example, when the dataref is in an inner-loop nested in an 4564 outer-loop that is now being vectorized, LOOP can be either the 4565 outer-loop, or the inner-loop. The first memory location accessed 4566 by the following dataref ('in' points to short): 4567 4568 for (i=0; i<N; i++) 4569 for (j=0; j<M; j++) 4570 s += in[i+j] 4571 4572 is as follows: 4573 if LOOP=i_loop: &in (relative to i_loop) 4574 if LOOP=j_loop: &in+i*2B (relative to j_loop) 4575 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the 4576 initial address. Unlike OFFSET, which is number of elements to 4577 be added, BYTE_OFFSET is measured in bytes. 4578 4579 Output: 4580 1. Return an SSA_NAME whose value is the address of the memory location of 4581 the first vector of the data reference. 4582 2. If new_stmt_list is not NULL_TREE after return then the caller must insert 4583 these statement(s) which define the returned SSA_NAME. 4584 4585 FORNOW: We are only handling array accesses with step 1. */ 4586 4587tree 4588vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info, 4589 gimple_seq *new_stmt_list, 4590 tree offset, 4591 tree byte_offset) 4592{ 4593 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); 4594 struct data_reference *dr = dr_info->dr; 4595 const char *base_name; 4596 tree addr_base; 4597 tree dest; 4598 gimple_seq seq = NULL; 4599 tree vect_ptr_type; 4600 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); 4601 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4602 innermost_loop_behavior *drb = vect_dr_behavior (dr_info); 4603 4604 tree data_ref_base = unshare_expr (drb->base_address); 4605 tree base_offset = unshare_expr (get_dr_vinfo_offset (dr_info, true)); 4606 tree init = unshare_expr (drb->init); 4607 4608 if (loop_vinfo) 4609 base_name = get_name (data_ref_base); 4610 else 4611 { 4612 base_offset = ssize_int (0); 4613 init = ssize_int (0); 4614 base_name = get_name (DR_REF (dr)); 4615 } 4616 4617 /* Create base_offset */ 4618 base_offset = size_binop (PLUS_EXPR, 4619 fold_convert (sizetype, base_offset), 4620 fold_convert (sizetype, init)); 4621 4622 if (offset) 4623 { 4624 offset = fold_build2 (MULT_EXPR, sizetype, 4625 fold_convert (sizetype, offset), step); 4626 base_offset = fold_build2 (PLUS_EXPR, sizetype, 4627 base_offset, offset); 4628 } 4629 if (byte_offset) 4630 { 4631 byte_offset = fold_convert (sizetype, byte_offset); 4632 base_offset = fold_build2 (PLUS_EXPR, sizetype, 4633 base_offset, byte_offset); 4634 } 4635 4636 /* base + base_offset */ 4637 if (loop_vinfo) 4638 addr_base = fold_build_pointer_plus (data_ref_base, base_offset); 4639 else 4640 { 4641 addr_base = build1 (ADDR_EXPR, 4642 build_pointer_type (TREE_TYPE (DR_REF (dr))), 4643 unshare_expr (DR_REF (dr))); 4644 } 4645 4646 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info)); 4647 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name); 4648 addr_base = force_gimple_operand (addr_base, &seq, true, dest); 4649 gimple_seq_add_seq (new_stmt_list, seq); 4650 4651 if (DR_PTR_INFO (dr) 4652 && TREE_CODE (addr_base) == SSA_NAME 4653 /* We should only duplicate pointer info to newly created SSA names. */ 4654 && SSA_NAME_VAR (addr_base) == dest) 4655 { 4656 vect_duplicate_ssa_name_ptr_info (addr_base, dr_info); 4657 if (offset || byte_offset) 4658 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base)); 4659 } 4660 4661 if (dump_enabled_p ()) 4662 dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base); 4663 4664 return addr_base; 4665} 4666 4667 4668/* Function vect_create_data_ref_ptr. 4669 4670 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first 4671 location accessed in the loop by STMT_INFO, along with the def-use update 4672 chain to appropriately advance the pointer through the loop iterations. 4673 Also set aliasing information for the pointer. This pointer is used by 4674 the callers to this function to create a memory reference expression for 4675 vector load/store access. 4676 4677 Input: 4678 1. STMT_INFO: a stmt that references memory. Expected to be of the form 4679 GIMPLE_ASSIGN <name, data-ref> or 4680 GIMPLE_ASSIGN <data-ref, name>. 4681 2. AGGR_TYPE: the type of the reference, which should be either a vector 4682 or an array. 4683 3. AT_LOOP: the loop where the vector memref is to be created. 4684 4. OFFSET (optional): an offset to be added to the initial address accessed 4685 by the data-ref in STMT_INFO. 4686 5. BSI: location where the new stmts are to be placed if there is no loop 4687 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain 4688 pointing to the initial address. 4689 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added 4690 to the initial address accessed by the data-ref in STMT_INFO. This is 4691 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET 4692 in bytes. 4693 8. IV_STEP (optional, defaults to NULL): the amount that should be added 4694 to the IV during each iteration of the loop. NULL says to move 4695 by one copy of AGGR_TYPE up or down, depending on the step of the 4696 data reference. 4697 4698 Output: 4699 1. Declare a new ptr to vector_type, and have it point to the base of the 4700 data reference (initial addressed accessed by the data reference). 4701 For example, for vector of type V8HI, the following code is generated: 4702 4703 v8hi *ap; 4704 ap = (v8hi *)initial_address; 4705 4706 if OFFSET is not supplied: 4707 initial_address = &a[init]; 4708 if OFFSET is supplied: 4709 initial_address = &a[init + OFFSET]; 4710 if BYTE_OFFSET is supplied: 4711 initial_address = &a[init] + BYTE_OFFSET; 4712 4713 Return the initial_address in INITIAL_ADDRESS. 4714 4715 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also 4716 update the pointer in each iteration of the loop. 4717 4718 Return the increment stmt that updates the pointer in PTR_INCR. 4719 4720 3. Return the pointer. */ 4721 4722tree 4723vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type, 4724 class loop *at_loop, tree offset, 4725 tree *initial_address, gimple_stmt_iterator *gsi, 4726 gimple **ptr_incr, bool only_init, 4727 tree byte_offset, tree iv_step) 4728{ 4729 const char *base_name; 4730 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 4731 class loop *loop = NULL; 4732 bool nested_in_vect_loop = false; 4733 class loop *containing_loop = NULL; 4734 tree aggr_ptr_type; 4735 tree aggr_ptr; 4736 tree new_temp; 4737 gimple_seq new_stmt_list = NULL; 4738 edge pe = NULL; 4739 basic_block new_bb; 4740 tree aggr_ptr_init; 4741 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); 4742 struct data_reference *dr = dr_info->dr; 4743 tree aptr; 4744 gimple_stmt_iterator incr_gsi; 4745 bool insert_after; 4746 tree indx_before_incr, indx_after_incr; 4747 gimple *incr; 4748 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info); 4749 4750 gcc_assert (iv_step != NULL_TREE 4751 || TREE_CODE (aggr_type) == ARRAY_TYPE 4752 || TREE_CODE (aggr_type) == VECTOR_TYPE); 4753 4754 if (loop_vinfo) 4755 { 4756 loop = LOOP_VINFO_LOOP (loop_vinfo); 4757 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info); 4758 containing_loop = (gimple_bb (stmt_info->stmt))->loop_father; 4759 pe = loop_preheader_edge (loop); 4760 } 4761 else 4762 { 4763 gcc_assert (bb_vinfo); 4764 only_init = true; 4765 *ptr_incr = NULL; 4766 } 4767 4768 /* Create an expression for the first address accessed by this load 4769 in LOOP. */ 4770 base_name = get_name (DR_BASE_ADDRESS (dr)); 4771 4772 if (dump_enabled_p ()) 4773 { 4774 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr)); 4775 dump_printf_loc (MSG_NOTE, vect_location, 4776 "create %s-pointer variable to type: %T", 4777 get_tree_code_name (TREE_CODE (aggr_type)), 4778 aggr_type); 4779 if (TREE_CODE (dr_base_type) == ARRAY_TYPE) 4780 dump_printf (MSG_NOTE, " vectorizing an array ref: "); 4781 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE) 4782 dump_printf (MSG_NOTE, " vectorizing a vector ref: "); 4783 else if (TREE_CODE (dr_base_type) == RECORD_TYPE) 4784 dump_printf (MSG_NOTE, " vectorizing a record based array ref: "); 4785 else 4786 dump_printf (MSG_NOTE, " vectorizing a pointer ref: "); 4787 dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr)); 4788 } 4789 4790 /* (1) Create the new aggregate-pointer variable. 4791 Vector and array types inherit the alias set of their component 4792 type by default so we need to use a ref-all pointer if the data 4793 reference does not conflict with the created aggregated data 4794 reference because it is not addressable. */ 4795 bool need_ref_all = false; 4796 if (!alias_sets_conflict_p (get_alias_set (aggr_type), 4797 get_alias_set (DR_REF (dr)))) 4798 need_ref_all = true; 4799 /* Likewise for any of the data references in the stmt group. */ 4800 else if (DR_GROUP_SIZE (stmt_info) > 1) 4801 { 4802 stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info); 4803 do 4804 { 4805 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo); 4806 if (!alias_sets_conflict_p (get_alias_set (aggr_type), 4807 get_alias_set (DR_REF (sdr)))) 4808 { 4809 need_ref_all = true; 4810 break; 4811 } 4812 sinfo = DR_GROUP_NEXT_ELEMENT (sinfo); 4813 } 4814 while (sinfo); 4815 } 4816 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode, 4817 need_ref_all); 4818 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name); 4819 4820 4821 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are 4822 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two 4823 def-use update cycles for the pointer: one relative to the outer-loop 4824 (LOOP), which is what steps (3) and (4) below do. The other is relative 4825 to the inner-loop (which is the inner-most loop containing the dataref), 4826 and this is done be step (5) below. 4827 4828 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the 4829 inner-most loop, and so steps (3),(4) work the same, and step (5) is 4830 redundant. Steps (3),(4) create the following: 4831 4832 vp0 = &base_addr; 4833 LOOP: vp1 = phi(vp0,vp2) 4834 ... 4835 ... 4836 vp2 = vp1 + step 4837 goto LOOP 4838 4839 If there is an inner-loop nested in loop, then step (5) will also be 4840 applied, and an additional update in the inner-loop will be created: 4841 4842 vp0 = &base_addr; 4843 LOOP: vp1 = phi(vp0,vp2) 4844 ... 4845 inner: vp3 = phi(vp1,vp4) 4846 vp4 = vp3 + inner_step 4847 if () goto inner 4848 ... 4849 vp2 = vp1 + step 4850 if () goto LOOP */ 4851 4852 /* (2) Calculate the initial address of the aggregate-pointer, and set 4853 the aggregate-pointer to point to it before the loop. */ 4854 4855 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */ 4856 4857 new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list, 4858 offset, byte_offset); 4859 if (new_stmt_list) 4860 { 4861 if (pe) 4862 { 4863 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list); 4864 gcc_assert (!new_bb); 4865 } 4866 else 4867 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT); 4868 } 4869 4870 *initial_address = new_temp; 4871 aggr_ptr_init = new_temp; 4872 4873 /* (3) Handle the updating of the aggregate-pointer inside the loop. 4874 This is needed when ONLY_INIT is false, and also when AT_LOOP is the 4875 inner-loop nested in LOOP (during outer-loop vectorization). */ 4876 4877 /* No update in loop is required. */ 4878 if (only_init && (!loop_vinfo || at_loop == loop)) 4879 aptr = aggr_ptr_init; 4880 else 4881 { 4882 /* Accesses to invariant addresses should be handled specially 4883 by the caller. */ 4884 tree step = vect_dr_behavior (dr_info)->step; 4885 gcc_assert (!integer_zerop (step)); 4886 4887 if (iv_step == NULL_TREE) 4888 { 4889 /* The step of the aggregate pointer is the type size, 4890 negated for downward accesses. */ 4891 iv_step = TYPE_SIZE_UNIT (aggr_type); 4892 if (tree_int_cst_sgn (step) == -1) 4893 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step); 4894 } 4895 4896 standard_iv_increment_position (loop, &incr_gsi, &insert_after); 4897 4898 create_iv (aggr_ptr_init, 4899 fold_convert (aggr_ptr_type, iv_step), 4900 aggr_ptr, loop, &incr_gsi, insert_after, 4901 &indx_before_incr, &indx_after_incr); 4902 incr = gsi_stmt (incr_gsi); 4903 loop_vinfo->add_stmt (incr); 4904 4905 /* Copy the points-to information if it exists. */ 4906 if (DR_PTR_INFO (dr)) 4907 { 4908 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info); 4909 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info); 4910 } 4911 if (ptr_incr) 4912 *ptr_incr = incr; 4913 4914 aptr = indx_before_incr; 4915 } 4916 4917 if (!nested_in_vect_loop || only_init) 4918 return aptr; 4919 4920 4921 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop 4922 nested in LOOP, if exists. */ 4923 4924 gcc_assert (nested_in_vect_loop); 4925 if (!only_init) 4926 { 4927 standard_iv_increment_position (containing_loop, &incr_gsi, 4928 &insert_after); 4929 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr, 4930 containing_loop, &incr_gsi, insert_after, &indx_before_incr, 4931 &indx_after_incr); 4932 incr = gsi_stmt (incr_gsi); 4933 loop_vinfo->add_stmt (incr); 4934 4935 /* Copy the points-to information if it exists. */ 4936 if (DR_PTR_INFO (dr)) 4937 { 4938 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info); 4939 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info); 4940 } 4941 if (ptr_incr) 4942 *ptr_incr = incr; 4943 4944 return indx_before_incr; 4945 } 4946 else 4947 gcc_unreachable (); 4948} 4949 4950 4951/* Function bump_vector_ptr 4952 4953 Increment a pointer (to a vector type) by vector-size. If requested, 4954 i.e. if PTR-INCR is given, then also connect the new increment stmt 4955 to the existing def-use update-chain of the pointer, by modifying 4956 the PTR_INCR as illustrated below: 4957 4958 The pointer def-use update-chain before this function: 4959 DATAREF_PTR = phi (p_0, p_2) 4960 .... 4961 PTR_INCR: p_2 = DATAREF_PTR + step 4962 4963 The pointer def-use update-chain after this function: 4964 DATAREF_PTR = phi (p_0, p_2) 4965 .... 4966 NEW_DATAREF_PTR = DATAREF_PTR + BUMP 4967 .... 4968 PTR_INCR: p_2 = NEW_DATAREF_PTR + step 4969 4970 Input: 4971 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated 4972 in the loop. 4973 PTR_INCR - optional. The stmt that updates the pointer in each iteration of 4974 the loop. The increment amount across iterations is expected 4975 to be vector_size. 4976 BSI - location where the new update stmt is to be placed. 4977 STMT_INFO - the original scalar memory-access stmt that is being vectorized. 4978 BUMP - optional. The offset by which to bump the pointer. If not given, 4979 the offset is assumed to be vector_size. 4980 4981 Output: Return NEW_DATAREF_PTR as illustrated above. 4982 4983*/ 4984 4985tree 4986bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi, 4987 stmt_vec_info stmt_info, tree bump) 4988{ 4989 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); 4990 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 4991 tree update = TYPE_SIZE_UNIT (vectype); 4992 gassign *incr_stmt; 4993 ssa_op_iter iter; 4994 use_operand_p use_p; 4995 tree new_dataref_ptr; 4996 4997 if (bump) 4998 update = bump; 4999 5000 if (TREE_CODE (dataref_ptr) == SSA_NAME) 5001 new_dataref_ptr = copy_ssa_name (dataref_ptr); 5002 else 5003 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr)); 5004 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR, 5005 dataref_ptr, update); 5006 vect_finish_stmt_generation (stmt_info, incr_stmt, gsi); 5007 5008 /* Copy the points-to information if it exists. */ 5009 if (DR_PTR_INFO (dr)) 5010 { 5011 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); 5012 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr)); 5013 } 5014 5015 if (!ptr_incr) 5016 return new_dataref_ptr; 5017 5018 /* Update the vector-pointer's cross-iteration increment. */ 5019 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) 5020 { 5021 tree use = USE_FROM_PTR (use_p); 5022 5023 if (use == dataref_ptr) 5024 SET_USE (use_p, new_dataref_ptr); 5025 else 5026 gcc_assert (operand_equal_p (use, update, 0)); 5027 } 5028 5029 return new_dataref_ptr; 5030} 5031 5032 5033/* Copy memory reference info such as base/clique from the SRC reference 5034 to the DEST MEM_REF. */ 5035 5036void 5037vect_copy_ref_info (tree dest, tree src) 5038{ 5039 if (TREE_CODE (dest) != MEM_REF) 5040 return; 5041 5042 tree src_base = src; 5043 while (handled_component_p (src_base)) 5044 src_base = TREE_OPERAND (src_base, 0); 5045 if (TREE_CODE (src_base) != MEM_REF 5046 && TREE_CODE (src_base) != TARGET_MEM_REF) 5047 return; 5048 5049 MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base); 5050 MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base); 5051} 5052 5053 5054/* Function vect_create_destination_var. 5055 5056 Create a new temporary of type VECTYPE. */ 5057 5058tree 5059vect_create_destination_var (tree scalar_dest, tree vectype) 5060{ 5061 tree vec_dest; 5062 const char *name; 5063 char *new_name; 5064 tree type; 5065 enum vect_var_kind kind; 5066 5067 kind = vectype 5068 ? VECTOR_BOOLEAN_TYPE_P (vectype) 5069 ? vect_mask_var 5070 : vect_simple_var 5071 : vect_scalar_var; 5072 type = vectype ? vectype : TREE_TYPE (scalar_dest); 5073 5074 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME); 5075 5076 name = get_name (scalar_dest); 5077 if (name) 5078 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest)); 5079 else 5080 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest)); 5081 vec_dest = vect_get_new_vect_var (type, kind, new_name); 5082 free (new_name); 5083 5084 return vec_dest; 5085} 5086 5087/* Function vect_grouped_store_supported. 5088 5089 Returns TRUE if interleave high and interleave low permutations 5090 are supported, and FALSE otherwise. */ 5091 5092bool 5093vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count) 5094{ 5095 machine_mode mode = TYPE_MODE (vectype); 5096 5097 /* vect_permute_store_chain requires the group size to be equal to 3 or 5098 be a power of two. */ 5099 if (count != 3 && exact_log2 (count) == -1) 5100 { 5101 if (dump_enabled_p ()) 5102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5103 "the size of the group of accesses" 5104 " is not a power of 2 or not eqaul to 3\n"); 5105 return false; 5106 } 5107 5108 /* Check that the permutation is supported. */ 5109 if (VECTOR_MODE_P (mode)) 5110 { 5111 unsigned int i; 5112 if (count == 3) 5113 { 5114 unsigned int j0 = 0, j1 = 0, j2 = 0; 5115 unsigned int i, j; 5116 5117 unsigned int nelt; 5118 if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) 5119 { 5120 if (dump_enabled_p ()) 5121 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5122 "cannot handle groups of 3 stores for" 5123 " variable-length vectors\n"); 5124 return false; 5125 } 5126 5127 vec_perm_builder sel (nelt, nelt, 1); 5128 sel.quick_grow (nelt); 5129 vec_perm_indices indices; 5130 for (j = 0; j < 3; j++) 5131 { 5132 int nelt0 = ((3 - j) * nelt) % 3; 5133 int nelt1 = ((3 - j) * nelt + 1) % 3; 5134 int nelt2 = ((3 - j) * nelt + 2) % 3; 5135 for (i = 0; i < nelt; i++) 5136 { 5137 if (3 * i + nelt0 < nelt) 5138 sel[3 * i + nelt0] = j0++; 5139 if (3 * i + nelt1 < nelt) 5140 sel[3 * i + nelt1] = nelt + j1++; 5141 if (3 * i + nelt2 < nelt) 5142 sel[3 * i + nelt2] = 0; 5143 } 5144 indices.new_vector (sel, 2, nelt); 5145 if (!can_vec_perm_const_p (mode, indices)) 5146 { 5147 if (dump_enabled_p ()) 5148 dump_printf (MSG_MISSED_OPTIMIZATION, 5149 "permutation op not supported by target.\n"); 5150 return false; 5151 } 5152 5153 for (i = 0; i < nelt; i++) 5154 { 5155 if (3 * i + nelt0 < nelt) 5156 sel[3 * i + nelt0] = 3 * i + nelt0; 5157 if (3 * i + nelt1 < nelt) 5158 sel[3 * i + nelt1] = 3 * i + nelt1; 5159 if (3 * i + nelt2 < nelt) 5160 sel[3 * i + nelt2] = nelt + j2++; 5161 } 5162 indices.new_vector (sel, 2, nelt); 5163 if (!can_vec_perm_const_p (mode, indices)) 5164 { 5165 if (dump_enabled_p ()) 5166 dump_printf (MSG_MISSED_OPTIMIZATION, 5167 "permutation op not supported by target.\n"); 5168 return false; 5169 } 5170 } 5171 return true; 5172 } 5173 else 5174 { 5175 /* If length is not equal to 3 then only power of 2 is supported. */ 5176 gcc_assert (pow2p_hwi (count)); 5177 poly_uint64 nelt = GET_MODE_NUNITS (mode); 5178 5179 /* The encoding has 2 interleaved stepped patterns. */ 5180 vec_perm_builder sel (nelt, 2, 3); 5181 sel.quick_grow (6); 5182 for (i = 0; i < 3; i++) 5183 { 5184 sel[i * 2] = i; 5185 sel[i * 2 + 1] = i + nelt; 5186 } 5187 vec_perm_indices indices (sel, 2, nelt); 5188 if (can_vec_perm_const_p (mode, indices)) 5189 { 5190 for (i = 0; i < 6; i++) 5191 sel[i] += exact_div (nelt, 2); 5192 indices.new_vector (sel, 2, nelt); 5193 if (can_vec_perm_const_p (mode, indices)) 5194 return true; 5195 } 5196 } 5197 } 5198 5199 if (dump_enabled_p ()) 5200 dump_printf (MSG_MISSED_OPTIMIZATION, 5201 "permutation op not supported by target.\n"); 5202 return false; 5203} 5204 5205 5206/* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of 5207 type VECTYPE. MASKED_P says whether the masked form is needed. */ 5208 5209bool 5210vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, 5211 bool masked_p) 5212{ 5213 if (masked_p) 5214 return vect_lanes_optab_supported_p ("vec_mask_store_lanes", 5215 vec_mask_store_lanes_optab, 5216 vectype, count); 5217 else 5218 return vect_lanes_optab_supported_p ("vec_store_lanes", 5219 vec_store_lanes_optab, 5220 vectype, count); 5221} 5222 5223 5224/* Function vect_permute_store_chain. 5225 5226 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be 5227 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder 5228 the data correctly for the stores. Return the final references for stores 5229 in RESULT_CHAIN. 5230 5231 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 5232 The input is 4 vectors each containing 8 elements. We assign a number to 5233 each element, the input sequence is: 5234 5235 1st vec: 0 1 2 3 4 5 6 7 5236 2nd vec: 8 9 10 11 12 13 14 15 5237 3rd vec: 16 17 18 19 20 21 22 23 5238 4th vec: 24 25 26 27 28 29 30 31 5239 5240 The output sequence should be: 5241 5242 1st vec: 0 8 16 24 1 9 17 25 5243 2nd vec: 2 10 18 26 3 11 19 27 5244 3rd vec: 4 12 20 28 5 13 21 30 5245 4th vec: 6 14 22 30 7 15 23 31 5246 5247 i.e., we interleave the contents of the four vectors in their order. 5248 5249 We use interleave_high/low instructions to create such output. The input of 5250 each interleave_high/low operation is two vectors: 5251 1st vec 2nd vec 5252 0 1 2 3 4 5 6 7 5253 the even elements of the result vector are obtained left-to-right from the 5254 high/low elements of the first vector. The odd elements of the result are 5255 obtained left-to-right from the high/low elements of the second vector. 5256 The output of interleave_high will be: 0 4 1 5 5257 and of interleave_low: 2 6 3 7 5258 5259 5260 The permutation is done in log LENGTH stages. In each stage interleave_high 5261 and interleave_low stmts are created for each pair of vectors in DR_CHAIN, 5262 where the first argument is taken from the first half of DR_CHAIN and the 5263 second argument from it's second half. 5264 In our example, 5265 5266 I1: interleave_high (1st vec, 3rd vec) 5267 I2: interleave_low (1st vec, 3rd vec) 5268 I3: interleave_high (2nd vec, 4th vec) 5269 I4: interleave_low (2nd vec, 4th vec) 5270 5271 The output for the first stage is: 5272 5273 I1: 0 16 1 17 2 18 3 19 5274 I2: 4 20 5 21 6 22 7 23 5275 I3: 8 24 9 25 10 26 11 27 5276 I4: 12 28 13 29 14 30 15 31 5277 5278 The output of the second stage, i.e. the final result is: 5279 5280 I1: 0 8 16 24 1 9 17 25 5281 I2: 2 10 18 26 3 11 19 27 5282 I3: 4 12 20 28 5 13 21 30 5283 I4: 6 14 22 30 7 15 23 31. */ 5284 5285void 5286vect_permute_store_chain (vec<tree> dr_chain, 5287 unsigned int length, 5288 stmt_vec_info stmt_info, 5289 gimple_stmt_iterator *gsi, 5290 vec<tree> *result_chain) 5291{ 5292 tree vect1, vect2, high, low; 5293 gimple *perm_stmt; 5294 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 5295 tree perm_mask_low, perm_mask_high; 5296 tree data_ref; 5297 tree perm3_mask_low, perm3_mask_high; 5298 unsigned int i, j, n, log_length = exact_log2 (length); 5299 5300 result_chain->quick_grow (length); 5301 memcpy (result_chain->address (), dr_chain.address (), 5302 length * sizeof (tree)); 5303 5304 if (length == 3) 5305 { 5306 /* vect_grouped_store_supported ensures that this is constant. */ 5307 unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); 5308 unsigned int j0 = 0, j1 = 0, j2 = 0; 5309 5310 vec_perm_builder sel (nelt, nelt, 1); 5311 sel.quick_grow (nelt); 5312 vec_perm_indices indices; 5313 for (j = 0; j < 3; j++) 5314 { 5315 int nelt0 = ((3 - j) * nelt) % 3; 5316 int nelt1 = ((3 - j) * nelt + 1) % 3; 5317 int nelt2 = ((3 - j) * nelt + 2) % 3; 5318 5319 for (i = 0; i < nelt; i++) 5320 { 5321 if (3 * i + nelt0 < nelt) 5322 sel[3 * i + nelt0] = j0++; 5323 if (3 * i + nelt1 < nelt) 5324 sel[3 * i + nelt1] = nelt + j1++; 5325 if (3 * i + nelt2 < nelt) 5326 sel[3 * i + nelt2] = 0; 5327 } 5328 indices.new_vector (sel, 2, nelt); 5329 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); 5330 5331 for (i = 0; i < nelt; i++) 5332 { 5333 if (3 * i + nelt0 < nelt) 5334 sel[3 * i + nelt0] = 3 * i + nelt0; 5335 if (3 * i + nelt1 < nelt) 5336 sel[3 * i + nelt1] = 3 * i + nelt1; 5337 if (3 * i + nelt2 < nelt) 5338 sel[3 * i + nelt2] = nelt + j2++; 5339 } 5340 indices.new_vector (sel, 2, nelt); 5341 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); 5342 5343 vect1 = dr_chain[0]; 5344 vect2 = dr_chain[1]; 5345 5346 /* Create interleaving stmt: 5347 low = VEC_PERM_EXPR <vect1, vect2, 5348 {j, nelt, *, j + 1, nelt + j + 1, *, 5349 j + 2, nelt + j + 2, *, ...}> */ 5350 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); 5351 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, 5352 vect2, perm3_mask_low); 5353 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 5354 5355 vect1 = data_ref; 5356 vect2 = dr_chain[2]; 5357 /* Create interleaving stmt: 5358 low = VEC_PERM_EXPR <vect1, vect2, 5359 {0, 1, nelt + j, 3, 4, nelt + j + 1, 5360 6, 7, nelt + j + 2, ...}> */ 5361 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); 5362 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1, 5363 vect2, perm3_mask_high); 5364 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 5365 (*result_chain)[j] = data_ref; 5366 } 5367 } 5368 else 5369 { 5370 /* If length is not equal to 3 then only power of 2 is supported. */ 5371 gcc_assert (pow2p_hwi (length)); 5372 5373 /* The encoding has 2 interleaved stepped patterns. */ 5374 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); 5375 vec_perm_builder sel (nelt, 2, 3); 5376 sel.quick_grow (6); 5377 for (i = 0; i < 3; i++) 5378 { 5379 sel[i * 2] = i; 5380 sel[i * 2 + 1] = i + nelt; 5381 } 5382 vec_perm_indices indices (sel, 2, nelt); 5383 perm_mask_high = vect_gen_perm_mask_checked (vectype, indices); 5384 5385 for (i = 0; i < 6; i++) 5386 sel[i] += exact_div (nelt, 2); 5387 indices.new_vector (sel, 2, nelt); 5388 perm_mask_low = vect_gen_perm_mask_checked (vectype, indices); 5389 5390 for (i = 0, n = log_length; i < n; i++) 5391 { 5392 for (j = 0; j < length/2; j++) 5393 { 5394 vect1 = dr_chain[j]; 5395 vect2 = dr_chain[j+length/2]; 5396 5397 /* Create interleaving stmt: 5398 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, 5399 ...}> */ 5400 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high"); 5401 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1, 5402 vect2, perm_mask_high); 5403 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 5404 (*result_chain)[2*j] = high; 5405 5406 /* Create interleaving stmt: 5407 low = VEC_PERM_EXPR <vect1, vect2, 5408 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1, 5409 ...}> */ 5410 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low"); 5411 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1, 5412 vect2, perm_mask_low); 5413 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 5414 (*result_chain)[2*j+1] = low; 5415 } 5416 memcpy (dr_chain.address (), result_chain->address (), 5417 length * sizeof (tree)); 5418 } 5419 } 5420} 5421 5422/* Function vect_setup_realignment 5423 5424 This function is called when vectorizing an unaligned load using 5425 the dr_explicit_realign[_optimized] scheme. 5426 This function generates the following code at the loop prolog: 5427 5428 p = initial_addr; 5429 x msq_init = *(floor(p)); # prolog load 5430 realignment_token = call target_builtin; 5431 loop: 5432 x msq = phi (msq_init, ---) 5433 5434 The stmts marked with x are generated only for the case of 5435 dr_explicit_realign_optimized. 5436 5437 The code above sets up a new (vector) pointer, pointing to the first 5438 location accessed by STMT_INFO, and a "floor-aligned" load using that 5439 pointer. It also generates code to compute the "realignment-token" 5440 (if the relevant target hook was defined), and creates a phi-node at the 5441 loop-header bb whose arguments are the result of the prolog-load (created 5442 by this function) and the result of a load that takes place in the loop 5443 (to be created by the caller to this function). 5444 5445 For the case of dr_explicit_realign_optimized: 5446 The caller to this function uses the phi-result (msq) to create the 5447 realignment code inside the loop, and sets up the missing phi argument, 5448 as follows: 5449 loop: 5450 msq = phi (msq_init, lsq) 5451 lsq = *(floor(p')); # load in loop 5452 result = realign_load (msq, lsq, realignment_token); 5453 5454 For the case of dr_explicit_realign: 5455 loop: 5456 msq = *(floor(p)); # load in loop 5457 p' = p + (VS-1); 5458 lsq = *(floor(p')); # load in loop 5459 result = realign_load (msq, lsq, realignment_token); 5460 5461 Input: 5462 STMT_INFO - (scalar) load stmt to be vectorized. This load accesses 5463 a memory location that may be unaligned. 5464 BSI - place where new code is to be inserted. 5465 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes 5466 is used. 5467 5468 Output: 5469 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load 5470 target hook, if defined. 5471 Return value - the result of the loop-header phi node. */ 5472 5473tree 5474vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, 5475 tree *realignment_token, 5476 enum dr_alignment_support alignment_support_scheme, 5477 tree init_addr, 5478 class loop **at_loop) 5479{ 5480 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 5481 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 5482 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info); 5483 struct data_reference *dr = dr_info->dr; 5484 class loop *loop = NULL; 5485 edge pe = NULL; 5486 tree scalar_dest = gimple_assign_lhs (stmt_info->stmt); 5487 tree vec_dest; 5488 gimple *inc; 5489 tree ptr; 5490 tree data_ref; 5491 basic_block new_bb; 5492 tree msq_init = NULL_TREE; 5493 tree new_temp; 5494 gphi *phi_stmt; 5495 tree msq = NULL_TREE; 5496 gimple_seq stmts = NULL; 5497 bool compute_in_loop = false; 5498 bool nested_in_vect_loop = false; 5499 class loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father; 5500 class loop *loop_for_initial_load = NULL; 5501 5502 if (loop_vinfo) 5503 { 5504 loop = LOOP_VINFO_LOOP (loop_vinfo); 5505 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info); 5506 } 5507 5508 gcc_assert (alignment_support_scheme == dr_explicit_realign 5509 || alignment_support_scheme == dr_explicit_realign_optimized); 5510 5511 /* We need to generate three things: 5512 1. the misalignment computation 5513 2. the extra vector load (for the optimized realignment scheme). 5514 3. the phi node for the two vectors from which the realignment is 5515 done (for the optimized realignment scheme). */ 5516 5517 /* 1. Determine where to generate the misalignment computation. 5518 5519 If INIT_ADDR is NULL_TREE, this indicates that the misalignment 5520 calculation will be generated by this function, outside the loop (in the 5521 preheader). Otherwise, INIT_ADDR had already been computed for us by the 5522 caller, inside the loop. 5523 5524 Background: If the misalignment remains fixed throughout the iterations of 5525 the loop, then both realignment schemes are applicable, and also the 5526 misalignment computation can be done outside LOOP. This is because we are 5527 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that 5528 are a multiple of VS (the Vector Size), and therefore the misalignment in 5529 different vectorized LOOP iterations is always the same. 5530 The problem arises only if the memory access is in an inner-loop nested 5531 inside LOOP, which is now being vectorized using outer-loop vectorization. 5532 This is the only case when the misalignment of the memory access may not 5533 remain fixed throughout the iterations of the inner-loop (as explained in 5534 detail in vect_supportable_dr_alignment). In this case, not only is the 5535 optimized realignment scheme not applicable, but also the misalignment 5536 computation (and generation of the realignment token that is passed to 5537 REALIGN_LOAD) have to be done inside the loop. 5538 5539 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode 5540 or not, which in turn determines if the misalignment is computed inside 5541 the inner-loop, or outside LOOP. */ 5542 5543 if (init_addr != NULL_TREE || !loop_vinfo) 5544 { 5545 compute_in_loop = true; 5546 gcc_assert (alignment_support_scheme == dr_explicit_realign); 5547 } 5548 5549 5550 /* 2. Determine where to generate the extra vector load. 5551 5552 For the optimized realignment scheme, instead of generating two vector 5553 loads in each iteration, we generate a single extra vector load in the 5554 preheader of the loop, and in each iteration reuse the result of the 5555 vector load from the previous iteration. In case the memory access is in 5556 an inner-loop nested inside LOOP, which is now being vectorized using 5557 outer-loop vectorization, we need to determine whether this initial vector 5558 load should be generated at the preheader of the inner-loop, or can be 5559 generated at the preheader of LOOP. If the memory access has no evolution 5560 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has 5561 to be generated inside LOOP (in the preheader of the inner-loop). */ 5562 5563 if (nested_in_vect_loop) 5564 { 5565 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); 5566 bool invariant_in_outerloop = 5567 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); 5568 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); 5569 } 5570 else 5571 loop_for_initial_load = loop; 5572 if (at_loop) 5573 *at_loop = loop_for_initial_load; 5574 5575 if (loop_for_initial_load) 5576 pe = loop_preheader_edge (loop_for_initial_load); 5577 5578 /* 3. For the case of the optimized realignment, create the first vector 5579 load at the loop preheader. */ 5580 5581 if (alignment_support_scheme == dr_explicit_realign_optimized) 5582 { 5583 /* Create msq_init = *(floor(p1)) in the loop preheader */ 5584 gassign *new_stmt; 5585 5586 gcc_assert (!compute_in_loop); 5587 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5588 ptr = vect_create_data_ref_ptr (stmt_info, vectype, 5589 loop_for_initial_load, NULL_TREE, 5590 &init_addr, NULL, &inc, true); 5591 if (TREE_CODE (ptr) == SSA_NAME) 5592 new_temp = copy_ssa_name (ptr); 5593 else 5594 new_temp = make_ssa_name (TREE_TYPE (ptr)); 5595 poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info); 5596 tree type = TREE_TYPE (ptr); 5597 new_stmt = gimple_build_assign 5598 (new_temp, BIT_AND_EXPR, ptr, 5599 fold_build2 (MINUS_EXPR, type, 5600 build_int_cst (type, 0), 5601 build_int_cst (type, align))); 5602 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5603 gcc_assert (!new_bb); 5604 data_ref 5605 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp, 5606 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0)); 5607 vect_copy_ref_info (data_ref, DR_REF (dr)); 5608 new_stmt = gimple_build_assign (vec_dest, data_ref); 5609 new_temp = make_ssa_name (vec_dest, new_stmt); 5610 gimple_assign_set_lhs (new_stmt, new_temp); 5611 if (pe) 5612 { 5613 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5614 gcc_assert (!new_bb); 5615 } 5616 else 5617 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5618 5619 msq_init = gimple_assign_lhs (new_stmt); 5620 } 5621 5622 /* 4. Create realignment token using a target builtin, if available. 5623 It is done either inside the containing loop, or before LOOP (as 5624 determined above). */ 5625 5626 if (targetm.vectorize.builtin_mask_for_load) 5627 { 5628 gcall *new_stmt; 5629 tree builtin_decl; 5630 5631 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ 5632 if (!init_addr) 5633 { 5634 /* Generate the INIT_ADDR computation outside LOOP. */ 5635 init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts, 5636 NULL_TREE); 5637 if (loop) 5638 { 5639 pe = loop_preheader_edge (loop); 5640 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); 5641 gcc_assert (!new_bb); 5642 } 5643 else 5644 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT); 5645 } 5646 5647 builtin_decl = targetm.vectorize.builtin_mask_for_load (); 5648 new_stmt = gimple_build_call (builtin_decl, 1, init_addr); 5649 vec_dest = 5650 vect_create_destination_var (scalar_dest, 5651 gimple_call_return_type (new_stmt)); 5652 new_temp = make_ssa_name (vec_dest, new_stmt); 5653 gimple_call_set_lhs (new_stmt, new_temp); 5654 5655 if (compute_in_loop) 5656 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); 5657 else 5658 { 5659 /* Generate the misalignment computation outside LOOP. */ 5660 pe = loop_preheader_edge (loop); 5661 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt); 5662 gcc_assert (!new_bb); 5663 } 5664 5665 *realignment_token = gimple_call_lhs (new_stmt); 5666 5667 /* The result of the CALL_EXPR to this builtin is determined from 5668 the value of the parameter and no global variables are touched 5669 which makes the builtin a "const" function. Requiring the 5670 builtin to have the "const" attribute makes it unnecessary 5671 to call mark_call_clobbered. */ 5672 gcc_assert (TREE_READONLY (builtin_decl)); 5673 } 5674 5675 if (alignment_support_scheme == dr_explicit_realign) 5676 return msq; 5677 5678 gcc_assert (!compute_in_loop); 5679 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); 5680 5681 5682 /* 5. Create msq = phi <msq_init, lsq> in loop */ 5683 5684 pe = loop_preheader_edge (containing_loop); 5685 vec_dest = vect_create_destination_var (scalar_dest, vectype); 5686 msq = make_ssa_name (vec_dest); 5687 phi_stmt = create_phi_node (msq, containing_loop->header); 5688 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION); 5689 5690 return msq; 5691} 5692 5693 5694/* Function vect_grouped_load_supported. 5695 5696 COUNT is the size of the load group (the number of statements plus the 5697 number of gaps). SINGLE_ELEMENT_P is true if there is actually 5698 only one statement, with a gap of COUNT - 1. 5699 5700 Returns true if a suitable permute exists. */ 5701 5702bool 5703vect_grouped_load_supported (tree vectype, bool single_element_p, 5704 unsigned HOST_WIDE_INT count) 5705{ 5706 machine_mode mode = TYPE_MODE (vectype); 5707 5708 /* If this is single-element interleaving with an element distance 5709 that leaves unused vector loads around punt - we at least create 5710 very sub-optimal code in that case (and blow up memory, 5711 see PR65518). */ 5712 if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype))) 5713 { 5714 if (dump_enabled_p ()) 5715 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5716 "single-element interleaving not supported " 5717 "for not adjacent vector loads\n"); 5718 return false; 5719 } 5720 5721 /* vect_permute_load_chain requires the group size to be equal to 3 or 5722 be a power of two. */ 5723 if (count != 3 && exact_log2 (count) == -1) 5724 { 5725 if (dump_enabled_p ()) 5726 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5727 "the size of the group of accesses" 5728 " is not a power of 2 or not equal to 3\n"); 5729 return false; 5730 } 5731 5732 /* Check that the permutation is supported. */ 5733 if (VECTOR_MODE_P (mode)) 5734 { 5735 unsigned int i, j; 5736 if (count == 3) 5737 { 5738 unsigned int nelt; 5739 if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) 5740 { 5741 if (dump_enabled_p ()) 5742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5743 "cannot handle groups of 3 loads for" 5744 " variable-length vectors\n"); 5745 return false; 5746 } 5747 5748 vec_perm_builder sel (nelt, nelt, 1); 5749 sel.quick_grow (nelt); 5750 vec_perm_indices indices; 5751 unsigned int k; 5752 for (k = 0; k < 3; k++) 5753 { 5754 for (i = 0; i < nelt; i++) 5755 if (3 * i + k < 2 * nelt) 5756 sel[i] = 3 * i + k; 5757 else 5758 sel[i] = 0; 5759 indices.new_vector (sel, 2, nelt); 5760 if (!can_vec_perm_const_p (mode, indices)) 5761 { 5762 if (dump_enabled_p ()) 5763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5764 "shuffle of 3 loads is not supported by" 5765 " target\n"); 5766 return false; 5767 } 5768 for (i = 0, j = 0; i < nelt; i++) 5769 if (3 * i + k < 2 * nelt) 5770 sel[i] = i; 5771 else 5772 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); 5773 indices.new_vector (sel, 2, nelt); 5774 if (!can_vec_perm_const_p (mode, indices)) 5775 { 5776 if (dump_enabled_p ()) 5777 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5778 "shuffle of 3 loads is not supported by" 5779 " target\n"); 5780 return false; 5781 } 5782 } 5783 return true; 5784 } 5785 else 5786 { 5787 /* If length is not equal to 3 then only power of 2 is supported. */ 5788 gcc_assert (pow2p_hwi (count)); 5789 poly_uint64 nelt = GET_MODE_NUNITS (mode); 5790 5791 /* The encoding has a single stepped pattern. */ 5792 vec_perm_builder sel (nelt, 1, 3); 5793 sel.quick_grow (3); 5794 for (i = 0; i < 3; i++) 5795 sel[i] = i * 2; 5796 vec_perm_indices indices (sel, 2, nelt); 5797 if (can_vec_perm_const_p (mode, indices)) 5798 { 5799 for (i = 0; i < 3; i++) 5800 sel[i] = i * 2 + 1; 5801 indices.new_vector (sel, 2, nelt); 5802 if (can_vec_perm_const_p (mode, indices)) 5803 return true; 5804 } 5805 } 5806 } 5807 5808 if (dump_enabled_p ()) 5809 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 5810 "extract even/odd not supported by target\n"); 5811 return false; 5812} 5813 5814/* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of 5815 type VECTYPE. MASKED_P says whether the masked form is needed. */ 5816 5817bool 5818vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count, 5819 bool masked_p) 5820{ 5821 if (masked_p) 5822 return vect_lanes_optab_supported_p ("vec_mask_load_lanes", 5823 vec_mask_load_lanes_optab, 5824 vectype, count); 5825 else 5826 return vect_lanes_optab_supported_p ("vec_load_lanes", 5827 vec_load_lanes_optab, 5828 vectype, count); 5829} 5830 5831/* Function vect_permute_load_chain. 5832 5833 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be 5834 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder 5835 the input data correctly. Return the final references for loads in 5836 RESULT_CHAIN. 5837 5838 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. 5839 The input is 4 vectors each containing 8 elements. We assign a number to each 5840 element, the input sequence is: 5841 5842 1st vec: 0 1 2 3 4 5 6 7 5843 2nd vec: 8 9 10 11 12 13 14 15 5844 3rd vec: 16 17 18 19 20 21 22 23 5845 4th vec: 24 25 26 27 28 29 30 31 5846 5847 The output sequence should be: 5848 5849 1st vec: 0 4 8 12 16 20 24 28 5850 2nd vec: 1 5 9 13 17 21 25 29 5851 3rd vec: 2 6 10 14 18 22 26 30 5852 4th vec: 3 7 11 15 19 23 27 31 5853 5854 i.e., the first output vector should contain the first elements of each 5855 interleaving group, etc. 5856 5857 We use extract_even/odd instructions to create such output. The input of 5858 each extract_even/odd operation is two vectors 5859 1st vec 2nd vec 5860 0 1 2 3 4 5 6 7 5861 5862 and the output is the vector of extracted even/odd elements. The output of 5863 extract_even will be: 0 2 4 6 5864 and of extract_odd: 1 3 5 7 5865 5866 5867 The permutation is done in log LENGTH stages. In each stage extract_even 5868 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in 5869 their order. In our example, 5870 5871 E1: extract_even (1st vec, 2nd vec) 5872 E2: extract_odd (1st vec, 2nd vec) 5873 E3: extract_even (3rd vec, 4th vec) 5874 E4: extract_odd (3rd vec, 4th vec) 5875 5876 The output for the first stage will be: 5877 5878 E1: 0 2 4 6 8 10 12 14 5879 E2: 1 3 5 7 9 11 13 15 5880 E3: 16 18 20 22 24 26 28 30 5881 E4: 17 19 21 23 25 27 29 31 5882 5883 In order to proceed and create the correct sequence for the next stage (or 5884 for the correct output, if the second stage is the last one, as in our 5885 example), we first put the output of extract_even operation and then the 5886 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). 5887 The input for the second stage is: 5888 5889 1st vec (E1): 0 2 4 6 8 10 12 14 5890 2nd vec (E3): 16 18 20 22 24 26 28 30 5891 3rd vec (E2): 1 3 5 7 9 11 13 15 5892 4th vec (E4): 17 19 21 23 25 27 29 31 5893 5894 The output of the second stage: 5895 5896 E1: 0 4 8 12 16 20 24 28 5897 E2: 2 6 10 14 18 22 26 30 5898 E3: 1 5 9 13 17 21 25 29 5899 E4: 3 7 11 15 19 23 27 31 5900 5901 And RESULT_CHAIN after reordering: 5902 5903 1st vec (E1): 0 4 8 12 16 20 24 28 5904 2nd vec (E3): 1 5 9 13 17 21 25 29 5905 3rd vec (E2): 2 6 10 14 18 22 26 30 5906 4th vec (E4): 3 7 11 15 19 23 27 31. */ 5907 5908static void 5909vect_permute_load_chain (vec<tree> dr_chain, 5910 unsigned int length, 5911 stmt_vec_info stmt_info, 5912 gimple_stmt_iterator *gsi, 5913 vec<tree> *result_chain) 5914{ 5915 tree data_ref, first_vect, second_vect; 5916 tree perm_mask_even, perm_mask_odd; 5917 tree perm3_mask_low, perm3_mask_high; 5918 gimple *perm_stmt; 5919 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 5920 unsigned int i, j, log_length = exact_log2 (length); 5921 5922 result_chain->quick_grow (length); 5923 memcpy (result_chain->address (), dr_chain.address (), 5924 length * sizeof (tree)); 5925 5926 if (length == 3) 5927 { 5928 /* vect_grouped_load_supported ensures that this is constant. */ 5929 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant (); 5930 unsigned int k; 5931 5932 vec_perm_builder sel (nelt, nelt, 1); 5933 sel.quick_grow (nelt); 5934 vec_perm_indices indices; 5935 for (k = 0; k < 3; k++) 5936 { 5937 for (i = 0; i < nelt; i++) 5938 if (3 * i + k < 2 * nelt) 5939 sel[i] = 3 * i + k; 5940 else 5941 sel[i] = 0; 5942 indices.new_vector (sel, 2, nelt); 5943 perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices); 5944 5945 for (i = 0, j = 0; i < nelt; i++) 5946 if (3 * i + k < 2 * nelt) 5947 sel[i] = i; 5948 else 5949 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++); 5950 indices.new_vector (sel, 2, nelt); 5951 perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices); 5952 5953 first_vect = dr_chain[0]; 5954 second_vect = dr_chain[1]; 5955 5956 /* Create interleaving stmt (low part of): 5957 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, 5958 ...}> */ 5959 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low"); 5960 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, 5961 second_vect, perm3_mask_low); 5962 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 5963 5964 /* Create interleaving stmt (high part of): 5965 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k, 5966 ...}> */ 5967 first_vect = data_ref; 5968 second_vect = dr_chain[2]; 5969 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high"); 5970 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect, 5971 second_vect, perm3_mask_high); 5972 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 5973 (*result_chain)[k] = data_ref; 5974 } 5975 } 5976 else 5977 { 5978 /* If length is not equal to 3 then only power of 2 is supported. */ 5979 gcc_assert (pow2p_hwi (length)); 5980 5981 /* The encoding has a single stepped pattern. */ 5982 poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype); 5983 vec_perm_builder sel (nelt, 1, 3); 5984 sel.quick_grow (3); 5985 for (i = 0; i < 3; ++i) 5986 sel[i] = i * 2; 5987 vec_perm_indices indices (sel, 2, nelt); 5988 perm_mask_even = vect_gen_perm_mask_checked (vectype, indices); 5989 5990 for (i = 0; i < 3; ++i) 5991 sel[i] = i * 2 + 1; 5992 indices.new_vector (sel, 2, nelt); 5993 perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices); 5994 5995 for (i = 0; i < log_length; i++) 5996 { 5997 for (j = 0; j < length; j += 2) 5998 { 5999 first_vect = dr_chain[j]; 6000 second_vect = dr_chain[j+1]; 6001 6002 /* data_ref = permute_even (first_data_ref, second_data_ref); */ 6003 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even"); 6004 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6005 first_vect, second_vect, 6006 perm_mask_even); 6007 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 6008 (*result_chain)[j/2] = data_ref; 6009 6010 /* data_ref = permute_odd (first_data_ref, second_data_ref); */ 6011 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd"); 6012 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6013 first_vect, second_vect, 6014 perm_mask_odd); 6015 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 6016 (*result_chain)[j/2+length/2] = data_ref; 6017 } 6018 memcpy (dr_chain.address (), result_chain->address (), 6019 length * sizeof (tree)); 6020 } 6021 } 6022} 6023 6024/* Function vect_shift_permute_load_chain. 6025 6026 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate 6027 sequence of stmts to reorder the input data accordingly. 6028 Return the final references for loads in RESULT_CHAIN. 6029 Return true if successed, false otherwise. 6030 6031 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8. 6032 The input is 3 vectors each containing 8 elements. We assign a 6033 number to each element, the input sequence is: 6034 6035 1st vec: 0 1 2 3 4 5 6 7 6036 2nd vec: 8 9 10 11 12 13 14 15 6037 3rd vec: 16 17 18 19 20 21 22 23 6038 6039 The output sequence should be: 6040 6041 1st vec: 0 3 6 9 12 15 18 21 6042 2nd vec: 1 4 7 10 13 16 19 22 6043 3rd vec: 2 5 8 11 14 17 20 23 6044 6045 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output. 6046 6047 First we shuffle all 3 vectors to get correct elements order: 6048 6049 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5) 6050 2nd vec: ( 8 11 14) ( 9 12 15) (10 13) 6051 3rd vec: (16 19 22) (17 20 23) (18 21) 6052 6053 Next we unite and shift vector 3 times: 6054 6055 1st step: 6056 shift right by 6 the concatenation of: 6057 "1st vec" and "2nd vec" 6058 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13) 6059 "2nd vec" and "3rd vec" 6060 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21) 6061 "3rd vec" and "1st vec" 6062 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5) 6063 | New vectors | 6064 6065 So that now new vectors are: 6066 6067 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15) 6068 2nd vec: (10 13) (16 19 22) (17 20 23) 6069 3rd vec: (18 21) ( 0 3 6) ( 1 4 7) 6070 6071 2nd step: 6072 shift right by 5 the concatenation of: 6073 "1st vec" and "3rd vec" 6074 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7) 6075 "2nd vec" and "1st vec" 6076 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15) 6077 "3rd vec" and "2nd vec" 6078 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23) 6079 | New vectors | 6080 6081 So that now new vectors are: 6082 6083 1st vec: ( 9 12 15) (18 21) ( 0 3 6) 6084 2nd vec: (17 20 23) ( 2 5) ( 8 11 14) 6085 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY 6086 6087 3rd step: 6088 shift right by 5 the concatenation of: 6089 "1st vec" and "1st vec" 6090 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6) 6091 shift right by 3 the concatenation of: 6092 "2nd vec" and "2nd vec" 6093 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14) 6094 | New vectors | 6095 6096 So that now all vectors are READY: 6097 1st vec: ( 0 3 6) ( 9 12 15) (18 21) 6098 2nd vec: ( 2 5) ( 8 11 14) (17 20 23) 6099 3rd vec: ( 1 4 7) (10 13) (16 19 22) 6100 6101 This algorithm is faster than one in vect_permute_load_chain if: 6102 1. "shift of a concatination" is faster than general permutation. 6103 This is usually so. 6104 2. The TARGET machine can't execute vector instructions in parallel. 6105 This is because each step of the algorithm depends on previous. 6106 The algorithm in vect_permute_load_chain is much more parallel. 6107 6108 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF. 6109*/ 6110 6111static bool 6112vect_shift_permute_load_chain (vec<tree> dr_chain, 6113 unsigned int length, 6114 stmt_vec_info stmt_info, 6115 gimple_stmt_iterator *gsi, 6116 vec<tree> *result_chain) 6117{ 6118 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect; 6119 tree perm2_mask1, perm2_mask2, perm3_mask; 6120 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask; 6121 gimple *perm_stmt; 6122 6123 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6124 unsigned int i; 6125 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6126 6127 unsigned HOST_WIDE_INT nelt, vf; 6128 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt) 6129 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf)) 6130 /* Not supported for variable-length vectors. */ 6131 return false; 6132 6133 vec_perm_builder sel (nelt, nelt, 1); 6134 sel.quick_grow (nelt); 6135 6136 result_chain->quick_grow (length); 6137 memcpy (result_chain->address (), dr_chain.address (), 6138 length * sizeof (tree)); 6139 6140 if (pow2p_hwi (length) && vf > 4) 6141 { 6142 unsigned int j, log_length = exact_log2 (length); 6143 for (i = 0; i < nelt / 2; ++i) 6144 sel[i] = i * 2; 6145 for (i = 0; i < nelt / 2; ++i) 6146 sel[nelt / 2 + i] = i * 2 + 1; 6147 vec_perm_indices indices (sel, 2, nelt); 6148 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6149 { 6150 if (dump_enabled_p ()) 6151 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6152 "shuffle of 2 fields structure is not \ 6153 supported by target\n"); 6154 return false; 6155 } 6156 perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices); 6157 6158 for (i = 0; i < nelt / 2; ++i) 6159 sel[i] = i * 2 + 1; 6160 for (i = 0; i < nelt / 2; ++i) 6161 sel[nelt / 2 + i] = i * 2; 6162 indices.new_vector (sel, 2, nelt); 6163 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6164 { 6165 if (dump_enabled_p ()) 6166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6167 "shuffle of 2 fields structure is not \ 6168 supported by target\n"); 6169 return false; 6170 } 6171 perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices); 6172 6173 /* Generating permutation constant to shift all elements. 6174 For vector length 8 it is {4 5 6 7 8 9 10 11}. */ 6175 for (i = 0; i < nelt; i++) 6176 sel[i] = nelt / 2 + i; 6177 indices.new_vector (sel, 2, nelt); 6178 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6179 { 6180 if (dump_enabled_p ()) 6181 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6182 "shift permutation is not supported by target\n"); 6183 return false; 6184 } 6185 shift1_mask = vect_gen_perm_mask_checked (vectype, indices); 6186 6187 /* Generating permutation constant to select vector from 2. 6188 For vector length 8 it is {0 1 2 3 12 13 14 15}. */ 6189 for (i = 0; i < nelt / 2; i++) 6190 sel[i] = i; 6191 for (i = nelt / 2; i < nelt; i++) 6192 sel[i] = nelt + i; 6193 indices.new_vector (sel, 2, nelt); 6194 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6195 { 6196 if (dump_enabled_p ()) 6197 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6198 "select is not supported by target\n"); 6199 return false; 6200 } 6201 select_mask = vect_gen_perm_mask_checked (vectype, indices); 6202 6203 for (i = 0; i < log_length; i++) 6204 { 6205 for (j = 0; j < length; j += 2) 6206 { 6207 first_vect = dr_chain[j]; 6208 second_vect = dr_chain[j + 1]; 6209 6210 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); 6211 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6212 first_vect, first_vect, 6213 perm2_mask1); 6214 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 6215 vect[0] = data_ref; 6216 6217 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); 6218 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6219 second_vect, second_vect, 6220 perm2_mask2); 6221 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 6222 vect[1] = data_ref; 6223 6224 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); 6225 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6226 vect[0], vect[1], shift1_mask); 6227 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 6228 (*result_chain)[j/2 + length/2] = data_ref; 6229 6230 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); 6231 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6232 vect[0], vect[1], select_mask); 6233 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 6234 (*result_chain)[j/2] = data_ref; 6235 } 6236 memcpy (dr_chain.address (), result_chain->address (), 6237 length * sizeof (tree)); 6238 } 6239 return true; 6240 } 6241 if (length == 3 && vf > 2) 6242 { 6243 unsigned int k = 0, l = 0; 6244 6245 /* Generating permutation constant to get all elements in rigth order. 6246 For vector length 8 it is {0 3 6 1 4 7 2 5}. */ 6247 for (i = 0; i < nelt; i++) 6248 { 6249 if (3 * k + (l % 3) >= nelt) 6250 { 6251 k = 0; 6252 l += (3 - (nelt % 3)); 6253 } 6254 sel[i] = 3 * k + (l % 3); 6255 k++; 6256 } 6257 vec_perm_indices indices (sel, 2, nelt); 6258 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6259 { 6260 if (dump_enabled_p ()) 6261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6262 "shuffle of 3 fields structure is not \ 6263 supported by target\n"); 6264 return false; 6265 } 6266 perm3_mask = vect_gen_perm_mask_checked (vectype, indices); 6267 6268 /* Generating permutation constant to shift all elements. 6269 For vector length 8 it is {6 7 8 9 10 11 12 13}. */ 6270 for (i = 0; i < nelt; i++) 6271 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i; 6272 indices.new_vector (sel, 2, nelt); 6273 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6274 { 6275 if (dump_enabled_p ()) 6276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6277 "shift permutation is not supported by target\n"); 6278 return false; 6279 } 6280 shift1_mask = vect_gen_perm_mask_checked (vectype, indices); 6281 6282 /* Generating permutation constant to shift all elements. 6283 For vector length 8 it is {5 6 7 8 9 10 11 12}. */ 6284 for (i = 0; i < nelt; i++) 6285 sel[i] = 2 * (nelt / 3) + 1 + i; 6286 indices.new_vector (sel, 2, nelt); 6287 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6288 { 6289 if (dump_enabled_p ()) 6290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6291 "shift permutation is not supported by target\n"); 6292 return false; 6293 } 6294 shift2_mask = vect_gen_perm_mask_checked (vectype, indices); 6295 6296 /* Generating permutation constant to shift all elements. 6297 For vector length 8 it is {3 4 5 6 7 8 9 10}. */ 6298 for (i = 0; i < nelt; i++) 6299 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i; 6300 indices.new_vector (sel, 2, nelt); 6301 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6302 { 6303 if (dump_enabled_p ()) 6304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6305 "shift permutation is not supported by target\n"); 6306 return false; 6307 } 6308 shift3_mask = vect_gen_perm_mask_checked (vectype, indices); 6309 6310 /* Generating permutation constant to shift all elements. 6311 For vector length 8 it is {5 6 7 8 9 10 11 12}. */ 6312 for (i = 0; i < nelt; i++) 6313 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i; 6314 indices.new_vector (sel, 2, nelt); 6315 if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices)) 6316 { 6317 if (dump_enabled_p ()) 6318 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 6319 "shift permutation is not supported by target\n"); 6320 return false; 6321 } 6322 shift4_mask = vect_gen_perm_mask_checked (vectype, indices); 6323 6324 for (k = 0; k < 3; k++) 6325 { 6326 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3"); 6327 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6328 dr_chain[k], dr_chain[k], 6329 perm3_mask); 6330 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 6331 vect[k] = data_ref; 6332 } 6333 6334 for (k = 0; k < 3; k++) 6335 { 6336 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1"); 6337 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6338 vect[k % 3], vect[(k + 1) % 3], 6339 shift1_mask); 6340 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 6341 vect_shift[k] = data_ref; 6342 } 6343 6344 for (k = 0; k < 3; k++) 6345 { 6346 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2"); 6347 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, 6348 vect_shift[(4 - k) % 3], 6349 vect_shift[(3 - k) % 3], 6350 shift2_mask); 6351 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 6352 vect[k] = data_ref; 6353 } 6354 6355 (*result_chain)[3 - (nelt % 3)] = vect[2]; 6356 6357 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3"); 6358 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0], 6359 vect[0], shift3_mask); 6360 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 6361 (*result_chain)[nelt % 3] = data_ref; 6362 6363 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4"); 6364 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1], 6365 vect[1], shift4_mask); 6366 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi); 6367 (*result_chain)[0] = data_ref; 6368 return true; 6369 } 6370 return false; 6371} 6372 6373/* Function vect_transform_grouped_load. 6374 6375 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements 6376 to perform their permutation and ascribe the result vectorized statements to 6377 the scalar statements. 6378*/ 6379 6380void 6381vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain, 6382 int size, gimple_stmt_iterator *gsi) 6383{ 6384 machine_mode mode; 6385 vec<tree> result_chain = vNULL; 6386 6387 /* DR_CHAIN contains input data-refs that are a part of the interleaving. 6388 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted 6389 vectors, that are ready for vector computation. */ 6390 result_chain.create (size); 6391 6392 /* If reassociation width for vector type is 2 or greater target machine can 6393 execute 2 or more vector instructions in parallel. Otherwise try to 6394 get chain for loads group using vect_shift_permute_load_chain. */ 6395 mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info)); 6396 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1 6397 || pow2p_hwi (size) 6398 || !vect_shift_permute_load_chain (dr_chain, size, stmt_info, 6399 gsi, &result_chain)) 6400 vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain); 6401 vect_record_grouped_load_vectors (stmt_info, result_chain); 6402 result_chain.release (); 6403} 6404 6405/* RESULT_CHAIN contains the output of a group of grouped loads that were 6406 generated as part of the vectorization of STMT_INFO. Assign the statement 6407 for each vector to the associated scalar statement. */ 6408 6409void 6410vect_record_grouped_load_vectors (stmt_vec_info stmt_info, 6411 vec<tree> result_chain) 6412{ 6413 vec_info *vinfo = stmt_info->vinfo; 6414 stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); 6415 unsigned int i, gap_count; 6416 tree tmp_data_ref; 6417 6418 /* Put a permuted data-ref in the VECTORIZED_STMT field. 6419 Since we scan the chain starting from it's first node, their order 6420 corresponds the order of data-refs in RESULT_CHAIN. */ 6421 stmt_vec_info next_stmt_info = first_stmt_info; 6422 gap_count = 1; 6423 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref) 6424 { 6425 if (!next_stmt_info) 6426 break; 6427 6428 /* Skip the gaps. Loads created for the gaps will be removed by dead 6429 code elimination pass later. No need to check for the first stmt in 6430 the group, since it always exists. 6431 DR_GROUP_GAP is the number of steps in elements from the previous 6432 access (if there is no gap DR_GROUP_GAP is 1). We skip loads that 6433 correspond to the gaps. */ 6434 if (next_stmt_info != first_stmt_info 6435 && gap_count < DR_GROUP_GAP (next_stmt_info)) 6436 { 6437 gap_count++; 6438 continue; 6439 } 6440 6441 /* ??? The following needs cleanup after the removal of 6442 DR_GROUP_SAME_DR_STMT. */ 6443 if (next_stmt_info) 6444 { 6445 stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref); 6446 /* We assume that if VEC_STMT is not NULL, this is a case of multiple 6447 copies, and we put the new vector statement in the first available 6448 RELATED_STMT. */ 6449 if (!STMT_VINFO_VEC_STMT (next_stmt_info)) 6450 STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info; 6451 else 6452 { 6453 stmt_vec_info prev_stmt_info 6454 = STMT_VINFO_VEC_STMT (next_stmt_info); 6455 stmt_vec_info rel_stmt_info 6456 = STMT_VINFO_RELATED_STMT (prev_stmt_info); 6457 while (rel_stmt_info) 6458 { 6459 prev_stmt_info = rel_stmt_info; 6460 rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info); 6461 } 6462 6463 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info; 6464 } 6465 6466 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info); 6467 gap_count = 1; 6468 } 6469 } 6470} 6471 6472/* Function vect_force_dr_alignment_p. 6473 6474 Returns whether the alignment of a DECL can be forced to be aligned 6475 on ALIGNMENT bit boundary. */ 6476 6477bool 6478vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment) 6479{ 6480 if (!VAR_P (decl)) 6481 return false; 6482 6483 if (decl_in_symtab_p (decl) 6484 && !symtab_node::get (decl)->can_increase_alignment_p ()) 6485 return false; 6486 6487 if (TREE_STATIC (decl)) 6488 return (known_le (alignment, 6489 (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT)); 6490 else 6491 return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT)); 6492} 6493 6494 6495/* Return whether the data reference DR_INFO is supported with respect to its 6496 alignment. 6497 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even 6498 it is aligned, i.e., check if it is possible to vectorize it with different 6499 alignment. */ 6500 6501enum dr_alignment_support 6502vect_supportable_dr_alignment (dr_vec_info *dr_info, 6503 bool check_aligned_accesses) 6504{ 6505 data_reference *dr = dr_info->dr; 6506 stmt_vec_info stmt_info = dr_info->stmt; 6507 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6508 machine_mode mode = TYPE_MODE (vectype); 6509 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); 6510 class loop *vect_loop = NULL; 6511 bool nested_in_vect_loop = false; 6512 6513 if (aligned_access_p (dr_info) && !check_aligned_accesses) 6514 return dr_aligned; 6515 6516 /* For now assume all conditional loads/stores support unaligned 6517 access without any special code. */ 6518 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) 6519 if (gimple_call_internal_p (stmt) 6520 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD 6521 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)) 6522 return dr_unaligned_supported; 6523 6524 if (loop_vinfo) 6525 { 6526 vect_loop = LOOP_VINFO_LOOP (loop_vinfo); 6527 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info); 6528 } 6529 6530 /* Possibly unaligned access. */ 6531 6532 /* We can choose between using the implicit realignment scheme (generating 6533 a misaligned_move stmt) and the explicit realignment scheme (generating 6534 aligned loads with a REALIGN_LOAD). There are two variants to the 6535 explicit realignment scheme: optimized, and unoptimized. 6536 We can optimize the realignment only if the step between consecutive 6537 vector loads is equal to the vector size. Since the vector memory 6538 accesses advance in steps of VS (Vector Size) in the vectorized loop, it 6539 is guaranteed that the misalignment amount remains the same throughout the 6540 execution of the vectorized loop. Therefore, we can create the 6541 "realignment token" (the permutation mask that is passed to REALIGN_LOAD) 6542 at the loop preheader. 6543 6544 However, in the case of outer-loop vectorization, when vectorizing a 6545 memory access in the inner-loop nested within the LOOP that is now being 6546 vectorized, while it is guaranteed that the misalignment of the 6547 vectorized memory access will remain the same in different outer-loop 6548 iterations, it is *not* guaranteed that is will remain the same throughout 6549 the execution of the inner-loop. This is because the inner-loop advances 6550 with the original scalar step (and not in steps of VS). If the inner-loop 6551 step happens to be a multiple of VS, then the misalignment remains fixed 6552 and we can use the optimized realignment scheme. For example: 6553 6554 for (i=0; i<N; i++) 6555 for (j=0; j<M; j++) 6556 s += a[i+j]; 6557 6558 When vectorizing the i-loop in the above example, the step between 6559 consecutive vector loads is 1, and so the misalignment does not remain 6560 fixed across the execution of the inner-loop, and the realignment cannot 6561 be optimized (as illustrated in the following pseudo vectorized loop): 6562 6563 for (i=0; i<N; i+=4) 6564 for (j=0; j<M; j++){ 6565 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...} 6566 // when j is {0,1,2,3,4,5,6,7,...} respectively. 6567 // (assuming that we start from an aligned address). 6568 } 6569 6570 We therefore have to use the unoptimized realignment scheme: 6571 6572 for (i=0; i<N; i+=4) 6573 for (j=k; j<M; j+=4) 6574 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming 6575 // that the misalignment of the initial address is 6576 // 0). 6577 6578 The loop can then be vectorized as follows: 6579 6580 for (k=0; k<4; k++){ 6581 rt = get_realignment_token (&vp[k]); 6582 for (i=0; i<N; i+=4){ 6583 v1 = vp[i+k]; 6584 for (j=k; j<M; j+=4){ 6585 v2 = vp[i+j+VS-1]; 6586 va = REALIGN_LOAD <v1,v2,rt>; 6587 vs += va; 6588 v1 = v2; 6589 } 6590 } 6591 } */ 6592 6593 if (DR_IS_READ (dr)) 6594 { 6595 bool is_packed = false; 6596 tree type = (TREE_TYPE (DR_REF (dr))); 6597 6598 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing 6599 && (!targetm.vectorize.builtin_mask_for_load 6600 || targetm.vectorize.builtin_mask_for_load ())) 6601 { 6602 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 6603 6604 /* If we are doing SLP then the accesses need not have the 6605 same alignment, instead it depends on the SLP group size. */ 6606 if (loop_vinfo 6607 && STMT_SLP_TYPE (stmt_info) 6608 && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo) 6609 * (DR_GROUP_SIZE 6610 (DR_GROUP_FIRST_ELEMENT (stmt_info))), 6611 TYPE_VECTOR_SUBPARTS (vectype))) 6612 ; 6613 else if (!loop_vinfo 6614 || (nested_in_vect_loop 6615 && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)), 6616 GET_MODE_SIZE (TYPE_MODE (vectype))))) 6617 return dr_explicit_realign; 6618 else 6619 return dr_explicit_realign_optimized; 6620 } 6621 if (!known_alignment_for_access_p (dr_info)) 6622 is_packed = not_size_aligned (DR_REF (dr)); 6623 6624 if (targetm.vectorize.support_vector_misalignment 6625 (mode, type, DR_MISALIGNMENT (dr_info), is_packed)) 6626 /* Can't software pipeline the loads, but can at least do them. */ 6627 return dr_unaligned_supported; 6628 } 6629 else 6630 { 6631 bool is_packed = false; 6632 tree type = (TREE_TYPE (DR_REF (dr))); 6633 6634 if (!known_alignment_for_access_p (dr_info)) 6635 is_packed = not_size_aligned (DR_REF (dr)); 6636 6637 if (targetm.vectorize.support_vector_misalignment 6638 (mode, type, DR_MISALIGNMENT (dr_info), is_packed)) 6639 return dr_unaligned_supported; 6640 } 6641 6642 /* Unsupported. */ 6643 return dr_unaligned_unsupported; 6644} 6645