1/* Vectorizer Specific Loop Manipulations 2 Copyright (C) 2003-2020 Free Software Foundation, Inc. 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> 4 and Ira Rosen <irar@il.ibm.com> 5 6This file is part of GCC. 7 8GCC is free software; you can redistribute it and/or modify it under 9the terms of the GNU General Public License as published by the Free 10Software Foundation; either version 3, or (at your option) any later 11version. 12 13GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14WARRANTY; without even the implied warranty of MERCHANTABILITY or 15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16for more details. 17 18You should have received a copy of the GNU General Public License 19along with GCC; see the file COPYING3. If not see 20<http://www.gnu.org/licenses/>. */ 21 22#include "config.h" 23#include "system.h" 24#include "coretypes.h" 25#include "backend.h" 26#include "tree.h" 27#include "gimple.h" 28#include "cfghooks.h" 29#include "tree-pass.h" 30#include "ssa.h" 31#include "fold-const.h" 32#include "cfganal.h" 33#include "gimplify.h" 34#include "gimple-iterator.h" 35#include "gimplify-me.h" 36#include "tree-cfg.h" 37#include "tree-ssa-loop-manip.h" 38#include "tree-into-ssa.h" 39#include "tree-ssa.h" 40#include "cfgloop.h" 41#include "tree-scalar-evolution.h" 42#include "tree-vectorizer.h" 43#include "tree-ssa-loop-ivopts.h" 44#include "gimple-fold.h" 45#include "tree-ssa-loop-niter.h" 46#include "internal-fn.h" 47#include "stor-layout.h" 48#include "optabs-query.h" 49#include "vec-perm-indices.h" 50#include "insn-config.h" 51#include "rtl.h" 52#include "recog.h" 53 54/************************************************************************* 55 Simple Loop Peeling Utilities 56 57 Utilities to support loop peeling for vectorization purposes. 58 *************************************************************************/ 59 60 61/* Renames the use *OP_P. */ 62 63static void 64rename_use_op (use_operand_p op_p) 65{ 66 tree new_name; 67 68 if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME) 69 return; 70 71 new_name = get_current_def (USE_FROM_PTR (op_p)); 72 73 /* Something defined outside of the loop. */ 74 if (!new_name) 75 return; 76 77 /* An ordinary ssa name defined in the loop. */ 78 79 SET_USE (op_p, new_name); 80} 81 82 83/* Renames the variables in basic block BB. Allow renaming of PHI arguments 84 on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is 85 true. */ 86 87static void 88rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop) 89{ 90 gimple *stmt; 91 use_operand_p use_p; 92 ssa_op_iter iter; 93 edge e; 94 edge_iterator ei; 95 class loop *loop = bb->loop_father; 96 class loop *outer_loop = NULL; 97 98 if (rename_from_outer_loop) 99 { 100 gcc_assert (loop); 101 outer_loop = loop_outer (loop); 102 } 103 104 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); 105 gsi_next (&gsi)) 106 { 107 stmt = gsi_stmt (gsi); 108 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES) 109 rename_use_op (use_p); 110 } 111 112 FOR_EACH_EDGE (e, ei, bb->preds) 113 { 114 if (!flow_bb_inside_loop_p (loop, e->src)) 115 { 116 if (!rename_from_outer_loop) 117 continue; 118 if (e->src != outer_loop->header) 119 { 120 if (outer_loop->inner->next) 121 { 122 /* If outer_loop has 2 inner loops, allow there to 123 be an extra basic block which decides which of the 124 two loops to use using LOOP_VECTORIZED. */ 125 if (!single_pred_p (e->src) 126 || single_pred (e->src) != outer_loop->header) 127 continue; 128 } 129 } 130 } 131 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi); 132 gsi_next (&gsi)) 133 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e)); 134 } 135} 136 137 138struct adjust_info 139{ 140 tree from, to; 141 basic_block bb; 142}; 143 144/* A stack of values to be adjusted in debug stmts. We have to 145 process them LIFO, so that the closest substitution applies. If we 146 processed them FIFO, without the stack, we might substitute uses 147 with a PHI DEF that would soon become non-dominant, and when we got 148 to the suitable one, it wouldn't have anything to substitute any 149 more. */ 150static vec<adjust_info, va_heap> adjust_vec; 151 152/* Adjust any debug stmts that referenced AI->from values to use the 153 loop-closed AI->to, if the references are dominated by AI->bb and 154 not by the definition of AI->from. */ 155 156static void 157adjust_debug_stmts_now (adjust_info *ai) 158{ 159 basic_block bbphi = ai->bb; 160 tree orig_def = ai->from; 161 tree new_def = ai->to; 162 imm_use_iterator imm_iter; 163 gimple *stmt; 164 basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def)); 165 166 gcc_assert (dom_info_available_p (CDI_DOMINATORS)); 167 168 /* Adjust any debug stmts that held onto non-loop-closed 169 references. */ 170 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def) 171 { 172 use_operand_p use_p; 173 basic_block bbuse; 174 175 if (!is_gimple_debug (stmt)) 176 continue; 177 178 gcc_assert (gimple_debug_bind_p (stmt)); 179 180 bbuse = gimple_bb (stmt); 181 182 if ((bbuse == bbphi 183 || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi)) 184 && !(bbuse == bbdef 185 || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef))) 186 { 187 if (new_def) 188 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 189 SET_USE (use_p, new_def); 190 else 191 { 192 gimple_debug_bind_reset_value (stmt); 193 update_stmt (stmt); 194 } 195 } 196 } 197} 198 199/* Adjust debug stmts as scheduled before. */ 200 201static void 202adjust_vec_debug_stmts (void) 203{ 204 if (!MAY_HAVE_DEBUG_BIND_STMTS) 205 return; 206 207 gcc_assert (adjust_vec.exists ()); 208 209 while (!adjust_vec.is_empty ()) 210 { 211 adjust_debug_stmts_now (&adjust_vec.last ()); 212 adjust_vec.pop (); 213 } 214} 215 216/* Adjust any debug stmts that referenced FROM values to use the 217 loop-closed TO, if the references are dominated by BB and not by 218 the definition of FROM. If adjust_vec is non-NULL, adjustments 219 will be postponed until adjust_vec_debug_stmts is called. */ 220 221static void 222adjust_debug_stmts (tree from, tree to, basic_block bb) 223{ 224 adjust_info ai; 225 226 if (MAY_HAVE_DEBUG_BIND_STMTS 227 && TREE_CODE (from) == SSA_NAME 228 && ! SSA_NAME_IS_DEFAULT_DEF (from) 229 && ! virtual_operand_p (from)) 230 { 231 ai.from = from; 232 ai.to = to; 233 ai.bb = bb; 234 235 if (adjust_vec.exists ()) 236 adjust_vec.safe_push (ai); 237 else 238 adjust_debug_stmts_now (&ai); 239 } 240} 241 242/* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information 243 to adjust any debug stmts that referenced the old phi arg, 244 presumably non-loop-closed references left over from other 245 transformations. */ 246 247static void 248adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def) 249{ 250 tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e); 251 252 SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def); 253 254 if (MAY_HAVE_DEBUG_BIND_STMTS) 255 adjust_debug_stmts (orig_def, PHI_RESULT (update_phi), 256 gimple_bb (update_phi)); 257} 258 259/* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that 260 the mask should have during the first iteration and NEXT_MASK is the 261 value that it should have on subsequent iterations. */ 262 263static void 264vect_set_loop_mask (class loop *loop, tree mask, tree init_mask, 265 tree next_mask) 266{ 267 gphi *phi = create_phi_node (mask, loop->header); 268 add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION); 269 add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION); 270} 271 272/* Add SEQ to the end of LOOP's preheader block. */ 273 274static void 275add_preheader_seq (class loop *loop, gimple_seq seq) 276{ 277 if (seq) 278 { 279 edge pe = loop_preheader_edge (loop); 280 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); 281 gcc_assert (!new_bb); 282 } 283} 284 285/* Add SEQ to the beginning of LOOP's header block. */ 286 287static void 288add_header_seq (class loop *loop, gimple_seq seq) 289{ 290 if (seq) 291 { 292 gimple_stmt_iterator gsi = gsi_after_labels (loop->header); 293 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); 294 } 295} 296 297/* Return true if the target can interleave elements of two vectors. 298 OFFSET is 0 if the first half of the vectors should be interleaved 299 or 1 if the second half should. When returning true, store the 300 associated permutation in INDICES. */ 301 302static bool 303interleave_supported_p (vec_perm_indices *indices, tree vectype, 304 unsigned int offset) 305{ 306 poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype); 307 poly_uint64 base = exact_div (nelts, 2) * offset; 308 vec_perm_builder sel (nelts, 2, 3); 309 for (unsigned int i = 0; i < 3; ++i) 310 { 311 sel.quick_push (base + i); 312 sel.quick_push (base + i + nelts); 313 } 314 indices->new_vector (sel, 2, nelts); 315 return can_vec_perm_const_p (TYPE_MODE (vectype), *indices); 316} 317 318/* Try to use permutes to define the masks in DEST_RGM using the masks 319 in SRC_RGM, given that the former has twice as many masks as the 320 latter. Return true on success, adding any new statements to SEQ. */ 321 322static bool 323vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm, 324 rgroup_masks *src_rgm) 325{ 326 tree src_masktype = src_rgm->mask_type; 327 tree dest_masktype = dest_rgm->mask_type; 328 machine_mode src_mode = TYPE_MODE (src_masktype); 329 insn_code icode1, icode2; 330 if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter 331 && (icode1 = optab_handler (vec_unpacku_hi_optab, 332 src_mode)) != CODE_FOR_nothing 333 && (icode2 = optab_handler (vec_unpacku_lo_optab, 334 src_mode)) != CODE_FOR_nothing) 335 { 336 /* Unpacking the source masks gives at least as many mask bits as 337 we need. We can then VIEW_CONVERT any excess bits away. */ 338 machine_mode dest_mode = insn_data[icode1].operand[0].mode; 339 gcc_assert (dest_mode == insn_data[icode2].operand[0].mode); 340 tree unpack_masktype = vect_halve_mask_nunits (src_masktype, dest_mode); 341 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i) 342 { 343 tree src = src_rgm->masks[i / 2]; 344 tree dest = dest_rgm->masks[i]; 345 tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1) 346 ? VEC_UNPACK_HI_EXPR 347 : VEC_UNPACK_LO_EXPR); 348 gassign *stmt; 349 if (dest_masktype == unpack_masktype) 350 stmt = gimple_build_assign (dest, code, src); 351 else 352 { 353 tree temp = make_ssa_name (unpack_masktype); 354 stmt = gimple_build_assign (temp, code, src); 355 gimple_seq_add_stmt (seq, stmt); 356 stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR, 357 build1 (VIEW_CONVERT_EXPR, 358 dest_masktype, temp)); 359 } 360 gimple_seq_add_stmt (seq, stmt); 361 } 362 return true; 363 } 364 vec_perm_indices indices[2]; 365 if (dest_masktype == src_masktype 366 && interleave_supported_p (&indices[0], src_masktype, 0) 367 && interleave_supported_p (&indices[1], src_masktype, 1)) 368 { 369 /* The destination requires twice as many mask bits as the source, so 370 we can use interleaving permutes to double up the number of bits. */ 371 tree masks[2]; 372 for (unsigned int i = 0; i < 2; ++i) 373 masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]); 374 for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i) 375 { 376 tree src = src_rgm->masks[i / 2]; 377 tree dest = dest_rgm->masks[i]; 378 gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR, 379 src, src, masks[i & 1]); 380 gimple_seq_add_stmt (seq, stmt); 381 } 382 return true; 383 } 384 return false; 385} 386 387/* Helper for vect_set_loop_condition_masked. Generate definitions for 388 all the masks in RGM and return a mask that is nonzero when the loop 389 needs to iterate. Add any new preheader statements to PREHEADER_SEQ. 390 Use LOOP_COND_GSI to insert code before the exit gcond. 391 392 RGM belongs to loop LOOP. The loop originally iterated NITERS 393 times and has been vectorized according to LOOP_VINFO. 394 395 If NITERS_SKIP is nonnull, the first iteration of the vectorized loop 396 starts with NITERS_SKIP dummy iterations of the scalar loop before 397 the real work starts. The mask elements for these dummy iterations 398 must be 0, to ensure that the extra iterations do not have an effect. 399 400 It is known that: 401 402 NITERS * RGM->max_nscalars_per_iter 403 404 does not overflow. However, MIGHT_WRAP_P says whether an induction 405 variable that starts at 0 and has step: 406 407 VF * RGM->max_nscalars_per_iter 408 409 might overflow before hitting a value above: 410 411 (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter 412 413 This means that we cannot guarantee that such an induction variable 414 would ever hit a value that produces a set of all-false masks for RGM. */ 415 416static tree 417vect_set_loop_masks_directly (class loop *loop, loop_vec_info loop_vinfo, 418 gimple_seq *preheader_seq, 419 gimple_stmt_iterator loop_cond_gsi, 420 rgroup_masks *rgm, tree niters, tree niters_skip, 421 bool might_wrap_p) 422{ 423 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); 424 tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo); 425 tree mask_type = rgm->mask_type; 426 unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter; 427 poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type); 428 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 429 430 /* Calculate the maximum number of scalar values that the rgroup 431 handles in total, the number that it handles for each iteration 432 of the vector loop, and the number that it should skip during the 433 first iteration of the vector loop. */ 434 tree nscalars_total = niters; 435 tree nscalars_step = build_int_cst (iv_type, vf); 436 tree nscalars_skip = niters_skip; 437 if (nscalars_per_iter != 1) 438 { 439 /* We checked before choosing to use a fully-masked loop that these 440 multiplications don't overflow. */ 441 tree compare_factor = build_int_cst (compare_type, nscalars_per_iter); 442 tree iv_factor = build_int_cst (iv_type, nscalars_per_iter); 443 nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type, 444 nscalars_total, compare_factor); 445 nscalars_step = gimple_build (preheader_seq, MULT_EXPR, iv_type, 446 nscalars_step, iv_factor); 447 if (nscalars_skip) 448 nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type, 449 nscalars_skip, compare_factor); 450 } 451 452 /* Create an induction variable that counts the number of scalars 453 processed. */ 454 tree index_before_incr, index_after_incr; 455 gimple_stmt_iterator incr_gsi; 456 bool insert_after; 457 standard_iv_increment_position (loop, &incr_gsi, &insert_after); 458 create_iv (build_int_cst (iv_type, 0), nscalars_step, NULL_TREE, loop, 459 &incr_gsi, insert_after, &index_before_incr, &index_after_incr); 460 461 tree zero_index = build_int_cst (compare_type, 0); 462 tree test_index, test_limit, first_limit; 463 gimple_stmt_iterator *test_gsi; 464 if (might_wrap_p) 465 { 466 /* In principle the loop should stop iterating once the incremented 467 IV reaches a value greater than or equal to: 468 469 NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP 470 471 However, there's no guarantee that this addition doesn't overflow 472 the comparison type, or that the IV hits a value above it before 473 wrapping around. We therefore adjust the limit down by one 474 IV step: 475 476 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP) 477 -[infinite-prec] NSCALARS_STEP 478 479 and compare the IV against this limit _before_ incrementing it. 480 Since the comparison type is unsigned, we actually want the 481 subtraction to saturate at zero: 482 483 (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP) 484 -[sat] NSCALARS_STEP 485 486 And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as: 487 488 NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP) 489 490 where the rightmost subtraction can be done directly in 491 COMPARE_TYPE. */ 492 test_index = index_before_incr; 493 tree adjust = gimple_convert (preheader_seq, compare_type, 494 nscalars_step); 495 if (nscalars_skip) 496 adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type, 497 adjust, nscalars_skip); 498 test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type, 499 nscalars_total, adjust); 500 test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type, 501 test_limit, adjust); 502 test_gsi = &incr_gsi; 503 504 /* Get a safe limit for the first iteration. */ 505 if (nscalars_skip) 506 { 507 /* The first vector iteration can handle at most NSCALARS_STEP 508 scalars. NSCALARS_STEP <= CONST_LIMIT, and adding 509 NSCALARS_SKIP to that cannot overflow. */ 510 tree const_limit = build_int_cst (compare_type, 511 LOOP_VINFO_VECT_FACTOR (loop_vinfo) 512 * nscalars_per_iter); 513 first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type, 514 nscalars_total, const_limit); 515 first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type, 516 first_limit, nscalars_skip); 517 } 518 else 519 /* For the first iteration it doesn't matter whether the IV hits 520 a value above NSCALARS_TOTAL. That only matters for the latch 521 condition. */ 522 first_limit = nscalars_total; 523 } 524 else 525 { 526 /* Test the incremented IV, which will always hit a value above 527 the bound before wrapping. */ 528 test_index = index_after_incr; 529 test_limit = nscalars_total; 530 if (nscalars_skip) 531 test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type, 532 test_limit, nscalars_skip); 533 test_gsi = &loop_cond_gsi; 534 535 first_limit = test_limit; 536 } 537 538 /* Convert the IV value to the comparison type (either a no-op or 539 a demotion). */ 540 gimple_seq test_seq = NULL; 541 test_index = gimple_convert (&test_seq, compare_type, test_index); 542 gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT); 543 544 /* Provide a definition of each mask in the group. */ 545 tree next_mask = NULL_TREE; 546 tree mask; 547 unsigned int i; 548 FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask) 549 { 550 /* Previous masks will cover BIAS scalars. This mask covers the 551 next batch. */ 552 poly_uint64 bias = nscalars_per_mask * i; 553 tree bias_tree = build_int_cst (compare_type, bias); 554 gimple *tmp_stmt; 555 556 /* See whether the first iteration of the vector loop is known 557 to have a full mask. */ 558 poly_uint64 const_limit; 559 bool first_iteration_full 560 = (poly_int_tree_p (first_limit, &const_limit) 561 && known_ge (const_limit, (i + 1) * nscalars_per_mask)); 562 563 /* Rather than have a new IV that starts at BIAS and goes up to 564 TEST_LIMIT, prefer to use the same 0-based IV for each mask 565 and adjust the bound down by BIAS. */ 566 tree this_test_limit = test_limit; 567 if (i != 0) 568 { 569 this_test_limit = gimple_build (preheader_seq, MAX_EXPR, 570 compare_type, this_test_limit, 571 bias_tree); 572 this_test_limit = gimple_build (preheader_seq, MINUS_EXPR, 573 compare_type, this_test_limit, 574 bias_tree); 575 } 576 577 /* Create the initial mask. First include all scalars that 578 are within the loop limit. */ 579 tree init_mask = NULL_TREE; 580 if (!first_iteration_full) 581 { 582 tree start, end; 583 if (first_limit == test_limit) 584 { 585 /* Use a natural test between zero (the initial IV value) 586 and the loop limit. The "else" block would be valid too, 587 but this choice can avoid the need to load BIAS_TREE into 588 a register. */ 589 start = zero_index; 590 end = this_test_limit; 591 } 592 else 593 { 594 /* FIRST_LIMIT is the maximum number of scalars handled by the 595 first iteration of the vector loop. Test the portion 596 associated with this mask. */ 597 start = bias_tree; 598 end = first_limit; 599 } 600 601 init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask"); 602 tmp_stmt = vect_gen_while (init_mask, start, end); 603 gimple_seq_add_stmt (preheader_seq, tmp_stmt); 604 } 605 606 /* Now AND out the bits that are within the number of skipped 607 scalars. */ 608 poly_uint64 const_skip; 609 if (nscalars_skip 610 && !(poly_int_tree_p (nscalars_skip, &const_skip) 611 && known_le (const_skip, bias))) 612 { 613 tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type, 614 bias_tree, nscalars_skip); 615 if (init_mask) 616 init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type, 617 init_mask, unskipped_mask); 618 else 619 init_mask = unskipped_mask; 620 } 621 622 if (!init_mask) 623 /* First iteration is full. */ 624 init_mask = build_minus_one_cst (mask_type); 625 626 /* Get the mask value for the next iteration of the loop. */ 627 next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask"); 628 gcall *call = vect_gen_while (next_mask, test_index, this_test_limit); 629 gsi_insert_before (test_gsi, call, GSI_SAME_STMT); 630 631 vect_set_loop_mask (loop, mask, init_mask, next_mask); 632 } 633 return next_mask; 634} 635 636/* Make LOOP iterate NITERS times using masking and WHILE_ULT calls. 637 LOOP_VINFO describes the vectorization of LOOP. NITERS is the 638 number of iterations of the original scalar loop that should be 639 handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are 640 as for vect_set_loop_condition. 641 642 Insert the branch-back condition before LOOP_COND_GSI and return the 643 final gcond. */ 644 645static gcond * 646vect_set_loop_condition_masked (class loop *loop, loop_vec_info loop_vinfo, 647 tree niters, tree final_iv, 648 bool niters_maybe_zero, 649 gimple_stmt_iterator loop_cond_gsi) 650{ 651 gimple_seq preheader_seq = NULL; 652 gimple_seq header_seq = NULL; 653 654 tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); 655 unsigned int compare_precision = TYPE_PRECISION (compare_type); 656 tree orig_niters = niters; 657 658 /* Type of the initial value of NITERS. */ 659 tree ni_actual_type = TREE_TYPE (niters); 660 unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type); 661 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); 662 663 /* Convert NITERS to the same size as the compare. */ 664 if (compare_precision > ni_actual_precision 665 && niters_maybe_zero) 666 { 667 /* We know that there is always at least one iteration, so if the 668 count is zero then it must have wrapped. Cope with this by 669 subtracting 1 before the conversion and adding 1 to the result. */ 670 gcc_assert (TYPE_UNSIGNED (ni_actual_type)); 671 niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type, 672 niters, build_minus_one_cst (ni_actual_type)); 673 niters = gimple_convert (&preheader_seq, compare_type, niters); 674 niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type, 675 niters, build_one_cst (compare_type)); 676 } 677 else 678 niters = gimple_convert (&preheader_seq, compare_type, niters); 679 680 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo); 681 682 /* Iterate over all the rgroups and fill in their masks. We could use 683 the first mask from any rgroup for the loop condition; here we 684 arbitrarily pick the last. */ 685 tree test_mask = NULL_TREE; 686 rgroup_masks *rgm; 687 unsigned int i; 688 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo); 689 FOR_EACH_VEC_ELT (*masks, i, rgm) 690 if (!rgm->masks.is_empty ()) 691 { 692 /* First try using permutes. This adds a single vector 693 instruction to the loop for each mask, but needs no extra 694 loop invariants or IVs. */ 695 unsigned int nmasks = i + 1; 696 if ((nmasks & 1) == 0) 697 { 698 rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1]; 699 if (!half_rgm->masks.is_empty () 700 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm)) 701 continue; 702 } 703 704 /* See whether zero-based IV would ever generate all-false masks 705 before wrapping around. */ 706 bool might_wrap_p 707 = (iv_limit == -1 708 || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter, 709 UNSIGNED) 710 > compare_precision)); 711 712 /* Set up all masks for this group. */ 713 test_mask = vect_set_loop_masks_directly (loop, loop_vinfo, 714 &preheader_seq, 715 loop_cond_gsi, rgm, 716 niters, niters_skip, 717 might_wrap_p); 718 } 719 720 /* Emit all accumulated statements. */ 721 add_preheader_seq (loop, preheader_seq); 722 add_header_seq (loop, header_seq); 723 724 /* Get a boolean result that tells us whether to iterate. */ 725 edge exit_edge = single_exit (loop); 726 tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR; 727 tree zero_mask = build_zero_cst (TREE_TYPE (test_mask)); 728 gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask, 729 NULL_TREE, NULL_TREE); 730 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT); 731 732 /* The loop iterates (NITERS - 1) / VF + 1 times. 733 Subtract one from this to get the latch count. */ 734 tree step = build_int_cst (compare_type, 735 LOOP_VINFO_VECT_FACTOR (loop_vinfo)); 736 tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters, 737 build_minus_one_cst (compare_type)); 738 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type, 739 niters_minus_one, step); 740 741 if (final_iv) 742 { 743 gassign *assign = gimple_build_assign (final_iv, orig_niters); 744 gsi_insert_on_edge_immediate (single_exit (loop), assign); 745 } 746 747 return cond_stmt; 748} 749 750/* Like vect_set_loop_condition, but handle the case in which there 751 are no loop masks. */ 752 753static gcond * 754vect_set_loop_condition_unmasked (class loop *loop, tree niters, 755 tree step, tree final_iv, 756 bool niters_maybe_zero, 757 gimple_stmt_iterator loop_cond_gsi) 758{ 759 tree indx_before_incr, indx_after_incr; 760 gcond *cond_stmt; 761 gcond *orig_cond; 762 edge pe = loop_preheader_edge (loop); 763 edge exit_edge = single_exit (loop); 764 gimple_stmt_iterator incr_gsi; 765 bool insert_after; 766 enum tree_code code; 767 tree niters_type = TREE_TYPE (niters); 768 769 orig_cond = get_loop_exit_condition (loop); 770 gcc_assert (orig_cond); 771 loop_cond_gsi = gsi_for_stmt (orig_cond); 772 773 tree init, limit; 774 if (!niters_maybe_zero && integer_onep (step)) 775 { 776 /* In this case we can use a simple 0-based IV: 777 778 A: 779 x = 0; 780 do 781 { 782 ... 783 x += 1; 784 } 785 while (x < NITERS); */ 786 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR; 787 init = build_zero_cst (niters_type); 788 limit = niters; 789 } 790 else 791 { 792 /* The following works for all values of NITERS except 0: 793 794 B: 795 x = 0; 796 do 797 { 798 ... 799 x += STEP; 800 } 801 while (x <= NITERS - STEP); 802 803 so that the loop continues to iterate if x + STEP - 1 < NITERS 804 but stops if x + STEP - 1 >= NITERS. 805 806 However, if NITERS is zero, x never hits a value above NITERS - STEP 807 before wrapping around. There are two obvious ways of dealing with 808 this: 809 810 - start at STEP - 1 and compare x before incrementing it 811 - start at -1 and compare x after incrementing it 812 813 The latter is simpler and is what we use. The loop in this case 814 looks like: 815 816 C: 817 x = -1; 818 do 819 { 820 ... 821 x += STEP; 822 } 823 while (x < NITERS - STEP); 824 825 In both cases the loop limit is NITERS - STEP. */ 826 gimple_seq seq = NULL; 827 limit = force_gimple_operand (niters, &seq, true, NULL_TREE); 828 limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step); 829 if (seq) 830 { 831 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); 832 gcc_assert (!new_bb); 833 } 834 if (niters_maybe_zero) 835 { 836 /* Case C. */ 837 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR; 838 init = build_all_ones_cst (niters_type); 839 } 840 else 841 { 842 /* Case B. */ 843 code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR; 844 init = build_zero_cst (niters_type); 845 } 846 } 847 848 standard_iv_increment_position (loop, &incr_gsi, &insert_after); 849 create_iv (init, step, NULL_TREE, loop, 850 &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr); 851 indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr, 852 true, NULL_TREE, true, 853 GSI_SAME_STMT); 854 limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE, 855 true, GSI_SAME_STMT); 856 857 cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE, 858 NULL_TREE); 859 860 gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT); 861 862 /* Record the number of latch iterations. */ 863 if (limit == niters) 864 /* Case A: the loop iterates NITERS times. Subtract one to get the 865 latch count. */ 866 loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters, 867 build_int_cst (niters_type, 1)); 868 else 869 /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times. 870 Subtract one from this to get the latch count. */ 871 loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type, 872 limit, step); 873 874 if (final_iv) 875 { 876 gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR, 877 indx_after_incr, init); 878 gsi_insert_on_edge_immediate (single_exit (loop), assign); 879 } 880 881 return cond_stmt; 882} 883 884/* If we're using fully-masked loops, make LOOP iterate: 885 886 N == (NITERS - 1) / STEP + 1 887 888 times. When NITERS is zero, this is equivalent to making the loop 889 execute (1 << M) / STEP times, where M is the precision of NITERS. 890 NITERS_MAYBE_ZERO is true if this last case might occur. 891 892 If we're not using fully-masked loops, make LOOP iterate: 893 894 N == (NITERS - STEP) / STEP + 1 895 896 times, where NITERS is known to be outside the range [1, STEP - 1]. 897 This is equivalent to making the loop execute NITERS / STEP times 898 when NITERS is nonzero and (1 << M) / STEP times otherwise. 899 NITERS_MAYBE_ZERO again indicates whether this last case might occur. 900 901 If FINAL_IV is nonnull, it is an SSA name that should be set to 902 N * STEP on exit from the loop. 903 904 Assumption: the exit-condition of LOOP is the last stmt in the loop. */ 905 906void 907vect_set_loop_condition (class loop *loop, loop_vec_info loop_vinfo, 908 tree niters, tree step, tree final_iv, 909 bool niters_maybe_zero) 910{ 911 gcond *cond_stmt; 912 gcond *orig_cond = get_loop_exit_condition (loop); 913 gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond); 914 915 if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 916 cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters, 917 final_iv, niters_maybe_zero, 918 loop_cond_gsi); 919 else 920 cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step, 921 final_iv, niters_maybe_zero, 922 loop_cond_gsi); 923 924 /* Remove old loop exit test. */ 925 stmt_vec_info orig_cond_info; 926 if (loop_vinfo 927 && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond))) 928 loop_vinfo->remove_stmt (orig_cond_info); 929 else 930 gsi_remove (&loop_cond_gsi, true); 931 932 if (dump_enabled_p ()) 933 dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G", 934 cond_stmt); 935} 936 937/* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg. 938 For all PHI arguments in FROM->dest and TO->dest from those 939 edges ensure that TO->dest PHI arguments have current_def 940 to that in from. */ 941 942static void 943slpeel_duplicate_current_defs_from_edges (edge from, edge to) 944{ 945 gimple_stmt_iterator gsi_from, gsi_to; 946 947 for (gsi_from = gsi_start_phis (from->dest), 948 gsi_to = gsi_start_phis (to->dest); 949 !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);) 950 { 951 gimple *from_phi = gsi_stmt (gsi_from); 952 gimple *to_phi = gsi_stmt (gsi_to); 953 tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from); 954 tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to); 955 if (virtual_operand_p (from_arg)) 956 { 957 gsi_next (&gsi_from); 958 continue; 959 } 960 if (virtual_operand_p (to_arg)) 961 { 962 gsi_next (&gsi_to); 963 continue; 964 } 965 if (TREE_CODE (from_arg) != SSA_NAME) 966 gcc_assert (operand_equal_p (from_arg, to_arg, 0)); 967 else if (TREE_CODE (to_arg) == SSA_NAME 968 && from_arg != to_arg) 969 { 970 if (get_current_def (to_arg) == NULL_TREE) 971 { 972 gcc_assert (types_compatible_p (TREE_TYPE (to_arg), 973 TREE_TYPE (get_current_def 974 (from_arg)))); 975 set_current_def (to_arg, get_current_def (from_arg)); 976 } 977 } 978 gsi_next (&gsi_from); 979 gsi_next (&gsi_to); 980 } 981 982 gphi *from_phi = get_virtual_phi (from->dest); 983 gphi *to_phi = get_virtual_phi (to->dest); 984 if (from_phi) 985 set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to), 986 get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from))); 987} 988 989 990/* Given LOOP this function generates a new copy of it and puts it 991 on E which is either the entry or exit of LOOP. If SCALAR_LOOP is 992 non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the 993 basic blocks from SCALAR_LOOP instead of LOOP, but to either the 994 entry or exit of LOOP. */ 995 996class loop * 997slpeel_tree_duplicate_loop_to_edge_cfg (class loop *loop, 998 class loop *scalar_loop, edge e) 999{ 1000 class loop *new_loop; 1001 basic_block *new_bbs, *bbs, *pbbs; 1002 bool at_exit; 1003 bool was_imm_dom; 1004 basic_block exit_dest; 1005 edge exit, new_exit; 1006 bool duplicate_outer_loop = false; 1007 1008 exit = single_exit (loop); 1009 at_exit = (e == exit); 1010 if (!at_exit && e != loop_preheader_edge (loop)) 1011 return NULL; 1012 1013 if (scalar_loop == NULL) 1014 scalar_loop = loop; 1015 1016 bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1); 1017 pbbs = bbs + 1; 1018 get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes); 1019 /* Allow duplication of outer loops. */ 1020 if (scalar_loop->inner) 1021 duplicate_outer_loop = true; 1022 /* Check whether duplication is possible. */ 1023 if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes)) 1024 { 1025 free (bbs); 1026 return NULL; 1027 } 1028 1029 /* Generate new loop structure. */ 1030 new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop)); 1031 duplicate_subloops (scalar_loop, new_loop); 1032 1033 exit_dest = exit->dest; 1034 was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS, 1035 exit_dest) == loop->header ? 1036 true : false); 1037 1038 /* Also copy the pre-header, this avoids jumping through hoops to 1039 duplicate the loop entry PHI arguments. Create an empty 1040 pre-header unconditionally for this. */ 1041 basic_block preheader = split_edge (loop_preheader_edge (scalar_loop)); 1042 edge entry_e = single_pred_edge (preheader); 1043 bbs[0] = preheader; 1044 new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1); 1045 1046 exit = single_exit (scalar_loop); 1047 copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs, 1048 &exit, 1, &new_exit, NULL, 1049 at_exit ? loop->latch : e->src, true); 1050 exit = single_exit (loop); 1051 basic_block new_preheader = new_bbs[0]; 1052 1053 add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL); 1054 1055 /* Skip new preheader since it's deleted if copy loop is added at entry. */ 1056 for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++) 1057 rename_variables_in_bb (new_bbs[i], duplicate_outer_loop); 1058 1059 if (scalar_loop != loop) 1060 { 1061 /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from 1062 SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop, 1063 but LOOP will not. slpeel_update_phi_nodes_for_guard{1,2} expects 1064 the LOOP SSA_NAMEs (on the exit edge and edge from latch to 1065 header) to have current_def set, so copy them over. */ 1066 slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop), 1067 exit); 1068 slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch, 1069 0), 1070 EDGE_SUCC (loop->latch, 0)); 1071 } 1072 1073 if (at_exit) /* Add the loop copy at exit. */ 1074 { 1075 if (scalar_loop != loop) 1076 { 1077 gphi_iterator gsi; 1078 new_exit = redirect_edge_and_branch (new_exit, exit_dest); 1079 1080 for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi); 1081 gsi_next (&gsi)) 1082 { 1083 gphi *phi = gsi.phi (); 1084 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e); 1085 location_t orig_locus 1086 = gimple_phi_arg_location_from_edge (phi, e); 1087 1088 add_phi_arg (phi, orig_arg, new_exit, orig_locus); 1089 } 1090 } 1091 redirect_edge_and_branch_force (e, new_preheader); 1092 flush_pending_stmts (e); 1093 set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src); 1094 if (was_imm_dom || duplicate_outer_loop) 1095 set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src); 1096 1097 /* And remove the non-necessary forwarder again. Keep the other 1098 one so we have a proper pre-header for the loop at the exit edge. */ 1099 redirect_edge_pred (single_succ_edge (preheader), 1100 single_pred (preheader)); 1101 delete_basic_block (preheader); 1102 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header, 1103 loop_preheader_edge (scalar_loop)->src); 1104 } 1105 else /* Add the copy at entry. */ 1106 { 1107 if (scalar_loop != loop) 1108 { 1109 /* Remove the non-necessary forwarder of scalar_loop again. */ 1110 redirect_edge_pred (single_succ_edge (preheader), 1111 single_pred (preheader)); 1112 delete_basic_block (preheader); 1113 set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header, 1114 loop_preheader_edge (scalar_loop)->src); 1115 preheader = split_edge (loop_preheader_edge (loop)); 1116 entry_e = single_pred_edge (preheader); 1117 } 1118 1119 redirect_edge_and_branch_force (entry_e, new_preheader); 1120 flush_pending_stmts (entry_e); 1121 set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src); 1122 1123 redirect_edge_and_branch_force (new_exit, preheader); 1124 flush_pending_stmts (new_exit); 1125 set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src); 1126 1127 /* And remove the non-necessary forwarder again. Keep the other 1128 one so we have a proper pre-header for the loop at the exit edge. */ 1129 redirect_edge_pred (single_succ_edge (new_preheader), 1130 single_pred (new_preheader)); 1131 delete_basic_block (new_preheader); 1132 set_immediate_dominator (CDI_DOMINATORS, new_loop->header, 1133 loop_preheader_edge (new_loop)->src); 1134 } 1135 1136 if (scalar_loop != loop) 1137 { 1138 /* Update new_loop->header PHIs, so that on the preheader 1139 edge they are the ones from loop rather than scalar_loop. */ 1140 gphi_iterator gsi_orig, gsi_new; 1141 edge orig_e = loop_preheader_edge (loop); 1142 edge new_e = loop_preheader_edge (new_loop); 1143 1144 for (gsi_orig = gsi_start_phis (loop->header), 1145 gsi_new = gsi_start_phis (new_loop->header); 1146 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new); 1147 gsi_next (&gsi_orig), gsi_next (&gsi_new)) 1148 { 1149 gphi *orig_phi = gsi_orig.phi (); 1150 gphi *new_phi = gsi_new.phi (); 1151 tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e); 1152 location_t orig_locus 1153 = gimple_phi_arg_location_from_edge (orig_phi, orig_e); 1154 1155 add_phi_arg (new_phi, orig_arg, new_e, orig_locus); 1156 } 1157 } 1158 1159 free (new_bbs); 1160 free (bbs); 1161 1162 checking_verify_dominators (CDI_DOMINATORS); 1163 1164 return new_loop; 1165} 1166 1167 1168/* Given the condition expression COND, put it as the last statement of 1169 GUARD_BB; set both edges' probability; set dominator of GUARD_TO to 1170 DOM_BB; return the skip edge. GUARD_TO is the target basic block to 1171 skip the loop. PROBABILITY is the skip edge's probability. Mark the 1172 new edge as irreducible if IRREDUCIBLE_P is true. */ 1173 1174static edge 1175slpeel_add_loop_guard (basic_block guard_bb, tree cond, 1176 basic_block guard_to, basic_block dom_bb, 1177 profile_probability probability, bool irreducible_p) 1178{ 1179 gimple_stmt_iterator gsi; 1180 edge new_e, enter_e; 1181 gcond *cond_stmt; 1182 gimple_seq gimplify_stmt_list = NULL; 1183 1184 enter_e = EDGE_SUCC (guard_bb, 0); 1185 enter_e->flags &= ~EDGE_FALLTHRU; 1186 enter_e->flags |= EDGE_FALSE_VALUE; 1187 gsi = gsi_last_bb (guard_bb); 1188 1189 cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr, 1190 NULL_TREE); 1191 if (gimplify_stmt_list) 1192 gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT); 1193 1194 cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE); 1195 gsi = gsi_last_bb (guard_bb); 1196 gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT); 1197 1198 /* Add new edge to connect guard block to the merge/loop-exit block. */ 1199 new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE); 1200 1201 new_e->probability = probability; 1202 if (irreducible_p) 1203 new_e->flags |= EDGE_IRREDUCIBLE_LOOP; 1204 1205 enter_e->probability = probability.invert (); 1206 set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb); 1207 1208 /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */ 1209 if (enter_e->dest->loop_father->header == enter_e->dest) 1210 split_edge (enter_e); 1211 1212 return new_e; 1213} 1214 1215 1216/* This function verifies that the following restrictions apply to LOOP: 1217 (1) it consists of exactly 2 basic blocks - header, and an empty latch 1218 for innermost loop and 5 basic blocks for outer-loop. 1219 (2) it is single entry, single exit 1220 (3) its exit condition is the last stmt in the header 1221 (4) E is the entry/exit edge of LOOP. 1222 */ 1223 1224bool 1225slpeel_can_duplicate_loop_p (const class loop *loop, const_edge e) 1226{ 1227 edge exit_e = single_exit (loop); 1228 edge entry_e = loop_preheader_edge (loop); 1229 gcond *orig_cond = get_loop_exit_condition (loop); 1230 gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src); 1231 unsigned int num_bb = loop->inner? 5 : 2; 1232 1233 /* All loops have an outer scope; the only case loop->outer is NULL is for 1234 the function itself. */ 1235 if (!loop_outer (loop) 1236 || loop->num_nodes != num_bb 1237 || !empty_block_p (loop->latch) 1238 || !single_exit (loop) 1239 /* Verify that new loop exit condition can be trivially modified. */ 1240 || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi)) 1241 || (e != exit_e && e != entry_e)) 1242 return false; 1243 1244 return true; 1245} 1246 1247/* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI 1248 in the exit bb and rename all the uses after the loop. This simplifies 1249 the *guard[12] routines, which assume loop closed SSA form for all PHIs 1250 (but normally loop closed SSA form doesn't require virtual PHIs to be 1251 in the same form). Doing this early simplifies the checking what 1252 uses should be renamed. 1253 1254 If we create a new phi after the loop, return the definition that 1255 applies on entry to the loop, otherwise return null. */ 1256 1257static tree 1258create_lcssa_for_virtual_phi (class loop *loop) 1259{ 1260 gphi_iterator gsi; 1261 edge exit_e = single_exit (loop); 1262 1263 for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi)) 1264 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi)))) 1265 { 1266 gphi *phi = gsi.phi (); 1267 for (gsi = gsi_start_phis (exit_e->dest); 1268 !gsi_end_p (gsi); gsi_next (&gsi)) 1269 if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi)))) 1270 break; 1271 if (gsi_end_p (gsi)) 1272 { 1273 tree new_vop = copy_ssa_name (PHI_RESULT (phi)); 1274 gphi *new_phi = create_phi_node (new_vop, exit_e->dest); 1275 tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0)); 1276 imm_use_iterator imm_iter; 1277 gimple *stmt; 1278 use_operand_p use_p; 1279 1280 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop) 1281 = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop); 1282 add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION); 1283 gimple_phi_set_result (new_phi, new_vop); 1284 FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop) 1285 if (stmt != new_phi 1286 && !flow_bb_inside_loop_p (loop, gimple_bb (stmt))) 1287 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 1288 SET_USE (use_p, new_vop); 1289 1290 return PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop)); 1291 } 1292 break; 1293 } 1294 return NULL_TREE; 1295} 1296 1297/* Function vect_get_loop_location. 1298 1299 Extract the location of the loop in the source code. 1300 If the loop is not well formed for vectorization, an estimated 1301 location is calculated. 1302 Return the loop location if succeed and NULL if not. */ 1303 1304dump_user_location_t 1305find_loop_location (class loop *loop) 1306{ 1307 gimple *stmt = NULL; 1308 basic_block bb; 1309 gimple_stmt_iterator si; 1310 1311 if (!loop) 1312 return dump_user_location_t (); 1313 1314 stmt = get_loop_exit_condition (loop); 1315 1316 if (stmt 1317 && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION) 1318 return stmt; 1319 1320 /* If we got here the loop is probably not "well formed", 1321 try to estimate the loop location */ 1322 1323 if (!loop->header) 1324 return dump_user_location_t (); 1325 1326 bb = loop->header; 1327 1328 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) 1329 { 1330 stmt = gsi_stmt (si); 1331 if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION) 1332 return stmt; 1333 } 1334 1335 return dump_user_location_t (); 1336} 1337 1338/* Return true if the phi described by STMT_INFO defines an IV of the 1339 loop to be vectorized. */ 1340 1341static bool 1342iv_phi_p (stmt_vec_info stmt_info) 1343{ 1344 gphi *phi = as_a <gphi *> (stmt_info->stmt); 1345 if (virtual_operand_p (PHI_RESULT (phi))) 1346 return false; 1347 1348 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def 1349 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def) 1350 return false; 1351 1352 return true; 1353} 1354 1355/* Function vect_can_advance_ivs_p 1356 1357 In case the number of iterations that LOOP iterates is unknown at compile 1358 time, an epilog loop will be generated, and the loop induction variables 1359 (IVs) will be "advanced" to the value they are supposed to take just before 1360 the epilog loop. Here we check that the access function of the loop IVs 1361 and the expression that represents the loop bound are simple enough. 1362 These restrictions will be relaxed in the future. */ 1363 1364bool 1365vect_can_advance_ivs_p (loop_vec_info loop_vinfo) 1366{ 1367 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1368 basic_block bb = loop->header; 1369 gphi_iterator gsi; 1370 1371 /* Analyze phi functions of the loop header. */ 1372 1373 if (dump_enabled_p ()) 1374 dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n"); 1375 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 1376 { 1377 tree evolution_part; 1378 1379 gphi *phi = gsi.phi (); 1380 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi); 1381 if (dump_enabled_p ()) 1382 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", 1383 phi_info->stmt); 1384 1385 /* Skip virtual phi's. The data dependences that are associated with 1386 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. 1387 1388 Skip reduction phis. */ 1389 if (!iv_phi_p (phi_info)) 1390 { 1391 if (dump_enabled_p ()) 1392 dump_printf_loc (MSG_NOTE, vect_location, 1393 "reduc or virtual phi. skip.\n"); 1394 continue; 1395 } 1396 1397 /* Analyze the evolution function. */ 1398 1399 evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info); 1400 if (evolution_part == NULL_TREE) 1401 { 1402 if (dump_enabled_p ()) 1403 dump_printf (MSG_MISSED_OPTIMIZATION, 1404 "No access function or evolution.\n"); 1405 return false; 1406 } 1407 1408 /* FORNOW: We do not transform initial conditions of IVs 1409 which evolution functions are not invariants in the loop. */ 1410 1411 if (!expr_invariant_in_loop_p (loop, evolution_part)) 1412 { 1413 if (dump_enabled_p ()) 1414 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1415 "evolution not invariant in loop.\n"); 1416 return false; 1417 } 1418 1419 /* FORNOW: We do not transform initial conditions of IVs 1420 which evolution functions are a polynomial of degree >= 2. */ 1421 1422 if (tree_is_chrec (evolution_part)) 1423 { 1424 if (dump_enabled_p ()) 1425 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 1426 "evolution is chrec.\n"); 1427 return false; 1428 } 1429 } 1430 1431 return true; 1432} 1433 1434 1435/* Function vect_update_ivs_after_vectorizer. 1436 1437 "Advance" the induction variables of LOOP to the value they should take 1438 after the execution of LOOP. This is currently necessary because the 1439 vectorizer does not handle induction variables that are used after the 1440 loop. Such a situation occurs when the last iterations of LOOP are 1441 peeled, because: 1442 1. We introduced new uses after LOOP for IVs that were not originally used 1443 after LOOP: the IVs of LOOP are now used by an epilog loop. 1444 2. LOOP is going to be vectorized; this means that it will iterate N/VF 1445 times, whereas the loop IVs should be bumped N times. 1446 1447 Input: 1448 - LOOP - a loop that is going to be vectorized. The last few iterations 1449 of LOOP were peeled. 1450 - NITERS - the number of iterations that LOOP executes (before it is 1451 vectorized). i.e, the number of times the ivs should be bumped. 1452 - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path 1453 coming out from LOOP on which there are uses of the LOOP ivs 1454 (this is the path from LOOP->exit to epilog_loop->preheader). 1455 1456 The new definitions of the ivs are placed in LOOP->exit. 1457 The phi args associated with the edge UPDATE_E in the bb 1458 UPDATE_E->dest are updated accordingly. 1459 1460 Assumption 1: Like the rest of the vectorizer, this function assumes 1461 a single loop exit that has a single predecessor. 1462 1463 Assumption 2: The phi nodes in the LOOP header and in update_bb are 1464 organized in the same order. 1465 1466 Assumption 3: The access function of the ivs is simple enough (see 1467 vect_can_advance_ivs_p). This assumption will be relaxed in the future. 1468 1469 Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path 1470 coming out of LOOP on which the ivs of LOOP are used (this is the path 1471 that leads to the epilog loop; other paths skip the epilog loop). This 1472 path starts with the edge UPDATE_E, and its destination (denoted update_bb) 1473 needs to have its phis updated. 1474 */ 1475 1476static void 1477vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, 1478 tree niters, edge update_e) 1479{ 1480 gphi_iterator gsi, gsi1; 1481 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 1482 basic_block update_bb = update_e->dest; 1483 basic_block exit_bb = single_exit (loop)->dest; 1484 1485 /* Make sure there exists a single-predecessor exit bb: */ 1486 gcc_assert (single_pred_p (exit_bb)); 1487 gcc_assert (single_succ_edge (exit_bb) == update_e); 1488 1489 for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb); 1490 !gsi_end_p (gsi) && !gsi_end_p (gsi1); 1491 gsi_next (&gsi), gsi_next (&gsi1)) 1492 { 1493 tree init_expr; 1494 tree step_expr, off; 1495 tree type; 1496 tree var, ni, ni_name; 1497 gimple_stmt_iterator last_gsi; 1498 1499 gphi *phi = gsi.phi (); 1500 gphi *phi1 = gsi1.phi (); 1501 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi); 1502 if (dump_enabled_p ()) 1503 dump_printf_loc (MSG_NOTE, vect_location, 1504 "vect_update_ivs_after_vectorizer: phi: %G", phi); 1505 1506 /* Skip reduction and virtual phis. */ 1507 if (!iv_phi_p (phi_info)) 1508 { 1509 if (dump_enabled_p ()) 1510 dump_printf_loc (MSG_NOTE, vect_location, 1511 "reduc or virtual phi. skip.\n"); 1512 continue; 1513 } 1514 1515 type = TREE_TYPE (gimple_phi_result (phi)); 1516 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info); 1517 step_expr = unshare_expr (step_expr); 1518 1519 /* FORNOW: We do not support IVs whose evolution function is a polynomial 1520 of degree >= 2 or exponential. */ 1521 gcc_assert (!tree_is_chrec (step_expr)); 1522 1523 init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop)); 1524 1525 off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), 1526 fold_convert (TREE_TYPE (step_expr), niters), 1527 step_expr); 1528 if (POINTER_TYPE_P (type)) 1529 ni = fold_build_pointer_plus (init_expr, off); 1530 else 1531 ni = fold_build2 (PLUS_EXPR, type, 1532 init_expr, fold_convert (type, off)); 1533 1534 var = create_tmp_var (type, "tmp"); 1535 1536 last_gsi = gsi_last_bb (exit_bb); 1537 gimple_seq new_stmts = NULL; 1538 ni_name = force_gimple_operand (ni, &new_stmts, false, var); 1539 /* Exit_bb shouldn't be empty. */ 1540 if (!gsi_end_p (last_gsi)) 1541 gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT); 1542 else 1543 gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT); 1544 1545 /* Fix phi expressions in the successor bb. */ 1546 adjust_phi_and_debug_stmts (phi1, update_e, ni_name); 1547 } 1548} 1549 1550/* Return a gimple value containing the misalignment (measured in vector 1551 elements) for the loop described by LOOP_VINFO, i.e. how many elements 1552 it is away from a perfectly aligned address. Add any new statements 1553 to SEQ. */ 1554 1555static tree 1556get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo) 1557{ 1558 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); 1559 stmt_vec_info stmt_info = dr_info->stmt; 1560 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 1561 1562 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info); 1563 unsigned HOST_WIDE_INT target_align_c; 1564 tree target_align_minus_1; 1565 1566 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr), 1567 size_zero_node) < 0; 1568 tree offset = (negative 1569 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) 1570 : size_zero_node); 1571 tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq, 1572 offset); 1573 tree type = unsigned_type_for (TREE_TYPE (start_addr)); 1574 if (target_align.is_constant (&target_align_c)) 1575 target_align_minus_1 = build_int_cst (type, target_align_c - 1); 1576 else 1577 { 1578 tree vla = build_int_cst (type, target_align); 1579 tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla, 1580 fold_build2 (MINUS_EXPR, type, 1581 build_int_cst (type, 0), vla)); 1582 target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align, 1583 build_int_cst (type, 1)); 1584 } 1585 1586 HOST_WIDE_INT elem_size 1587 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); 1588 tree elem_size_log = build_int_cst (type, exact_log2 (elem_size)); 1589 1590 /* Create: misalign_in_bytes = addr & (target_align - 1). */ 1591 tree int_start_addr = fold_convert (type, start_addr); 1592 tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr, 1593 target_align_minus_1); 1594 1595 /* Create: misalign_in_elems = misalign_in_bytes / element_size. */ 1596 tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes, 1597 elem_size_log); 1598 1599 return misalign_in_elems; 1600} 1601 1602/* Function vect_gen_prolog_loop_niters 1603 1604 Generate the number of iterations which should be peeled as prolog for the 1605 loop represented by LOOP_VINFO. It is calculated as the misalignment of 1606 DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). 1607 As a result, after the execution of this loop, the data reference DR will 1608 refer to an aligned location. The following computation is generated: 1609 1610 If the misalignment of DR is known at compile time: 1611 addr_mis = int mis = DR_MISALIGNMENT (dr); 1612 Else, compute address misalignment in bytes: 1613 addr_mis = addr & (target_align - 1) 1614 1615 prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step 1616 1617 (elem_size = element type size; an element is the scalar element whose type 1618 is the inner type of the vectype) 1619 1620 The computations will be emitted at the end of BB. We also compute and 1621 store upper bound (included) of the result in BOUND. 1622 1623 When the step of the data-ref in the loop is not 1 (as in interleaved data 1624 and SLP), the number of iterations of the prolog must be divided by the step 1625 (which is equal to the size of interleaved group). 1626 1627 The above formulas assume that VF == number of elements in the vector. This 1628 may not hold when there are multiple-types in the loop. 1629 In this case, for some data-references in the loop the VF does not represent 1630 the number of elements that fit in the vector. Therefore, instead of VF we 1631 use TYPE_VECTOR_SUBPARTS. */ 1632 1633static tree 1634vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo, 1635 basic_block bb, int *bound) 1636{ 1637 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); 1638 tree var; 1639 tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)); 1640 gimple_seq stmts = NULL, new_stmts = NULL; 1641 tree iters, iters_name; 1642 stmt_vec_info stmt_info = dr_info->stmt; 1643 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 1644 poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info); 1645 1646 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) 1647 { 1648 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 1649 1650 if (dump_enabled_p ()) 1651 dump_printf_loc (MSG_NOTE, vect_location, 1652 "known peeling = %d.\n", npeel); 1653 1654 iters = build_int_cst (niters_type, npeel); 1655 *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 1656 } 1657 else 1658 { 1659 tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo); 1660 tree type = TREE_TYPE (misalign_in_elems); 1661 HOST_WIDE_INT elem_size 1662 = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); 1663 /* We only do prolog peeling if the target alignment is known at compile 1664 time. */ 1665 poly_uint64 align_in_elems = 1666 exact_div (target_align, elem_size); 1667 tree align_in_elems_minus_1 = 1668 build_int_cst (type, align_in_elems - 1); 1669 tree align_in_elems_tree = build_int_cst (type, align_in_elems); 1670 1671 /* Create: (niters_type) ((align_in_elems - misalign_in_elems) 1672 & (align_in_elems - 1)). */ 1673 bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr), 1674 size_zero_node) < 0; 1675 if (negative) 1676 iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems, 1677 align_in_elems_tree); 1678 else 1679 iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree, 1680 misalign_in_elems); 1681 iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1); 1682 iters = fold_convert (niters_type, iters); 1683 unsigned HOST_WIDE_INT align_in_elems_c; 1684 if (align_in_elems.is_constant (&align_in_elems_c)) 1685 *bound = align_in_elems_c - 1; 1686 else 1687 *bound = -1; 1688 } 1689 1690 if (dump_enabled_p ()) 1691 dump_printf_loc (MSG_NOTE, vect_location, 1692 "niters for prolog loop: %T\n", iters); 1693 1694 var = create_tmp_var (niters_type, "prolog_loop_niters"); 1695 iters_name = force_gimple_operand (iters, &new_stmts, false, var); 1696 1697 if (new_stmts) 1698 gimple_seq_add_seq (&stmts, new_stmts); 1699 if (stmts) 1700 { 1701 gcc_assert (single_succ_p (bb)); 1702 gimple_stmt_iterator gsi = gsi_last_bb (bb); 1703 if (gsi_end_p (gsi)) 1704 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT); 1705 else 1706 gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT); 1707 } 1708 return iters_name; 1709} 1710 1711 1712/* Function vect_update_init_of_dr 1713 1714 If CODE is PLUS, the vector loop starts NITERS iterations after the 1715 scalar one, otherwise CODE is MINUS and the vector loop starts NITERS 1716 iterations before the scalar one (using masking to skip inactive 1717 elements). This function updates the information recorded in DR to 1718 account for the difference. Specifically, it updates the OFFSET 1719 field of DR_INFO. */ 1720 1721static void 1722vect_update_init_of_dr (dr_vec_info *dr_info, tree niters, tree_code code) 1723{ 1724 struct data_reference *dr = dr_info->dr; 1725 tree offset = dr_info->offset; 1726 if (!offset) 1727 offset = build_zero_cst (sizetype); 1728 1729 niters = fold_build2 (MULT_EXPR, sizetype, 1730 fold_convert (sizetype, niters), 1731 fold_convert (sizetype, DR_STEP (dr))); 1732 offset = fold_build2 (code, sizetype, 1733 fold_convert (sizetype, offset), niters); 1734 dr_info->offset = offset; 1735} 1736 1737 1738/* Function vect_update_inits_of_drs 1739 1740 Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO. 1741 CODE and NITERS are as for vect_update_inits_of_dr. */ 1742 1743void 1744vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters, 1745 tree_code code) 1746{ 1747 unsigned int i; 1748 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); 1749 struct data_reference *dr; 1750 1751 DUMP_VECT_SCOPE ("vect_update_inits_of_dr"); 1752 1753 /* Adjust niters to sizetype. We used to insert the stmts on loop preheader 1754 here, but since we might use these niters to update the epilogues niters 1755 and data references we can't insert them here as this definition might not 1756 always dominate its uses. */ 1757 if (!types_compatible_p (sizetype, TREE_TYPE (niters))) 1758 niters = fold_convert (sizetype, niters); 1759 1760 FOR_EACH_VEC_ELT (datarefs, i, dr) 1761 { 1762 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr); 1763 if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt) 1764 && !STMT_VINFO_SIMD_LANE_ACCESS_P (dr_info->stmt)) 1765 vect_update_init_of_dr (dr_info, niters, code); 1766 } 1767} 1768 1769/* For the information recorded in LOOP_VINFO prepare the loop for peeling 1770 by masking. This involves calculating the number of iterations to 1771 be peeled and then aligning all memory references appropriately. */ 1772 1773void 1774vect_prepare_for_masked_peels (loop_vec_info loop_vinfo) 1775{ 1776 tree misalign_in_elems; 1777 tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo); 1778 1779 gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo)); 1780 1781 /* From the information recorded in LOOP_VINFO get the number of iterations 1782 that need to be skipped via masking. */ 1783 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) 1784 { 1785 poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo) 1786 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)); 1787 misalign_in_elems = build_int_cst (type, misalign); 1788 } 1789 else 1790 { 1791 gimple_seq seq1 = NULL, seq2 = NULL; 1792 misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo); 1793 misalign_in_elems = fold_convert (type, misalign_in_elems); 1794 misalign_in_elems = force_gimple_operand (misalign_in_elems, 1795 &seq2, true, NULL_TREE); 1796 gimple_seq_add_seq (&seq1, seq2); 1797 if (seq1) 1798 { 1799 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); 1800 basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1); 1801 gcc_assert (!new_bb); 1802 } 1803 } 1804 1805 if (dump_enabled_p ()) 1806 dump_printf_loc (MSG_NOTE, vect_location, 1807 "misalignment for fully-masked loop: %T\n", 1808 misalign_in_elems); 1809 1810 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems; 1811 1812 vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR); 1813} 1814 1815/* This function builds ni_name = number of iterations. Statements 1816 are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set 1817 it to TRUE if new ssa_var is generated. */ 1818 1819tree 1820vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p) 1821{ 1822 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo)); 1823 if (TREE_CODE (ni) == INTEGER_CST) 1824 return ni; 1825 else 1826 { 1827 tree ni_name, var; 1828 gimple_seq stmts = NULL; 1829 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); 1830 1831 var = create_tmp_var (TREE_TYPE (ni), "niters"); 1832 ni_name = force_gimple_operand (ni, &stmts, false, var); 1833 if (stmts) 1834 { 1835 gsi_insert_seq_on_edge_immediate (pe, stmts); 1836 if (new_var_p != NULL) 1837 *new_var_p = true; 1838 } 1839 1840 return ni_name; 1841 } 1842} 1843 1844/* Calculate the number of iterations above which vectorized loop will be 1845 preferred than scalar loop. NITERS_PROLOG is the number of iterations 1846 of prolog loop. If it's integer const, the integer number is also passed 1847 in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the 1848 number of iterations of the prolog loop. BOUND_EPILOG is the corresponding 1849 value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the 1850 threshold below which the scalar (rather than vectorized) loop will be 1851 executed. This function stores the upper bound (inclusive) of the result 1852 in BOUND_SCALAR. */ 1853 1854static tree 1855vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog, 1856 int bound_prolog, poly_int64 bound_epilog, int th, 1857 poly_uint64 *bound_scalar, 1858 bool check_profitability) 1859{ 1860 tree type = TREE_TYPE (niters_prolog); 1861 tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog, 1862 build_int_cst (type, bound_epilog)); 1863 1864 *bound_scalar = bound_prolog + bound_epilog; 1865 if (check_profitability) 1866 { 1867 /* TH indicates the minimum niters of vectorized loop, while we 1868 compute the maximum niters of scalar loop. */ 1869 th--; 1870 /* Peeling for constant times. */ 1871 if (int_niters_prolog >= 0) 1872 { 1873 *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th); 1874 return build_int_cst (type, *bound_scalar); 1875 } 1876 /* Peeling an unknown number of times. Note that both BOUND_PROLOG 1877 and BOUND_EPILOG are inclusive upper bounds. */ 1878 if (known_ge (th, bound_prolog + bound_epilog)) 1879 { 1880 *bound_scalar = th; 1881 return build_int_cst (type, th); 1882 } 1883 /* Need to do runtime comparison. */ 1884 else if (maybe_gt (th, bound_epilog)) 1885 { 1886 *bound_scalar = upper_bound (*bound_scalar, th); 1887 return fold_build2 (MAX_EXPR, type, 1888 build_int_cst (type, th), niters); 1889 } 1890 } 1891 return niters; 1892} 1893 1894/* NITERS is the number of times that the original scalar loop executes 1895 after peeling. Work out the maximum number of iterations N that can 1896 be handled by the vectorized form of the loop and then either: 1897 1898 a) set *STEP_VECTOR_PTR to the vectorization factor and generate: 1899 1900 niters_vector = N 1901 1902 b) set *STEP_VECTOR_PTR to one and generate: 1903 1904 niters_vector = N / vf 1905 1906 In both cases, store niters_vector in *NITERS_VECTOR_PTR and add 1907 any new statements on the loop preheader edge. NITERS_NO_OVERFLOW 1908 is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */ 1909 1910void 1911vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters, 1912 tree *niters_vector_ptr, tree *step_vector_ptr, 1913 bool niters_no_overflow) 1914{ 1915 tree ni_minus_gap, var; 1916 tree niters_vector, step_vector, type = TREE_TYPE (niters); 1917 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 1918 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); 1919 tree log_vf = NULL_TREE; 1920 1921 /* If epilogue loop is required because of data accesses with gaps, we 1922 subtract one iteration from the total number of iterations here for 1923 correct calculation of RATIO. */ 1924 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) 1925 { 1926 ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters, 1927 build_one_cst (type)); 1928 if (!is_gimple_val (ni_minus_gap)) 1929 { 1930 var = create_tmp_var (type, "ni_gap"); 1931 gimple *stmts = NULL; 1932 ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts, 1933 true, var); 1934 gsi_insert_seq_on_edge_immediate (pe, stmts); 1935 } 1936 } 1937 else 1938 ni_minus_gap = niters; 1939 1940 unsigned HOST_WIDE_INT const_vf; 1941 if (vf.is_constant (&const_vf) 1942 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) 1943 { 1944 /* Create: niters >> log2(vf) */ 1945 /* If it's known that niters == number of latch executions + 1 doesn't 1946 overflow, we can generate niters >> log2(vf); otherwise we generate 1947 (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio 1948 will be at least one. */ 1949 log_vf = build_int_cst (type, exact_log2 (const_vf)); 1950 if (niters_no_overflow) 1951 niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf); 1952 else 1953 niters_vector 1954 = fold_build2 (PLUS_EXPR, type, 1955 fold_build2 (RSHIFT_EXPR, type, 1956 fold_build2 (MINUS_EXPR, type, 1957 ni_minus_gap, 1958 build_int_cst (type, vf)), 1959 log_vf), 1960 build_int_cst (type, 1)); 1961 step_vector = build_one_cst (type); 1962 } 1963 else 1964 { 1965 niters_vector = ni_minus_gap; 1966 step_vector = build_int_cst (type, vf); 1967 } 1968 1969 if (!is_gimple_val (niters_vector)) 1970 { 1971 var = create_tmp_var (type, "bnd"); 1972 gimple_seq stmts = NULL; 1973 niters_vector = force_gimple_operand (niters_vector, &stmts, true, var); 1974 gsi_insert_seq_on_edge_immediate (pe, stmts); 1975 /* Peeling algorithm guarantees that vector loop bound is at least ONE, 1976 we set range information to make niters analyzer's life easier. 1977 Note the number of latch iteration value can be TYPE_MAX_VALUE so 1978 we have to represent the vector niter TYPE_MAX_VALUE + 1 >> log_vf. */ 1979 if (stmts != NULL && log_vf) 1980 { 1981 if (niters_no_overflow) 1982 set_range_info (niters_vector, VR_RANGE, 1983 wi::one (TYPE_PRECISION (type)), 1984 wi::rshift (wi::max_value (TYPE_PRECISION (type), 1985 TYPE_SIGN (type)), 1986 exact_log2 (const_vf), 1987 TYPE_SIGN (type))); 1988 /* For VF == 1 the vector IV might also overflow so we cannot 1989 assert a minimum value of 1. */ 1990 else if (const_vf > 1) 1991 set_range_info (niters_vector, VR_RANGE, 1992 wi::one (TYPE_PRECISION (type)), 1993 wi::rshift (wi::max_value (TYPE_PRECISION (type), 1994 TYPE_SIGN (type)) 1995 - (const_vf - 1), 1996 exact_log2 (const_vf), TYPE_SIGN (type)) 1997 + 1); 1998 } 1999 } 2000 *niters_vector_ptr = niters_vector; 2001 *step_vector_ptr = step_vector; 2002 2003 return; 2004} 2005 2006/* Given NITERS_VECTOR which is the number of iterations for vectorized 2007 loop specified by LOOP_VINFO after vectorization, compute the number 2008 of iterations before vectorization (niters_vector * vf) and store it 2009 to NITERS_VECTOR_MULT_VF_PTR. */ 2010 2011static void 2012vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo, 2013 tree niters_vector, 2014 tree *niters_vector_mult_vf_ptr) 2015{ 2016 /* We should be using a step_vector of VF if VF is variable. */ 2017 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant (); 2018 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 2019 tree type = TREE_TYPE (niters_vector); 2020 tree log_vf = build_int_cst (type, exact_log2 (vf)); 2021 basic_block exit_bb = single_exit (loop)->dest; 2022 2023 gcc_assert (niters_vector_mult_vf_ptr != NULL); 2024 tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type, 2025 niters_vector, log_vf); 2026 if (!is_gimple_val (niters_vector_mult_vf)) 2027 { 2028 tree var = create_tmp_var (type, "niters_vector_mult_vf"); 2029 gimple_seq stmts = NULL; 2030 niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf, 2031 &stmts, true, var); 2032 gimple_stmt_iterator gsi = gsi_start_bb (exit_bb); 2033 gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT); 2034 } 2035 *niters_vector_mult_vf_ptr = niters_vector_mult_vf; 2036} 2037 2038/* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP, 2039 this function searches for the corresponding lcssa phi node in exit 2040 bb of LOOP. If it is found, return the phi result; otherwise return 2041 NULL. */ 2042 2043static tree 2044find_guard_arg (class loop *loop, class loop *epilog ATTRIBUTE_UNUSED, 2045 gphi *lcssa_phi) 2046{ 2047 gphi_iterator gsi; 2048 edge e = single_exit (loop); 2049 2050 gcc_assert (single_pred_p (e->dest)); 2051 for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi)) 2052 { 2053 gphi *phi = gsi.phi (); 2054 if (operand_equal_p (PHI_ARG_DEF (phi, 0), 2055 PHI_ARG_DEF (lcssa_phi, 0), 0)) 2056 return PHI_RESULT (phi); 2057 } 2058 return NULL_TREE; 2059} 2060 2061/* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND 2062 from SECOND/FIRST and puts it at the original loop's preheader/exit 2063 edge, the two loops are arranged as below: 2064 2065 preheader_a: 2066 first_loop: 2067 header_a: 2068 i_1 = PHI<i_0, i_2>; 2069 ... 2070 i_2 = i_1 + 1; 2071 if (cond_a) 2072 goto latch_a; 2073 else 2074 goto between_bb; 2075 latch_a: 2076 goto header_a; 2077 2078 between_bb: 2079 ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST, 2080 2081 second_loop: 2082 header_b: 2083 i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x, 2084 or with i_2 if no LCSSA phi is created 2085 under condition of CREATE_LCSSA_FOR_IV_PHIS. 2086 ... 2087 i_4 = i_3 + 1; 2088 if (cond_b) 2089 goto latch_b; 2090 else 2091 goto exit_bb; 2092 latch_b: 2093 goto header_b; 2094 2095 exit_bb: 2096 2097 This function creates loop closed SSA for the first loop; update the 2098 second loop's PHI nodes by replacing argument on incoming edge with the 2099 result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS 2100 is false, Loop closed ssa phis will only be created for non-iv phis for 2101 the first loop. 2102 2103 This function assumes exit bb of the first loop is preheader bb of the 2104 second loop, i.e, between_bb in the example code. With PHIs updated, 2105 the second loop will execute rest iterations of the first. */ 2106 2107static void 2108slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo, 2109 class loop *first, class loop *second, 2110 bool create_lcssa_for_iv_phis) 2111{ 2112 gphi_iterator gsi_update, gsi_orig; 2113 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 2114 2115 edge first_latch_e = EDGE_SUCC (first->latch, 0); 2116 edge second_preheader_e = loop_preheader_edge (second); 2117 basic_block between_bb = single_exit (first)->dest; 2118 2119 gcc_assert (between_bb == second_preheader_e->src); 2120 gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb)); 2121 /* Either the first loop or the second is the loop to be vectorized. */ 2122 gcc_assert (loop == first || loop == second); 2123 2124 for (gsi_orig = gsi_start_phis (first->header), 2125 gsi_update = gsi_start_phis (second->header); 2126 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update); 2127 gsi_next (&gsi_orig), gsi_next (&gsi_update)) 2128 { 2129 gphi *orig_phi = gsi_orig.phi (); 2130 gphi *update_phi = gsi_update.phi (); 2131 2132 tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e); 2133 /* Generate lcssa PHI node for the first loop. */ 2134 gphi *vect_phi = (loop == first) ? orig_phi : update_phi; 2135 stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi); 2136 if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info)) 2137 { 2138 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi)); 2139 gphi *lcssa_phi = create_phi_node (new_res, between_bb); 2140 add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION); 2141 arg = new_res; 2142 } 2143 2144 /* Update PHI node in the second loop by replacing arg on the loop's 2145 incoming edge. */ 2146 adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg); 2147 } 2148 2149 /* For epilogue peeling we have to make sure to copy all LC PHIs 2150 for correct vectorization of live stmts. */ 2151 if (loop == first) 2152 { 2153 basic_block orig_exit = single_exit (second)->dest; 2154 for (gsi_orig = gsi_start_phis (orig_exit); 2155 !gsi_end_p (gsi_orig); gsi_next (&gsi_orig)) 2156 { 2157 gphi *orig_phi = gsi_orig.phi (); 2158 tree orig_arg = PHI_ARG_DEF (orig_phi, 0); 2159 if (TREE_CODE (orig_arg) != SSA_NAME || virtual_operand_p (orig_arg)) 2160 continue; 2161 2162 /* Already created in the above loop. */ 2163 if (find_guard_arg (first, second, orig_phi)) 2164 continue; 2165 2166 tree new_res = copy_ssa_name (orig_arg); 2167 gphi *lcphi = create_phi_node (new_res, between_bb); 2168 add_phi_arg (lcphi, orig_arg, single_exit (first), UNKNOWN_LOCATION); 2169 } 2170 } 2171} 2172 2173/* Function slpeel_add_loop_guard adds guard skipping from the beginning 2174 of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE 2175 are two pred edges of the merge point before UPDATE_LOOP. The two loops 2176 appear like below: 2177 2178 guard_bb: 2179 if (cond) 2180 goto merge_bb; 2181 else 2182 goto skip_loop; 2183 2184 skip_loop: 2185 header_a: 2186 i_1 = PHI<i_0, i_2>; 2187 ... 2188 i_2 = i_1 + 1; 2189 if (cond_a) 2190 goto latch_a; 2191 else 2192 goto exit_a; 2193 latch_a: 2194 goto header_a; 2195 2196 exit_a: 2197 i_5 = PHI<i_2>; 2198 2199 merge_bb: 2200 ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point. 2201 2202 update_loop: 2203 header_b: 2204 i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x. 2205 ... 2206 i_4 = i_3 + 1; 2207 if (cond_b) 2208 goto latch_b; 2209 else 2210 goto exit_bb; 2211 latch_b: 2212 goto header_b; 2213 2214 exit_bb: 2215 2216 This function creates PHI nodes at merge_bb and replaces the use of i_5 2217 in the update_loop's PHI node with the result of new PHI result. */ 2218 2219static void 2220slpeel_update_phi_nodes_for_guard1 (class loop *skip_loop, 2221 class loop *update_loop, 2222 edge guard_edge, edge merge_edge) 2223{ 2224 location_t merge_loc, guard_loc; 2225 edge orig_e = loop_preheader_edge (skip_loop); 2226 edge update_e = loop_preheader_edge (update_loop); 2227 gphi_iterator gsi_orig, gsi_update; 2228 2229 for ((gsi_orig = gsi_start_phis (skip_loop->header), 2230 gsi_update = gsi_start_phis (update_loop->header)); 2231 !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update); 2232 gsi_next (&gsi_orig), gsi_next (&gsi_update)) 2233 { 2234 gphi *orig_phi = gsi_orig.phi (); 2235 gphi *update_phi = gsi_update.phi (); 2236 2237 /* Generate new phi node at merge bb of the guard. */ 2238 tree new_res = copy_ssa_name (PHI_RESULT (orig_phi)); 2239 gphi *new_phi = create_phi_node (new_res, guard_edge->dest); 2240 2241 /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the 2242 args in NEW_PHI for these edges. */ 2243 tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e); 2244 tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e); 2245 merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e); 2246 guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e); 2247 add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc); 2248 add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc); 2249 2250 /* Update phi in UPDATE_PHI. */ 2251 adjust_phi_and_debug_stmts (update_phi, update_e, new_res); 2252 } 2253} 2254 2255/* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied 2256 from LOOP. Function slpeel_add_loop_guard adds guard skipping from a 2257 point between the two loops to the end of EPILOG. Edges GUARD_EDGE 2258 and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG. 2259 The CFG looks like: 2260 2261 loop: 2262 header_a: 2263 i_1 = PHI<i_0, i_2>; 2264 ... 2265 i_2 = i_1 + 1; 2266 if (cond_a) 2267 goto latch_a; 2268 else 2269 goto exit_a; 2270 latch_a: 2271 goto header_a; 2272 2273 exit_a: 2274 2275 guard_bb: 2276 if (cond) 2277 goto merge_bb; 2278 else 2279 goto epilog_loop; 2280 2281 ;; fall_through_bb 2282 2283 epilog_loop: 2284 header_b: 2285 i_3 = PHI<i_2, i_4>; 2286 ... 2287 i_4 = i_3 + 1; 2288 if (cond_b) 2289 goto latch_b; 2290 else 2291 goto merge_bb; 2292 latch_b: 2293 goto header_b; 2294 2295 merge_bb: 2296 ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point. 2297 2298 exit_bb: 2299 i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb. 2300 2301 For each name used out side EPILOG (i.e - for each name that has a lcssa 2302 phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two 2303 args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is 2304 the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined 2305 by LOOP and is found in the exit bb of LOOP. Arg of the original PHI 2306 in exit_bb will also be updated. */ 2307 2308static void 2309slpeel_update_phi_nodes_for_guard2 (class loop *loop, class loop *epilog, 2310 edge guard_edge, edge merge_edge) 2311{ 2312 gphi_iterator gsi; 2313 basic_block merge_bb = guard_edge->dest; 2314 2315 gcc_assert (single_succ_p (merge_bb)); 2316 edge e = single_succ_edge (merge_bb); 2317 basic_block exit_bb = e->dest; 2318 gcc_assert (single_pred_p (exit_bb)); 2319 gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest); 2320 2321 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi)) 2322 { 2323 gphi *update_phi = gsi.phi (); 2324 tree old_arg = PHI_ARG_DEF (update_phi, 0); 2325 2326 tree merge_arg = NULL_TREE; 2327 2328 /* If the old argument is a SSA_NAME use its current_def. */ 2329 if (TREE_CODE (old_arg) == SSA_NAME) 2330 merge_arg = get_current_def (old_arg); 2331 /* If it's a constant or doesn't have a current_def, just use the old 2332 argument. */ 2333 if (!merge_arg) 2334 merge_arg = old_arg; 2335 2336 tree guard_arg = find_guard_arg (loop, epilog, update_phi); 2337 /* If the var is live after loop but not a reduction, we simply 2338 use the old arg. */ 2339 if (!guard_arg) 2340 guard_arg = old_arg; 2341 2342 /* Create new phi node in MERGE_BB: */ 2343 tree new_res = copy_ssa_name (PHI_RESULT (update_phi)); 2344 gphi *merge_phi = create_phi_node (new_res, merge_bb); 2345 2346 /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set 2347 the two PHI args in merge_phi for these edges. */ 2348 add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION); 2349 add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION); 2350 2351 /* Update the original phi in exit_bb. */ 2352 adjust_phi_and_debug_stmts (update_phi, e, new_res); 2353 } 2354} 2355 2356/* EPILOG loop is duplicated from the original loop for vectorizing, 2357 the arg of its loop closed ssa PHI needs to be updated. */ 2358 2359static void 2360slpeel_update_phi_nodes_for_lcssa (class loop *epilog) 2361{ 2362 gphi_iterator gsi; 2363 basic_block exit_bb = single_exit (epilog)->dest; 2364 2365 gcc_assert (single_pred_p (exit_bb)); 2366 edge e = EDGE_PRED (exit_bb, 0); 2367 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi)) 2368 rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e)); 2369} 2370 2371/* Function vect_do_peeling. 2372 2373 Input: 2374 - LOOP_VINFO: Represent a loop to be vectorized, which looks like: 2375 2376 preheader: 2377 LOOP: 2378 header_bb: 2379 loop_body 2380 if (exit_loop_cond) goto exit_bb 2381 else goto header_bb 2382 exit_bb: 2383 2384 - NITERS: The number of iterations of the loop. 2385 - NITERSM1: The number of iterations of the loop's latch. 2386 - NITERS_NO_OVERFLOW: No overflow in computing NITERS. 2387 - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if 2388 CHECK_PROFITABILITY is true. 2389 Output: 2390 - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should 2391 iterate after vectorization; see vect_set_loop_condition for details. 2392 - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that 2393 should be set to the number of scalar iterations handled by the 2394 vector loop. The SSA name is only used on exit from the loop. 2395 2396 This function peels prolog and epilog from the loop, adds guards skipping 2397 PROLOG and EPILOG for various conditions. As a result, the changed CFG 2398 would look like: 2399 2400 guard_bb_1: 2401 if (prefer_scalar_loop) goto merge_bb_1 2402 else goto guard_bb_2 2403 2404 guard_bb_2: 2405 if (skip_prolog) goto merge_bb_2 2406 else goto prolog_preheader 2407 2408 prolog_preheader: 2409 PROLOG: 2410 prolog_header_bb: 2411 prolog_body 2412 if (exit_prolog_cond) goto prolog_exit_bb 2413 else goto prolog_header_bb 2414 prolog_exit_bb: 2415 2416 merge_bb_2: 2417 2418 vector_preheader: 2419 VECTOR LOOP: 2420 vector_header_bb: 2421 vector_body 2422 if (exit_vector_cond) goto vector_exit_bb 2423 else goto vector_header_bb 2424 vector_exit_bb: 2425 2426 guard_bb_3: 2427 if (skip_epilog) goto merge_bb_3 2428 else goto epilog_preheader 2429 2430 merge_bb_1: 2431 2432 epilog_preheader: 2433 EPILOG: 2434 epilog_header_bb: 2435 epilog_body 2436 if (exit_epilog_cond) goto merge_bb_3 2437 else goto epilog_header_bb 2438 2439 merge_bb_3: 2440 2441 Note this function peels prolog and epilog only if it's necessary, 2442 as well as guards. 2443 This function returns the epilogue loop if a decision was made to vectorize 2444 it, otherwise NULL. 2445 2446 The analysis resulting in this epilogue loop's loop_vec_info was performed 2447 in the same vect_analyze_loop call as the main loop's. At that time 2448 vect_analyze_loop constructs a list of accepted loop_vec_info's for lower 2449 vectorization factors than the main loop. This list is stored in the main 2450 loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to 2451 vectorize the epilogue loop for a lower vectorization factor, the 2452 loop_vec_info sitting at the top of the epilogue_vinfos list is removed, 2453 updated and linked to the epilogue loop. This is later used to vectorize 2454 the epilogue. The reason the loop_vec_info needs updating is that it was 2455 constructed based on the original main loop, and the epilogue loop is a 2456 copy of this loop, so all links pointing to statements in the original loop 2457 need updating. Furthermore, these loop_vec_infos share the 2458 data_reference's records, which will also need to be updated. 2459 2460 TODO: Guard for prefer_scalar_loop should be emitted along with 2461 versioning conditions if loop versioning is needed. */ 2462 2463 2464class loop * 2465vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, 2466 tree *niters_vector, tree *step_vector, 2467 tree *niters_vector_mult_vf_var, int th, 2468 bool check_profitability, bool niters_no_overflow, 2469 tree *advance) 2470{ 2471 edge e, guard_e; 2472 tree type = TREE_TYPE (niters), guard_cond; 2473 basic_block guard_bb, guard_to; 2474 profile_probability prob_prolog, prob_vector, prob_epilog; 2475 int estimated_vf; 2476 int prolog_peeling = 0; 2477 bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0; 2478 /* We currently do not support prolog peeling if the target alignment is not 2479 known at compile time. 'vect_gen_prolog_loop_niters' depends on the 2480 target alignment being constant. */ 2481 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); 2482 if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ()) 2483 return NULL; 2484 2485 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo)) 2486 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); 2487 2488 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); 2489 poly_uint64 bound_epilog = 0; 2490 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) 2491 && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)) 2492 bound_epilog += vf - 1; 2493 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) 2494 bound_epilog += 1; 2495 bool epilog_peeling = maybe_ne (bound_epilog, 0U); 2496 poly_uint64 bound_scalar = bound_epilog; 2497 2498 if (!prolog_peeling && !epilog_peeling) 2499 return NULL; 2500 2501 /* Before doing any peeling make sure to reset debug binds outside of 2502 the loop refering to defs not in LC SSA. */ 2503 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo); 2504 for (unsigned i = 0; i < loop->num_nodes; ++i) 2505 { 2506 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i]; 2507 imm_use_iterator ui; 2508 gimple *use_stmt; 2509 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi); 2510 gsi_next (&gsi)) 2511 { 2512 FOR_EACH_IMM_USE_STMT (use_stmt, ui, gimple_phi_result (gsi.phi ())) 2513 if (gimple_debug_bind_p (use_stmt) 2514 && loop != gimple_bb (use_stmt)->loop_father 2515 && !flow_loop_nested_p (loop, 2516 gimple_bb (use_stmt)->loop_father)) 2517 { 2518 gimple_debug_bind_reset_value (use_stmt); 2519 update_stmt (use_stmt); 2520 } 2521 } 2522 for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi); 2523 gsi_next (&gsi)) 2524 { 2525 ssa_op_iter op_iter; 2526 def_operand_p def_p; 2527 FOR_EACH_SSA_DEF_OPERAND (def_p, gsi_stmt (gsi), op_iter, SSA_OP_DEF) 2528 FOR_EACH_IMM_USE_STMT (use_stmt, ui, DEF_FROM_PTR (def_p)) 2529 if (gimple_debug_bind_p (use_stmt) 2530 && loop != gimple_bb (use_stmt)->loop_father 2531 && !flow_loop_nested_p (loop, 2532 gimple_bb (use_stmt)->loop_father)) 2533 { 2534 gimple_debug_bind_reset_value (use_stmt); 2535 update_stmt (use_stmt); 2536 } 2537 } 2538 } 2539 2540 prob_vector = profile_probability::guessed_always ().apply_scale (9, 10); 2541 estimated_vf = vect_vf_for_cost (loop_vinfo); 2542 if (estimated_vf == 2) 2543 estimated_vf = 3; 2544 prob_prolog = prob_epilog = profile_probability::guessed_always () 2545 .apply_scale (estimated_vf - 1, estimated_vf); 2546 2547 class loop *prolog, *epilog = NULL; 2548 class loop *first_loop = loop; 2549 bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP; 2550 2551 /* We might have a queued need to update virtual SSA form. As we 2552 delete the update SSA machinery below after doing a regular 2553 incremental SSA update during loop copying make sure we don't 2554 lose that fact. 2555 ??? Needing to update virtual SSA form by renaming is unfortunate 2556 but not all of the vectorizer code inserting new loads / stores 2557 properly assigns virtual operands to those statements. */ 2558 update_ssa (TODO_update_ssa_only_virtuals); 2559 2560 create_lcssa_for_virtual_phi (loop); 2561 2562 /* If we're vectorizing an epilogue loop, the update_ssa above will 2563 have ensured that the virtual operand is in SSA form throughout the 2564 vectorized main loop. Normally it is possible to trace the updated 2565 vector-stmt vdefs back to scalar-stmt vdefs and vector-stmt vuses 2566 back to scalar-stmt vuses, meaning that the effect of the SSA update 2567 remains local to the main loop. However, there are rare cases in 2568 which the vectorized loop has vdefs even when the original scalar 2569 loop didn't. For example, vectorizing a load with IFN_LOAD_LANES 2570 introduces clobbers of the temporary vector array, which in turn 2571 needs new vdefs. If the scalar loop doesn't write to memory, these 2572 new vdefs will be the only ones in the vector loop. 2573 2574 In that case, update_ssa will have added a new virtual phi to the 2575 main loop, which previously didn't need one. Ensure that we (locally) 2576 maintain LCSSA form for the virtual operand, just as we would have 2577 done if the virtual phi had existed from the outset. This makes it 2578 easier to duplicate the scalar epilogue loop below. */ 2579 tree vop_to_rename = NULL_TREE; 2580 if (loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) 2581 { 2582 class loop *orig_loop = LOOP_VINFO_LOOP (orig_loop_vinfo); 2583 vop_to_rename = create_lcssa_for_virtual_phi (orig_loop); 2584 } 2585 2586 if (MAY_HAVE_DEBUG_BIND_STMTS) 2587 { 2588 gcc_assert (!adjust_vec.exists ()); 2589 adjust_vec.create (32); 2590 } 2591 initialize_original_copy_tables (); 2592 2593 /* Record the anchor bb at which the guard should be placed if the scalar 2594 loop might be preferred. */ 2595 basic_block anchor = loop_preheader_edge (loop)->src; 2596 2597 /* Generate the number of iterations for the prolog loop. We do this here 2598 so that we can also get the upper bound on the number of iterations. */ 2599 tree niters_prolog; 2600 int bound_prolog = 0; 2601 if (prolog_peeling) 2602 niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor, 2603 &bound_prolog); 2604 else 2605 niters_prolog = build_int_cst (type, 0); 2606 2607 loop_vec_info epilogue_vinfo = NULL; 2608 if (vect_epilogues) 2609 { 2610 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0]; 2611 loop_vinfo->epilogue_vinfos.ordered_remove (0); 2612 } 2613 2614 tree niters_vector_mult_vf = NULL_TREE; 2615 /* Saving NITERs before the loop, as this may be changed by prologue. */ 2616 tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo); 2617 edge update_e = NULL, skip_e = NULL; 2618 unsigned int lowest_vf = constant_lower_bound (vf); 2619 /* If we know the number of scalar iterations for the main loop we should 2620 check whether after the main loop there are enough iterations left over 2621 for the epilogue. */ 2622 if (vect_epilogues 2623 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 2624 && prolog_peeling >= 0 2625 && known_eq (vf, lowest_vf)) 2626 { 2627 unsigned HOST_WIDE_INT eiters 2628 = (LOOP_VINFO_INT_NITERS (loop_vinfo) 2629 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)); 2630 2631 eiters -= prolog_peeling; 2632 eiters 2633 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo); 2634 2635 unsigned int ratio; 2636 unsigned int epilogue_gaps 2637 = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo); 2638 while (!(constant_multiple_p 2639 (GET_MODE_SIZE (loop_vinfo->vector_mode), 2640 GET_MODE_SIZE (epilogue_vinfo->vector_mode), &ratio) 2641 && eiters >= lowest_vf / ratio + epilogue_gaps)) 2642 { 2643 delete epilogue_vinfo; 2644 epilogue_vinfo = NULL; 2645 if (loop_vinfo->epilogue_vinfos.length () == 0) 2646 { 2647 vect_epilogues = false; 2648 break; 2649 } 2650 epilogue_vinfo = loop_vinfo->epilogue_vinfos[0]; 2651 loop_vinfo->epilogue_vinfos.ordered_remove (0); 2652 epilogue_gaps = LOOP_VINFO_PEELING_FOR_GAPS (epilogue_vinfo); 2653 } 2654 } 2655 /* Prolog loop may be skipped. */ 2656 bool skip_prolog = (prolog_peeling != 0); 2657 /* Skip this loop to epilog when there are not enough iterations to enter this 2658 vectorized loop. If true we should perform runtime checks on the NITERS 2659 to check whether we should skip the current vectorized loop. If we know 2660 the number of scalar iterations we may choose to add a runtime check if 2661 this number "maybe" smaller than the number of iterations required 2662 when we know the number of scalar iterations may potentially 2663 be smaller than the number of iterations required to enter this loop, for 2664 this we use the upper bounds on the prolog and epilog peeling. When we 2665 don't know the number of iterations and don't require versioning it is 2666 because we have asserted that there are enough scalar iterations to enter 2667 the main loop, so this skip is not necessary. When we are versioning then 2668 we only add such a skip if we have chosen to vectorize the epilogue. */ 2669 bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 2670 ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo), 2671 bound_prolog + bound_epilog) 2672 : (!LOOP_REQUIRES_VERSIONING (loop_vinfo) 2673 || vect_epilogues)); 2674 /* Epilog loop must be executed if the number of iterations for epilog 2675 loop is known at compile time, otherwise we need to add a check at 2676 the end of vector loop and skip to the end of epilog loop. */ 2677 bool skip_epilog = (prolog_peeling < 0 2678 || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) 2679 || !vf.is_constant ()); 2680 /* PEELING_FOR_GAPS is special because epilog loop must be executed. */ 2681 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)) 2682 skip_epilog = false; 2683 2684 if (skip_vector) 2685 { 2686 split_edge (loop_preheader_edge (loop)); 2687 2688 /* Due to the order in which we peel prolog and epilog, we first 2689 propagate probability to the whole loop. The purpose is to 2690 avoid adjusting probabilities of both prolog and vector loops 2691 separately. Note in this case, the probability of epilog loop 2692 needs to be scaled back later. */ 2693 basic_block bb_before_loop = loop_preheader_edge (loop)->src; 2694 if (prob_vector.initialized_p ()) 2695 { 2696 scale_bbs_frequencies (&bb_before_loop, 1, prob_vector); 2697 scale_loop_profile (loop, prob_vector, 0); 2698 } 2699 } 2700 2701 dump_user_location_t loop_loc = find_loop_location (loop); 2702 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); 2703 if (vect_epilogues) 2704 /* Make sure to set the epilogue's epilogue scalar loop, such that we can 2705 use the original scalar loop as remaining epilogue if necessary. */ 2706 LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo) 2707 = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); 2708 2709 if (prolog_peeling) 2710 { 2711 e = loop_preheader_edge (loop); 2712 if (!slpeel_can_duplicate_loop_p (loop, e)) 2713 { 2714 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, 2715 "loop can't be duplicated to preheader edge.\n"); 2716 gcc_unreachable (); 2717 } 2718 /* Peel prolog and put it on preheader edge of loop. */ 2719 prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e); 2720 if (!prolog) 2721 { 2722 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, 2723 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n"); 2724 gcc_unreachable (); 2725 } 2726 prolog->force_vectorize = false; 2727 slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true); 2728 first_loop = prolog; 2729 reset_original_copy_tables (); 2730 2731 /* Update the number of iterations for prolog loop. */ 2732 tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog)); 2733 vect_set_loop_condition (prolog, NULL, niters_prolog, 2734 step_prolog, NULL_TREE, false); 2735 2736 /* Skip the prolog loop. */ 2737 if (skip_prolog) 2738 { 2739 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node, 2740 niters_prolog, build_int_cst (type, 0)); 2741 guard_bb = loop_preheader_edge (prolog)->src; 2742 basic_block bb_after_prolog = loop_preheader_edge (loop)->src; 2743 guard_to = split_edge (loop_preheader_edge (loop)); 2744 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, 2745 guard_to, guard_bb, 2746 prob_prolog.invert (), 2747 irred_flag); 2748 e = EDGE_PRED (guard_to, 0); 2749 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1)); 2750 slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e); 2751 2752 scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog); 2753 scale_loop_profile (prolog, prob_prolog, bound_prolog); 2754 } 2755 2756 /* Update init address of DRs. */ 2757 vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR); 2758 /* Update niters for vector loop. */ 2759 LOOP_VINFO_NITERS (loop_vinfo) 2760 = fold_build2 (MINUS_EXPR, type, niters, niters_prolog); 2761 LOOP_VINFO_NITERSM1 (loop_vinfo) 2762 = fold_build2 (MINUS_EXPR, type, 2763 LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog); 2764 bool new_var_p = false; 2765 niters = vect_build_loop_niters (loop_vinfo, &new_var_p); 2766 /* It's guaranteed that vector loop bound before vectorization is at 2767 least VF, so set range information for newly generated var. */ 2768 if (new_var_p) 2769 set_range_info (niters, VR_RANGE, 2770 wi::to_wide (build_int_cst (type, vf)), 2771 wi::to_wide (TYPE_MAX_VALUE (type))); 2772 2773 /* Prolog iterates at most bound_prolog times, latch iterates at 2774 most bound_prolog - 1 times. */ 2775 record_niter_bound (prolog, bound_prolog - 1, false, true); 2776 delete_update_ssa (); 2777 adjust_vec_debug_stmts (); 2778 scev_reset (); 2779 } 2780 2781 if (epilog_peeling) 2782 { 2783 e = single_exit (loop); 2784 if (!slpeel_can_duplicate_loop_p (loop, e)) 2785 { 2786 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, 2787 "loop can't be duplicated to exit edge.\n"); 2788 gcc_unreachable (); 2789 } 2790 /* Peel epilog and put it on exit edge of loop. If we are vectorizing 2791 said epilog then we should use a copy of the main loop as a starting 2792 point. This loop may have already had some preliminary transformations 2793 to allow for more optimal vectorization, for example if-conversion. 2794 If we are not vectorizing the epilog then we should use the scalar loop 2795 as the transformations mentioned above make less or no sense when not 2796 vectorizing. */ 2797 epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop; 2798 if (vop_to_rename) 2799 { 2800 /* Vectorizing the main loop can sometimes introduce a vdef to 2801 a loop that previously didn't have one; see the comment above 2802 the definition of VOP_TO_RENAME for details. The definition 2803 D that holds on E will then be different from the definition 2804 VOP_TO_RENAME that holds during SCALAR_LOOP, so we need to 2805 rename VOP_TO_RENAME to D when copying the loop. 2806 2807 The virtual operand is in LCSSA form for the main loop, 2808 and no stmt between the main loop and E needs a vdef, 2809 so we know that D is provided by a phi rather than by a 2810 vdef on a normal gimple stmt. */ 2811 basic_block vdef_bb = e->src; 2812 gphi *vphi; 2813 while (!(vphi = get_virtual_phi (vdef_bb))) 2814 vdef_bb = get_immediate_dominator (CDI_DOMINATORS, vdef_bb); 2815 gcc_assert (vop_to_rename != gimple_phi_result (vphi)); 2816 set_current_def (vop_to_rename, gimple_phi_result (vphi)); 2817 } 2818 epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e); 2819 if (!epilog) 2820 { 2821 dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, 2822 "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n"); 2823 gcc_unreachable (); 2824 } 2825 epilog->force_vectorize = false; 2826 slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false); 2827 2828 /* Scalar version loop may be preferred. In this case, add guard 2829 and skip to epilog. Note this only happens when the number of 2830 iterations of loop is unknown at compile time, otherwise this 2831 won't be vectorized. */ 2832 if (skip_vector) 2833 { 2834 /* Additional epilogue iteration is peeled if gap exists. */ 2835 tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling, 2836 bound_prolog, bound_epilog, 2837 th, &bound_scalar, 2838 check_profitability); 2839 /* Build guard against NITERSM1 since NITERS may overflow. */ 2840 guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t); 2841 guard_bb = anchor; 2842 guard_to = split_edge (loop_preheader_edge (epilog)); 2843 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, 2844 guard_to, guard_bb, 2845 prob_vector.invert (), 2846 irred_flag); 2847 skip_e = guard_e; 2848 e = EDGE_PRED (guard_to, 0); 2849 e = (e != guard_e ? e : EDGE_PRED (guard_to, 1)); 2850 slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e); 2851 2852 /* Simply propagate profile info from guard_bb to guard_to which is 2853 a merge point of control flow. */ 2854 guard_to->count = guard_bb->count; 2855 2856 /* Scale probability of epilog loop back. 2857 FIXME: We should avoid scaling down and back up. Profile may 2858 get lost if we scale down to 0. */ 2859 basic_block *bbs = get_loop_body (epilog); 2860 for (unsigned int i = 0; i < epilog->num_nodes; i++) 2861 bbs[i]->count = bbs[i]->count.apply_scale 2862 (bbs[i]->count, 2863 bbs[i]->count.apply_probability 2864 (prob_vector)); 2865 free (bbs); 2866 } 2867 2868 basic_block bb_before_epilog = loop_preheader_edge (epilog)->src; 2869 /* If loop is peeled for non-zero constant times, now niters refers to 2870 orig_niters - prolog_peeling, it won't overflow even the orig_niters 2871 overflows. */ 2872 niters_no_overflow |= (prolog_peeling > 0); 2873 vect_gen_vector_loop_niters (loop_vinfo, niters, 2874 niters_vector, step_vector, 2875 niters_no_overflow); 2876 if (!integer_onep (*step_vector)) 2877 { 2878 /* On exit from the loop we will have an easy way of calcalating 2879 NITERS_VECTOR / STEP * STEP. Install a dummy definition 2880 until then. */ 2881 niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector)); 2882 SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop (); 2883 *niters_vector_mult_vf_var = niters_vector_mult_vf; 2884 } 2885 else 2886 vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector, 2887 &niters_vector_mult_vf); 2888 /* Update IVs of original loop as if they were advanced by 2889 niters_vector_mult_vf steps. */ 2890 gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo)); 2891 update_e = skip_vector ? e : loop_preheader_edge (epilog); 2892 vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf, 2893 update_e); 2894 2895 if (skip_epilog) 2896 { 2897 guard_cond = fold_build2 (EQ_EXPR, boolean_type_node, 2898 niters, niters_vector_mult_vf); 2899 guard_bb = single_exit (loop)->dest; 2900 guard_to = split_edge (single_exit (epilog)); 2901 guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to, 2902 skip_vector ? anchor : guard_bb, 2903 prob_epilog.invert (), 2904 irred_flag); 2905 slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e, 2906 single_exit (epilog)); 2907 /* Only need to handle basic block before epilog loop if it's not 2908 the guard_bb, which is the case when skip_vector is true. */ 2909 if (guard_bb != bb_before_epilog) 2910 { 2911 prob_epilog = prob_vector * prob_epilog + prob_vector.invert (); 2912 2913 scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog); 2914 } 2915 scale_loop_profile (epilog, prob_epilog, 0); 2916 } 2917 else 2918 slpeel_update_phi_nodes_for_lcssa (epilog); 2919 2920 unsigned HOST_WIDE_INT bound; 2921 if (bound_scalar.is_constant (&bound)) 2922 { 2923 gcc_assert (bound != 0); 2924 /* -1 to convert loop iterations to latch iterations. */ 2925 record_niter_bound (epilog, bound - 1, false, true); 2926 } 2927 2928 delete_update_ssa (); 2929 adjust_vec_debug_stmts (); 2930 scev_reset (); 2931 } 2932 2933 if (vect_epilogues) 2934 { 2935 epilog->aux = epilogue_vinfo; 2936 LOOP_VINFO_LOOP (epilogue_vinfo) = epilog; 2937 2938 loop_constraint_clear (epilog, LOOP_C_INFINITE); 2939 2940 /* We now must calculate the number of NITERS performed by the previous 2941 loop and EPILOGUE_NITERS to be performed by the epilogue. */ 2942 tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf), 2943 niters_prolog, niters_vector_mult_vf); 2944 2945 /* If skip_vector we may skip the previous loop, we insert a phi-node to 2946 determine whether we are coming from the previous vectorized loop 2947 using the update_e edge or the skip_vector basic block using the 2948 skip_e edge. */ 2949 if (skip_vector) 2950 { 2951 gcc_assert (update_e != NULL && skip_e != NULL); 2952 gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)), 2953 update_e->dest); 2954 tree new_ssa = make_ssa_name (TREE_TYPE (niters)); 2955 gimple *stmt = gimple_build_assign (new_ssa, niters); 2956 gimple_stmt_iterator gsi; 2957 if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME 2958 && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL) 2959 { 2960 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf)); 2961 gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); 2962 } 2963 else 2964 { 2965 gsi = gsi_last_bb (update_e->src); 2966 gsi_insert_before (&gsi, stmt, GSI_NEW_STMT); 2967 } 2968 2969 niters = new_ssa; 2970 add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION); 2971 add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e, 2972 UNKNOWN_LOCATION); 2973 niters = PHI_RESULT (new_phi); 2974 } 2975 2976 /* Subtract the number of iterations performed by the vectorized loop 2977 from the number of total iterations. */ 2978 tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters), 2979 before_loop_niters, 2980 niters); 2981 2982 LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters; 2983 LOOP_VINFO_NITERSM1 (epilogue_vinfo) 2984 = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters), 2985 epilogue_niters, 2986 build_one_cst (TREE_TYPE (epilogue_niters))); 2987 2988 /* Set ADVANCE to the number of iterations performed by the previous 2989 loop and its prologue. */ 2990 *advance = niters; 2991 2992 /* Redo the peeling for niter analysis as the NITERs and alignment 2993 may have been updated to take the main loop into account. */ 2994 determine_peel_for_niter (epilogue_vinfo); 2995 } 2996 2997 adjust_vec.release (); 2998 free_original_copy_tables (); 2999 3000 return vect_epilogues ? epilog : NULL; 3001} 3002 3003/* Function vect_create_cond_for_niters_checks. 3004 3005 Create a conditional expression that represents the run-time checks for 3006 loop's niter. The loop is guaranteed to terminate if the run-time 3007 checks hold. 3008 3009 Input: 3010 COND_EXPR - input conditional expression. New conditions will be chained 3011 with logical AND operation. If it is NULL, then the function 3012 is used to return the number of alias checks. 3013 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs 3014 to be checked. 3015 3016 Output: 3017 COND_EXPR - conditional expression. 3018 3019 The returned COND_EXPR is the conditional expression to be used in the 3020 if statement that controls which version of the loop gets executed at 3021 runtime. */ 3022 3023static void 3024vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr) 3025{ 3026 tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo); 3027 3028 if (*cond_expr) 3029 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, 3030 *cond_expr, part_cond_expr); 3031 else 3032 *cond_expr = part_cond_expr; 3033} 3034 3035/* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR 3036 and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */ 3037 3038static void 3039chain_cond_expr (tree *cond_expr, tree part_cond_expr) 3040{ 3041 if (*cond_expr) 3042 *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, 3043 *cond_expr, part_cond_expr); 3044 else 3045 *cond_expr = part_cond_expr; 3046} 3047 3048/* Function vect_create_cond_for_align_checks. 3049 3050 Create a conditional expression that represents the alignment checks for 3051 all of data references (array element references) whose alignment must be 3052 checked at runtime. 3053 3054 Input: 3055 COND_EXPR - input conditional expression. New conditions will be chained 3056 with logical AND operation. 3057 LOOP_VINFO - two fields of the loop information are used. 3058 LOOP_VINFO_PTR_MASK is the mask used to check the alignment. 3059 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked. 3060 3061 Output: 3062 COND_EXPR_STMT_LIST - statements needed to construct the conditional 3063 expression. 3064 The returned value is the conditional expression to be used in the if 3065 statement that controls which version of the loop gets executed at runtime. 3066 3067 The algorithm makes two assumptions: 3068 1) The number of bytes "n" in a vector is a power of 2. 3069 2) An address "a" is aligned if a%n is zero and that this 3070 test can be done as a&(n-1) == 0. For example, for 16 3071 byte vectors the test is a&0xf == 0. */ 3072 3073static void 3074vect_create_cond_for_align_checks (loop_vec_info loop_vinfo, 3075 tree *cond_expr, 3076 gimple_seq *cond_expr_stmt_list) 3077{ 3078 vec<stmt_vec_info> may_misalign_stmts 3079 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); 3080 stmt_vec_info stmt_info; 3081 int mask = LOOP_VINFO_PTR_MASK (loop_vinfo); 3082 tree mask_cst; 3083 unsigned int i; 3084 tree int_ptrsize_type; 3085 char tmp_name[20]; 3086 tree or_tmp_name = NULL_TREE; 3087 tree and_tmp_name; 3088 gimple *and_stmt; 3089 tree ptrsize_zero; 3090 tree part_cond_expr; 3091 3092 /* Check that mask is one less than a power of 2, i.e., mask is 3093 all zeros followed by all ones. */ 3094 gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0)); 3095 3096 int_ptrsize_type = signed_type_for (ptr_type_node); 3097 3098 /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address 3099 of the first vector of the i'th data reference. */ 3100 3101 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info) 3102 { 3103 gimple_seq new_stmt_list = NULL; 3104 tree addr_base; 3105 tree addr_tmp_name; 3106 tree new_or_tmp_name; 3107 gimple *addr_stmt, *or_stmt; 3108 tree vectype = STMT_VINFO_VECTYPE (stmt_info); 3109 bool negative = tree_int_cst_compare 3110 (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0; 3111 tree offset = negative 3112 ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node; 3113 3114 /* create: addr_tmp = (int)(address_of_first_vector) */ 3115 addr_base = 3116 vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list, 3117 offset); 3118 if (new_stmt_list != NULL) 3119 gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list); 3120 3121 sprintf (tmp_name, "addr2int%d", i); 3122 addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name); 3123 addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base); 3124 gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt); 3125 3126 /* The addresses are OR together. */ 3127 3128 if (or_tmp_name != NULL_TREE) 3129 { 3130 /* create: or_tmp = or_tmp | addr_tmp */ 3131 sprintf (tmp_name, "orptrs%d", i); 3132 new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name); 3133 or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR, 3134 or_tmp_name, addr_tmp_name); 3135 gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt); 3136 or_tmp_name = new_or_tmp_name; 3137 } 3138 else 3139 or_tmp_name = addr_tmp_name; 3140 3141 } /* end for i */ 3142 3143 mask_cst = build_int_cst (int_ptrsize_type, mask); 3144 3145 /* create: and_tmp = or_tmp & mask */ 3146 and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask"); 3147 3148 and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR, 3149 or_tmp_name, mask_cst); 3150 gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt); 3151 3152 /* Make and_tmp the left operand of the conditional test against zero. 3153 if and_tmp has a nonzero bit then some address is unaligned. */ 3154 ptrsize_zero = build_int_cst (int_ptrsize_type, 0); 3155 part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node, 3156 and_tmp_name, ptrsize_zero); 3157 chain_cond_expr (cond_expr, part_cond_expr); 3158} 3159 3160/* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>, 3161 create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn). 3162 Set *COND_EXPR to a tree that is true when both the original *COND_EXPR 3163 and this new condition are true. Treat a null *COND_EXPR as "true". */ 3164 3165static void 3166vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr) 3167{ 3168 vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo); 3169 unsigned int i; 3170 vec_object_pair *pair; 3171 FOR_EACH_VEC_ELT (pairs, i, pair) 3172 { 3173 tree addr1 = build_fold_addr_expr (pair->first); 3174 tree addr2 = build_fold_addr_expr (pair->second); 3175 tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node, 3176 addr1, addr2); 3177 chain_cond_expr (cond_expr, part_cond_expr); 3178 } 3179} 3180 3181/* Create an expression that is true when all lower-bound conditions for 3182 the vectorized loop are met. Chain this condition with *COND_EXPR. */ 3183 3184static void 3185vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr) 3186{ 3187 vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo); 3188 for (unsigned int i = 0; i < lower_bounds.length (); ++i) 3189 { 3190 tree expr = lower_bounds[i].expr; 3191 tree type = unsigned_type_for (TREE_TYPE (expr)); 3192 expr = fold_convert (type, expr); 3193 poly_uint64 bound = lower_bounds[i].min_value; 3194 if (!lower_bounds[i].unsigned_p) 3195 { 3196 expr = fold_build2 (PLUS_EXPR, type, expr, 3197 build_int_cstu (type, bound - 1)); 3198 bound += bound - 1; 3199 } 3200 tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr, 3201 build_int_cstu (type, bound)); 3202 chain_cond_expr (cond_expr, part_cond_expr); 3203 } 3204} 3205 3206/* Function vect_create_cond_for_alias_checks. 3207 3208 Create a conditional expression that represents the run-time checks for 3209 overlapping of address ranges represented by a list of data references 3210 relations passed as input. 3211 3212 Input: 3213 COND_EXPR - input conditional expression. New conditions will be chained 3214 with logical AND operation. If it is NULL, then the function 3215 is used to return the number of alias checks. 3216 LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs 3217 to be checked. 3218 3219 Output: 3220 COND_EXPR - conditional expression. 3221 3222 The returned COND_EXPR is the conditional expression to be used in the if 3223 statement that controls which version of the loop gets executed at runtime. 3224*/ 3225 3226void 3227vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr) 3228{ 3229 vec<dr_with_seg_len_pair_t> comp_alias_ddrs = 3230 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo); 3231 3232 if (comp_alias_ddrs.is_empty ()) 3233 return; 3234 3235 create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo), 3236 &comp_alias_ddrs, cond_expr); 3237 if (dump_enabled_p ()) 3238 dump_printf_loc (MSG_NOTE, vect_location, 3239 "created %u versioning for alias checks.\n", 3240 comp_alias_ddrs.length ()); 3241} 3242 3243 3244/* Function vect_loop_versioning. 3245 3246 If the loop has data references that may or may not be aligned or/and 3247 has data reference relations whose independence was not proven then 3248 two versions of the loop need to be generated, one which is vectorized 3249 and one which isn't. A test is then generated to control which of the 3250 loops is executed. The test checks for the alignment of all of the 3251 data references that may or may not be aligned. An additional 3252 sequence of runtime tests is generated for each pairs of DDRs whose 3253 independence was not proven. The vectorized version of loop is 3254 executed only if both alias and alignment tests are passed. 3255 3256 The test generated to check which version of loop is executed 3257 is modified to also check for profitability as indicated by the 3258 cost model threshold TH. 3259 3260 The versioning precondition(s) are placed in *COND_EXPR and 3261 *COND_EXPR_STMT_LIST. */ 3262 3263class loop * 3264vect_loop_versioning (loop_vec_info loop_vinfo, 3265 gimple *loop_vectorized_call) 3266{ 3267 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop; 3268 class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); 3269 basic_block condition_bb; 3270 gphi_iterator gsi; 3271 gimple_stmt_iterator cond_exp_gsi; 3272 basic_block merge_bb; 3273 basic_block new_exit_bb; 3274 edge new_exit_e, e; 3275 gphi *orig_phi, *new_phi; 3276 tree cond_expr = NULL_TREE; 3277 gimple_seq cond_expr_stmt_list = NULL; 3278 tree arg; 3279 profile_probability prob = profile_probability::likely (); 3280 gimple_seq gimplify_stmt_list = NULL; 3281 tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo); 3282 bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo); 3283 bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo); 3284 bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo); 3285 poly_uint64 versioning_threshold 3286 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); 3287 tree version_simd_if_cond 3288 = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo); 3289 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); 3290 3291 if (vect_apply_runtime_profitability_check_p (loop_vinfo) 3292 && !ordered_p (th, versioning_threshold)) 3293 cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters, 3294 build_int_cst (TREE_TYPE (scalar_loop_iters), 3295 th - 1)); 3296 if (maybe_ne (versioning_threshold, 0U)) 3297 { 3298 tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters, 3299 build_int_cst (TREE_TYPE (scalar_loop_iters), 3300 versioning_threshold - 1)); 3301 if (cond_expr) 3302 cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node, 3303 expr, cond_expr); 3304 else 3305 cond_expr = expr; 3306 } 3307 3308 if (version_niter) 3309 vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr); 3310 3311 if (cond_expr) 3312 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr), 3313 &cond_expr_stmt_list, 3314 is_gimple_condexpr, NULL_TREE); 3315 3316 if (version_align) 3317 vect_create_cond_for_align_checks (loop_vinfo, &cond_expr, 3318 &cond_expr_stmt_list); 3319 3320 if (version_alias) 3321 { 3322 vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr); 3323 vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr); 3324 vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr); 3325 } 3326 3327 if (version_simd_if_cond) 3328 { 3329 gcc_assert (dom_info_available_p (CDI_DOMINATORS)); 3330 if (flag_checking) 3331 if (basic_block bb 3332 = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond))) 3333 gcc_assert (bb != loop->header 3334 && dominated_by_p (CDI_DOMINATORS, loop->header, bb) 3335 && (scalar_loop == NULL 3336 || (bb != scalar_loop->header 3337 && dominated_by_p (CDI_DOMINATORS, 3338 scalar_loop->header, bb)))); 3339 tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond)); 3340 tree c = fold_build2 (NE_EXPR, boolean_type_node, 3341 version_simd_if_cond, zero); 3342 if (cond_expr) 3343 cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, 3344 c, cond_expr); 3345 else 3346 cond_expr = c; 3347 if (dump_enabled_p ()) 3348 dump_printf_loc (MSG_NOTE, vect_location, 3349 "created versioning for simd if condition check.\n"); 3350 } 3351 3352 cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr), 3353 &gimplify_stmt_list, 3354 is_gimple_condexpr, NULL_TREE); 3355 gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list); 3356 3357 /* Compute the outermost loop cond_expr and cond_expr_stmt_list are 3358 invariant in. */ 3359 class loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr); 3360 for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list); 3361 !gsi_end_p (gsi); gsi_next (&gsi)) 3362 { 3363 gimple *stmt = gsi_stmt (gsi); 3364 update_stmt (stmt); 3365 ssa_op_iter iter; 3366 use_operand_p use_p; 3367 basic_block def_bb; 3368 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE) 3369 if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p)))) 3370 && flow_bb_inside_loop_p (outermost, def_bb)) 3371 outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1); 3372 } 3373 3374 /* Search for the outermost loop we can version. Avoid versioning of 3375 non-perfect nests but allow if-conversion versioned loops inside. */ 3376 class loop *loop_to_version = loop; 3377 if (flow_loop_nested_p (outermost, loop)) 3378 { 3379 if (dump_enabled_p ()) 3380 dump_printf_loc (MSG_NOTE, vect_location, 3381 "trying to apply versioning to outer loop %d\n", 3382 outermost->num); 3383 if (outermost->num == 0) 3384 outermost = superloop_at_depth (loop, 1); 3385 /* And avoid applying versioning on non-perfect nests. */ 3386 while (loop_to_version != outermost 3387 && (e = single_exit (loop_outer (loop_to_version))) 3388 && !(e->flags & EDGE_COMPLEX) 3389 && (!loop_outer (loop_to_version)->inner->next 3390 || vect_loop_vectorized_call (loop_to_version)) 3391 && (!loop_outer (loop_to_version)->inner->next 3392 || !loop_outer (loop_to_version)->inner->next->next)) 3393 loop_to_version = loop_outer (loop_to_version); 3394 } 3395 3396 /* Apply versioning. If there is already a scalar version created by 3397 if-conversion re-use that. Note we cannot re-use the copy of 3398 an if-converted outer-loop when vectorizing the inner loop only. */ 3399 gcond *cond; 3400 if ((!loop_to_version->inner || loop == loop_to_version) 3401 && loop_vectorized_call) 3402 { 3403 gcc_assert (scalar_loop); 3404 condition_bb = gimple_bb (loop_vectorized_call); 3405 cond = as_a <gcond *> (last_stmt (condition_bb)); 3406 gimple_cond_set_condition_from_tree (cond, cond_expr); 3407 update_stmt (cond); 3408 3409 if (cond_expr_stmt_list) 3410 { 3411 cond_exp_gsi = gsi_for_stmt (loop_vectorized_call); 3412 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, 3413 GSI_SAME_STMT); 3414 } 3415 3416 /* if-conversion uses profile_probability::always () for both paths, 3417 reset the paths probabilities appropriately. */ 3418 edge te, fe; 3419 extract_true_false_edges_from_block (condition_bb, &te, &fe); 3420 te->probability = prob; 3421 fe->probability = prob.invert (); 3422 /* We can scale loops counts immediately but have to postpone 3423 scaling the scalar loop because we re-use it during peeling. */ 3424 scale_loop_frequencies (loop_to_version, te->probability); 3425 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo) = fe->probability; 3426 3427 nloop = scalar_loop; 3428 if (dump_enabled_p ()) 3429 dump_printf_loc (MSG_NOTE, vect_location, 3430 "reusing %sloop version created by if conversion\n", 3431 loop_to_version != loop ? "outer " : ""); 3432 } 3433 else 3434 { 3435 if (loop_to_version != loop 3436 && dump_enabled_p ()) 3437 dump_printf_loc (MSG_NOTE, vect_location, 3438 "applying loop versioning to outer loop %d\n", 3439 loop_to_version->num); 3440 3441 initialize_original_copy_tables (); 3442 nloop = loop_version (loop_to_version, cond_expr, &condition_bb, 3443 prob, prob.invert (), prob, prob.invert (), true); 3444 gcc_assert (nloop); 3445 nloop = get_loop_copy (loop); 3446 3447 /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will 3448 reap those otherwise; they also refer to the original 3449 loops. */ 3450 class loop *l = loop; 3451 while (gimple *call = vect_loop_vectorized_call (l)) 3452 { 3453 call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call))); 3454 fold_loop_internal_call (call, boolean_false_node); 3455 l = loop_outer (l); 3456 } 3457 free_original_copy_tables (); 3458 3459 if (cond_expr_stmt_list) 3460 { 3461 cond_exp_gsi = gsi_last_bb (condition_bb); 3462 gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list, 3463 GSI_SAME_STMT); 3464 } 3465 3466 /* Loop versioning violates an assumption we try to maintain during 3467 vectorization - that the loop exit block has a single predecessor. 3468 After versioning, the exit block of both loop versions is the same 3469 basic block (i.e. it has two predecessors). Just in order to simplify 3470 following transformations in the vectorizer, we fix this situation 3471 here by adding a new (empty) block on the exit-edge of the loop, 3472 with the proper loop-exit phis to maintain loop-closed-form. 3473 If loop versioning wasn't done from loop, but scalar_loop instead, 3474 merge_bb will have already just a single successor. */ 3475 3476 merge_bb = single_exit (loop_to_version)->dest; 3477 if (EDGE_COUNT (merge_bb->preds) >= 2) 3478 { 3479 gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2); 3480 new_exit_bb = split_edge (single_exit (loop_to_version)); 3481 new_exit_e = single_exit (loop_to_version); 3482 e = EDGE_SUCC (new_exit_bb, 0); 3483 3484 for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); 3485 gsi_next (&gsi)) 3486 { 3487 tree new_res; 3488 orig_phi = gsi.phi (); 3489 new_res = copy_ssa_name (PHI_RESULT (orig_phi)); 3490 new_phi = create_phi_node (new_res, new_exit_bb); 3491 arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); 3492 add_phi_arg (new_phi, arg, new_exit_e, 3493 gimple_phi_arg_location_from_edge (orig_phi, e)); 3494 adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi)); 3495 } 3496 } 3497 3498 update_ssa (TODO_update_ssa); 3499 } 3500 3501 if (version_niter) 3502 { 3503 /* The versioned loop could be infinite, we need to clear existing 3504 niter information which is copied from the original loop. */ 3505 gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE)); 3506 vect_free_loop_info_assumptions (nloop); 3507 /* And set constraint LOOP_C_INFINITE for niter analyzer. */ 3508 loop_constraint_set (loop, LOOP_C_INFINITE); 3509 } 3510 3511 if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION 3512 && dump_enabled_p ()) 3513 { 3514 if (version_alias) 3515 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING, 3516 vect_location, 3517 "loop versioned for vectorization because of " 3518 "possible aliasing\n"); 3519 if (version_align) 3520 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING, 3521 vect_location, 3522 "loop versioned for vectorization to enhance " 3523 "alignment\n"); 3524 3525 } 3526 3527 return nloop; 3528} 3529