1/* Subroutines used to expand string and block move, clear, 2 compare and other operations for PowerPC. 3 Copyright (C) 1991-2020 Free Software Foundation, Inc. 4 5 This file is part of GCC. 6 7 GCC is free software; you can redistribute it and/or modify it 8 under the terms of the GNU General Public License as published 9 by the Free Software Foundation; either version 3, or (at your 10 option) any later version. 11 12 GCC is distributed in the hope that it will be useful, but WITHOUT 13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 15 License for more details. 16 17 You should have received a copy of the GNU General Public License 18 along with GCC; see the file COPYING3. If not see 19 <http://www.gnu.org/licenses/>. */ 20 21#define IN_TARGET_CODE 1 22 23#include "config.h" 24#include "system.h" 25#include "coretypes.h" 26#include "backend.h" 27#include "rtl.h" 28#include "tree.h" 29#include "memmodel.h" 30#include "tm_p.h" 31#include "ira.h" 32#include "print-tree.h" 33#include "varasm.h" 34#include "explow.h" 35#include "expr.h" 36#include "output.h" 37#include "target.h" 38#include "profile-count.h" 39#include "predict.h" 40 41/* Expand a block clear operation, and return 1 if successful. Return 0 42 if we should let the compiler generate normal code. 43 44 operands[0] is the destination 45 operands[1] is the length 46 operands[3] is the alignment */ 47 48int 49expand_block_clear (rtx operands[]) 50{ 51 rtx orig_dest = operands[0]; 52 rtx bytes_rtx = operands[1]; 53 rtx align_rtx = operands[3]; 54 bool constp = CONST_INT_P (bytes_rtx); 55 HOST_WIDE_INT align; 56 HOST_WIDE_INT bytes; 57 int offset; 58 int clear_bytes; 59 int clear_step; 60 61 /* If this is not a fixed size move, just call memcpy */ 62 if (! constp) 63 return 0; 64 65 /* This must be a fixed size alignment */ 66 gcc_assert (CONST_INT_P (align_rtx)); 67 align = INTVAL (align_rtx) * BITS_PER_UNIT; 68 69 /* Anything to clear? */ 70 bytes = INTVAL (bytes_rtx); 71 if (bytes <= 0) 72 return 1; 73 74 /* Use the builtin memset after a point, to avoid huge code bloat. 75 When optimize_size, avoid any significant code bloat; calling 76 memset is about 4 instructions, so allow for one instruction to 77 load zero and three to do clearing. */ 78 if (TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX)) 79 clear_step = 16; 80 else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT)) 81 clear_step = 8; 82 else 83 clear_step = 4; 84 85 if (optimize_size && bytes > 3 * clear_step) 86 return 0; 87 if (! optimize_size && bytes > 8 * clear_step) 88 return 0; 89 90 bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX); 91 92 for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes) 93 { 94 machine_mode mode = BLKmode; 95 rtx dest; 96 97 if (TARGET_ALTIVEC 98 && (bytes >= 16 && (align >= 128 || unaligned_vsx_ok))) 99 { 100 clear_bytes = 16; 101 mode = V4SImode; 102 } 103 else if (bytes >= 8 && TARGET_POWERPC64 104 && (align >= 64 || !STRICT_ALIGNMENT)) 105 { 106 clear_bytes = 8; 107 mode = DImode; 108 if (offset == 0 && align < 64) 109 { 110 rtx addr; 111 112 /* If the address form is reg+offset with offset not a 113 multiple of four, reload into reg indirect form here 114 rather than waiting for reload. This way we get one 115 reload, not one per store. */ 116 addr = XEXP (orig_dest, 0); 117 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) 118 && CONST_INT_P (XEXP (addr, 1)) 119 && (INTVAL (XEXP (addr, 1)) & 3) != 0) 120 { 121 addr = copy_addr_to_reg (addr); 122 orig_dest = replace_equiv_address (orig_dest, addr); 123 } 124 } 125 } 126 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT)) 127 { /* move 4 bytes */ 128 clear_bytes = 4; 129 mode = SImode; 130 } 131 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT)) 132 { /* move 2 bytes */ 133 clear_bytes = 2; 134 mode = HImode; 135 } 136 else /* move 1 byte at a time */ 137 { 138 clear_bytes = 1; 139 mode = QImode; 140 } 141 142 dest = adjust_address (orig_dest, mode, offset); 143 144 emit_move_insn (dest, CONST0_RTX (mode)); 145 } 146 147 return 1; 148} 149 150/* Figure out the correct instructions to generate to load data for 151 block compare. MODE is used for the read from memory, and 152 data is zero extended if REG is wider than MODE. If LE code 153 is being generated, bswap loads are used. 154 155 REG is the destination register to move the data into. 156 MEM is the memory block being read. 157 MODE is the mode of memory to use for the read. */ 158static void 159do_load_for_compare (rtx reg, rtx mem, machine_mode mode) 160{ 161 switch (GET_MODE (reg)) 162 { 163 case E_V16QImode: 164 switch (mode) 165 { 166 case E_V16QImode: 167 if (!BYTES_BIG_ENDIAN) 168 { 169 if (TARGET_P9_VECTOR) 170 emit_insn (gen_vsx_ld_elemrev_v16qi_internal (reg, mem)); 171 else 172 { 173 rtx reg_v2di = simplify_gen_subreg (V2DImode, reg, 174 V16QImode, 0); 175 gcc_assert (MEM_P (mem)); 176 rtx addr = XEXP (mem, 0); 177 rtx mem_v2di = gen_rtx_MEM (V2DImode, addr); 178 MEM_COPY_ATTRIBUTES (mem_v2di, mem); 179 set_mem_size (mem, GET_MODE_SIZE (V2DImode)); 180 emit_insn (gen_vsx_ld_elemrev_v2di (reg_v2di, mem_v2di)); 181 } 182 } 183 else 184 emit_insn (gen_vsx_movv2di_64bit (reg, mem)); 185 break; 186 default: 187 gcc_unreachable (); 188 } 189 break; 190 case E_DImode: 191 switch (mode) 192 { 193 case E_QImode: 194 emit_insn (gen_zero_extendqidi2 (reg, mem)); 195 break; 196 case E_HImode: 197 { 198 rtx src = mem; 199 if (!BYTES_BIG_ENDIAN) 200 { 201 src = gen_reg_rtx (HImode); 202 emit_insn (gen_bswaphi2 (src, mem)); 203 } 204 emit_insn (gen_zero_extendhidi2 (reg, src)); 205 break; 206 } 207 case E_SImode: 208 { 209 rtx src = mem; 210 if (!BYTES_BIG_ENDIAN) 211 { 212 src = gen_reg_rtx (SImode); 213 emit_insn (gen_bswapsi2 (src, mem)); 214 } 215 emit_insn (gen_zero_extendsidi2 (reg, src)); 216 } 217 break; 218 case E_DImode: 219 if (!BYTES_BIG_ENDIAN) 220 emit_insn (gen_bswapdi2 (reg, mem)); 221 else 222 emit_insn (gen_movdi (reg, mem)); 223 break; 224 default: 225 gcc_unreachable (); 226 } 227 break; 228 229 case E_SImode: 230 switch (mode) 231 { 232 case E_QImode: 233 emit_insn (gen_zero_extendqisi2 (reg, mem)); 234 break; 235 case E_HImode: 236 { 237 rtx src = mem; 238 if (!BYTES_BIG_ENDIAN) 239 { 240 src = gen_reg_rtx (HImode); 241 emit_insn (gen_bswaphi2 (src, mem)); 242 } 243 emit_insn (gen_zero_extendhisi2 (reg, src)); 244 break; 245 } 246 case E_SImode: 247 if (!BYTES_BIG_ENDIAN) 248 emit_insn (gen_bswapsi2 (reg, mem)); 249 else 250 emit_insn (gen_movsi (reg, mem)); 251 break; 252 case E_DImode: 253 /* DImode is larger than the destination reg so is not expected. */ 254 gcc_unreachable (); 255 break; 256 default: 257 gcc_unreachable (); 258 } 259 break; 260 261 case E_QImode: 262 gcc_assert (mode == E_QImode); 263 emit_move_insn (reg, mem); 264 break; 265 266 default: 267 gcc_unreachable (); 268 break; 269 } 270} 271 272/* Select the mode to be used for reading the next chunk of bytes 273 in the compare. 274 275 OFFSET is the current read offset from the beginning of the block. 276 BYTES is the number of bytes remaining to be read. 277 ALIGN is the minimum alignment of the memory blocks being compared in bytes. */ 278static machine_mode 279select_block_compare_mode (unsigned HOST_WIDE_INT offset, 280 unsigned HOST_WIDE_INT bytes, 281 unsigned HOST_WIDE_INT align) 282{ 283 /* First see if we can do a whole load unit 284 as that will be more efficient than a larger load + shift. */ 285 286 /* If big, use biggest chunk. 287 If exactly chunk size, use that size. 288 If remainder can be done in one piece with shifting, do that. 289 Do largest chunk possible without violating alignment rules. */ 290 291 /* The most we can read without potential page crossing. */ 292 unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align); 293 294 /* If we have an LE target without ldbrx and word_mode is DImode, 295 then we must avoid using word_mode. */ 296 int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX 297 && word_mode == DImode); 298 299 if (word_mode_ok && bytes >= UNITS_PER_WORD) 300 return word_mode; 301 else if (bytes == GET_MODE_SIZE (SImode)) 302 return SImode; 303 else if (bytes == GET_MODE_SIZE (HImode)) 304 return HImode; 305 else if (bytes == GET_MODE_SIZE (QImode)) 306 return QImode; 307 else if (bytes < GET_MODE_SIZE (SImode) 308 && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED 309 && offset >= GET_MODE_SIZE (SImode) - bytes) 310 /* This matches the case were we have SImode and 3 bytes 311 and offset >= 1 and permits us to move back one and overlap 312 with the previous read, thus avoiding having to shift 313 unwanted bytes off of the input. */ 314 return SImode; 315 else if (word_mode_ok && bytes < UNITS_PER_WORD 316 && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED 317 && offset >= UNITS_PER_WORD-bytes) 318 /* Similarly, if we can use DImode it will get matched here and 319 can do an overlapping read that ends at the end of the block. */ 320 return word_mode; 321 else if (word_mode_ok && maxread >= UNITS_PER_WORD) 322 /* It is safe to do all remaining in one load of largest size, 323 possibly with a shift to get rid of unwanted bytes. */ 324 return word_mode; 325 else if (maxread >= GET_MODE_SIZE (SImode)) 326 /* It is safe to do all remaining in one SImode load, 327 possibly with a shift to get rid of unwanted bytes. */ 328 return SImode; 329 else if (bytes > GET_MODE_SIZE (SImode)) 330 return SImode; 331 else if (bytes > GET_MODE_SIZE (HImode)) 332 return HImode; 333 334 /* final fallback is do one byte */ 335 return QImode; 336} 337 338/* Compute the alignment of pointer+OFFSET where the original alignment 339 of pointer was BASE_ALIGN. */ 340static unsigned HOST_WIDE_INT 341compute_current_alignment (unsigned HOST_WIDE_INT base_align, 342 unsigned HOST_WIDE_INT offset) 343{ 344 if (offset == 0) 345 return base_align; 346 return MIN (base_align, offset & -offset); 347} 348 349/* Prepare address and then do a load. 350 351 MODE is the mode to use for the load. 352 DEST is the destination register for the data. 353 ADDR is the address to be loaded. 354 ORIG_ADDR is the original address expression. */ 355static void 356do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr, 357 rtx orig_addr) 358{ 359 rtx mem = gen_rtx_MEM (mode, addr); 360 MEM_COPY_ATTRIBUTES (mem, orig_addr); 361 set_mem_size (mem, GET_MODE_SIZE (mode)); 362 do_load_for_compare (dest, mem, mode); 363 return; 364} 365 366/* Do a branch for an if/else decision. 367 368 CMPMODE is the mode to use for the comparison. 369 COMPARISON is the rtx code for the compare needed. 370 A is the first thing to be compared. 371 B is the second thing to be compared. 372 CR is the condition code reg input, or NULL_RTX. 373 TRUE_LABEL is the label to branch to if the condition is true. 374 P is the estimated branch probability for the branch. 375 376 The return value is the CR used for the comparison. 377 If CR is null_rtx, then a new register of CMPMODE is generated. 378 If A and B are both null_rtx, then CR must not be null, and the 379 compare is not generated so you can use this with a dot form insn. */ 380 381static void 382do_ifelse (machine_mode cmpmode, rtx_code comparison, 383 rtx a, rtx b, rtx cr, rtx true_label, profile_probability br_prob) 384{ 385 gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX) 386 || (a != NULL_RTX && b != NULL_RTX)); 387 388 if (cr != NULL_RTX) 389 gcc_assert (GET_MODE (cr) == cmpmode); 390 else 391 cr = gen_reg_rtx (cmpmode); 392 393 rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label); 394 395 if (a != NULL_RTX) 396 emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b)); 397 398 rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx); 399 400 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx); 401 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); 402 add_reg_br_prob_note (j, br_prob); 403 JUMP_LABEL (j) = true_label; 404 LABEL_NUSES (true_label) += 1; 405} 406 407/* Emit an isel of the proper mode for DEST. 408 409 DEST is the isel destination register. 410 SRC1 is the isel source if CR is true. 411 SRC2 is the isel source if CR is false. 412 CR is the condition for the isel. */ 413static void 414do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr) 415{ 416 if (GET_MODE (dest) == DImode) 417 emit_insn (gen_isel_signed_di (dest, cmp, src_t, src_f, cr)); 418 else 419 emit_insn (gen_isel_signed_si (dest, cmp, src_t, src_f, cr)); 420} 421 422/* Emit a subtract of the proper mode for DEST. 423 424 DEST is the destination register for the subtract. 425 SRC1 is the first subtract input. 426 SRC2 is the second subtract input. 427 428 Computes DEST = SRC1-SRC2. */ 429static void 430do_sub3 (rtx dest, rtx src1, rtx src2) 431{ 432 if (GET_MODE (dest) == DImode) 433 emit_insn (gen_subdi3 (dest, src1, src2)); 434 else 435 emit_insn (gen_subsi3 (dest, src1, src2)); 436} 437 438/* Emit an add of the proper mode for DEST. 439 440 DEST is the destination register for the add. 441 SRC1 is the first add input. 442 SRC2 is the second add input. 443 444 Computes DEST = SRC1+SRC2. */ 445static void 446do_add3 (rtx dest, rtx src1, rtx src2) 447{ 448 if (GET_MODE (dest) == DImode) 449 emit_insn (gen_adddi3 (dest, src1, src2)); 450 else 451 emit_insn (gen_addsi3 (dest, src1, src2)); 452} 453 454/* Emit an and of the proper mode for DEST. 455 456 DEST is the destination register for the and. 457 SRC1 is the first and input. 458 SRC2 is the second and input. 459 460 Computes DEST = SRC1&SRC2. */ 461static void 462do_and3 (rtx dest, rtx src1, rtx src2) 463{ 464 if (GET_MODE (dest) == DImode) 465 emit_insn (gen_anddi3 (dest, src1, src2)); 466 else 467 emit_insn (gen_andsi3 (dest, src1, src2)); 468} 469 470/* Emit an cmpb of the proper mode for DEST. 471 472 DEST is the destination register for the cmpb. 473 SRC1 is the first input. 474 SRC2 is the second input. 475 476 Computes cmpb of SRC1, SRC2. */ 477static void 478do_cmpb3 (rtx dest, rtx src1, rtx src2) 479{ 480 if (GET_MODE (dest) == DImode) 481 emit_insn (gen_cmpbdi3 (dest, src1, src2)); 482 else 483 emit_insn (gen_cmpbsi3 (dest, src1, src2)); 484} 485 486/* Emit a rotl of the proper mode for DEST. 487 488 DEST is the destination register for the and. 489 SRC1 is the first and input. 490 SRC2 is the second and input. 491 492 Computes DEST = SRC1 rotated left by SRC2. */ 493static void 494do_rotl3 (rtx dest, rtx src1, rtx src2) 495{ 496 if (GET_MODE (dest) == DImode) 497 emit_insn (gen_rotldi3 (dest, src1, src2)); 498 else 499 emit_insn (gen_rotlsi3 (dest, src1, src2)); 500} 501 502/* Generate rtl for a load, shift, and compare of less than a full word. 503 504 LOAD_MODE is the machine mode for the loads. 505 DIFF is the reg for the difference. 506 CMP_REM is the reg containing the remaining bytes to compare. 507 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb. 508 SRC1_ADDR is the first source address. 509 SRC2_ADDR is the second source address. 510 ORIG_SRC1 is the original first source block's address rtx. 511 ORIG_SRC2 is the original second source block's address rtx. */ 512static void 513do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, rtx dcond, 514 rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx orig_src2) 515{ 516 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); 517 rtx shift_amount = gen_reg_rtx (word_mode); 518 rtx d1 = gen_reg_rtx (word_mode); 519 rtx d2 = gen_reg_rtx (word_mode); 520 521 do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1); 522 do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2); 523 do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem); 524 525 if (word_mode == DImode) 526 { 527 emit_insn (gen_ashldi3 (shift_amount, shift_amount, 528 GEN_INT (LOG2_BITS_PER_UNIT))); 529 emit_insn (gen_lshrdi3 (d1, d1, 530 gen_lowpart (SImode, shift_amount))); 531 emit_insn (gen_lshrdi3 (d2, d2, 532 gen_lowpart (SImode, shift_amount))); 533 } 534 else 535 { 536 emit_insn (gen_ashlsi3 (shift_amount, shift_amount, 537 GEN_INT (LOG2_BITS_PER_UNIT))); 538 emit_insn (gen_lshrsi3 (d1, d1, shift_amount)); 539 emit_insn (gen_lshrsi3 (d2, d2, shift_amount)); 540 } 541 542 if (TARGET_P9_MISC) 543 { 544 /* Generate a compare, and convert with a setb later. */ 545 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2); 546 emit_insn (gen_rtx_SET (dcond, cmp)); 547 } 548 else 549 { 550 if (word_mode == DImode) 551 emit_insn (gen_subfdi3_carry (diff, d2, d1)); 552 else 553 emit_insn (gen_subfsi3_carry (diff, d2, d1)); 554 } 555} 556 557/* Generate rtl for an overlapping load and compare of less than a 558 full load_mode. This assumes that the previous word is part of the 559 block being compared so it's ok to back up part of a word so we can 560 compare the last unaligned full word that ends at the end of the block. 561 562 LOAD_MODE is the machine mode for the loads. 563 ISCONST tells whether the remaining length is a constant or in a register. 564 BYTES_REM is the remaining length if ISCONST is true. 565 DIFF is the reg for the difference. 566 CMP_REM is the reg containing the remaining bytes to compare if !ISCONST. 567 DCOND is the CCUNS reg for the compare if we are doing P9 code with setb. 568 SRC1_ADDR is the first source address. 569 SRC2_ADDR is the second source address. 570 ORIG_SRC1 is the original first source block's address rtx. 571 ORIG_SRC2 is the original second source block's address rtx. */ 572static void 573do_overlap_load_compare (machine_mode load_mode, bool isConst, 574 HOST_WIDE_INT bytes_rem, rtx diff, 575 rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr, 576 rtx orig_src1, rtx orig_src2) 577{ 578 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); 579 HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem; 580 rtx d1 = gen_reg_rtx (word_mode); 581 rtx d2 = gen_reg_rtx (word_mode); 582 583 rtx addr1, addr2; 584 if (!isConst || addr_adj) 585 { 586 rtx adj_reg = gen_reg_rtx (word_mode); 587 if (isConst) 588 emit_move_insn (adj_reg, GEN_INT (-addr_adj)); 589 else 590 { 591 rtx reg_lms = gen_reg_rtx (word_mode); 592 emit_move_insn (reg_lms, GEN_INT (load_mode_size)); 593 do_sub3 (adj_reg, cmp_rem, reg_lms); 594 } 595 596 addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg); 597 addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg); 598 } 599 else 600 { 601 addr1 = src1_addr; 602 addr2 = src2_addr; 603 } 604 605 do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1); 606 do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2); 607 608 if (TARGET_P9_MISC) 609 { 610 /* Generate a compare, and convert with a setb later. */ 611 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2); 612 emit_insn (gen_rtx_SET (dcond, cmp)); 613 } 614 else 615 { 616 if (word_mode == DImode) 617 emit_insn (gen_subfdi3_carry (diff, d2, d1)); 618 else 619 emit_insn (gen_subfsi3_carry (diff, d2, d1)); 620 } 621} 622 623/* Generate the sequence of compares for strcmp/strncmp using vec/vsx 624 instructions. 625 626 BYTES_TO_COMPARE is the number of bytes to be compared. 627 ORIG_SRC1 is the unmodified rtx for the first string. 628 ORIG_SRC2 is the unmodified rtx for the second string. 629 S1ADDR is the register to use for the base address of the first string. 630 S2ADDR is the register to use for the base address of the second string. 631 OFF_REG is the register to use for the string offset for loads. 632 S1DATA is the register for loading the first string. 633 S2DATA is the register for loading the second string. 634 VEC_RESULT is the rtx for the vector result indicating the byte difference. 635 EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call 636 to strcmp/strncmp if we have equality at the end of the inline comparison. 637 P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code 638 to clean up and generate the final comparison result. 639 FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just 640 set the final result. 641 CHECKZERO indicates whether the sequence should check for zero bytes 642 for use doing strncmp, or not (for use doing memcmp). */ 643static void 644expand_cmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare, 645 rtx orig_src1, rtx orig_src2, 646 rtx s1addr, rtx s2addr, rtx off_reg, 647 rtx s1data, rtx s2data, rtx vec_result, 648 bool equality_compare_rest, rtx *p_cleanup_label, 649 rtx final_move_label, bool checkzero) 650{ 651 machine_mode load_mode; 652 unsigned int load_mode_size; 653 unsigned HOST_WIDE_INT cmp_bytes = 0; 654 unsigned HOST_WIDE_INT offset = 0; 655 rtx zero_reg = NULL; 656 657 gcc_assert (p_cleanup_label != NULL); 658 rtx cleanup_label = *p_cleanup_label; 659 660 emit_move_insn (s1addr, force_reg (Pmode, XEXP (orig_src1, 0))); 661 emit_move_insn (s2addr, force_reg (Pmode, XEXP (orig_src2, 0))); 662 663 if (checkzero && !TARGET_P9_VECTOR) 664 { 665 zero_reg = gen_reg_rtx (V16QImode); 666 emit_move_insn (zero_reg, CONST0_RTX (V16QImode)); 667 } 668 669 while (bytes_to_compare > 0) 670 { 671 /* VEC/VSX compare sequence for P8: 672 check each 16B with: 673 lxvd2x 32,28,8 674 lxvd2x 33,29,8 675 vcmpequb 2,0,1 # compare strings 676 vcmpequb 4,0,3 # compare w/ 0 677 xxlorc 37,36,34 # first FF byte is either mismatch or end of string 678 vcmpequb. 7,5,3 # reg 7 contains 0 679 bnl 6,.Lmismatch 680 681 For the P8 LE case, we use lxvd2x and compare full 16 bytes 682 but then use vgbbd and a shift to get two bytes with the 683 information we need in the correct order. 684 685 VEC/VSX compare sequence if TARGET_P9_VECTOR: 686 lxvb16x/lxvb16x # load 16B of each string 687 vcmpnezb. # produces difference location or zero byte location 688 bne 6,.Lmismatch 689 690 Use the overlapping compare trick for the last block if it is 691 less than 16 bytes. 692 */ 693 694 load_mode = V16QImode; 695 load_mode_size = GET_MODE_SIZE (load_mode); 696 697 if (bytes_to_compare >= load_mode_size) 698 cmp_bytes = load_mode_size; 699 else 700 { 701 /* Move this load back so it doesn't go past the end. P8/P9 702 can do this efficiently. This is never called with less 703 than 16 bytes so we should always be able to do this. */ 704 unsigned int extra_bytes = load_mode_size - bytes_to_compare; 705 cmp_bytes = bytes_to_compare; 706 gcc_assert (offset > extra_bytes); 707 offset -= extra_bytes; 708 cmp_bytes = load_mode_size; 709 bytes_to_compare = cmp_bytes; 710 } 711 712 /* The offset currently used is always kept in off_reg so that the 713 cleanup code on P8 can use it to extract the differing byte. */ 714 emit_move_insn (off_reg, GEN_INT (offset)); 715 716 rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg); 717 do_load_for_compare_from_addr (load_mode, s1data, addr1, orig_src1); 718 rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg); 719 do_load_for_compare_from_addr (load_mode, s2data, addr2, orig_src2); 720 721 /* Cases to handle. A and B are chunks of the two strings. 722 1: Not end of comparison: 723 A != B: branch to cleanup code to compute result. 724 A == B: next block 725 2: End of the inline comparison: 726 A != B: branch to cleanup code to compute result. 727 A == B: call strcmp/strncmp 728 3: compared requested N bytes: 729 A == B: branch to result 0. 730 A != B: cleanup code to compute result. */ 731 732 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes; 733 734 if (checkzero) 735 { 736 if (TARGET_P9_VECTOR) 737 emit_insn (gen_vcmpnezb_p (vec_result, s1data, s2data)); 738 else 739 { 740 /* Emit instructions to do comparison and zero check. */ 741 rtx cmp_res = gen_reg_rtx (load_mode); 742 rtx cmp_zero = gen_reg_rtx (load_mode); 743 rtx cmp_combined = gen_reg_rtx (load_mode); 744 emit_insn (gen_altivec_eqv16qi (cmp_res, s1data, s2data)); 745 emit_insn (gen_altivec_eqv16qi (cmp_zero, s1data, zero_reg)); 746 emit_insn (gen_orcv16qi3 (vec_result, cmp_zero, cmp_res)); 747 emit_insn (gen_altivec_vcmpequb_p (cmp_combined, vec_result, zero_reg)); 748 } 749 } 750 else 751 emit_insn (gen_altivec_vcmpequb_p (vec_result, s1data, s2data)); 752 753 bool branch_to_cleanup = (remain > 0 || equality_compare_rest); 754 rtx cr6 = gen_rtx_REG (CCmode, CR6_REGNO); 755 rtx dst_label; 756 rtx cmp_rtx; 757 if (branch_to_cleanup) 758 { 759 /* Branch to cleanup code, otherwise fall through to do more 760 compares. P8 and P9 use different CR bits because on P8 761 we are looking at the result of a comparsion vs a 762 register of zeroes so the all-true condition means no 763 difference or zero was found. On P9, vcmpnezb sets a byte 764 to 0xff if there is a mismatch or zero, so the all-false 765 condition indicates we found no difference or zero. */ 766 if (!cleanup_label) 767 cleanup_label = gen_label_rtx (); 768 dst_label = cleanup_label; 769 if (TARGET_P9_VECTOR && checkzero) 770 cmp_rtx = gen_rtx_NE (VOIDmode, cr6, const0_rtx); 771 else 772 cmp_rtx = gen_rtx_GE (VOIDmode, cr6, const0_rtx); 773 } 774 else 775 { 776 /* Branch to final return or fall through to cleanup, 777 result is already set to 0. */ 778 dst_label = final_move_label; 779 if (TARGET_P9_VECTOR && checkzero) 780 cmp_rtx = gen_rtx_EQ (VOIDmode, cr6, const0_rtx); 781 else 782 cmp_rtx = gen_rtx_LT (VOIDmode, cr6, const0_rtx); 783 } 784 785 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label); 786 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, 787 lab_ref, pc_rtx); 788 rtx_insn *j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); 789 add_reg_br_prob_note (j2, profile_probability::likely ()); 790 JUMP_LABEL (j2) = dst_label; 791 LABEL_NUSES (dst_label) += 1; 792 793 offset += cmp_bytes; 794 bytes_to_compare -= cmp_bytes; 795 } 796 *p_cleanup_label = cleanup_label; 797 return; 798} 799 800/* Generate the final sequence that identifies the differing 801 byte and generates the final result, taking into account 802 zero bytes: 803 804 P8: 805 vgbbd 0,0 806 vsldoi 0,0,0,9 807 mfvsrd 9,32 808 addi 10,9,-1 # count trailing zero bits 809 andc 9,10,9 810 popcntd 9,9 811 lbzx 10,28,9 # use that offset to load differing byte 812 lbzx 3,29,9 813 subf 3,3,10 # subtract for final result 814 815 P9: 816 vclzlsbb # counts trailing bytes with lsb=0 817 vextublx # extract differing byte 818 819 STR1 is the reg rtx for data from string 1. 820 STR2 is the reg rtx for data from string 2. 821 RESULT is the reg rtx for the comparison result. 822 S1ADDR is the register to use for the base address of the first string. 823 S2ADDR is the register to use for the base address of the second string. 824 ORIG_SRC1 is the unmodified rtx for the first string. 825 ORIG_SRC2 is the unmodified rtx for the second string. 826 OFF_REG is the register to use for the string offset for loads. 827 VEC_RESULT is the rtx for the vector result indicating the byte difference. */ 828 829static void 830emit_final_compare_vec (rtx str1, rtx str2, rtx result, 831 rtx s1addr, rtx s2addr, 832 rtx orig_src1, rtx orig_src2, 833 rtx off_reg, rtx vec_result) 834{ 835 836 if (TARGET_P9_VECTOR) 837 { 838 rtx diffix = gen_reg_rtx (SImode); 839 rtx chr1 = gen_reg_rtx (SImode); 840 rtx chr2 = gen_reg_rtx (SImode); 841 rtx chr1_di = simplify_gen_subreg (DImode, chr1, SImode, 0); 842 rtx chr2_di = simplify_gen_subreg (DImode, chr2, SImode, 0); 843 emit_insn (gen_vclzlsbb_v16qi (diffix, vec_result)); 844 emit_insn (gen_vextublx (chr1, diffix, str1)); 845 emit_insn (gen_vextublx (chr2, diffix, str2)); 846 do_sub3 (result, chr1_di, chr2_di); 847 } 848 else 849 { 850 gcc_assert (TARGET_P8_VECTOR); 851 rtx diffix = gen_reg_rtx (DImode); 852 rtx result_gbbd = gen_reg_rtx (V16QImode); 853 /* Since each byte of the input is either 00 or FF, the bytes in 854 dw0 and dw1 after vgbbd are all identical to each other. */ 855 emit_insn (gen_p8v_vgbbd (result_gbbd, vec_result)); 856 /* For LE, we shift by 9 and get BA in the low two bytes then CTZ. 857 For BE, we shift by 7 and get AB in the high two bytes then CLZ. */ 858 rtx result_shifted = gen_reg_rtx (V16QImode); 859 int shift_amt = (BYTES_BIG_ENDIAN) ? 7 : 9; 860 emit_insn (gen_altivec_vsldoi_v16qi (result_shifted, result_gbbd, 861 result_gbbd, GEN_INT (shift_amt))); 862 863 rtx diffix_df = simplify_gen_subreg (DFmode, diffix, DImode, 0); 864 emit_insn (gen_p8_mfvsrd_3_v16qi (diffix_df, result_shifted)); 865 rtx count = gen_reg_rtx (DImode); 866 867 if (BYTES_BIG_ENDIAN) 868 emit_insn (gen_clzdi2 (count, diffix)); 869 else 870 emit_insn (gen_ctzdi2 (count, diffix)); 871 872 /* P8 doesn't have a good solution for extracting one byte from 873 a vsx reg like vextublx on P9 so we just compute the offset 874 of the differing byte and load it from each string. */ 875 do_add3 (off_reg, off_reg, count); 876 877 rtx chr1 = gen_reg_rtx (QImode); 878 rtx chr2 = gen_reg_rtx (QImode); 879 rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg); 880 do_load_for_compare_from_addr (QImode, chr1, addr1, orig_src1); 881 rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg); 882 do_load_for_compare_from_addr (QImode, chr2, addr2, orig_src2); 883 machine_mode rmode = GET_MODE (result); 884 rtx chr1_rm = simplify_gen_subreg (rmode, chr1, QImode, 0); 885 rtx chr2_rm = simplify_gen_subreg (rmode, chr2, QImode, 0); 886 do_sub3 (result, chr1_rm, chr2_rm); 887 } 888 889 return; 890} 891 892/* Expand a block compare operation using loop code, and return true 893 if successful. Return false if we should let the compiler generate 894 normal code, probably a memcmp call. 895 896 OPERANDS[0] is the target (result). 897 OPERANDS[1] is the first source. 898 OPERANDS[2] is the second source. 899 OPERANDS[3] is the length. 900 OPERANDS[4] is the alignment. */ 901bool 902expand_compare_loop (rtx operands[]) 903{ 904 rtx target = operands[0]; 905 rtx orig_src1 = operands[1]; 906 rtx orig_src2 = operands[2]; 907 rtx bytes_rtx = operands[3]; 908 rtx align_rtx = operands[4]; 909 910 /* This case is complicated to handle because the subtract 911 with carry instructions do not generate the 64-bit 912 carry and so we must emit code to calculate it ourselves. 913 We choose not to implement this yet. */ 914 if (TARGET_32BIT && TARGET_POWERPC64) 915 return false; 916 917 /* Allow non-const length. */ 918 int bytes_is_const = CONST_INT_P (bytes_rtx); 919 920 /* This must be a fixed size alignment. */ 921 if (!CONST_INT_P (align_rtx)) 922 return false; 923 924 HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT; 925 HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT; 926 HOST_WIDE_INT minalign = MIN (align1, align2); 927 928 bool isP7 = (rs6000_tune == PROCESSOR_POWER7); 929 930 gcc_assert (GET_MODE (target) == SImode); 931 932 /* Anything to move? */ 933 HOST_WIDE_INT bytes = 0; 934 if (bytes_is_const) 935 bytes = INTVAL (bytes_rtx); 936 937 if (bytes_is_const && bytes == 0) 938 return true; 939 940 /* Limit the amount we compare, if known statically. */ 941 HOST_WIDE_INT max_bytes; 942 switch (rs6000_tune) 943 { 944 case PROCESSOR_POWER7: 945 if (!bytes_is_const) 946 if (minalign < 8) 947 max_bytes = 0; 948 else 949 max_bytes = 128; 950 else 951 if (minalign < 8) 952 max_bytes = 32; 953 else 954 max_bytes = 128; 955 break; 956 case PROCESSOR_POWER8: 957 if (!bytes_is_const) 958 max_bytes = 0; 959 else 960 if (minalign < 8) 961 max_bytes = 128; 962 else 963 max_bytes = 64; 964 break; 965 case PROCESSOR_POWER9: 966 case PROCESSOR_POWER10: 967 if (bytes_is_const) 968 max_bytes = 191; 969 else 970 max_bytes = 0; 971 break; 972 default: 973 max_bytes = 128; 974 } 975 976 /* Allow the option to override the default. */ 977 if (rs6000_block_compare_inline_loop_limit >= 0) 978 max_bytes = (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_loop_limit; 979 980 if (max_bytes == 0) 981 return false; 982 983 rtx cmp_rem = gen_reg_rtx (word_mode); /* Remainder for library call. */ 984 rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop. */ 985 HOST_WIDE_INT niter; 986 rtx iter = gen_reg_rtx (word_mode); 987 rtx iv1 = gen_reg_rtx (word_mode); 988 rtx iv2 = gen_reg_rtx (word_mode); 989 rtx d1_1 = gen_reg_rtx (word_mode); /* Addr expression src1+iv1 */ 990 rtx d1_2 = gen_reg_rtx (word_mode); /* Addr expression src1+iv2 */ 991 rtx d2_1 = gen_reg_rtx (word_mode); /* Addr expression src2+iv1 */ 992 rtx d2_2 = gen_reg_rtx (word_mode); /* Addr expression src2+iv2 */ 993 994 /* Strip unneeded subreg from length if there is one. */ 995 if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx)) 996 bytes_rtx = SUBREG_REG (bytes_rtx); 997 /* Extend bytes_rtx to word_mode if needed. But, we expect only to 998 maybe have to deal with the case were bytes_rtx is SImode and 999 word_mode is DImode. */ 1000 if (!bytes_is_const) 1001 { 1002 if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode)) 1003 /* Do not expect length longer than word_mode. */ 1004 return false; 1005 else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode)) 1006 { 1007 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx); 1008 bytes_rtx = force_reg (word_mode, 1009 gen_rtx_fmt_e (ZERO_EXTEND, word_mode, 1010 bytes_rtx)); 1011 } 1012 else 1013 /* Make sure it's in a register before we get started. */ 1014 bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx); 1015 } 1016 1017 machine_mode load_mode = word_mode; 1018 HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode); 1019 1020 /* Number of bytes per iteration of the unrolled loop. */ 1021 HOST_WIDE_INT loop_bytes = 2 * load_mode_size; 1022 /* max iters and bytes compared in the loop. */ 1023 HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes; 1024 HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes; 1025 int l2lb = floor_log2 (loop_bytes); 1026 1027 if (bytes_is_const && (max_bytes < load_mode_size 1028 || !IN_RANGE (bytes, load_mode_size, max_bytes))) 1029 return false; 1030 1031 bool no_remainder_code = false; 1032 rtx final_label = gen_label_rtx (); 1033 rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); 1034 rtx diff_label = gen_label_rtx (); 1035 rtx library_call_label = NULL; 1036 rtx cleanup_label = gen_label_rtx (); 1037 1038 rtx cr; 1039 1040 rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0)); 1041 rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0)); 1042 1043 /* Difference found is stored here before jump to diff_label. */ 1044 rtx diff = gen_reg_rtx (word_mode); 1045 rtx_insn *j; 1046 1047 /* Example of generated code for 35 bytes aligned 1 byte. 1048 1049 mtctr 8 1050 li 6,0 1051 li 5,8 1052 .L13: 1053 ldbrx 7,3,6 1054 ldbrx 9,10,6 1055 ldbrx 0,3,5 1056 ldbrx 4,10,5 1057 addi 6,6,16 1058 addi 5,5,16 1059 subfc. 9,9,7 1060 bne 0,.L10 1061 subfc. 9,4,0 1062 bdnzt 2,.L13 1063 bne 0,.L10 1064 add 3,3,6 1065 add 10,10,6 1066 addi 9,3,-5 1067 ldbrx 7,0,9 1068 addi 9,10,-5 1069 ldbrx 9,0,9 1070 subfc 9,9,7 1071 .p2align 4,,15 1072 .L10: 1073 popcntd 9,9 1074 subfe 10,10,10 1075 or 9,9,10 1076 1077 Compiled with -fno-reorder-blocks for clarity. */ 1078 1079 /* Structure of what we're going to do: 1080 Two separate lengths: what we will compare before bailing to library 1081 call (max_bytes), and the total length to be checked. 1082 if length <= 16, branch to linear cleanup code starting with 1083 remainder length check (length not known at compile time) 1084 set up 2 iv's and load count reg, compute remainder length 1085 unrollx2 compare loop 1086 if loop exit due to a difference, branch to difference handling code 1087 if remainder length < 8, branch to final cleanup compare 1088 load and compare 8B 1089 final cleanup comparison (depends on alignment and length) 1090 load 8B, shift off bytes past length, compare 1091 load 8B ending at last byte and compare 1092 load/compare 1 byte at a time (short block abutting 4k boundary) 1093 difference handling, 64->32 conversion 1094 final result 1095 branch around memcmp call 1096 memcmp library call 1097 */ 1098 1099 /* If bytes is not const, compare length and branch directly 1100 to the cleanup code that can handle 0-16 bytes if length 1101 is >= 16. Stash away bytes-max_bytes for the library call. */ 1102 if (bytes_is_const) 1103 { 1104 /* These need to be set for some of the places we may jump to. */ 1105 if (bytes > max_bytes) 1106 { 1107 no_remainder_code = true; 1108 niter = max_loop_iter; 1109 library_call_label = gen_label_rtx (); 1110 } 1111 else 1112 { 1113 niter = bytes / loop_bytes; 1114 } 1115 emit_move_insn (iter, GEN_INT (niter)); 1116 emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes)); 1117 emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes)); 1118 } 1119 else 1120 { 1121 library_call_label = gen_label_rtx (); 1122 1123 /* If we go to the cleanup code, it expects length to be in cmp_rem. */ 1124 emit_move_insn (cmp_rem, bytes_rtx); 1125 1126 /* Check for > max_bytes bytes. We want to bail out as quickly as 1127 possible if we have to go over to memcmp. */ 1128 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes), 1129 NULL_RTX, library_call_label, profile_probability::even ()); 1130 1131 /* Check for < loop_bytes bytes. */ 1132 do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes), 1133 NULL_RTX, cleanup_label, profile_probability::even ()); 1134 1135 /* Loop compare bytes and iterations if bytes>max_bytes. */ 1136 rtx mb_reg = gen_reg_rtx (word_mode); 1137 emit_move_insn (mb_reg, GEN_INT (max_loop_bytes)); 1138 rtx mi_reg = gen_reg_rtx (word_mode); 1139 emit_move_insn (mi_reg, GEN_INT (max_loop_iter)); 1140 1141 /* Compute number of loop iterations if bytes <= max_bytes. */ 1142 if (word_mode == DImode) 1143 emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb))); 1144 else 1145 emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb))); 1146 1147 /* Compute bytes to compare in loop if bytes <= max_bytes. */ 1148 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb); 1149 if (word_mode == DImode) 1150 { 1151 emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask)); 1152 } 1153 else 1154 { 1155 emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask)); 1156 } 1157 1158 /* Check for bytes <= max_bytes. */ 1159 if (TARGET_ISEL) 1160 { 1161 /* P9 has fast isel so we use one compare and two isel. */ 1162 cr = gen_reg_rtx (CCmode); 1163 rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx, 1164 GEN_INT (max_bytes)); 1165 emit_move_insn (cr, compare_rtx); 1166 rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx); 1167 do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr); 1168 do_isel (iter, cmp_rtx, iter, mi_reg, cr); 1169 } 1170 else 1171 { 1172 rtx lab_after = gen_label_rtx (); 1173 do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes), 1174 NULL_RTX, lab_after, profile_probability::even ()); 1175 emit_move_insn (loop_cmp, mb_reg); 1176 emit_move_insn (iter, mi_reg); 1177 emit_label (lab_after); 1178 } 1179 1180 /* Now compute remainder bytes which isn't used until after the loop. */ 1181 do_sub3 (cmp_rem, bytes_rtx, loop_cmp); 1182 } 1183 1184 rtx dcond = NULL_RTX; /* Used for when we jump to diff_label. */ 1185 /* For p9 we need to have just one of these as multiple places define 1186 it and it gets used by the setb at the end. */ 1187 if (TARGET_P9_MISC) 1188 dcond = gen_reg_rtx (CCUNSmode); 1189 1190 if (!bytes_is_const || bytes >= loop_bytes) 1191 { 1192 /* It should not be possible to come here if remaining bytes is 1193 < 16 in the runtime case either. Compute number of loop 1194 iterations. We compare 2*word_mode per iteration so 16B for 1195 64-bit code and 8B for 32-bit. Set up two induction 1196 variables and load count register. */ 1197 1198 /* HACK ALERT: create hard reg for CTR here. If we just use a 1199 pseudo, cse will get rid of it and then the allocator will 1200 see it used in the lshr above and won't give us ctr. */ 1201 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO); 1202 emit_move_insn (ctr, iter); 1203 emit_move_insn (diff, GEN_INT (0)); 1204 emit_move_insn (iv1, GEN_INT (0)); 1205 emit_move_insn (iv2, GEN_INT (load_mode_size)); 1206 1207 /* inner loop to compare 2*word_mode */ 1208 rtx loop_top_label = gen_label_rtx (); 1209 emit_label (loop_top_label); 1210 1211 rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1); 1212 rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1); 1213 1214 do_load_for_compare_from_addr (load_mode, d1_1, 1215 src1_ix1, orig_src1); 1216 do_load_for_compare_from_addr (load_mode, d2_1, 1217 src2_ix1, orig_src2); 1218 do_add3 (iv1, iv1, GEN_INT (loop_bytes)); 1219 1220 rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2); 1221 rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2); 1222 1223 do_load_for_compare_from_addr (load_mode, d1_2, 1224 src1_ix2, orig_src1); 1225 do_load_for_compare_from_addr (load_mode, d2_2, 1226 src2_ix2, orig_src2); 1227 do_add3 (iv2, iv2, GEN_INT (loop_bytes)); 1228 1229 if (TARGET_P9_MISC) 1230 { 1231 /* Generate a compare, and convert with a setb later. */ 1232 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1); 1233 emit_insn (gen_rtx_SET (dcond, cmp)); 1234 } 1235 else 1236 { 1237 dcond = gen_reg_rtx (CCmode); 1238 if (word_mode == DImode) 1239 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond)); 1240 else 1241 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond)); 1242 } 1243 1244 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX, 1245 dcond, diff_label, profile_probability::unlikely ()); 1246 1247 if (TARGET_P9_MISC) 1248 { 1249 /* Generate a compare, and convert with a setb later. */ 1250 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2); 1251 emit_insn (gen_rtx_SET (dcond, cmp)); 1252 } 1253 else 1254 { 1255 dcond = gen_reg_rtx (CCmode); 1256 if (word_mode == DImode) 1257 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond)); 1258 else 1259 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond)); 1260 } 1261 1262 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2); 1263 if (TARGET_64BIT) 1264 j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr, 1265 eqrtx, dcond)); 1266 else 1267 j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr, 1268 eqrtx, dcond)); 1269 add_reg_br_prob_note (j, profile_probability::likely ()); 1270 JUMP_LABEL (j) = loop_top_label; 1271 LABEL_NUSES (loop_top_label) += 1; 1272 } 1273 1274 HOST_WIDE_INT bytes_remaining = 0; 1275 if (bytes_is_const) 1276 bytes_remaining = (bytes % loop_bytes); 1277 1278 /* If diff is nonzero, branch to difference handling 1279 code. If we exit here with a nonzero diff, it is 1280 because the second word differed. */ 1281 if (TARGET_P9_MISC) 1282 do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond, 1283 diff_label, profile_probability::unlikely ()); 1284 else 1285 do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX, 1286 diff_label, profile_probability::unlikely ()); 1287 1288 if (library_call_label != NULL && bytes_is_const && bytes > max_bytes) 1289 { 1290 /* If the length is known at compile time, then we will always 1291 have a remainder to go to the library call with. */ 1292 rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, library_call_label); 1293 j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref)); 1294 JUMP_LABEL (j) = library_call_label; 1295 LABEL_NUSES (library_call_label) += 1; 1296 emit_barrier (); 1297 } 1298 1299 if (bytes_is_const && bytes_remaining == 0) 1300 { 1301 /* No remainder and if we are here then diff is 0 so just return 0 */ 1302 if (TARGET_64BIT) 1303 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); 1304 else 1305 emit_move_insn (target, diff); 1306 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); 1307 JUMP_LABEL (j) = final_label; 1308 LABEL_NUSES (final_label) += 1; 1309 emit_barrier (); 1310 } 1311 else if (!no_remainder_code) 1312 { 1313 /* Update addresses to point to the next word to examine. */ 1314 do_add3 (src1_addr, src1_addr, iv1); 1315 do_add3 (src2_addr, src2_addr, iv1); 1316 1317 emit_label (cleanup_label); 1318 1319 if (!bytes_is_const) 1320 { 1321 /* If we're dealing with runtime length, we have to check if 1322 it's zero after the loop. When length is known at compile 1323 time the no-remainder condition is dealt with above. By 1324 doing this after cleanup_label, we also deal with the 1325 case where length is 0 at the start and we bypass the 1326 loop with a branch to cleanup_label. */ 1327 emit_move_insn (target, const0_rtx); 1328 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, 1329 NULL_RTX, final_label, profile_probability::unlikely ()); 1330 } 1331 1332 rtx final_cleanup = gen_label_rtx (); 1333 rtx cmp_rem_before = gen_reg_rtx (word_mode); 1334 /* Compare one more word_mode chunk if needed. */ 1335 if (!bytes_is_const || bytes_remaining >= load_mode_size) 1336 { 1337 /* If remainder length < word length, branch to final 1338 cleanup compare. */ 1339 1340 if (!bytes_is_const) 1341 { 1342 do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size), 1343 NULL_RTX, final_cleanup, profile_probability::even ()); 1344 } 1345 1346 /* load and compare 8B */ 1347 do_load_for_compare_from_addr (load_mode, d1_1, 1348 src1_addr, orig_src1); 1349 do_load_for_compare_from_addr (load_mode, d2_1, 1350 src2_addr, orig_src2); 1351 1352 /* Compare the word, see if we need to do the last partial. */ 1353 if (TARGET_P9_MISC) 1354 { 1355 /* Generate a compare, and convert with a setb later. */ 1356 rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1); 1357 emit_insn (gen_rtx_SET (dcond, cmp)); 1358 } 1359 else 1360 { 1361 dcond = gen_reg_rtx (CCmode); 1362 if (word_mode == DImode) 1363 emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond)); 1364 else 1365 emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond)); 1366 } 1367 1368 do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX, 1369 dcond, diff_label, profile_probability::even ()); 1370 1371 do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size)); 1372 do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size)); 1373 emit_move_insn (cmp_rem_before, cmp_rem); 1374 do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size)); 1375 if (bytes_is_const) 1376 bytes_remaining -= load_mode_size; 1377 else 1378 /* See if remaining length is now zero. We previously set 1379 target to 0 so we can just jump to the end. */ 1380 do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, NULL_RTX, 1381 final_label, profile_probability::unlikely ()); 1382 } 1383 1384 /* Cases: 1385 bytes_is_const 1386 We can always shift back to do an overlapping compare 1387 of the last chunk because we know length >= 8. 1388 1389 !bytes_is_const 1390 align>=load_mode_size 1391 Read word_mode and mask 1392 align<load_mode_size 1393 avoid stepping past end 1394 1395 Three strategies: 1396 * decrement address and do overlapping compare 1397 * read word_mode and mask 1398 * carefully avoid crossing 4k boundary 1399 */ 1400 1401 if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7)) 1402 && align1 >= load_mode_size && align2 >= load_mode_size) 1403 { 1404 /* Alignment is larger than word_mode so we do not need to be 1405 concerned with extra page crossings. But, we do not know 1406 that the length is larger than load_mode_size so we might 1407 end up compareing against data before the block if we try 1408 an overlapping compare. Also we use this on P7 for fixed length 1409 remainder because P7 doesn't like overlapping unaligned. 1410 Strategy: load 8B, shift off bytes past length, and compare. */ 1411 emit_label (final_cleanup); 1412 do_load_mask_compare (load_mode, diff, cmp_rem, dcond, 1413 src1_addr, src2_addr, orig_src1, orig_src2); 1414 } 1415 else if (bytes_remaining && bytes_is_const) 1416 { 1417 /* We do not do loop expand if length < 32 so we know at the 1418 end we can do an overlapping compare. 1419 Strategy: shift address back and do word_mode load that 1420 ends at the end of the block. */ 1421 emit_label (final_cleanup); 1422 do_overlap_load_compare (load_mode, true, bytes_remaining, diff, 1423 cmp_rem, dcond, src1_addr, src2_addr, 1424 orig_src1, orig_src2); 1425 } 1426 else if (!bytes_is_const) 1427 { 1428 rtx handle4k_label = gen_label_rtx (); 1429 rtx nonconst_overlap = gen_label_rtx (); 1430 emit_label (nonconst_overlap); 1431 1432 /* Here we have to handle the case where whe have runtime 1433 length which may be too short for overlap compare, and 1434 alignment is not at least load_mode_size so we have to 1435 tread carefully to avoid stepping across 4k boundaries. */ 1436 1437 /* If the length after the loop was larger than word_mode 1438 size, we can just do an overlapping compare and we're 1439 done. We fall through to this code from the word_mode 1440 compare that preceeds this. */ 1441 do_overlap_load_compare (load_mode, false, 0, diff, 1442 cmp_rem, dcond, src1_addr, src2_addr, 1443 orig_src1, orig_src2); 1444 1445 rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label); 1446 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref)); 1447 JUMP_LABEL (j) = diff_label; 1448 LABEL_NUSES (diff_label) += 1; 1449 emit_barrier (); 1450 1451 /* If we couldn't do the overlap compare we have to be more 1452 careful of the 4k boundary. Test to see if either 1453 address is less than word_mode_size away from a 4k 1454 boundary. If not, then we can do a load/shift/compare 1455 and we are done. We come to this code if length was less 1456 than word_mode_size. */ 1457 1458 emit_label (final_cleanup); 1459 1460 /* We can still avoid the slow case if the length was larger 1461 than one loop iteration, in which case go do the overlap 1462 load compare path. */ 1463 do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes), 1464 NULL_RTX, nonconst_overlap, profile_probability::even ()); 1465 1466 rtx rem4k = gen_reg_rtx (word_mode); 1467 rtx dist1 = gen_reg_rtx (word_mode); 1468 rtx dist2 = gen_reg_rtx (word_mode); 1469 do_sub3 (rem4k, GEN_INT (4096), cmp_rem); 1470 if (word_mode == SImode) 1471 emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff))); 1472 else 1473 emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff))); 1474 do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX, 1475 handle4k_label, profile_probability::very_unlikely ()); 1476 if (word_mode == SImode) 1477 emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff))); 1478 else 1479 emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff))); 1480 do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX, 1481 handle4k_label, profile_probability::very_unlikely ()); 1482 1483 /* We don't have a 4k boundary to deal with, so do 1484 a load/shift/compare and jump to diff. */ 1485 1486 do_load_mask_compare (load_mode, diff, cmp_rem, dcond, 1487 src1_addr, src2_addr, orig_src1, orig_src2); 1488 1489 j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref)); 1490 JUMP_LABEL (j) = diff_label; 1491 LABEL_NUSES (diff_label) += 1; 1492 emit_barrier (); 1493 1494 /* Finally in the unlikely case we are inching up to a 1495 4k boundary we use a compact lbzx/compare loop to do 1496 it a byte at a time. */ 1497 1498 emit_label (handle4k_label); 1499 1500 rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO); 1501 emit_move_insn (ctr, cmp_rem); 1502 rtx ixreg = gen_reg_rtx (Pmode); 1503 emit_move_insn (ixreg, const0_rtx); 1504 1505 rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg); 1506 rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg); 1507 rtx d1 = gen_reg_rtx (word_mode); 1508 rtx d2 = gen_reg_rtx (word_mode); 1509 1510 rtx fc_loop = gen_label_rtx (); 1511 emit_label (fc_loop); 1512 1513 do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1); 1514 do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2); 1515 1516 do_add3 (ixreg, ixreg, const1_rtx); 1517 1518 rtx cond = gen_reg_rtx (CCmode); 1519 rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2); 1520 rs6000_emit_dot_insn (diff, subexpr, 2, cond); 1521 1522 rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2); 1523 if (TARGET_64BIT) 1524 j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr, 1525 eqrtx, cond)); 1526 else 1527 j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr, 1528 eqrtx, cond)); 1529 add_reg_br_prob_note (j, profile_probability::likely ()); 1530 JUMP_LABEL (j) = fc_loop; 1531 LABEL_NUSES (fc_loop) += 1; 1532 1533 if (TARGET_64BIT) 1534 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); 1535 else 1536 emit_move_insn (target, diff); 1537 1538 /* Since we are comparing bytes, the difference can be used 1539 as the final result and we are done here. */ 1540 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); 1541 JUMP_LABEL (j) = final_label; 1542 LABEL_NUSES (final_label) += 1; 1543 emit_barrier (); 1544 } 1545 } 1546 1547 emit_label (diff_label); 1548 /* difference handling, 64->32 conversion */ 1549 1550 /* We need to produce DI result from sub, then convert to target SI 1551 while maintaining <0 / ==0 / >0 properties. This sequence works: 1552 subfc L,A,B 1553 subfe H,H,H 1554 popcntd L,L 1555 rldimi L,H,6,0 1556 1557 This is an alternate one Segher cooked up if somebody 1558 wants to expand this for something that doesn't have popcntd: 1559 subfc L,a,b 1560 subfe H,x,x 1561 addic t,L,-1 1562 subfe v,t,L 1563 or z,v,H 1564 1565 And finally, p9 can just do this: 1566 cmpld A,B 1567 setb r */ 1568 1569 if (TARGET_P9_MISC) 1570 emit_insn (gen_setb_unsigned (target, dcond)); 1571 else 1572 { 1573 if (TARGET_64BIT) 1574 { 1575 rtx tmp_reg_ca = gen_reg_rtx (DImode); 1576 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca)); 1577 emit_insn (gen_popcntddi2 (diff, diff)); 1578 emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca)); 1579 emit_insn (gen_movsi (target, gen_lowpart (SImode, diff))); 1580 } 1581 else 1582 { 1583 rtx tmp_reg_ca = gen_reg_rtx (SImode); 1584 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca)); 1585 emit_insn (gen_popcntdsi2 (diff, diff)); 1586 emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca)); 1587 } 1588 } 1589 1590 if (library_call_label != NULL) 1591 { 1592 /* Branch around memcmp call. */ 1593 j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref)); 1594 JUMP_LABEL (j) = final_label; 1595 LABEL_NUSES (final_label) += 1; 1596 emit_barrier (); 1597 1598 /* Make memcmp library call. cmp_rem is the remaining bytes that 1599 were compared and cmp_rem is the expected amount to be compared 1600 by memcmp. If we don't find a difference in the loop compare, do 1601 the library call directly instead of doing a small compare just 1602 to get to an arbitrary boundary before calling it anyway. 1603 Also, update addresses to point to the next word to examine. */ 1604 emit_label (library_call_label); 1605 1606 rtx len_rtx = gen_reg_rtx (word_mode); 1607 if (bytes_is_const) 1608 { 1609 emit_move_insn (len_rtx, cmp_rem); 1610 do_add3 (src1_addr, src1_addr, iv1); 1611 do_add3 (src2_addr, src2_addr, iv1); 1612 } 1613 else 1614 emit_move_insn (len_rtx, bytes_rtx); 1615 1616 tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP); 1617 emit_library_call_value (XEXP (DECL_RTL (fun), 0), 1618 target, LCT_NORMAL, GET_MODE (target), 1619 src1_addr, Pmode, 1620 src2_addr, Pmode, 1621 len_rtx, GET_MODE (len_rtx)); 1622 } 1623 1624 /* emit final_label */ 1625 emit_label (final_label); 1626 return true; 1627} 1628 1629/* Generate code to convert a DImode-plus-carry subtract result into 1630 a SImode result that has the same <0 / ==0 / >0 properties to 1631 produce the final result from memcmp. 1632 1633 TARGET is the rtx for the register to receive the memcmp result. 1634 SUB_RESULT is the rtx for the register contining the subtract result. */ 1635 1636void 1637generate_6432_conversion(rtx target, rtx sub_result) 1638{ 1639 /* We need to produce DI result from sub, then convert to target SI 1640 while maintaining <0 / ==0 / >0 properties. This sequence works: 1641 subfc L,A,B 1642 subfe H,H,H 1643 popcntd L,L 1644 rldimi L,H,6,0 1645 1646 This is an alternate one Segher cooked up if somebody 1647 wants to expand this for something that doesn't have popcntd: 1648 subfc L,a,b 1649 subfe H,x,x 1650 addic t,L,-1 1651 subfe v,t,L 1652 or z,v,H 1653 1654 And finally, p9 can just do this: 1655 cmpld A,B 1656 setb r */ 1657 1658 if (TARGET_64BIT) 1659 { 1660 rtx tmp_reg_ca = gen_reg_rtx (DImode); 1661 emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca)); 1662 rtx popcnt = gen_reg_rtx (DImode); 1663 emit_insn (gen_popcntddi2 (popcnt, sub_result)); 1664 rtx tmp2 = gen_reg_rtx (DImode); 1665 emit_insn (gen_iordi3 (tmp2, popcnt, tmp_reg_ca)); 1666 emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp2))); 1667 } 1668 else 1669 { 1670 rtx tmp_reg_ca = gen_reg_rtx (SImode); 1671 emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca)); 1672 rtx popcnt = gen_reg_rtx (SImode); 1673 emit_insn (gen_popcntdsi2 (popcnt, sub_result)); 1674 emit_insn (gen_iorsi3 (target, popcnt, tmp_reg_ca)); 1675 } 1676} 1677 1678/* Generate memcmp expansion using in-line non-loop GPR instructions. 1679 The bool return indicates whether code for a 64->32 conversion 1680 should be generated. 1681 1682 BYTES is the number of bytes to be compared. 1683 BASE_ALIGN is the minimum alignment for both blocks to compare. 1684 ORIG_SRC1 is the original pointer to the first block to compare. 1685 ORIG_SRC2 is the original pointer to the second block to compare. 1686 SUB_RESULT is the reg rtx for the result from the final subtract. 1687 COND is rtx for a condition register that will be used for the final 1688 compare on power9 or better. 1689 FINAL_RESULT is the reg rtx for the final memcmp result. 1690 P_CONVERT_LABEL is a pointer to rtx that will be used to store the 1691 label generated for a branch to the 64->32 code, if such a branch 1692 is needed. 1693 P_FINAL_LABEL is a pointer to rtx that will be used to store the label 1694 for the end of the memcmp if a branch there is needed. 1695*/ 1696 1697bool 1698expand_block_compare_gpr(unsigned HOST_WIDE_INT bytes, unsigned int base_align, 1699 rtx orig_src1, rtx orig_src2, 1700 rtx sub_result, rtx cond, rtx final_result, 1701 rtx *p_convert_label, rtx *p_final_label) 1702{ 1703 /* Example of generated code for 18 bytes aligned 1 byte. 1704 Compiled with -fno-reorder-blocks for clarity. 1705 ldbrx 10,31,8 1706 ldbrx 9,7,8 1707 subfc. 9,9,10 1708 bne 0,.L6487 1709 addi 9,12,8 1710 addi 5,11,8 1711 ldbrx 10,0,9 1712 ldbrx 9,0,5 1713 subfc. 9,9,10 1714 bne 0,.L6487 1715 addi 9,12,16 1716 lhbrx 10,0,9 1717 addi 9,11,16 1718 lhbrx 9,0,9 1719 subf 9,9,10 1720 b .L6488 1721 .p2align 4,,15 1722 .L6487: #convert_label 1723 popcntd 9,9 1724 subfe 10,10,10 1725 or 9,9,10 1726 .L6488: #final_label 1727 extsw 10,9 1728 1729 We start off with DImode for two blocks that jump to the DI->SI conversion 1730 if the difference is found there, then a final block of HImode that skips 1731 the DI->SI conversion. */ 1732 1733 unsigned HOST_WIDE_INT offset = 0; 1734 unsigned int load_mode_size; 1735 HOST_WIDE_INT cmp_bytes = 0; 1736 rtx src1 = orig_src1; 1737 rtx src2 = orig_src2; 1738 rtx tmp_reg_src1 = gen_reg_rtx (word_mode); 1739 rtx tmp_reg_src2 = gen_reg_rtx (word_mode); 1740 bool need_6432_conv = false; 1741 rtx convert_label = NULL; 1742 rtx final_label = NULL; 1743 machine_mode load_mode; 1744 1745 while (bytes > 0) 1746 { 1747 unsigned int align = compute_current_alignment (base_align, offset); 1748 load_mode = select_block_compare_mode (offset, bytes, align); 1749 load_mode_size = GET_MODE_SIZE (load_mode); 1750 if (bytes >= load_mode_size) 1751 cmp_bytes = load_mode_size; 1752 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) 1753 { 1754 /* Move this load back so it doesn't go past the end. 1755 P8/P9 can do this efficiently. */ 1756 unsigned int extra_bytes = load_mode_size - bytes; 1757 cmp_bytes = bytes; 1758 if (extra_bytes < offset) 1759 { 1760 offset -= extra_bytes; 1761 cmp_bytes = load_mode_size; 1762 bytes = cmp_bytes; 1763 } 1764 } 1765 else 1766 /* P7 and earlier can't do the overlapping load trick fast, 1767 so this forces a non-overlapping load and a shift to get 1768 rid of the extra bytes. */ 1769 cmp_bytes = bytes; 1770 1771 src1 = adjust_address (orig_src1, load_mode, offset); 1772 src2 = adjust_address (orig_src2, load_mode, offset); 1773 1774 if (!REG_P (XEXP (src1, 0))) 1775 { 1776 rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0)); 1777 src1 = replace_equiv_address (src1, src1_reg); 1778 } 1779 set_mem_size (src1, load_mode_size); 1780 1781 if (!REG_P (XEXP (src2, 0))) 1782 { 1783 rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0)); 1784 src2 = replace_equiv_address (src2, src2_reg); 1785 } 1786 set_mem_size (src2, load_mode_size); 1787 1788 do_load_for_compare (tmp_reg_src1, src1, load_mode); 1789 do_load_for_compare (tmp_reg_src2, src2, load_mode); 1790 1791 if (cmp_bytes < load_mode_size) 1792 { 1793 /* Shift unneeded bytes off. */ 1794 rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes)); 1795 if (word_mode == DImode) 1796 { 1797 emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh)); 1798 emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh)); 1799 } 1800 else 1801 { 1802 emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh)); 1803 emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh)); 1804 } 1805 } 1806 1807 int remain = bytes - cmp_bytes; 1808 if (GET_MODE_SIZE (GET_MODE (final_result)) > GET_MODE_SIZE (load_mode)) 1809 { 1810 /* Final_result is larger than load size so we don't need to 1811 reduce result size. */ 1812 1813 /* We previously did a block that need 64->32 conversion but 1814 the current block does not, so a label is needed to jump 1815 to the end. */ 1816 if (need_6432_conv && !final_label) 1817 final_label = gen_label_rtx (); 1818 1819 if (remain > 0) 1820 { 1821 /* This is not the last block, branch to the end if the result 1822 of this subtract is not zero. */ 1823 if (!final_label) 1824 final_label = gen_label_rtx (); 1825 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); 1826 rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2); 1827 rtx cr = gen_reg_rtx (CCmode); 1828 rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr); 1829 emit_insn (gen_movsi (final_result, 1830 gen_lowpart (SImode, tmp_reg_src2))); 1831 rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx); 1832 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx, 1833 fin_ref, pc_rtx); 1834 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); 1835 add_reg_br_prob_note (j, profile_probability::unlikely ()); 1836 JUMP_LABEL (j) = final_label; 1837 LABEL_NUSES (final_label) += 1; 1838 } 1839 else 1840 { 1841 if (word_mode == DImode) 1842 { 1843 emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1, 1844 tmp_reg_src2)); 1845 emit_insn (gen_movsi (final_result, 1846 gen_lowpart (SImode, tmp_reg_src2))); 1847 } 1848 else 1849 emit_insn (gen_subsi3 (final_result, tmp_reg_src1, tmp_reg_src2)); 1850 1851 if (final_label) 1852 { 1853 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); 1854 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); 1855 JUMP_LABEL (j) = final_label; 1856 LABEL_NUSES (final_label) += 1; 1857 emit_barrier (); 1858 } 1859 } 1860 } 1861 else 1862 { 1863 /* Do we need a 64->32 conversion block? We need the 64->32 1864 conversion even if final_result size == load_mode size because 1865 the subtract generates one extra bit. */ 1866 need_6432_conv = true; 1867 1868 if (remain > 0) 1869 { 1870 if (!convert_label) 1871 convert_label = gen_label_rtx (); 1872 1873 /* Compare to zero and branch to convert_label if not zero. */ 1874 rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label); 1875 if (TARGET_P9_MISC) 1876 { 1877 /* Generate a compare, and convert with a setb later. 1878 Use cond that is passed in because the caller needs 1879 to use it for the 64->32 conversion later. */ 1880 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1, 1881 tmp_reg_src2); 1882 emit_insn (gen_rtx_SET (cond, cmp)); 1883 } 1884 else 1885 { 1886 /* Generate a subfc. and use the longer sequence for 1887 conversion. Cond is not used outside this 1888 function in this case. */ 1889 cond = gen_reg_rtx (CCmode); 1890 if (TARGET_64BIT) 1891 emit_insn (gen_subfdi3_carry_dot2 (sub_result, tmp_reg_src2, 1892 tmp_reg_src1, cond)); 1893 else 1894 emit_insn (gen_subfsi3_carry_dot2 (sub_result, tmp_reg_src2, 1895 tmp_reg_src1, cond)); 1896 } 1897 1898 rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); 1899 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx, 1900 cvt_ref, pc_rtx); 1901 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); 1902 add_reg_br_prob_note (j, profile_probability::likely ()); 1903 JUMP_LABEL (j) = convert_label; 1904 LABEL_NUSES (convert_label) += 1; 1905 } 1906 else 1907 { 1908 /* Just do the subtract/compare. Since this is the last block 1909 the convert code will be generated immediately following. */ 1910 if (TARGET_P9_MISC) 1911 { 1912 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1, 1913 tmp_reg_src2); 1914 emit_insn (gen_rtx_SET (cond, cmp)); 1915 } 1916 else 1917 if (TARGET_64BIT) 1918 emit_insn (gen_subfdi3_carry (sub_result, tmp_reg_src2, 1919 tmp_reg_src1)); 1920 else 1921 emit_insn (gen_subfsi3_carry (sub_result, tmp_reg_src2, 1922 tmp_reg_src1)); 1923 } 1924 } 1925 1926 offset += cmp_bytes; 1927 bytes -= cmp_bytes; 1928 } 1929 1930 if (convert_label) 1931 *p_convert_label = convert_label; 1932 if (final_label) 1933 *p_final_label = final_label; 1934 return need_6432_conv; 1935} 1936 1937/* Expand a block compare operation, and return true if successful. 1938 Return false if we should let the compiler generate normal code, 1939 probably a memcmp call. 1940 1941 OPERANDS[0] is the target (result). 1942 OPERANDS[1] is the first source. 1943 OPERANDS[2] is the second source. 1944 OPERANDS[3] is the length. 1945 OPERANDS[4] is the alignment. */ 1946bool 1947expand_block_compare (rtx operands[]) 1948{ 1949 rtx target = operands[0]; 1950 rtx orig_src1 = operands[1]; 1951 rtx orig_src2 = operands[2]; 1952 rtx bytes_rtx = operands[3]; 1953 rtx align_rtx = operands[4]; 1954 1955 /* This case is complicated to handle because the subtract 1956 with carry instructions do not generate the 64-bit 1957 carry and so we must emit code to calculate it ourselves. 1958 We choose not to implement this yet. */ 1959 if (TARGET_32BIT && TARGET_POWERPC64) 1960 return false; 1961 1962 bool isP7 = (rs6000_tune == PROCESSOR_POWER7); 1963 1964 /* Allow this param to shut off all expansion. */ 1965 if (rs6000_block_compare_inline_limit == 0) 1966 return false; 1967 1968 /* targetm.slow_unaligned_access -- don't do unaligned stuff. 1969 However slow_unaligned_access returns true on P7 even though the 1970 performance of this code is good there. */ 1971 if (!isP7 1972 && (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1)) 1973 || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2)))) 1974 return false; 1975 1976 /* Unaligned l*brx traps on P7 so don't do this. However this should 1977 not affect much because LE isn't really supported on P7 anyway. */ 1978 if (isP7 && !BYTES_BIG_ENDIAN) 1979 return false; 1980 1981 /* If this is not a fixed size compare, try generating loop code and 1982 if that fails just call memcmp. */ 1983 if (!CONST_INT_P (bytes_rtx)) 1984 return expand_compare_loop (operands); 1985 1986 /* This must be a fixed size alignment. */ 1987 if (!CONST_INT_P (align_rtx)) 1988 return false; 1989 1990 unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT; 1991 1992 gcc_assert (GET_MODE (target) == SImode); 1993 1994 /* Anything to move? */ 1995 unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx); 1996 if (bytes == 0) 1997 return true; 1998 1999 /* P7/P8 code uses cond for subfc. but P9 uses 2000 it for cmpld which needs CCUNSmode. */ 2001 rtx cond = NULL; 2002 if (TARGET_P9_MISC) 2003 cond = gen_reg_rtx (CCUNSmode); 2004 2005 /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at 2006 least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is 2007 at least POWER8. That way we can rely on overlapping compares to 2008 do the final comparison of less than 16 bytes. Also I do not 2009 want to deal with making this work for 32 bits. In addition, we 2010 have to make sure that we have at least P8_VECTOR (we don't allow 2011 P9_VECTOR without P8_VECTOR). */ 2012 int use_vec = (bytes >= 33 && !TARGET_32BIT 2013 && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR); 2014 2015 /* We don't want to generate too much code. The loop code can take 2016 over for lengths greater than 31 bytes. */ 2017 unsigned HOST_WIDE_INT max_bytes = rs6000_block_compare_inline_limit; 2018 2019 /* Don't generate too much code if vsx was disabled. */ 2020 if (!use_vec && max_bytes > 1) 2021 max_bytes = ((max_bytes + 1) / 2) - 1; 2022 2023 if (!IN_RANGE (bytes, 1, max_bytes)) 2024 return expand_compare_loop (operands); 2025 2026 /* The code generated for p7 and older is not faster than glibc 2027 memcmp if alignment is small and length is not short, so bail 2028 out to avoid those conditions. */ 2029 if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED 2030 && ((base_align == 1 && bytes > 16) 2031 || (base_align == 2 && bytes > 32))) 2032 return false; 2033 2034 rtx final_label = NULL; 2035 2036 if (use_vec) 2037 { 2038 rtx final_move_label = gen_label_rtx (); 2039 rtx s1addr = gen_reg_rtx (Pmode); 2040 rtx s2addr = gen_reg_rtx (Pmode); 2041 rtx off_reg = gen_reg_rtx (Pmode); 2042 rtx cleanup_label = NULL; 2043 rtx vec_result = gen_reg_rtx (V16QImode); 2044 rtx s1data = gen_reg_rtx (V16QImode); 2045 rtx s2data = gen_reg_rtx (V16QImode); 2046 rtx result_reg = gen_reg_rtx (word_mode); 2047 emit_move_insn (result_reg, GEN_INT (0)); 2048 2049 expand_cmp_vec_sequence (bytes, orig_src1, orig_src2, 2050 s1addr, s2addr, off_reg, s1data, s2data, 2051 vec_result, false, 2052 &cleanup_label, final_move_label, false); 2053 2054 if (cleanup_label) 2055 emit_label (cleanup_label); 2056 2057 emit_insn (gen_one_cmplv16qi2 (vec_result, vec_result)); 2058 2059 emit_final_compare_vec (s1data, s2data, result_reg, 2060 s1addr, s2addr, orig_src1, orig_src2, 2061 off_reg, vec_result); 2062 2063 emit_label (final_move_label); 2064 emit_insn (gen_movsi (target, 2065 gen_lowpart (SImode, result_reg))); 2066 } 2067 else 2068 { /* generate GPR code */ 2069 2070 rtx convert_label = NULL; 2071 rtx sub_result = gen_reg_rtx (word_mode); 2072 bool need_6432_conversion = 2073 expand_block_compare_gpr(bytes, base_align, 2074 orig_src1, orig_src2, 2075 sub_result, cond, target, 2076 &convert_label, &final_label); 2077 2078 if (need_6432_conversion) 2079 { 2080 if (convert_label) 2081 emit_label (convert_label); 2082 if (TARGET_P9_MISC) 2083 emit_insn (gen_setb_unsigned (target, cond)); 2084 else 2085 generate_6432_conversion(target, sub_result); 2086 } 2087 } 2088 2089 if (final_label) 2090 emit_label (final_label); 2091 2092 return true; 2093} 2094 2095/* Generate page crossing check and branch code to set up for 2096 strncmp when we don't have DI alignment. 2097 STRNCMP_LABEL is the label to branch if there is a page crossing. 2098 SRC_ADDR is the string address to be examined. 2099 BYTES is the max number of bytes to compare. */ 2100static void 2101expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes) 2102{ 2103 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label); 2104 rtx src_pgoff = gen_reg_rtx (GET_MODE (src_addr)); 2105 do_and3 (src_pgoff, src_addr, GEN_INT (0xfff)); 2106 rtx cond = gen_reg_rtx (CCmode); 2107 emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_pgoff, 2108 GEN_INT (4096 - bytes))); 2109 2110 rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx); 2111 2112 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, 2113 lab_ref, pc_rtx); 2114 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); 2115 add_reg_br_prob_note (j, profile_probability::unlikely ()); 2116 JUMP_LABEL (j) = strncmp_label; 2117 LABEL_NUSES (strncmp_label) += 1; 2118} 2119 2120/* Generate the sequence of compares for strcmp/strncmp using gpr instructions. 2121 BYTES_TO_COMPARE is the number of bytes to be compared. 2122 BASE_ALIGN is the smaller of the alignment of the two strings. 2123 ORIG_SRC1 is the unmodified rtx for the first string. 2124 ORIG_SRC2 is the unmodified rtx for the second string. 2125 TMP_REG_SRC1 is the register for loading the first string. 2126 TMP_REG_SRC2 is the register for loading the second string. 2127 RESULT_REG is the rtx for the result register. 2128 EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call 2129 to strcmp/strncmp if we have equality at the end of the inline comparison. 2130 P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code 2131 to clean up and generate the final comparison result. 2132 FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just 2133 set the final result. */ 2134static void 2135expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare, 2136 unsigned int base_align, 2137 rtx orig_src1, rtx orig_src2, 2138 rtx tmp_reg_src1, rtx tmp_reg_src2, rtx result_reg, 2139 bool equality_compare_rest, rtx *p_cleanup_label, 2140 rtx final_move_label) 2141{ 2142 unsigned int word_mode_size = GET_MODE_SIZE (word_mode); 2143 machine_mode load_mode; 2144 unsigned int load_mode_size; 2145 unsigned HOST_WIDE_INT cmp_bytes = 0; 2146 unsigned HOST_WIDE_INT offset = 0; 2147 rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0)); 2148 rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0)); 2149 gcc_assert (p_cleanup_label != NULL); 2150 rtx cleanup_label = *p_cleanup_label; 2151 2152 while (bytes_to_compare > 0) 2153 { 2154 /* GPR compare sequence: 2155 check each 8B with: ld/ld/cmpb/cmpb/orc./bne 2156 2157 cleanup code at end: 2158 cntlzd get bit of first zero/diff byte 2159 subfic convert for rldcl use 2160 rldcl rldcl extract diff/zero byte 2161 subf subtract for final result 2162 2163 The last compare can branch around the cleanup code if the 2164 result is zero because the strings are exactly equal. */ 2165 2166 unsigned int align = compute_current_alignment (base_align, offset); 2167 load_mode = select_block_compare_mode (offset, bytes_to_compare, align); 2168 load_mode_size = GET_MODE_SIZE (load_mode); 2169 if (bytes_to_compare >= load_mode_size) 2170 cmp_bytes = load_mode_size; 2171 else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED) 2172 { 2173 /* Move this load back so it doesn't go past the end. 2174 P8/P9 can do this efficiently. */ 2175 unsigned int extra_bytes = load_mode_size - bytes_to_compare; 2176 cmp_bytes = bytes_to_compare; 2177 if (extra_bytes < offset) 2178 { 2179 offset -= extra_bytes; 2180 cmp_bytes = load_mode_size; 2181 bytes_to_compare = cmp_bytes; 2182 } 2183 } 2184 else 2185 /* P7 and earlier can't do the overlapping load trick fast, 2186 so this forces a non-overlapping load and a shift to get 2187 rid of the extra bytes. */ 2188 cmp_bytes = bytes_to_compare; 2189 2190 rtx offset_rtx; 2191 if (BYTES_BIG_ENDIAN || TARGET_AVOID_XFORM) 2192 offset_rtx = GEN_INT (offset); 2193 else 2194 { 2195 offset_rtx = gen_reg_rtx (Pmode); 2196 emit_move_insn (offset_rtx, GEN_INT (offset)); 2197 } 2198 rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_rtx); 2199 rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_rtx); 2200 2201 do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1); 2202 do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2); 2203 2204 /* We must always left-align the data we read, and 2205 clear any bytes to the right that are beyond the string. 2206 Otherwise the cmpb sequence won't produce the correct 2207 results. However if there is only one byte left, we 2208 can just subtract to get the final result so the shifts 2209 and clears are not needed. */ 2210 2211 unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes; 2212 2213 /* Loading just a single byte is a special case. If we are 2214 loading more than that, we have to check whether we are 2215 looking at the entire chunk of data. If not, rotate left and 2216 clear right so that bytes we aren't supposed to look at are 2217 zeroed, and the first byte we are supposed to compare is 2218 leftmost. */ 2219 if (load_mode_size != 1) 2220 { 2221 if (load_mode_size < word_mode_size) 2222 { 2223 /* Rotate left first. */ 2224 rtx sh = GEN_INT (BITS_PER_UNIT 2225 * (word_mode_size - load_mode_size)); 2226 do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh); 2227 do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh); 2228 } 2229 2230 if (cmp_bytes < word_mode_size) 2231 { 2232 /* Now clear right. This plus the rotate can be 2233 turned into a rldicr instruction. */ 2234 HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes); 2235 rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb); 2236 do_and3 (tmp_reg_src1, tmp_reg_src1, mask); 2237 do_and3 (tmp_reg_src2, tmp_reg_src2, mask); 2238 } 2239 } 2240 2241 /* Cases to handle. A and B are chunks of the two strings. 2242 1: Not end of comparison: 2243 A != B: branch to cleanup code to compute result. 2244 A == B: check for 0 byte, next block if not found. 2245 2: End of the inline comparison: 2246 A != B: branch to cleanup code to compute result. 2247 A == B: check for 0 byte, call strcmp/strncmp 2248 3: compared requested N bytes: 2249 A == B: branch to result 0. 2250 A != B: cleanup code to compute result. */ 2251 2252 rtx dst_label; 2253 if (remain > 0 || equality_compare_rest) 2254 { 2255 /* Branch to cleanup code, otherwise fall through to do 2256 more compares. */ 2257 if (!cleanup_label) 2258 cleanup_label = gen_label_rtx (); 2259 dst_label = cleanup_label; 2260 } 2261 else 2262 /* Branch to end and produce result of 0. */ 2263 dst_label = final_move_label; 2264 2265 if (load_mode_size == 1) 2266 { 2267 /* Special case for comparing just single byte. */ 2268 if (equality_compare_rest) 2269 { 2270 /* Use subf./bne to branch to final_move_label if the 2271 byte differs, otherwise fall through to the strncmp 2272 call. We must also check for a zero byte here as we 2273 must not make the library call if this is the end of 2274 the string. */ 2275 2276 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label); 2277 rtx cond = gen_reg_rtx (CCmode); 2278 rtx diff_rtx = gen_rtx_MINUS (word_mode, 2279 tmp_reg_src1, tmp_reg_src2); 2280 rs6000_emit_dot_insn (result_reg, diff_rtx, 2, cond); 2281 rtx cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); 2282 2283 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, 2284 lab_ref, pc_rtx); 2285 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); 2286 add_reg_br_prob_note (j, profile_probability::unlikely ()); 2287 JUMP_LABEL (j) = final_move_label; 2288 LABEL_NUSES (final_move_label) += 1; 2289 2290 /* Check for zero byte here before fall through to 2291 library call. This catches the case where the 2292 strings are equal and end in a zero byte at this 2293 position. */ 2294 2295 rtx cond0 = gen_reg_rtx (CCmode); 2296 emit_move_insn (cond0, gen_rtx_COMPARE (CCmode, tmp_reg_src1, 2297 const0_rtx)); 2298 2299 rtx cmp0eq_rtx = gen_rtx_EQ (VOIDmode, cond0, const0_rtx); 2300 2301 rtx ifelse0 = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp0eq_rtx, 2302 lab_ref, pc_rtx); 2303 rtx_insn *j0 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse0)); 2304 add_reg_br_prob_note (j0, profile_probability::unlikely ()); 2305 JUMP_LABEL (j0) = final_move_label; 2306 LABEL_NUSES (final_move_label) += 1; 2307 } 2308 else 2309 { 2310 /* This is the last byte to be compared so we can use 2311 subf to compute the final result and branch 2312 unconditionally to final_move_label. */ 2313 2314 do_sub3 (result_reg, tmp_reg_src1, tmp_reg_src2); 2315 2316 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label); 2317 rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); 2318 JUMP_LABEL (j) = final_move_label; 2319 LABEL_NUSES (final_move_label) += 1; 2320 emit_barrier (); 2321 } 2322 } 2323 else 2324 { 2325 rtx cmpb_zero = gen_reg_rtx (word_mode); 2326 rtx cmpb_diff = gen_reg_rtx (word_mode); 2327 rtx zero_reg = gen_reg_rtx (word_mode); 2328 rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label); 2329 rtx cond = gen_reg_rtx (CCmode); 2330 2331 emit_move_insn (zero_reg, GEN_INT (0)); 2332 do_cmpb3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2); 2333 do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg); 2334 rtx not_diff = gen_rtx_NOT (word_mode, cmpb_diff); 2335 rtx orc_rtx = gen_rtx_IOR (word_mode, not_diff, cmpb_zero); 2336 2337 rs6000_emit_dot_insn (result_reg, orc_rtx, 2, cond); 2338 2339 rtx cmp_rtx; 2340 if (remain == 0 && !equality_compare_rest) 2341 cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx); 2342 else 2343 cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx); 2344 2345 rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, 2346 lab_ref, pc_rtx); 2347 rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse)); 2348 add_reg_br_prob_note (j, profile_probability::unlikely ()); 2349 JUMP_LABEL (j) = dst_label; 2350 LABEL_NUSES (dst_label) += 1; 2351 } 2352 2353 offset += cmp_bytes; 2354 bytes_to_compare -= cmp_bytes; 2355 } 2356 2357 *p_cleanup_label = cleanup_label; 2358 return; 2359} 2360 2361/* Generate the final sequence that identifies the differing 2362 byte and generates the final result, taking into account 2363 zero bytes: 2364 2365 cntlzd get bit of first zero/diff byte 2366 addi convert for rldcl use 2367 rldcl rldcl extract diff/zero byte 2368 subf subtract for final result 2369 2370 STR1 is the reg rtx for data from string 1. 2371 STR2 is the reg rtx for data from string 2. 2372 RESULT is the reg rtx for the comparison result. */ 2373 2374static void 2375emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result) 2376{ 2377 machine_mode m = GET_MODE (str1); 2378 rtx rot_amt = gen_reg_rtx (m); 2379 2380 rtx rot1_1 = gen_reg_rtx (m); 2381 rtx rot1_2 = gen_reg_rtx (m); 2382 rtx rot2_1 = gen_reg_rtx (m); 2383 rtx rot2_2 = gen_reg_rtx (m); 2384 2385 if (m == SImode) 2386 { 2387 emit_insn (gen_clzsi2 (rot_amt, result)); 2388 emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8))); 2389 emit_insn (gen_rotlsi3 (rot1_1, str1, 2390 gen_lowpart (SImode, rot_amt))); 2391 emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff))); 2392 emit_insn (gen_rotlsi3 (rot2_1, str2, 2393 gen_lowpart (SImode, rot_amt))); 2394 emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff))); 2395 emit_insn (gen_subsi3 (result, rot1_2, rot2_2)); 2396 } 2397 else if (m == DImode) 2398 { 2399 emit_insn (gen_clzdi2 (rot_amt, result)); 2400 emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8))); 2401 emit_insn (gen_rotldi3 (rot1_1, str1, 2402 gen_lowpart (SImode, rot_amt))); 2403 emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff))); 2404 emit_insn (gen_rotldi3 (rot2_1, str2, 2405 gen_lowpart (SImode, rot_amt))); 2406 emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff))); 2407 emit_insn (gen_subdi3 (result, rot1_2, rot2_2)); 2408 } 2409 else 2410 gcc_unreachable (); 2411 2412 return; 2413} 2414 2415/* Expand a string compare operation with length, and return 2416 true if successful. Return false if we should let the 2417 compiler generate normal code, probably a strncmp call. 2418 2419 OPERANDS[0] is the target (result). 2420 OPERANDS[1] is the first source. 2421 OPERANDS[2] is the second source. 2422 If NO_LENGTH is zero, then: 2423 OPERANDS[3] is the length. 2424 OPERANDS[4] is the alignment in bytes. 2425 If NO_LENGTH is nonzero, then: 2426 OPERANDS[3] is the alignment in bytes. */ 2427bool 2428expand_strn_compare (rtx operands[], int no_length) 2429{ 2430 rtx target = operands[0]; 2431 rtx orig_src1 = operands[1]; 2432 rtx orig_src2 = operands[2]; 2433 rtx bytes_rtx, align_rtx; 2434 if (no_length) 2435 { 2436 bytes_rtx = NULL; 2437 align_rtx = operands[3]; 2438 } 2439 else 2440 { 2441 bytes_rtx = operands[3]; 2442 align_rtx = operands[4]; 2443 } 2444 2445 rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0)); 2446 rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0)); 2447 2448 /* If we have a length, it must be constant. This simplifies things 2449 a bit as we don't have to generate code to check if we've exceeded 2450 the length. Later this could be expanded to handle this case. */ 2451 if (!no_length && !CONST_INT_P (bytes_rtx)) 2452 return false; 2453 2454 /* This must be a fixed size alignment. */ 2455 if (!CONST_INT_P (align_rtx)) 2456 return false; 2457 2458 unsigned int base_align = UINTVAL (align_rtx); 2459 unsigned int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT; 2460 unsigned int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT; 2461 2462 /* targetm.slow_unaligned_access -- don't do unaligned stuff. */ 2463 if (targetm.slow_unaligned_access (word_mode, align1) 2464 || targetm.slow_unaligned_access (word_mode, align2)) 2465 return false; 2466 2467 gcc_assert (GET_MODE (target) == SImode); 2468 2469 unsigned int required_align = 8; 2470 2471 unsigned HOST_WIDE_INT offset = 0; 2472 unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available. */ 2473 unsigned HOST_WIDE_INT compare_length; /* How much to compare inline. */ 2474 2475 if (no_length) 2476 bytes = rs6000_string_compare_inline_limit; 2477 else 2478 bytes = UINTVAL (bytes_rtx); 2479 2480 /* Is it OK to use vec/vsx for this. TARGET_VSX means we have at 2481 least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is 2482 at least POWER8. That way we can rely on overlapping compares to 2483 do the final comparison of less than 16 bytes. Also I do not 2484 want to deal with making this work for 32 bits. In addition, we 2485 have to make sure that we have at least P8_VECTOR (we don't allow 2486 P9_VECTOR without P8_VECTOR). */ 2487 int use_vec = (bytes >= 16 && !TARGET_32BIT 2488 && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR); 2489 2490 if (use_vec) 2491 required_align = 16; 2492 2493 machine_mode load_mode; 2494 rtx tmp_reg_src1, tmp_reg_src2; 2495 if (use_vec) 2496 { 2497 load_mode = V16QImode; 2498 tmp_reg_src1 = gen_reg_rtx (V16QImode); 2499 tmp_reg_src2 = gen_reg_rtx (V16QImode); 2500 } 2501 else 2502 { 2503 load_mode = select_block_compare_mode (0, bytes, base_align); 2504 tmp_reg_src1 = gen_reg_rtx (word_mode); 2505 tmp_reg_src2 = gen_reg_rtx (word_mode); 2506 } 2507 2508 compare_length = rs6000_string_compare_inline_limit; 2509 2510 /* If we have equality at the end of the last compare and we have not 2511 found the end of the string, we need to call strcmp/strncmp to 2512 compare the remainder. */ 2513 bool equality_compare_rest = false; 2514 2515 if (no_length) 2516 { 2517 bytes = compare_length; 2518 equality_compare_rest = true; 2519 } 2520 else 2521 { 2522 if (bytes <= compare_length) 2523 compare_length = bytes; 2524 else 2525 equality_compare_rest = true; 2526 } 2527 2528 rtx result_reg = gen_reg_rtx (word_mode); 2529 rtx final_move_label = gen_label_rtx (); 2530 rtx final_label = gen_label_rtx (); 2531 rtx begin_compare_label = NULL; 2532 2533 if (base_align < required_align) 2534 { 2535 /* Generate code that checks distance to 4k boundary for this case. */ 2536 begin_compare_label = gen_label_rtx (); 2537 rtx strncmp_label = gen_label_rtx (); 2538 rtx jmp; 2539 2540 /* Strncmp for power8 in glibc does this: 2541 rldicl r8,r3,0,52 2542 cmpldi cr7,r8,4096-16 2543 bgt cr7,L(pagecross) */ 2544 2545 /* Make sure that the length we use for the alignment test and 2546 the subsequent code generation are in agreement so we do not 2547 go past the length we tested for a 4k boundary crossing. */ 2548 unsigned HOST_WIDE_INT align_test = compare_length; 2549 if (align_test < required_align) 2550 { 2551 align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test); 2552 base_align = align_test; 2553 } 2554 else 2555 { 2556 align_test = ROUND_UP (align_test, required_align); 2557 base_align = required_align; 2558 } 2559 2560 if (align1 < required_align) 2561 expand_strncmp_align_check (strncmp_label, src1_addr, align_test); 2562 if (align2 < required_align) 2563 expand_strncmp_align_check (strncmp_label, src2_addr, align_test); 2564 2565 /* Now generate the following sequence: 2566 - branch to begin_compare 2567 - strncmp_label 2568 - call to strncmp 2569 - branch to final_label 2570 - begin_compare_label */ 2571 2572 rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label); 2573 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref)); 2574 JUMP_LABEL (jmp) = begin_compare_label; 2575 LABEL_NUSES (begin_compare_label) += 1; 2576 emit_barrier (); 2577 2578 emit_label (strncmp_label); 2579 2580 if (no_length) 2581 { 2582 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP); 2583 emit_library_call_value (XEXP (DECL_RTL (fun), 0), 2584 target, LCT_NORMAL, GET_MODE (target), 2585 force_reg (Pmode, src1_addr), Pmode, 2586 force_reg (Pmode, src2_addr), Pmode); 2587 } 2588 else 2589 { 2590 /* -m32 -mpowerpc64 results in word_mode being DImode even 2591 though otherwise it is 32-bit. The length arg to strncmp 2592 is a size_t which will be the same size as pointers. */ 2593 rtx len_rtx = gen_reg_rtx (Pmode); 2594 emit_move_insn (len_rtx, gen_int_mode (bytes, Pmode)); 2595 2596 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP); 2597 emit_library_call_value (XEXP (DECL_RTL (fun), 0), 2598 target, LCT_NORMAL, GET_MODE (target), 2599 force_reg (Pmode, src1_addr), Pmode, 2600 force_reg (Pmode, src2_addr), Pmode, 2601 len_rtx, Pmode); 2602 } 2603 2604 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); 2605 jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); 2606 JUMP_LABEL (jmp) = final_label; 2607 LABEL_NUSES (final_label) += 1; 2608 emit_barrier (); 2609 emit_label (begin_compare_label); 2610 } 2611 2612 rtx cleanup_label = NULL; 2613 rtx s1addr = NULL, s2addr = NULL, off_reg = NULL, vec_result = NULL; 2614 2615 /* Generate a sequence of GPR or VEC/VSX instructions to compare out 2616 to the length specified. */ 2617 if (use_vec) 2618 { 2619 s1addr = gen_reg_rtx (Pmode); 2620 s2addr = gen_reg_rtx (Pmode); 2621 off_reg = gen_reg_rtx (Pmode); 2622 vec_result = gen_reg_rtx (load_mode); 2623 emit_move_insn (result_reg, GEN_INT (0)); 2624 expand_cmp_vec_sequence (compare_length, 2625 orig_src1, orig_src2, 2626 s1addr, s2addr, off_reg, 2627 tmp_reg_src1, tmp_reg_src2, 2628 vec_result, 2629 equality_compare_rest, 2630 &cleanup_label, final_move_label, true); 2631 } 2632 else 2633 expand_strncmp_gpr_sequence (compare_length, base_align, 2634 orig_src1, orig_src2, 2635 tmp_reg_src1, tmp_reg_src2, 2636 result_reg, 2637 equality_compare_rest, 2638 &cleanup_label, final_move_label); 2639 2640 offset = compare_length; 2641 2642 if (equality_compare_rest) 2643 { 2644 /* Update pointers past what has been compared already. */ 2645 rtx src1 = force_reg (Pmode, 2646 gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset))); 2647 rtx src2 = force_reg (Pmode, 2648 gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset))); 2649 2650 /* Construct call to strcmp/strncmp to compare the rest of the string. */ 2651 if (no_length) 2652 { 2653 tree fun = builtin_decl_explicit (BUILT_IN_STRCMP); 2654 emit_library_call_value (XEXP (DECL_RTL (fun), 0), 2655 target, LCT_NORMAL, GET_MODE (target), 2656 src1, Pmode, src2, Pmode); 2657 } 2658 else 2659 { 2660 rtx len_rtx = gen_reg_rtx (Pmode); 2661 emit_move_insn (len_rtx, gen_int_mode (bytes - compare_length, Pmode)); 2662 tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP); 2663 emit_library_call_value (XEXP (DECL_RTL (fun), 0), 2664 target, LCT_NORMAL, GET_MODE (target), 2665 src1, Pmode, src2, Pmode, len_rtx, Pmode); 2666 } 2667 2668 rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label); 2669 rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref)); 2670 JUMP_LABEL (jmp) = final_label; 2671 LABEL_NUSES (final_label) += 1; 2672 emit_barrier (); 2673 } 2674 2675 if (cleanup_label) 2676 emit_label (cleanup_label); 2677 2678 if (use_vec) 2679 emit_final_compare_vec (tmp_reg_src1, tmp_reg_src2, result_reg, 2680 s1addr, s2addr, orig_src1, orig_src2, 2681 off_reg, vec_result); 2682 else 2683 emit_final_str_compare_gpr (tmp_reg_src1, tmp_reg_src2, result_reg); 2684 2685 emit_label (final_move_label); 2686 emit_insn (gen_movsi (target, 2687 gen_lowpart (SImode, result_reg))); 2688 emit_label (final_label); 2689 return true; 2690} 2691 2692/* Generate loads and stores for a move of v4si mode using lvx/stvx. 2693 This uses altivec_{l,st}vx_<mode>_internal which use unspecs to 2694 keep combine from changing what instruction gets used. 2695 2696 DEST is the destination for the data. 2697 SRC is the source of the data for the move. */ 2698 2699static rtx 2700gen_lvx_v4si_move (rtx dest, rtx src) 2701{ 2702 gcc_assert (MEM_P (dest) ^ MEM_P (src)); 2703 gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode); 2704 2705 if (MEM_P (dest)) 2706 return gen_altivec_stvx_v4si_internal (dest, src); 2707 else 2708 return gen_altivec_lvx_v4si_internal (dest, src); 2709} 2710 2711/* Expand a block move operation, and return 1 if successful. Return 0 2712 if we should let the compiler generate normal code. 2713 2714 operands[0] is the destination 2715 operands[1] is the source 2716 operands[2] is the length 2717 operands[3] is the alignment */ 2718 2719#define MAX_MOVE_REG 4 2720 2721int 2722expand_block_move (rtx operands[], bool might_overlap) 2723{ 2724 rtx orig_dest = operands[0]; 2725 rtx orig_src = operands[1]; 2726 rtx bytes_rtx = operands[2]; 2727 rtx align_rtx = operands[3]; 2728 int constp = CONST_INT_P (bytes_rtx); 2729 int align; 2730 int bytes; 2731 int offset; 2732 int move_bytes; 2733 rtx loads[MAX_MOVE_REG]; 2734 rtx stores[MAX_MOVE_REG]; 2735 int num_reg = 0; 2736 2737 /* If this is not a fixed size move, just call memcpy */ 2738 if (! constp) 2739 return 0; 2740 2741 /* This must be a fixed size alignment */ 2742 gcc_assert (CONST_INT_P (align_rtx)); 2743 align = INTVAL (align_rtx) * BITS_PER_UNIT; 2744 2745 /* Anything to move? */ 2746 bytes = INTVAL (bytes_rtx); 2747 if (bytes <= 0) 2748 return 1; 2749 2750 if (bytes > rs6000_block_move_inline_limit) 2751 return 0; 2752 2753 for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes) 2754 { 2755 union { 2756 rtx (*movmemsi) (rtx, rtx, rtx, rtx); 2757 rtx (*mov) (rtx, rtx); 2758 } gen_func; 2759 machine_mode mode = BLKmode; 2760 rtx src, dest; 2761 2762 /* Altivec first, since it will be faster than a string move 2763 when it applies, and usually not significantly larger. */ 2764 if (TARGET_ALTIVEC && bytes >= 16 && align >= 128) 2765 { 2766 move_bytes = 16; 2767 mode = V4SImode; 2768 gen_func.mov = gen_lvx_v4si_move; 2769 } 2770 else if (bytes >= 8 && TARGET_POWERPC64 2771 && (align >= 64 || !STRICT_ALIGNMENT)) 2772 { 2773 move_bytes = 8; 2774 mode = DImode; 2775 gen_func.mov = gen_movdi; 2776 if (offset == 0 && align < 64) 2777 { 2778 rtx addr; 2779 2780 /* If the address form is reg+offset with offset not a 2781 multiple of four, reload into reg indirect form here 2782 rather than waiting for reload. This way we get one 2783 reload, not one per load and/or store. */ 2784 addr = XEXP (orig_dest, 0); 2785 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) 2786 && CONST_INT_P (XEXP (addr, 1)) 2787 && (INTVAL (XEXP (addr, 1)) & 3) != 0) 2788 { 2789 addr = copy_addr_to_reg (addr); 2790 orig_dest = replace_equiv_address (orig_dest, addr); 2791 } 2792 addr = XEXP (orig_src, 0); 2793 if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM) 2794 && CONST_INT_P (XEXP (addr, 1)) 2795 && (INTVAL (XEXP (addr, 1)) & 3) != 0) 2796 { 2797 addr = copy_addr_to_reg (addr); 2798 orig_src = replace_equiv_address (orig_src, addr); 2799 } 2800 } 2801 } 2802 else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT)) 2803 { /* move 4 bytes */ 2804 move_bytes = 4; 2805 mode = SImode; 2806 gen_func.mov = gen_movsi; 2807 } 2808 else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT)) 2809 { /* move 2 bytes */ 2810 move_bytes = 2; 2811 mode = HImode; 2812 gen_func.mov = gen_movhi; 2813 } 2814 else /* move 1 byte at a time */ 2815 { 2816 move_bytes = 1; 2817 mode = QImode; 2818 gen_func.mov = gen_movqi; 2819 } 2820 2821 /* Mode is always set to something other than BLKmode by one of the 2822 cases of the if statement above. */ 2823 gcc_assert (mode != BLKmode); 2824 2825 src = adjust_address (orig_src, mode, offset); 2826 dest = adjust_address (orig_dest, mode, offset); 2827 2828 rtx tmp_reg = gen_reg_rtx (mode); 2829 2830 loads[num_reg] = (*gen_func.mov) (tmp_reg, src); 2831 stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg); 2832 2833 /* If we didn't succeed in doing it in one pass, we can't do it in the 2834 might_overlap case. Bail out and return failure. */ 2835 if (might_overlap && num_reg >= MAX_MOVE_REG 2836 && bytes > move_bytes) 2837 return 0; 2838 2839 /* Emit loads and stores saved up. */ 2840 if (num_reg >= MAX_MOVE_REG || bytes == move_bytes) 2841 { 2842 int i; 2843 for (i = 0; i < num_reg; i++) 2844 emit_insn (loads[i]); 2845 for (i = 0; i < num_reg; i++) 2846 emit_insn (stores[i]); 2847 num_reg = 0; 2848 } 2849 2850 } 2851 2852 return 1; 2853} 2854