1/* Subroutines used to expand string and block move, clear,
2   compare and other operations for PowerPC.
3   Copyright (C) 1991-2020 Free Software Foundation, Inc.
4
5   This file is part of GCC.
6
7   GCC is free software; you can redistribute it and/or modify it
8   under the terms of the GNU General Public License as published
9   by the Free Software Foundation; either version 3, or (at your
10   option) any later version.
11
12   GCC is distributed in the hope that it will be useful, but WITHOUT
13   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
15   License for more details.
16
17   You should have received a copy of the GNU General Public License
18   along with GCC; see the file COPYING3.  If not see
19   <http://www.gnu.org/licenses/>.  */
20
21#define IN_TARGET_CODE 1
22
23#include "config.h"
24#include "system.h"
25#include "coretypes.h"
26#include "backend.h"
27#include "rtl.h"
28#include "tree.h"
29#include "memmodel.h"
30#include "tm_p.h"
31#include "ira.h"
32#include "print-tree.h"
33#include "varasm.h"
34#include "explow.h"
35#include "expr.h"
36#include "output.h"
37#include "target.h"
38#include "profile-count.h"
39#include "predict.h"
40
41/* Expand a block clear operation, and return 1 if successful.  Return 0
42   if we should let the compiler generate normal code.
43
44   operands[0] is the destination
45   operands[1] is the length
46   operands[3] is the alignment */
47
48int
49expand_block_clear (rtx operands[])
50{
51  rtx orig_dest = operands[0];
52  rtx bytes_rtx	= operands[1];
53  rtx align_rtx = operands[3];
54  bool constp	= CONST_INT_P (bytes_rtx);
55  HOST_WIDE_INT align;
56  HOST_WIDE_INT bytes;
57  int offset;
58  int clear_bytes;
59  int clear_step;
60
61  /* If this is not a fixed size move, just call memcpy */
62  if (! constp)
63    return 0;
64
65  /* This must be a fixed size alignment  */
66  gcc_assert (CONST_INT_P (align_rtx));
67  align = INTVAL (align_rtx) * BITS_PER_UNIT;
68
69  /* Anything to clear? */
70  bytes = INTVAL (bytes_rtx);
71  if (bytes <= 0)
72    return 1;
73
74  /* Use the builtin memset after a point, to avoid huge code bloat.
75     When optimize_size, avoid any significant code bloat; calling
76     memset is about 4 instructions, so allow for one instruction to
77     load zero and three to do clearing.  */
78  if (TARGET_ALTIVEC && (align >= 128 || TARGET_EFFICIENT_UNALIGNED_VSX))
79    clear_step = 16;
80  else if (TARGET_POWERPC64 && (align >= 64 || !STRICT_ALIGNMENT))
81    clear_step = 8;
82  else
83    clear_step = 4;
84
85  if (optimize_size && bytes > 3 * clear_step)
86    return 0;
87  if (! optimize_size && bytes > 8 * clear_step)
88    return 0;
89
90  bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX);
91
92  for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
93    {
94      machine_mode mode = BLKmode;
95      rtx dest;
96
97      if (TARGET_ALTIVEC
98	  && (bytes >= 16 && (align >= 128 || unaligned_vsx_ok)))
99	{
100	  clear_bytes = 16;
101	  mode = V4SImode;
102	}
103      else if (bytes >= 8 && TARGET_POWERPC64
104	       && (align >= 64 || !STRICT_ALIGNMENT))
105	{
106	  clear_bytes = 8;
107	  mode = DImode;
108	  if (offset == 0 && align < 64)
109	    {
110	      rtx addr;
111
112	      /* If the address form is reg+offset with offset not a
113		 multiple of four, reload into reg indirect form here
114		 rather than waiting for reload.  This way we get one
115		 reload, not one per store.  */
116	      addr = XEXP (orig_dest, 0);
117	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
118		  && CONST_INT_P (XEXP (addr, 1))
119		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
120		{
121		  addr = copy_addr_to_reg (addr);
122		  orig_dest = replace_equiv_address (orig_dest, addr);
123		}
124	    }
125	}
126      else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
127	{			/* move 4 bytes */
128	  clear_bytes = 4;
129	  mode = SImode;
130	}
131      else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
132	{			/* move 2 bytes */
133	  clear_bytes = 2;
134	  mode = HImode;
135	}
136      else /* move 1 byte at a time */
137	{
138	  clear_bytes = 1;
139	  mode = QImode;
140	}
141
142      dest = adjust_address (orig_dest, mode, offset);
143
144      emit_move_insn (dest, CONST0_RTX (mode));
145    }
146
147  return 1;
148}
149
150/* Figure out the correct instructions to generate to load data for
151   block compare.  MODE is used for the read from memory, and
152   data is zero extended if REG is wider than MODE.  If LE code
153   is being generated, bswap loads are used.
154
155   REG is the destination register to move the data into.
156   MEM is the memory block being read.
157   MODE is the mode of memory to use for the read.  */
158static void
159do_load_for_compare (rtx reg, rtx mem, machine_mode mode)
160{
161  switch (GET_MODE (reg))
162    {
163    case E_V16QImode:
164      switch (mode)
165	{
166	case E_V16QImode:
167	  if (!BYTES_BIG_ENDIAN)
168	    {
169	      if (TARGET_P9_VECTOR)
170		emit_insn (gen_vsx_ld_elemrev_v16qi_internal (reg, mem));
171	      else
172		{
173		  rtx reg_v2di = simplify_gen_subreg (V2DImode, reg,
174						      V16QImode, 0);
175		  gcc_assert (MEM_P (mem));
176		  rtx addr = XEXP (mem, 0);
177		  rtx mem_v2di = gen_rtx_MEM (V2DImode, addr);
178		  MEM_COPY_ATTRIBUTES (mem_v2di, mem);
179		  set_mem_size (mem, GET_MODE_SIZE (V2DImode));
180		  emit_insn (gen_vsx_ld_elemrev_v2di (reg_v2di, mem_v2di));
181		}
182	    }
183	  else
184	    emit_insn (gen_vsx_movv2di_64bit (reg, mem));
185	  break;
186	default:
187	  gcc_unreachable ();
188	}
189      break;
190    case E_DImode:
191      switch (mode)
192	{
193	case E_QImode:
194	  emit_insn (gen_zero_extendqidi2 (reg, mem));
195	  break;
196	case E_HImode:
197	  {
198	    rtx src = mem;
199	    if (!BYTES_BIG_ENDIAN)
200	      {
201		src = gen_reg_rtx (HImode);
202		emit_insn (gen_bswaphi2 (src, mem));
203	      }
204	    emit_insn (gen_zero_extendhidi2 (reg, src));
205	    break;
206	  }
207	case E_SImode:
208	  {
209	    rtx src = mem;
210	    if (!BYTES_BIG_ENDIAN)
211	      {
212		src = gen_reg_rtx (SImode);
213		emit_insn (gen_bswapsi2 (src, mem));
214	      }
215	    emit_insn (gen_zero_extendsidi2 (reg, src));
216	  }
217	  break;
218	case E_DImode:
219	  if (!BYTES_BIG_ENDIAN)
220	    emit_insn (gen_bswapdi2 (reg, mem));
221	  else
222	    emit_insn (gen_movdi (reg, mem));
223	  break;
224	default:
225	  gcc_unreachable ();
226	}
227      break;
228
229    case E_SImode:
230      switch (mode)
231	{
232	case E_QImode:
233	  emit_insn (gen_zero_extendqisi2 (reg, mem));
234	  break;
235	case E_HImode:
236	  {
237	    rtx src = mem;
238	    if (!BYTES_BIG_ENDIAN)
239	      {
240		src = gen_reg_rtx (HImode);
241		emit_insn (gen_bswaphi2 (src, mem));
242	      }
243	    emit_insn (gen_zero_extendhisi2 (reg, src));
244	    break;
245	  }
246	case E_SImode:
247	  if (!BYTES_BIG_ENDIAN)
248	    emit_insn (gen_bswapsi2 (reg, mem));
249	  else
250	    emit_insn (gen_movsi (reg, mem));
251	  break;
252	case E_DImode:
253	  /* DImode is larger than the destination reg so is not expected.  */
254	  gcc_unreachable ();
255	  break;
256	default:
257	  gcc_unreachable ();
258	}
259      break;
260
261    case E_QImode:
262      gcc_assert (mode == E_QImode);
263      emit_move_insn (reg, mem);
264      break;
265
266    default:
267      gcc_unreachable ();
268      break;
269    }
270}
271
272/* Select the mode to be used for reading the next chunk of bytes
273   in the compare.
274
275   OFFSET is the current read offset from the beginning of the block.
276   BYTES is the number of bytes remaining to be read.
277   ALIGN is the minimum alignment of the memory blocks being compared in bytes.  */
278static machine_mode
279select_block_compare_mode (unsigned HOST_WIDE_INT offset,
280			   unsigned HOST_WIDE_INT bytes,
281			   unsigned HOST_WIDE_INT align)
282{
283  /* First see if we can do a whole load unit
284     as that will be more efficient than a larger load + shift.  */
285
286  /* If big, use biggest chunk.
287     If exactly chunk size, use that size.
288     If remainder can be done in one piece with shifting, do that.
289     Do largest chunk possible without violating alignment rules.  */
290
291  /* The most we can read without potential page crossing.  */
292  unsigned HOST_WIDE_INT maxread = ROUND_UP (bytes, align);
293
294  /* If we have an LE target without ldbrx and word_mode is DImode,
295     then we must avoid using word_mode.  */
296  int word_mode_ok = !(!BYTES_BIG_ENDIAN && !TARGET_LDBRX
297		       && word_mode == DImode);
298
299  if (word_mode_ok && bytes >= UNITS_PER_WORD)
300    return word_mode;
301  else if (bytes == GET_MODE_SIZE (SImode))
302    return SImode;
303  else if (bytes == GET_MODE_SIZE (HImode))
304    return HImode;
305  else if (bytes == GET_MODE_SIZE (QImode))
306    return QImode;
307  else if (bytes < GET_MODE_SIZE (SImode)
308	   && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
309	   && offset >= GET_MODE_SIZE (SImode) - bytes)
310    /* This matches the case were we have SImode and 3 bytes
311       and offset >= 1 and permits us to move back one and overlap
312       with the previous read, thus avoiding having to shift
313       unwanted bytes off of the input.  */
314    return SImode;
315  else if (word_mode_ok && bytes < UNITS_PER_WORD
316	   && TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
317	   && offset >= UNITS_PER_WORD-bytes)
318    /* Similarly, if we can use DImode it will get matched here and
319       can do an overlapping read that ends at the end of the block.  */
320    return word_mode;
321  else if (word_mode_ok && maxread >= UNITS_PER_WORD)
322    /* It is safe to do all remaining in one load of largest size,
323       possibly with a shift to get rid of unwanted bytes.  */
324    return word_mode;
325  else if (maxread >= GET_MODE_SIZE (SImode))
326    /* It is safe to do all remaining in one SImode load,
327       possibly with a shift to get rid of unwanted bytes.  */
328    return SImode;
329  else if (bytes > GET_MODE_SIZE (SImode))
330    return SImode;
331  else if (bytes > GET_MODE_SIZE (HImode))
332    return HImode;
333
334  /* final fallback is do one byte */
335  return QImode;
336}
337
338/* Compute the alignment of pointer+OFFSET where the original alignment
339   of pointer was BASE_ALIGN.  */
340static unsigned HOST_WIDE_INT
341compute_current_alignment (unsigned HOST_WIDE_INT base_align,
342			   unsigned HOST_WIDE_INT offset)
343{
344  if (offset == 0)
345    return base_align;
346  return MIN (base_align, offset & -offset);
347}
348
349/* Prepare address and then do a load.
350
351   MODE is the mode to use for the load.
352   DEST is the destination register for the data.
353   ADDR is the address to be loaded.
354   ORIG_ADDR is the original address expression.  */
355static void
356do_load_for_compare_from_addr (machine_mode mode, rtx dest, rtx addr,
357			       rtx orig_addr)
358{
359  rtx mem = gen_rtx_MEM (mode, addr);
360  MEM_COPY_ATTRIBUTES (mem, orig_addr);
361  set_mem_size (mem, GET_MODE_SIZE (mode));
362  do_load_for_compare (dest, mem, mode);
363  return;
364}
365
366/* Do a branch for an if/else decision.
367
368   CMPMODE is the mode to use for the comparison.
369   COMPARISON is the rtx code for the compare needed.
370   A is the first thing to be compared.
371   B is the second thing to be compared.
372   CR is the condition code reg input, or NULL_RTX.
373   TRUE_LABEL is the label to branch to if the condition is true.
374   P is the estimated branch probability for the branch.
375
376   The return value is the CR used for the comparison.
377   If CR is null_rtx, then a new register of CMPMODE is generated.
378   If A and B are both null_rtx, then CR must not be null, and the
379   compare is not generated so you can use this with a dot form insn.  */
380
381static void
382do_ifelse (machine_mode cmpmode, rtx_code comparison,
383	   rtx a, rtx b, rtx cr, rtx true_label, profile_probability br_prob)
384{
385  gcc_assert ((a == NULL_RTX && b == NULL_RTX && cr != NULL_RTX)
386	      || (a != NULL_RTX && b != NULL_RTX));
387
388  if (cr != NULL_RTX)
389    gcc_assert (GET_MODE (cr) == cmpmode);
390  else
391    cr = gen_reg_rtx (cmpmode);
392
393  rtx label_ref = gen_rtx_LABEL_REF (VOIDmode, true_label);
394
395  if (a != NULL_RTX)
396    emit_move_insn (cr, gen_rtx_COMPARE (cmpmode, a, b));
397
398  rtx cmp_rtx = gen_rtx_fmt_ee (comparison, VOIDmode, cr, const0_rtx);
399
400  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx, label_ref, pc_rtx);
401  rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
402  add_reg_br_prob_note (j, br_prob);
403  JUMP_LABEL (j) = true_label;
404  LABEL_NUSES (true_label) += 1;
405}
406
407/* Emit an isel of the proper mode for DEST.
408
409   DEST is the isel destination register.
410   SRC1 is the isel source if CR is true.
411   SRC2 is the isel source if CR is false.
412   CR is the condition for the isel.  */
413static void
414do_isel (rtx dest, rtx cmp, rtx src_t, rtx src_f, rtx cr)
415{
416  if (GET_MODE (dest) == DImode)
417    emit_insn (gen_isel_signed_di (dest, cmp, src_t, src_f, cr));
418  else
419    emit_insn (gen_isel_signed_si (dest, cmp, src_t, src_f, cr));
420}
421
422/* Emit a subtract of the proper mode for DEST.
423
424   DEST is the destination register for the subtract.
425   SRC1 is the first subtract input.
426   SRC2 is the second subtract input.
427
428   Computes DEST = SRC1-SRC2.  */
429static void
430do_sub3 (rtx dest, rtx src1, rtx src2)
431{
432  if (GET_MODE (dest) == DImode)
433    emit_insn (gen_subdi3 (dest, src1, src2));
434  else
435    emit_insn (gen_subsi3 (dest, src1, src2));
436}
437
438/* Emit an add of the proper mode for DEST.
439
440   DEST is the destination register for the add.
441   SRC1 is the first add input.
442   SRC2 is the second add input.
443
444   Computes DEST = SRC1+SRC2.  */
445static void
446do_add3 (rtx dest, rtx src1, rtx src2)
447{
448  if (GET_MODE (dest) == DImode)
449    emit_insn (gen_adddi3 (dest, src1, src2));
450  else
451    emit_insn (gen_addsi3 (dest, src1, src2));
452}
453
454/* Emit an and of the proper mode for DEST.
455
456   DEST is the destination register for the and.
457   SRC1 is the first and input.
458   SRC2 is the second and input.
459
460   Computes DEST = SRC1&SRC2.  */
461static void
462do_and3 (rtx dest, rtx src1, rtx src2)
463{
464  if (GET_MODE (dest) == DImode)
465    emit_insn (gen_anddi3 (dest, src1, src2));
466  else
467    emit_insn (gen_andsi3 (dest, src1, src2));
468}
469
470/* Emit an cmpb of the proper mode for DEST.
471
472   DEST is the destination register for the cmpb.
473   SRC1 is the first input.
474   SRC2 is the second input.
475
476   Computes cmpb of SRC1, SRC2.  */
477static void
478do_cmpb3 (rtx dest, rtx src1, rtx src2)
479{
480  if (GET_MODE (dest) == DImode)
481    emit_insn (gen_cmpbdi3 (dest, src1, src2));
482  else
483    emit_insn (gen_cmpbsi3 (dest, src1, src2));
484}
485
486/* Emit a rotl of the proper mode for DEST.
487
488   DEST is the destination register for the and.
489   SRC1 is the first and input.
490   SRC2 is the second and input.
491
492   Computes DEST = SRC1 rotated left by SRC2.  */
493static void
494do_rotl3 (rtx dest, rtx src1, rtx src2)
495{
496  if (GET_MODE (dest) == DImode)
497    emit_insn (gen_rotldi3 (dest, src1, src2));
498  else
499    emit_insn (gen_rotlsi3 (dest, src1, src2));
500}
501
502/* Generate rtl for a load, shift, and compare of less than a full word.
503
504   LOAD_MODE is the machine mode for the loads.
505   DIFF is the reg for the difference.
506   CMP_REM is the reg containing the remaining bytes to compare.
507   DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
508   SRC1_ADDR is the first source address.
509   SRC2_ADDR is the second source address.
510   ORIG_SRC1 is the original first source block's address rtx.
511   ORIG_SRC2 is the original second source block's address rtx.  */
512static void
513do_load_mask_compare (const machine_mode load_mode, rtx diff, rtx cmp_rem, rtx dcond,
514		      rtx src1_addr, rtx src2_addr, rtx orig_src1, rtx orig_src2)
515{
516  HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
517  rtx shift_amount = gen_reg_rtx (word_mode);
518  rtx d1 = gen_reg_rtx (word_mode);
519  rtx d2 = gen_reg_rtx (word_mode);
520
521  do_load_for_compare_from_addr (load_mode, d1, src1_addr, orig_src1);
522  do_load_for_compare_from_addr (load_mode, d2, src2_addr, orig_src2);
523  do_sub3 (shift_amount, GEN_INT (load_mode_size), cmp_rem);
524
525  if (word_mode == DImode)
526    {
527      emit_insn (gen_ashldi3 (shift_amount, shift_amount,
528			      GEN_INT (LOG2_BITS_PER_UNIT)));
529      emit_insn (gen_lshrdi3 (d1, d1,
530			      gen_lowpart (SImode, shift_amount)));
531      emit_insn (gen_lshrdi3 (d2, d2,
532			      gen_lowpart (SImode, shift_amount)));
533    }
534  else
535    {
536      emit_insn (gen_ashlsi3 (shift_amount, shift_amount,
537			      GEN_INT (LOG2_BITS_PER_UNIT)));
538      emit_insn (gen_lshrsi3 (d1, d1, shift_amount));
539      emit_insn (gen_lshrsi3 (d2, d2, shift_amount));
540    }
541
542  if (TARGET_P9_MISC)
543    {
544      /* Generate a compare, and convert with a setb later.  */
545      rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
546      emit_insn (gen_rtx_SET (dcond, cmp));
547    }
548  else
549    {
550      if (word_mode == DImode)
551	emit_insn (gen_subfdi3_carry (diff, d2, d1));
552      else
553	emit_insn (gen_subfsi3_carry (diff, d2, d1));
554    }
555}
556
557/* Generate rtl for an overlapping load and compare of less than a
558   full load_mode.  This assumes that the previous word is part of the
559   block being compared so it's ok to back up part of a word so we can
560   compare the last unaligned full word that ends at the end of the block.
561
562   LOAD_MODE is the machine mode for the loads.
563   ISCONST tells whether the remaining length is a constant or in a register.
564   BYTES_REM is the remaining length if ISCONST is true.
565   DIFF is the reg for the difference.
566   CMP_REM is the reg containing the remaining bytes to compare if !ISCONST.
567   DCOND is the CCUNS reg for the compare if we are doing P9 code with setb.
568   SRC1_ADDR is the first source address.
569   SRC2_ADDR is the second source address.
570   ORIG_SRC1 is the original first source block's address rtx.
571   ORIG_SRC2 is the original second source block's address rtx.  */
572static void
573do_overlap_load_compare (machine_mode load_mode, bool isConst,
574			HOST_WIDE_INT bytes_rem, rtx diff,
575			rtx cmp_rem, rtx dcond, rtx src1_addr, rtx src2_addr,
576			rtx orig_src1, rtx orig_src2)
577{
578  HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
579  HOST_WIDE_INT addr_adj = load_mode_size - bytes_rem;
580  rtx d1 = gen_reg_rtx (word_mode);
581  rtx d2 = gen_reg_rtx (word_mode);
582
583  rtx addr1, addr2;
584  if (!isConst || addr_adj)
585    {
586      rtx adj_reg = gen_reg_rtx (word_mode);
587      if (isConst)
588	emit_move_insn (adj_reg, GEN_INT (-addr_adj));
589      else
590	{
591	  rtx reg_lms = gen_reg_rtx (word_mode);
592	  emit_move_insn (reg_lms, GEN_INT (load_mode_size));
593	  do_sub3 (adj_reg, cmp_rem, reg_lms);
594	}
595
596      addr1 = gen_rtx_PLUS (word_mode, src1_addr, adj_reg);
597      addr2 = gen_rtx_PLUS (word_mode, src2_addr, adj_reg);
598    }
599  else
600    {
601      addr1 = src1_addr;
602      addr2 = src2_addr;
603    }
604
605  do_load_for_compare_from_addr (load_mode, d1, addr1, orig_src1);
606  do_load_for_compare_from_addr (load_mode, d2, addr2, orig_src2);
607
608  if (TARGET_P9_MISC)
609    {
610      /* Generate a compare, and convert with a setb later.  */
611      rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1, d2);
612      emit_insn (gen_rtx_SET (dcond, cmp));
613    }
614  else
615    {
616      if (word_mode == DImode)
617	emit_insn (gen_subfdi3_carry (diff, d2, d1));
618      else
619	emit_insn (gen_subfsi3_carry (diff, d2, d1));
620    }
621}
622
623/* Generate the sequence of compares for strcmp/strncmp using vec/vsx
624   instructions.
625
626   BYTES_TO_COMPARE is the number of bytes to be compared.
627   ORIG_SRC1 is the unmodified rtx for the first string.
628   ORIG_SRC2 is the unmodified rtx for the second string.
629   S1ADDR is the register to use for the base address of the first string.
630   S2ADDR is the register to use for the base address of the second string.
631   OFF_REG is the register to use for the string offset for loads.
632   S1DATA is the register for loading the first string.
633   S2DATA is the register for loading the second string.
634   VEC_RESULT is the rtx for the vector result indicating the byte difference.
635   EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
636   to strcmp/strncmp if we have equality at the end of the inline comparison.
637   P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
638   to clean up and generate the final comparison result.
639   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
640   set the final result.
641   CHECKZERO indicates whether the sequence should check for zero bytes
642   for use doing strncmp, or not (for use doing memcmp).  */
643static void
644expand_cmp_vec_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
645			 rtx orig_src1, rtx orig_src2,
646			 rtx s1addr, rtx s2addr, rtx off_reg,
647			 rtx s1data, rtx s2data, rtx vec_result,
648			 bool equality_compare_rest, rtx *p_cleanup_label,
649			 rtx final_move_label, bool checkzero)
650{
651  machine_mode load_mode;
652  unsigned int load_mode_size;
653  unsigned HOST_WIDE_INT cmp_bytes = 0;
654  unsigned HOST_WIDE_INT offset = 0;
655  rtx zero_reg = NULL;
656
657  gcc_assert (p_cleanup_label != NULL);
658  rtx cleanup_label = *p_cleanup_label;
659
660  emit_move_insn (s1addr, force_reg (Pmode, XEXP (orig_src1, 0)));
661  emit_move_insn (s2addr, force_reg (Pmode, XEXP (orig_src2, 0)));
662
663  if (checkzero && !TARGET_P9_VECTOR)
664    {
665      zero_reg = gen_reg_rtx (V16QImode);
666      emit_move_insn (zero_reg, CONST0_RTX (V16QImode));
667    }
668
669  while (bytes_to_compare > 0)
670    {
671      /* VEC/VSX compare sequence for P8:
672	 check each 16B with:
673	 lxvd2x 32,28,8
674	 lxvd2x 33,29,8
675	 vcmpequb 2,0,1  # compare strings
676	 vcmpequb 4,0,3  # compare w/ 0
677	 xxlorc 37,36,34       # first FF byte is either mismatch or end of string
678	 vcmpequb. 7,5,3  # reg 7 contains 0
679	 bnl 6,.Lmismatch
680
681	 For the P8 LE case, we use lxvd2x and compare full 16 bytes
682	 but then use vgbbd and a shift to get two bytes with the
683	 information we need in the correct order.
684
685	 VEC/VSX compare sequence if TARGET_P9_VECTOR:
686	 lxvb16x/lxvb16x     # load 16B of each string
687	 vcmpnezb.           # produces difference location or zero byte location
688	 bne 6,.Lmismatch
689
690	 Use the overlapping compare trick for the last block if it is
691	 less than 16 bytes.
692      */
693
694      load_mode = V16QImode;
695      load_mode_size = GET_MODE_SIZE (load_mode);
696
697      if (bytes_to_compare >= load_mode_size)
698	cmp_bytes = load_mode_size;
699      else
700	{
701	  /* Move this load back so it doesn't go past the end.  P8/P9
702	     can do this efficiently.  This is never called with less
703	     than 16 bytes so we should always be able to do this.  */
704	  unsigned int extra_bytes = load_mode_size - bytes_to_compare;
705	  cmp_bytes = bytes_to_compare;
706	  gcc_assert (offset > extra_bytes);
707	  offset -= extra_bytes;
708	  cmp_bytes = load_mode_size;
709	  bytes_to_compare = cmp_bytes;
710	}
711
712      /* The offset currently used is always kept in off_reg so that the
713	 cleanup code on P8 can use it to extract the differing byte.  */
714      emit_move_insn (off_reg, GEN_INT (offset));
715
716      rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg);
717      do_load_for_compare_from_addr (load_mode, s1data, addr1, orig_src1);
718      rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg);
719      do_load_for_compare_from_addr (load_mode, s2data, addr2, orig_src2);
720
721      /* Cases to handle.  A and B are chunks of the two strings.
722	 1: Not end of comparison:
723	 A != B: branch to cleanup code to compute result.
724	 A == B: next block
725	 2: End of the inline comparison:
726	 A != B: branch to cleanup code to compute result.
727	 A == B: call strcmp/strncmp
728	 3: compared requested N bytes:
729	 A == B: branch to result 0.
730	 A != B: cleanup code to compute result.  */
731
732      unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
733
734      if (checkzero)
735	{
736	  if (TARGET_P9_VECTOR)
737	    emit_insn (gen_vcmpnezb_p (vec_result, s1data, s2data));
738	  else
739	    {
740	      /* Emit instructions to do comparison and zero check.  */
741	      rtx cmp_res = gen_reg_rtx (load_mode);
742	      rtx cmp_zero = gen_reg_rtx (load_mode);
743	      rtx cmp_combined = gen_reg_rtx (load_mode);
744	      emit_insn (gen_altivec_eqv16qi (cmp_res, s1data, s2data));
745	      emit_insn (gen_altivec_eqv16qi (cmp_zero, s1data, zero_reg));
746	      emit_insn (gen_orcv16qi3 (vec_result, cmp_zero, cmp_res));
747	      emit_insn (gen_altivec_vcmpequb_p (cmp_combined, vec_result, zero_reg));
748	    }
749	}
750      else
751	emit_insn (gen_altivec_vcmpequb_p (vec_result, s1data, s2data));
752
753      bool branch_to_cleanup = (remain > 0 || equality_compare_rest);
754      rtx cr6 = gen_rtx_REG (CCmode, CR6_REGNO);
755      rtx dst_label;
756      rtx cmp_rtx;
757      if (branch_to_cleanup)
758	{
759	  /* Branch to cleanup code, otherwise fall through to do more
760	     compares.  P8 and P9 use different CR bits because on P8
761	     we are looking at the result of a comparsion vs a
762	     register of zeroes so the all-true condition means no
763	     difference or zero was found.  On P9, vcmpnezb sets a byte
764	     to 0xff if there is a mismatch or zero, so the all-false
765	     condition indicates we found no difference or zero.  */
766	  if (!cleanup_label)
767	    cleanup_label = gen_label_rtx ();
768	  dst_label = cleanup_label;
769	  if (TARGET_P9_VECTOR && checkzero)
770	    cmp_rtx = gen_rtx_NE (VOIDmode, cr6, const0_rtx);
771	  else
772	    cmp_rtx = gen_rtx_GE (VOIDmode, cr6, const0_rtx);
773	}
774      else
775	{
776	  /* Branch to final return or fall through to cleanup,
777	     result is already set to 0.  */
778	  dst_label = final_move_label;
779	  if (TARGET_P9_VECTOR && checkzero)
780	    cmp_rtx = gen_rtx_EQ (VOIDmode, cr6, const0_rtx);
781	  else
782	    cmp_rtx = gen_rtx_LT (VOIDmode, cr6, const0_rtx);
783	}
784
785      rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
786      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
787					 lab_ref, pc_rtx);
788      rtx_insn *j2 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
789      add_reg_br_prob_note (j2, profile_probability::likely ());
790      JUMP_LABEL (j2) = dst_label;
791      LABEL_NUSES (dst_label) += 1;
792
793      offset += cmp_bytes;
794      bytes_to_compare -= cmp_bytes;
795    }
796  *p_cleanup_label = cleanup_label;
797  return;
798}
799
800/* Generate the final sequence that identifies the differing
801   byte and generates the final result, taking into account
802   zero bytes:
803
804   P8:
805        vgbbd 0,0
806        vsldoi 0,0,0,9
807        mfvsrd 9,32
808        addi 10,9,-1    # count trailing zero bits
809        andc 9,10,9
810        popcntd 9,9
811        lbzx 10,28,9    # use that offset to load differing byte
812        lbzx 3,29,9
813        subf 3,3,10     # subtract for final result
814
815   P9:
816	 vclzlsbb            # counts trailing bytes with lsb=0
817	 vextublx            # extract differing byte
818
819   STR1 is the reg rtx for data from string 1.
820   STR2 is the reg rtx for data from string 2.
821   RESULT is the reg rtx for the comparison result.
822   S1ADDR is the register to use for the base address of the first string.
823   S2ADDR is the register to use for the base address of the second string.
824   ORIG_SRC1 is the unmodified rtx for the first string.
825   ORIG_SRC2 is the unmodified rtx for the second string.
826   OFF_REG is the register to use for the string offset for loads.
827   VEC_RESULT is the rtx for the vector result indicating the byte difference.  */
828
829static void
830emit_final_compare_vec (rtx str1, rtx str2, rtx result,
831			rtx s1addr, rtx s2addr,
832			rtx orig_src1, rtx orig_src2,
833			rtx off_reg, rtx vec_result)
834{
835
836  if (TARGET_P9_VECTOR)
837    {
838      rtx diffix = gen_reg_rtx (SImode);
839      rtx chr1 = gen_reg_rtx (SImode);
840      rtx chr2 = gen_reg_rtx (SImode);
841      rtx chr1_di = simplify_gen_subreg (DImode, chr1, SImode, 0);
842      rtx chr2_di = simplify_gen_subreg (DImode, chr2, SImode, 0);
843      emit_insn (gen_vclzlsbb_v16qi (diffix, vec_result));
844      emit_insn (gen_vextublx (chr1, diffix, str1));
845      emit_insn (gen_vextublx (chr2, diffix, str2));
846      do_sub3 (result, chr1_di, chr2_di);
847    }
848  else
849    {
850      gcc_assert (TARGET_P8_VECTOR);
851      rtx diffix = gen_reg_rtx (DImode);
852      rtx result_gbbd = gen_reg_rtx (V16QImode);
853      /* Since each byte of the input is either 00 or FF, the bytes in
854	 dw0 and dw1 after vgbbd are all identical to each other.  */
855      emit_insn (gen_p8v_vgbbd (result_gbbd, vec_result));
856      /* For LE, we shift by 9 and get BA in the low two bytes then CTZ.
857	 For BE, we shift by 7 and get AB in the high two bytes then CLZ.  */
858      rtx result_shifted = gen_reg_rtx (V16QImode);
859      int shift_amt = (BYTES_BIG_ENDIAN) ? 7 : 9;
860      emit_insn (gen_altivec_vsldoi_v16qi (result_shifted, result_gbbd,
861					   result_gbbd, GEN_INT (shift_amt)));
862
863      rtx diffix_df = simplify_gen_subreg (DFmode, diffix, DImode, 0);
864      emit_insn (gen_p8_mfvsrd_3_v16qi (diffix_df, result_shifted));
865      rtx count = gen_reg_rtx (DImode);
866
867      if (BYTES_BIG_ENDIAN)
868	emit_insn (gen_clzdi2 (count, diffix));
869      else
870	emit_insn (gen_ctzdi2 (count, diffix));
871
872      /* P8 doesn't have a good solution for extracting one byte from
873	 a vsx reg like vextublx on P9 so we just compute the offset
874	 of the differing byte and load it from each string.  */
875      do_add3 (off_reg, off_reg, count);
876
877      rtx chr1 = gen_reg_rtx (QImode);
878      rtx chr2 = gen_reg_rtx (QImode);
879      rtx addr1 = gen_rtx_PLUS (Pmode, s1addr, off_reg);
880      do_load_for_compare_from_addr (QImode, chr1, addr1, orig_src1);
881      rtx addr2 = gen_rtx_PLUS (Pmode, s2addr, off_reg);
882      do_load_for_compare_from_addr (QImode, chr2, addr2, orig_src2);
883      machine_mode rmode = GET_MODE (result);
884      rtx chr1_rm = simplify_gen_subreg (rmode, chr1, QImode, 0);
885      rtx chr2_rm = simplify_gen_subreg (rmode, chr2, QImode, 0);
886      do_sub3 (result, chr1_rm, chr2_rm);
887    }
888
889  return;
890}
891
892/* Expand a block compare operation using loop code, and return true
893   if successful.  Return false if we should let the compiler generate
894   normal code, probably a memcmp call.
895
896   OPERANDS[0] is the target (result).
897   OPERANDS[1] is the first source.
898   OPERANDS[2] is the second source.
899   OPERANDS[3] is the length.
900   OPERANDS[4] is the alignment.  */
901bool
902expand_compare_loop (rtx operands[])
903{
904  rtx target = operands[0];
905  rtx orig_src1 = operands[1];
906  rtx orig_src2 = operands[2];
907  rtx bytes_rtx = operands[3];
908  rtx align_rtx = operands[4];
909
910  /* This case is complicated to handle because the subtract
911     with carry instructions do not generate the 64-bit
912     carry and so we must emit code to calculate it ourselves.
913     We choose not to implement this yet.  */
914  if (TARGET_32BIT && TARGET_POWERPC64)
915    return false;
916
917  /* Allow non-const length.  */
918  int bytes_is_const = CONST_INT_P (bytes_rtx);
919
920  /* This must be a fixed size alignment.  */
921  if (!CONST_INT_P (align_rtx))
922    return false;
923
924  HOST_WIDE_INT align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
925  HOST_WIDE_INT align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
926  HOST_WIDE_INT minalign = MIN (align1, align2);
927
928  bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
929
930  gcc_assert (GET_MODE (target) == SImode);
931
932  /* Anything to move?	*/
933  HOST_WIDE_INT bytes = 0;
934  if (bytes_is_const)
935    bytes = INTVAL (bytes_rtx);
936
937  if (bytes_is_const && bytes == 0)
938    return true;
939
940  /* Limit the amount we compare, if known statically.  */
941  HOST_WIDE_INT max_bytes;
942  switch (rs6000_tune)
943    {
944    case PROCESSOR_POWER7:
945      if (!bytes_is_const)
946	if (minalign < 8)
947	  max_bytes = 0;
948	else
949	  max_bytes = 128;
950      else
951	if (minalign < 8)
952	  max_bytes = 32;
953	else
954	  max_bytes = 128;
955      break;
956    case PROCESSOR_POWER8:
957      if (!bytes_is_const)
958	max_bytes = 0;
959      else
960	if (minalign < 8)
961	  max_bytes = 128;
962	else
963	  max_bytes = 64;
964      break;
965    case PROCESSOR_POWER9:
966    case PROCESSOR_POWER10:
967      if (bytes_is_const)
968	max_bytes = 191;
969      else
970	max_bytes = 0;
971      break;
972    default:
973      max_bytes = 128;
974    }
975
976  /* Allow the option to override the default.  */
977  if (rs6000_block_compare_inline_loop_limit >= 0)
978    max_bytes = (unsigned HOST_WIDE_INT) rs6000_block_compare_inline_loop_limit;
979
980  if (max_bytes == 0)
981    return false;
982
983  rtx cmp_rem = gen_reg_rtx (word_mode);  /* Remainder for library call.  */
984  rtx loop_cmp = gen_reg_rtx (word_mode); /* Actual amount compared by loop.  */
985  HOST_WIDE_INT niter;
986  rtx iter = gen_reg_rtx (word_mode);
987  rtx iv1 = gen_reg_rtx (word_mode);
988  rtx iv2 = gen_reg_rtx (word_mode);
989  rtx d1_1 = gen_reg_rtx (word_mode);  /* Addr expression src1+iv1 */
990  rtx d1_2 = gen_reg_rtx (word_mode);  /* Addr expression src1+iv2 */
991  rtx d2_1 = gen_reg_rtx (word_mode);  /* Addr expression src2+iv1 */
992  rtx d2_2 = gen_reg_rtx (word_mode);  /* Addr expression src2+iv2 */
993
994  /* Strip unneeded subreg from length if there is one.  */
995  if (SUBREG_P (bytes_rtx) && subreg_lowpart_p (bytes_rtx))
996    bytes_rtx = SUBREG_REG (bytes_rtx);
997  /* Extend bytes_rtx to word_mode if needed.  But, we expect only to
998   maybe have to deal with the case were bytes_rtx is SImode and
999   word_mode is DImode.  */
1000  if (!bytes_is_const)
1001    {
1002      if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) > GET_MODE_SIZE (word_mode))
1003	/* Do not expect length longer than word_mode.  */
1004	return false;
1005      else if (GET_MODE_SIZE (GET_MODE (bytes_rtx)) < GET_MODE_SIZE (word_mode))
1006	{
1007	  bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
1008	  bytes_rtx = force_reg (word_mode,
1009				 gen_rtx_fmt_e (ZERO_EXTEND, word_mode,
1010						bytes_rtx));
1011	}
1012      else
1013	/* Make sure it's in a register before we get started.  */
1014	bytes_rtx = force_reg (GET_MODE (bytes_rtx), bytes_rtx);
1015    }
1016
1017  machine_mode load_mode = word_mode;
1018  HOST_WIDE_INT load_mode_size = GET_MODE_SIZE (load_mode);
1019
1020  /* Number of bytes per iteration of the unrolled loop.  */
1021  HOST_WIDE_INT loop_bytes = 2 * load_mode_size;
1022  /* max iters and bytes compared in the loop.  */
1023  HOST_WIDE_INT max_loop_iter = max_bytes / loop_bytes;
1024  HOST_WIDE_INT max_loop_bytes = max_loop_iter * loop_bytes;
1025  int l2lb = floor_log2 (loop_bytes);
1026
1027  if (bytes_is_const && (max_bytes < load_mode_size
1028			 || !IN_RANGE (bytes, load_mode_size, max_bytes)))
1029    return false;
1030
1031  bool no_remainder_code = false;
1032  rtx final_label = gen_label_rtx ();
1033  rtx final_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1034  rtx diff_label = gen_label_rtx ();
1035  rtx library_call_label = NULL;
1036  rtx cleanup_label = gen_label_rtx ();
1037
1038  rtx cr;
1039
1040  rtx src1_addr = copy_addr_to_reg (XEXP (orig_src1, 0));
1041  rtx src2_addr = copy_addr_to_reg (XEXP (orig_src2, 0));
1042
1043  /* Difference found is stored here before jump to diff_label.  */
1044  rtx diff = gen_reg_rtx (word_mode);
1045  rtx_insn *j;
1046
1047  /* Example of generated code for 35 bytes aligned 1 byte.
1048
1049	     mtctr 8
1050	     li 6,0
1051	     li 5,8
1052     .L13:
1053	     ldbrx 7,3,6
1054	     ldbrx 9,10,6
1055	     ldbrx 0,3,5
1056	     ldbrx 4,10,5
1057	     addi 6,6,16
1058	     addi 5,5,16
1059	     subfc. 9,9,7
1060	     bne 0,.L10
1061	     subfc. 9,4,0
1062	     bdnzt 2,.L13
1063	     bne 0,.L10
1064	     add 3,3,6
1065	     add 10,10,6
1066	     addi 9,3,-5
1067	     ldbrx 7,0,9
1068	     addi 9,10,-5
1069	     ldbrx 9,0,9
1070	     subfc 9,9,7
1071	     .p2align 4,,15
1072     .L10:
1073	     popcntd 9,9
1074	     subfe 10,10,10
1075	     or 9,9,10
1076
1077     Compiled with -fno-reorder-blocks for clarity.  */
1078
1079  /* Structure of what we're going to do:
1080     Two separate lengths: what we will compare before bailing to library
1081	call (max_bytes), and the total length to be checked.
1082     if length <= 16, branch to linear cleanup code starting with
1083	remainder length check (length not known at compile time)
1084     set up 2 iv's and load count reg, compute remainder length
1085     unrollx2 compare loop
1086     if loop exit due to a difference, branch to difference handling code
1087     if remainder length < 8, branch to final cleanup compare
1088     load and compare 8B
1089     final cleanup comparison (depends on alignment and length)
1090	load 8B, shift off bytes past length, compare
1091	load 8B ending at last byte and compare
1092	load/compare 1 byte at a time (short block abutting 4k boundary)
1093     difference handling, 64->32 conversion
1094     final result
1095     branch around memcmp call
1096     memcmp library call
1097  */
1098
1099  /* If bytes is not const, compare length and branch directly
1100     to the cleanup code that can handle 0-16 bytes if length
1101     is >= 16.  Stash away bytes-max_bytes for the library call.  */
1102  if (bytes_is_const)
1103    {
1104      /* These need to be set for some of the places we may jump to.  */
1105      if (bytes > max_bytes)
1106	{
1107	  no_remainder_code = true;
1108	  niter = max_loop_iter;
1109	  library_call_label = gen_label_rtx ();
1110	}
1111      else
1112	{
1113	  niter = bytes / loop_bytes;
1114	}
1115      emit_move_insn (iter, GEN_INT (niter));
1116      emit_move_insn (loop_cmp, GEN_INT (niter * loop_bytes));
1117      emit_move_insn (cmp_rem, GEN_INT (bytes - niter * loop_bytes));
1118    }
1119  else
1120    {
1121      library_call_label = gen_label_rtx ();
1122
1123      /* If we go to the cleanup code, it expects length to be in cmp_rem.  */
1124      emit_move_insn (cmp_rem, bytes_rtx);
1125
1126      /* Check for > max_bytes bytes.  We want to bail out as quickly as
1127	 possible if we have to go over to memcmp.  */
1128      do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (max_bytes),
1129		 NULL_RTX, library_call_label, profile_probability::even ());
1130
1131      /* Check for < loop_bytes bytes.  */
1132      do_ifelse (CCmode, LT, bytes_rtx, GEN_INT (loop_bytes),
1133		 NULL_RTX, cleanup_label, profile_probability::even ());
1134
1135      /* Loop compare bytes and iterations if bytes>max_bytes.  */
1136      rtx mb_reg = gen_reg_rtx (word_mode);
1137      emit_move_insn (mb_reg, GEN_INT (max_loop_bytes));
1138      rtx mi_reg = gen_reg_rtx (word_mode);
1139      emit_move_insn (mi_reg, GEN_INT (max_loop_iter));
1140
1141      /* Compute number of loop iterations if bytes <= max_bytes.  */
1142      if (word_mode == DImode)
1143	emit_insn (gen_lshrdi3 (iter, bytes_rtx, GEN_INT (l2lb)));
1144      else
1145	emit_insn (gen_lshrsi3 (iter, bytes_rtx, GEN_INT (l2lb)));
1146
1147      /* Compute bytes to compare in loop if bytes <= max_bytes.  */
1148      rtx mask = GEN_INT (HOST_WIDE_INT_M1U << l2lb);
1149      if (word_mode == DImode)
1150	{
1151	  emit_insn (gen_anddi3 (loop_cmp, bytes_rtx, mask));
1152	}
1153      else
1154	{
1155	  emit_insn (gen_andsi3 (loop_cmp, bytes_rtx, mask));
1156	}
1157
1158      /* Check for bytes <= max_bytes.  */
1159      if (TARGET_ISEL)
1160	{
1161	  /* P9 has fast isel so we use one compare and two isel.  */
1162	  cr = gen_reg_rtx (CCmode);
1163	  rtx compare_rtx = gen_rtx_COMPARE (CCmode, bytes_rtx,
1164					     GEN_INT (max_bytes));
1165	  emit_move_insn (cr, compare_rtx);
1166	  rtx cmp_rtx = gen_rtx_LE (VOIDmode, cr, const0_rtx);
1167	  do_isel (loop_cmp, cmp_rtx, loop_cmp, mb_reg, cr);
1168	  do_isel (iter, cmp_rtx, iter, mi_reg, cr);
1169	}
1170      else
1171	{
1172	  rtx lab_after = gen_label_rtx ();
1173	  do_ifelse (CCmode, LE, bytes_rtx, GEN_INT (max_bytes),
1174		     NULL_RTX, lab_after, profile_probability::even ());
1175	  emit_move_insn (loop_cmp, mb_reg);
1176	  emit_move_insn (iter, mi_reg);
1177	  emit_label (lab_after);
1178	}
1179
1180      /* Now compute remainder bytes which isn't used until after the loop.  */
1181      do_sub3 (cmp_rem, bytes_rtx, loop_cmp);
1182    }
1183
1184  rtx dcond = NULL_RTX; /* Used for when we jump to diff_label.  */
1185  /* For p9 we need to have just one of these as multiple places define
1186     it and it gets used by the setb at the end.  */
1187  if (TARGET_P9_MISC)
1188    dcond = gen_reg_rtx (CCUNSmode);
1189
1190  if (!bytes_is_const || bytes >= loop_bytes)
1191    {
1192      /* It should not be possible to come here if remaining bytes is
1193	 < 16 in the runtime case either.  Compute number of loop
1194	 iterations.  We compare 2*word_mode per iteration so 16B for
1195	 64-bit code and 8B for 32-bit.  Set up two induction
1196	 variables and load count register.  */
1197
1198      /* HACK ALERT: create hard reg for CTR here.  If we just use a
1199	 pseudo, cse will get rid of it and then the allocator will
1200	 see it used in the lshr above and won't give us ctr.  */
1201      rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
1202      emit_move_insn (ctr, iter);
1203      emit_move_insn (diff, GEN_INT (0));
1204      emit_move_insn (iv1, GEN_INT (0));
1205      emit_move_insn (iv2, GEN_INT (load_mode_size));
1206
1207      /* inner loop to compare 2*word_mode */
1208      rtx loop_top_label = gen_label_rtx ();
1209      emit_label (loop_top_label);
1210
1211      rtx src1_ix1 = gen_rtx_PLUS (word_mode, src1_addr, iv1);
1212      rtx src2_ix1 = gen_rtx_PLUS (word_mode, src2_addr, iv1);
1213
1214      do_load_for_compare_from_addr (load_mode, d1_1,
1215				     src1_ix1, orig_src1);
1216      do_load_for_compare_from_addr (load_mode, d2_1,
1217				     src2_ix1, orig_src2);
1218      do_add3 (iv1, iv1, GEN_INT (loop_bytes));
1219
1220      rtx src1_ix2 = gen_rtx_PLUS (word_mode, src1_addr, iv2);
1221      rtx src2_ix2 = gen_rtx_PLUS (word_mode, src2_addr, iv2);
1222
1223      do_load_for_compare_from_addr (load_mode, d1_2,
1224				     src1_ix2, orig_src1);
1225      do_load_for_compare_from_addr (load_mode, d2_2,
1226				     src2_ix2, orig_src2);
1227      do_add3 (iv2, iv2, GEN_INT (loop_bytes));
1228
1229      if (TARGET_P9_MISC)
1230	{
1231	  /* Generate a compare, and convert with a setb later.  */
1232	  rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
1233	  emit_insn (gen_rtx_SET (dcond, cmp));
1234	}
1235      else
1236	{
1237	  dcond = gen_reg_rtx (CCmode);
1238	  if (word_mode == DImode)
1239	    emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1240	  else
1241	    emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1242	}
1243
1244      do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
1245		 dcond, diff_label, profile_probability::unlikely ());
1246
1247      if (TARGET_P9_MISC)
1248	{
1249	  /* Generate a compare, and convert with a setb later.  */
1250	  rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_2, d2_2);
1251	  emit_insn (gen_rtx_SET (dcond, cmp));
1252	}
1253      else
1254	{
1255	  dcond = gen_reg_rtx (CCmode);
1256	  if (word_mode == DImode)
1257	    emit_insn (gen_subfdi3_carry_dot2 (diff, d2_2, d1_2, dcond));
1258	  else
1259	    emit_insn (gen_subfsi3_carry_dot2 (diff, d2_2, d1_2, dcond));
1260	}
1261
1262      rtx eqrtx = gen_rtx_EQ (VOIDmode, d1_2, d2_2);
1263      if (TARGET_64BIT)
1264	j = emit_jump_insn (gen_bdnztf_di (loop_top_label, ctr, ctr,
1265					   eqrtx, dcond));
1266      else
1267	j = emit_jump_insn (gen_bdnztf_si (loop_top_label, ctr, ctr,
1268					   eqrtx, dcond));
1269      add_reg_br_prob_note (j, profile_probability::likely ());
1270      JUMP_LABEL (j) = loop_top_label;
1271      LABEL_NUSES (loop_top_label) += 1;
1272    }
1273
1274  HOST_WIDE_INT bytes_remaining = 0;
1275  if (bytes_is_const)
1276    bytes_remaining = (bytes % loop_bytes);
1277
1278  /* If diff is nonzero, branch to difference handling
1279     code.  If we exit here with a nonzero diff, it is
1280     because the second word differed.  */
1281  if (TARGET_P9_MISC)
1282    do_ifelse (CCUNSmode, NE, NULL_RTX, NULL_RTX, dcond,
1283	       diff_label, profile_probability::unlikely ());
1284  else
1285    do_ifelse (CCmode, NE, diff, const0_rtx, NULL_RTX,
1286	       diff_label, profile_probability::unlikely ());
1287
1288  if (library_call_label != NULL && bytes_is_const && bytes > max_bytes)
1289    {
1290      /* If the length is known at compile time, then we will always
1291	 have a remainder to go to the library call with.  */
1292      rtx library_call_ref = gen_rtx_LABEL_REF (VOIDmode, library_call_label);
1293      j = emit_jump_insn (gen_rtx_SET (pc_rtx, library_call_ref));
1294      JUMP_LABEL (j) = library_call_label;
1295      LABEL_NUSES (library_call_label) += 1;
1296      emit_barrier ();
1297    }
1298
1299  if (bytes_is_const && bytes_remaining == 0)
1300    {
1301      /* No remainder and if we are here then diff is 0 so just return 0 */
1302      if (TARGET_64BIT)
1303	emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1304      else
1305	emit_move_insn (target, diff);
1306      j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1307      JUMP_LABEL (j) = final_label;
1308      LABEL_NUSES (final_label) += 1;
1309      emit_barrier ();
1310    }
1311  else if (!no_remainder_code)
1312    {
1313      /* Update addresses to point to the next word to examine.  */
1314      do_add3 (src1_addr, src1_addr, iv1);
1315      do_add3 (src2_addr, src2_addr, iv1);
1316
1317      emit_label (cleanup_label);
1318
1319      if (!bytes_is_const)
1320	{
1321	  /* If we're dealing with runtime length, we have to check if
1322	     it's zero after the loop.  When length is known at compile
1323	     time the no-remainder condition is dealt with above.  By
1324	     doing this after cleanup_label, we also deal with the
1325	     case where length is 0 at the start and we bypass the
1326	     loop with a branch to cleanup_label.  */
1327	  emit_move_insn (target, const0_rtx);
1328	  do_ifelse (CCmode, EQ, cmp_rem, const0_rtx,
1329		     NULL_RTX, final_label, profile_probability::unlikely ());
1330	}
1331
1332      rtx final_cleanup = gen_label_rtx ();
1333      rtx cmp_rem_before = gen_reg_rtx (word_mode);
1334      /* Compare one more word_mode chunk if needed.  */
1335      if (!bytes_is_const || bytes_remaining >= load_mode_size)
1336	{
1337	  /* If remainder length < word length, branch to final
1338	     cleanup compare.  */
1339
1340	  if (!bytes_is_const)
1341	    {
1342	      do_ifelse (CCmode, LT, cmp_rem, GEN_INT (load_mode_size),
1343			 NULL_RTX, final_cleanup, profile_probability::even ());
1344	    }
1345
1346	  /* load and compare 8B */
1347	  do_load_for_compare_from_addr (load_mode, d1_1,
1348					 src1_addr, orig_src1);
1349	  do_load_for_compare_from_addr (load_mode, d2_1,
1350					 src2_addr, orig_src2);
1351
1352	  /* Compare the word, see if we need to do the last partial.  */
1353	  if (TARGET_P9_MISC)
1354	    {
1355	      /* Generate a compare, and convert with a setb later.  */
1356	      rtx cmp = gen_rtx_COMPARE (CCUNSmode, d1_1, d2_1);
1357	      emit_insn (gen_rtx_SET (dcond, cmp));
1358	    }
1359	  else
1360	    {
1361	      dcond = gen_reg_rtx (CCmode);
1362	      if (word_mode == DImode)
1363		emit_insn (gen_subfdi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1364	      else
1365		emit_insn (gen_subfsi3_carry_dot2 (diff, d2_1, d1_1, dcond));
1366	    }
1367
1368	  do_ifelse (GET_MODE (dcond), NE, NULL_RTX, NULL_RTX,
1369		     dcond, diff_label, profile_probability::even ());
1370
1371	  do_add3 (src1_addr, src1_addr, GEN_INT (load_mode_size));
1372	  do_add3 (src2_addr, src2_addr, GEN_INT (load_mode_size));
1373	  emit_move_insn (cmp_rem_before, cmp_rem);
1374	  do_add3 (cmp_rem, cmp_rem, GEN_INT (-load_mode_size));
1375	  if (bytes_is_const)
1376	    bytes_remaining -= load_mode_size;
1377	  else
1378	    /* See if remaining length is now zero.  We previously set
1379	       target to 0 so we can just jump to the end.  */
1380	    do_ifelse (CCmode, EQ, cmp_rem, const0_rtx, NULL_RTX,
1381		       final_label, profile_probability::unlikely ());
1382	}
1383
1384      /* Cases:
1385	 bytes_is_const
1386	   We can always shift back to do an overlapping compare
1387	   of the last chunk because we know length >= 8.
1388
1389	 !bytes_is_const
1390	   align>=load_mode_size
1391	     Read word_mode and mask
1392	   align<load_mode_size
1393	     avoid stepping past end
1394
1395	  Three strategies:
1396	  * decrement address and do overlapping compare
1397	  * read word_mode and mask
1398	  * carefully avoid crossing 4k boundary
1399       */
1400
1401      if ((!bytes_is_const || (bytes_is_const && bytes_remaining && isP7))
1402	  && align1 >= load_mode_size && align2 >= load_mode_size)
1403	{
1404	  /* Alignment is larger than word_mode so we do not need to be
1405	     concerned with extra page crossings.  But, we do not know
1406	     that the length is larger than load_mode_size so we might
1407	     end up compareing against data before the block if we try
1408	     an overlapping compare.  Also we use this on P7 for fixed length
1409	     remainder because P7 doesn't like overlapping unaligned.
1410	     Strategy: load 8B, shift off bytes past length, and compare.  */
1411	  emit_label (final_cleanup);
1412	  do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1413				src1_addr, src2_addr, orig_src1, orig_src2);
1414	}
1415      else if (bytes_remaining && bytes_is_const)
1416	{
1417	  /* We do not do loop expand if length < 32 so we know at the
1418	     end we can do an overlapping compare.
1419	     Strategy: shift address back and do word_mode load that
1420	     ends at the end of the block.  */
1421	  emit_label (final_cleanup);
1422	  do_overlap_load_compare (load_mode, true, bytes_remaining, diff,
1423				   cmp_rem, dcond, src1_addr, src2_addr,
1424				   orig_src1, orig_src2);
1425	}
1426      else if (!bytes_is_const)
1427	{
1428	  rtx handle4k_label = gen_label_rtx ();
1429	  rtx nonconst_overlap = gen_label_rtx ();
1430	  emit_label (nonconst_overlap);
1431
1432	  /* Here we have to handle the case where whe have runtime
1433	     length which may be too short for overlap compare, and
1434	     alignment is not at least load_mode_size so we have to
1435	     tread carefully to avoid stepping across 4k boundaries.  */
1436
1437	  /* If the length after the loop was larger than word_mode
1438	     size, we can just do an overlapping compare and we're
1439	     done.  We fall through to this code from the word_mode
1440	     compare that preceeds this.  */
1441	  do_overlap_load_compare (load_mode, false, 0, diff,
1442				   cmp_rem, dcond, src1_addr, src2_addr,
1443				   orig_src1, orig_src2);
1444
1445	  rtx diff_ref = gen_rtx_LABEL_REF (VOIDmode, diff_label);
1446	  j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1447	  JUMP_LABEL (j) = diff_label;
1448	  LABEL_NUSES (diff_label) += 1;
1449	  emit_barrier ();
1450
1451	  /* If we couldn't do the overlap compare we have to be more
1452	     careful of the 4k boundary.  Test to see if either
1453	     address is less than word_mode_size away from a 4k
1454	     boundary.  If not, then we can do a load/shift/compare
1455	     and we are done.  We come to this code if length was less
1456	     than word_mode_size.  */
1457
1458	  emit_label (final_cleanup);
1459
1460	  /* We can still avoid the slow case if the length was larger
1461	     than one loop iteration, in which case go do the overlap
1462	     load compare path.  */
1463	  do_ifelse (CCmode, GT, bytes_rtx, GEN_INT (loop_bytes),
1464		     NULL_RTX, nonconst_overlap, profile_probability::even ());
1465
1466	  rtx rem4k = gen_reg_rtx (word_mode);
1467	  rtx dist1 = gen_reg_rtx (word_mode);
1468	  rtx dist2 = gen_reg_rtx (word_mode);
1469	  do_sub3 (rem4k, GEN_INT (4096), cmp_rem);
1470	  if (word_mode == SImode)
1471	    emit_insn (gen_andsi3 (dist1, src1_addr, GEN_INT (0xfff)));
1472	  else
1473	    emit_insn (gen_anddi3 (dist1, src1_addr, GEN_INT (0xfff)));
1474	  do_ifelse (CCmode, LE, dist1, rem4k, NULL_RTX,
1475		     handle4k_label, profile_probability::very_unlikely ());
1476	  if (word_mode == SImode)
1477	    emit_insn (gen_andsi3 (dist2, src2_addr, GEN_INT (0xfff)));
1478	  else
1479	    emit_insn (gen_anddi3 (dist2, src2_addr, GEN_INT (0xfff)));
1480	  do_ifelse (CCmode, LE, dist2, rem4k, NULL_RTX,
1481		     handle4k_label, profile_probability::very_unlikely ());
1482
1483	  /* We don't have a 4k boundary to deal with, so do
1484	     a load/shift/compare and jump to diff.  */
1485
1486	  do_load_mask_compare (load_mode, diff, cmp_rem, dcond,
1487				src1_addr, src2_addr, orig_src1, orig_src2);
1488
1489	  j = emit_jump_insn (gen_rtx_SET (pc_rtx, diff_ref));
1490	  JUMP_LABEL (j) = diff_label;
1491	  LABEL_NUSES (diff_label) += 1;
1492	  emit_barrier ();
1493
1494	  /* Finally in the unlikely case we are inching up to a
1495	     4k boundary we use a compact lbzx/compare loop to do
1496	     it a byte at a time.  */
1497
1498	  emit_label (handle4k_label);
1499
1500	  rtx ctr = gen_rtx_REG (Pmode, CTR_REGNO);
1501	  emit_move_insn (ctr, cmp_rem);
1502	  rtx ixreg = gen_reg_rtx (Pmode);
1503	  emit_move_insn (ixreg, const0_rtx);
1504
1505	  rtx src1_ix = gen_rtx_PLUS (word_mode, src1_addr, ixreg);
1506	  rtx src2_ix = gen_rtx_PLUS (word_mode, src2_addr, ixreg);
1507	  rtx d1 = gen_reg_rtx (word_mode);
1508	  rtx d2 = gen_reg_rtx (word_mode);
1509
1510	  rtx fc_loop = gen_label_rtx ();
1511	  emit_label (fc_loop);
1512
1513	  do_load_for_compare_from_addr (QImode, d1, src1_ix, orig_src1);
1514	  do_load_for_compare_from_addr (QImode, d2, src2_ix, orig_src2);
1515
1516	  do_add3 (ixreg, ixreg, const1_rtx);
1517
1518	  rtx cond = gen_reg_rtx (CCmode);
1519	  rtx subexpr = gen_rtx_MINUS (word_mode, d1, d2);
1520	  rs6000_emit_dot_insn (diff, subexpr, 2, cond);
1521
1522	  rtx eqrtx = gen_rtx_EQ (VOIDmode, d1, d2);
1523	  if (TARGET_64BIT)
1524	    j = emit_jump_insn (gen_bdnztf_di (fc_loop, ctr, ctr,
1525					       eqrtx, cond));
1526	  else
1527	    j = emit_jump_insn (gen_bdnztf_si (fc_loop, ctr, ctr,
1528					       eqrtx, cond));
1529	  add_reg_br_prob_note (j, profile_probability::likely ());
1530	  JUMP_LABEL (j) = fc_loop;
1531	  LABEL_NUSES (fc_loop) += 1;
1532
1533	  if (TARGET_64BIT)
1534	    emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1535	  else
1536	    emit_move_insn (target, diff);
1537
1538	  /* Since we are comparing bytes, the difference can be used
1539	     as the final result and we are done here.  */
1540	  j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1541	  JUMP_LABEL (j) = final_label;
1542	  LABEL_NUSES (final_label) += 1;
1543	  emit_barrier ();
1544	}
1545    }
1546
1547  emit_label (diff_label);
1548  /* difference handling, 64->32 conversion */
1549
1550  /* We need to produce DI result from sub, then convert to target SI
1551     while maintaining <0 / ==0 / >0 properties.  This sequence works:
1552     subfc L,A,B
1553     subfe H,H,H
1554     popcntd L,L
1555     rldimi L,H,6,0
1556
1557     This is an alternate one Segher cooked up if somebody
1558     wants to expand this for something that doesn't have popcntd:
1559     subfc L,a,b
1560     subfe H,x,x
1561     addic t,L,-1
1562     subfe v,t,L
1563     or z,v,H
1564
1565     And finally, p9 can just do this:
1566     cmpld A,B
1567     setb r */
1568
1569  if (TARGET_P9_MISC)
1570    emit_insn (gen_setb_unsigned (target, dcond));
1571  else
1572    {
1573      if (TARGET_64BIT)
1574	{
1575	  rtx tmp_reg_ca = gen_reg_rtx (DImode);
1576	  emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1577	  emit_insn (gen_popcntddi2 (diff, diff));
1578	  emit_insn (gen_iordi3 (diff, diff, tmp_reg_ca));
1579	  emit_insn (gen_movsi (target, gen_lowpart (SImode, diff)));
1580	}
1581      else
1582	{
1583	  rtx tmp_reg_ca = gen_reg_rtx (SImode);
1584	  emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1585	  emit_insn (gen_popcntdsi2 (diff, diff));
1586	  emit_insn (gen_iorsi3 (target, diff, tmp_reg_ca));
1587	}
1588    }
1589
1590  if (library_call_label != NULL)
1591    {
1592      /* Branch around memcmp call.  */
1593      j = emit_jump_insn (gen_rtx_SET (pc_rtx, final_ref));
1594      JUMP_LABEL (j) = final_label;
1595      LABEL_NUSES (final_label) += 1;
1596      emit_barrier ();
1597
1598      /* Make memcmp library call.  cmp_rem is the remaining bytes that
1599	 were compared and cmp_rem is the expected amount to be compared
1600	 by memcmp.  If we don't find a difference in the loop compare, do
1601	 the library call directly instead of doing a small compare just
1602	 to get to an arbitrary boundary before calling it anyway.
1603	 Also, update addresses to point to the next word to examine.  */
1604      emit_label (library_call_label);
1605
1606      rtx len_rtx = gen_reg_rtx (word_mode);
1607      if (bytes_is_const)
1608	{
1609	  emit_move_insn (len_rtx, cmp_rem);
1610	  do_add3 (src1_addr, src1_addr, iv1);
1611	  do_add3 (src2_addr, src2_addr, iv1);
1612	}
1613      else
1614	emit_move_insn (len_rtx, bytes_rtx);
1615
1616      tree fun = builtin_decl_explicit (BUILT_IN_MEMCMP);
1617      emit_library_call_value (XEXP (DECL_RTL (fun), 0),
1618			       target, LCT_NORMAL, GET_MODE (target),
1619			       src1_addr, Pmode,
1620			       src2_addr, Pmode,
1621			       len_rtx, GET_MODE (len_rtx));
1622    }
1623
1624  /* emit final_label */
1625  emit_label (final_label);
1626  return true;
1627}
1628
1629/* Generate code to convert a DImode-plus-carry subtract result into
1630   a SImode result that has the same <0 / ==0 / >0 properties to
1631   produce the final result from memcmp.
1632
1633   TARGET is the rtx for the register to receive the memcmp result.
1634   SUB_RESULT is the rtx for the register contining the subtract result.  */
1635
1636void
1637generate_6432_conversion(rtx target, rtx sub_result)
1638{
1639  /* We need to produce DI result from sub, then convert to target SI
1640     while maintaining <0 / ==0 / >0 properties.  This sequence works:
1641     subfc L,A,B
1642     subfe H,H,H
1643     popcntd L,L
1644     rldimi L,H,6,0
1645
1646     This is an alternate one Segher cooked up if somebody
1647     wants to expand this for something that doesn't have popcntd:
1648     subfc L,a,b
1649     subfe H,x,x
1650     addic t,L,-1
1651     subfe v,t,L
1652     or z,v,H
1653
1654     And finally, p9 can just do this:
1655     cmpld A,B
1656     setb r */
1657
1658  if (TARGET_64BIT)
1659    {
1660      rtx tmp_reg_ca = gen_reg_rtx (DImode);
1661      emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
1662      rtx popcnt = gen_reg_rtx (DImode);
1663      emit_insn (gen_popcntddi2 (popcnt, sub_result));
1664      rtx tmp2 = gen_reg_rtx (DImode);
1665      emit_insn (gen_iordi3 (tmp2, popcnt, tmp_reg_ca));
1666      emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp2)));
1667    }
1668  else
1669    {
1670      rtx tmp_reg_ca = gen_reg_rtx (SImode);
1671      emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
1672      rtx popcnt = gen_reg_rtx (SImode);
1673      emit_insn (gen_popcntdsi2 (popcnt, sub_result));
1674      emit_insn (gen_iorsi3 (target, popcnt, tmp_reg_ca));
1675    }
1676}
1677
1678/* Generate memcmp expansion using in-line non-loop GPR instructions.
1679   The bool return indicates whether code for a 64->32 conversion
1680   should be generated.
1681
1682   BYTES is the number of bytes to be compared.
1683   BASE_ALIGN is the minimum alignment for both blocks to compare.
1684   ORIG_SRC1 is the original pointer to the first block to compare.
1685   ORIG_SRC2 is the original pointer to the second block to compare.
1686   SUB_RESULT is the reg rtx for the result from the final subtract.
1687   COND is rtx for a condition register that will be used for the final
1688   compare on power9 or better.
1689   FINAL_RESULT is the reg rtx for the final memcmp result.
1690   P_CONVERT_LABEL is a pointer to rtx that will be used to store the
1691   label generated for a branch to the 64->32 code, if such a branch
1692   is needed.
1693   P_FINAL_LABEL is a pointer to rtx that will be used to store the label
1694   for the end of the memcmp if a branch there is needed.
1695*/
1696
1697bool
1698expand_block_compare_gpr(unsigned HOST_WIDE_INT bytes, unsigned int base_align,
1699			 rtx orig_src1, rtx orig_src2,
1700			 rtx sub_result, rtx cond, rtx final_result,
1701			 rtx *p_convert_label, rtx *p_final_label)
1702{
1703  /* Example of generated code for 18 bytes aligned 1 byte.
1704     Compiled with -fno-reorder-blocks for clarity.
1705             ldbrx 10,31,8
1706             ldbrx 9,7,8
1707             subfc. 9,9,10
1708             bne 0,.L6487
1709             addi 9,12,8
1710             addi 5,11,8
1711             ldbrx 10,0,9
1712             ldbrx 9,0,5
1713             subfc. 9,9,10
1714             bne 0,.L6487
1715             addi 9,12,16
1716             lhbrx 10,0,9
1717             addi 9,11,16
1718             lhbrx 9,0,9
1719             subf 9,9,10
1720             b .L6488
1721             .p2align 4,,15
1722     .L6487: #convert_label
1723             popcntd 9,9
1724             subfe 10,10,10
1725             or 9,9,10
1726     .L6488: #final_label
1727             extsw 10,9
1728
1729     We start off with DImode for two blocks that jump to the DI->SI conversion
1730     if the difference is found there, then a final block of HImode that skips
1731     the DI->SI conversion.  */
1732
1733  unsigned HOST_WIDE_INT offset = 0;
1734  unsigned int load_mode_size;
1735  HOST_WIDE_INT cmp_bytes = 0;
1736  rtx src1 = orig_src1;
1737  rtx src2 = orig_src2;
1738  rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
1739  rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
1740  bool need_6432_conv = false;
1741  rtx convert_label = NULL;
1742  rtx final_label = NULL;
1743  machine_mode load_mode;
1744
1745  while (bytes > 0)
1746    {
1747      unsigned int align = compute_current_alignment (base_align, offset);
1748      load_mode = select_block_compare_mode (offset, bytes, align);
1749      load_mode_size = GET_MODE_SIZE (load_mode);
1750      if (bytes >= load_mode_size)
1751	cmp_bytes = load_mode_size;
1752      else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
1753	{
1754	  /* Move this load back so it doesn't go past the end.
1755	     P8/P9 can do this efficiently.  */
1756	  unsigned int extra_bytes = load_mode_size - bytes;
1757	  cmp_bytes = bytes;
1758	  if (extra_bytes < offset)
1759	    {
1760	      offset -= extra_bytes;
1761	      cmp_bytes = load_mode_size;
1762	      bytes = cmp_bytes;
1763	    }
1764	}
1765      else
1766	/* P7 and earlier can't do the overlapping load trick fast,
1767	   so this forces a non-overlapping load and a shift to get
1768	   rid of the extra bytes.  */
1769	cmp_bytes = bytes;
1770
1771      src1 = adjust_address (orig_src1, load_mode, offset);
1772      src2 = adjust_address (orig_src2, load_mode, offset);
1773
1774      if (!REG_P (XEXP (src1, 0)))
1775	{
1776	  rtx src1_reg = copy_addr_to_reg (XEXP (src1, 0));
1777	  src1 = replace_equiv_address (src1, src1_reg);
1778	}
1779      set_mem_size (src1, load_mode_size);
1780
1781      if (!REG_P (XEXP (src2, 0)))
1782	{
1783	  rtx src2_reg = copy_addr_to_reg (XEXP (src2, 0));
1784	  src2 = replace_equiv_address (src2, src2_reg);
1785	}
1786      set_mem_size (src2, load_mode_size);
1787
1788      do_load_for_compare (tmp_reg_src1, src1, load_mode);
1789      do_load_for_compare (tmp_reg_src2, src2, load_mode);
1790
1791      if (cmp_bytes < load_mode_size)
1792	{
1793	  /* Shift unneeded bytes off.  */
1794	  rtx sh = GEN_INT (BITS_PER_UNIT * (load_mode_size - cmp_bytes));
1795	  if (word_mode == DImode)
1796	    {
1797	      emit_insn (gen_lshrdi3 (tmp_reg_src1, tmp_reg_src1, sh));
1798	      emit_insn (gen_lshrdi3 (tmp_reg_src2, tmp_reg_src2, sh));
1799	    }
1800	  else
1801	    {
1802	      emit_insn (gen_lshrsi3 (tmp_reg_src1, tmp_reg_src1, sh));
1803	      emit_insn (gen_lshrsi3 (tmp_reg_src2, tmp_reg_src2, sh));
1804	    }
1805	}
1806
1807      int remain = bytes - cmp_bytes;
1808      if (GET_MODE_SIZE (GET_MODE (final_result)) > GET_MODE_SIZE (load_mode))
1809	{
1810	  /* Final_result is larger than load size so we don't need to
1811	     reduce result size.  */
1812
1813	  /* We previously did a block that need 64->32 conversion but
1814	     the current block does not, so a label is needed to jump
1815	     to the end.  */
1816	  if (need_6432_conv && !final_label)
1817	    final_label = gen_label_rtx ();
1818
1819	  if (remain > 0)
1820	    {
1821	      /* This is not the last block, branch to the end if the result
1822		 of this subtract is not zero.  */
1823	      if (!final_label)
1824		final_label = gen_label_rtx ();
1825	      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1826	      rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
1827	      rtx cr = gen_reg_rtx (CCmode);
1828	      rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
1829	      emit_insn (gen_movsi (final_result,
1830				    gen_lowpart (SImode, tmp_reg_src2)));
1831	      rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
1832	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1833						 fin_ref, pc_rtx);
1834	      rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1835	      add_reg_br_prob_note (j, profile_probability::unlikely ());
1836	      JUMP_LABEL (j) = final_label;
1837	      LABEL_NUSES (final_label) += 1;
1838	    }
1839	  else
1840	    {
1841	      if (word_mode == DImode)
1842		{
1843		  emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
1844					 tmp_reg_src2));
1845		  emit_insn (gen_movsi (final_result,
1846					gen_lowpart (SImode, tmp_reg_src2)));
1847		}
1848	      else
1849		emit_insn (gen_subsi3 (final_result, tmp_reg_src1, tmp_reg_src2));
1850
1851	      if (final_label)
1852		{
1853		  rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
1854		  rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
1855		  JUMP_LABEL (j) = final_label;
1856		  LABEL_NUSES (final_label) += 1;
1857		  emit_barrier ();
1858		}
1859	    }
1860	}
1861      else
1862	{
1863	  /* Do we need a 64->32 conversion block? We need the 64->32
1864	     conversion even if final_result size == load_mode size because
1865	     the subtract generates one extra bit.  */
1866	  need_6432_conv = true;
1867
1868	  if (remain > 0)
1869	    {
1870	      if (!convert_label)
1871		convert_label = gen_label_rtx ();
1872
1873	      /* Compare to zero and branch to convert_label if not zero.  */
1874	      rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
1875	      if (TARGET_P9_MISC)
1876		{
1877		/* Generate a compare, and convert with a setb later.
1878		   Use cond that is passed in because the caller needs
1879		   to use it for the 64->32 conversion later.  */
1880		  rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1881					     tmp_reg_src2);
1882		  emit_insn (gen_rtx_SET (cond, cmp));
1883		}
1884	      else
1885		{
1886		  /* Generate a subfc. and use the longer sequence for
1887		     conversion.  Cond is not used outside this
1888		     function in this case.  */
1889		  cond = gen_reg_rtx (CCmode);
1890		  if (TARGET_64BIT)
1891		    emit_insn (gen_subfdi3_carry_dot2 (sub_result, tmp_reg_src2,
1892						       tmp_reg_src1, cond));
1893		  else
1894		    emit_insn (gen_subfsi3_carry_dot2 (sub_result, tmp_reg_src2,
1895						       tmp_reg_src1, cond));
1896		}
1897
1898	      rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
1899	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
1900						 cvt_ref, pc_rtx);
1901	      rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
1902	      add_reg_br_prob_note (j, profile_probability::likely ());
1903	      JUMP_LABEL (j) = convert_label;
1904	      LABEL_NUSES (convert_label) += 1;
1905	    }
1906	  else
1907	    {
1908	      /* Just do the subtract/compare.  Since this is the last block
1909		 the convert code will be generated immediately following.  */
1910	      if (TARGET_P9_MISC)
1911		{
1912		  rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
1913					     tmp_reg_src2);
1914		  emit_insn (gen_rtx_SET (cond, cmp));
1915		}
1916	      else
1917		if (TARGET_64BIT)
1918		  emit_insn (gen_subfdi3_carry (sub_result, tmp_reg_src2,
1919						tmp_reg_src1));
1920		else
1921		  emit_insn (gen_subfsi3_carry (sub_result, tmp_reg_src2,
1922						tmp_reg_src1));
1923	    }
1924	}
1925
1926      offset += cmp_bytes;
1927      bytes -= cmp_bytes;
1928    }
1929
1930  if (convert_label)
1931    *p_convert_label = convert_label;
1932  if (final_label)
1933    *p_final_label = final_label;
1934  return need_6432_conv;
1935}
1936
1937/* Expand a block compare operation, and return true if successful.
1938   Return false if we should let the compiler generate normal code,
1939   probably a memcmp call.
1940
1941   OPERANDS[0] is the target (result).
1942   OPERANDS[1] is the first source.
1943   OPERANDS[2] is the second source.
1944   OPERANDS[3] is the length.
1945   OPERANDS[4] is the alignment.  */
1946bool
1947expand_block_compare (rtx operands[])
1948{
1949  rtx target = operands[0];
1950  rtx orig_src1 = operands[1];
1951  rtx orig_src2 = operands[2];
1952  rtx bytes_rtx = operands[3];
1953  rtx align_rtx = operands[4];
1954
1955  /* This case is complicated to handle because the subtract
1956     with carry instructions do not generate the 64-bit
1957     carry and so we must emit code to calculate it ourselves.
1958     We choose not to implement this yet.  */
1959  if (TARGET_32BIT && TARGET_POWERPC64)
1960    return false;
1961
1962  bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
1963
1964  /* Allow this param to shut off all expansion.  */
1965  if (rs6000_block_compare_inline_limit == 0)
1966    return false;
1967
1968  /* targetm.slow_unaligned_access -- don't do unaligned stuff.
1969     However slow_unaligned_access returns true on P7 even though the
1970     performance of this code is good there.  */
1971  if (!isP7
1972      && (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1))
1973	  || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2))))
1974    return false;
1975
1976  /* Unaligned l*brx traps on P7 so don't do this.  However this should
1977     not affect much because LE isn't really supported on P7 anyway.  */
1978  if (isP7 && !BYTES_BIG_ENDIAN)
1979    return false;
1980
1981  /* If this is not a fixed size compare, try generating loop code and
1982     if that fails just call memcmp.  */
1983  if (!CONST_INT_P (bytes_rtx))
1984    return expand_compare_loop (operands);
1985
1986  /* This must be a fixed size alignment.  */
1987  if (!CONST_INT_P (align_rtx))
1988    return false;
1989
1990  unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
1991
1992  gcc_assert (GET_MODE (target) == SImode);
1993
1994  /* Anything to move?  */
1995  unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
1996  if (bytes == 0)
1997    return true;
1998
1999  /* P7/P8 code uses cond for subfc. but P9 uses
2000     it for cmpld which needs CCUNSmode.  */
2001  rtx cond = NULL;
2002  if (TARGET_P9_MISC)
2003    cond = gen_reg_rtx (CCUNSmode);
2004
2005  /* Is it OK to use vec/vsx for this.  TARGET_VSX means we have at
2006     least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
2007     at least POWER8.  That way we can rely on overlapping compares to
2008     do the final comparison of less than 16 bytes.  Also I do not
2009     want to deal with making this work for 32 bits.  In addition, we
2010     have to make sure that we have at least P8_VECTOR (we don't allow
2011     P9_VECTOR without P8_VECTOR).  */
2012  int use_vec = (bytes >= 33 && !TARGET_32BIT
2013		 && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR);
2014
2015  /* We don't want to generate too much code.  The loop code can take
2016     over for lengths greater than 31 bytes.  */
2017  unsigned HOST_WIDE_INT max_bytes = rs6000_block_compare_inline_limit;
2018
2019  /* Don't generate too much code if vsx was disabled.  */
2020  if (!use_vec && max_bytes > 1)
2021    max_bytes = ((max_bytes + 1) / 2) - 1;
2022
2023  if (!IN_RANGE (bytes, 1, max_bytes))
2024    return expand_compare_loop (operands);
2025
2026  /* The code generated for p7 and older is not faster than glibc
2027     memcmp if alignment is small and length is not short, so bail
2028     out to avoid those conditions.  */
2029  if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
2030      && ((base_align == 1 && bytes > 16)
2031	  || (base_align == 2 && bytes > 32)))
2032    return false;
2033
2034  rtx final_label = NULL;
2035
2036  if (use_vec)
2037    {
2038      rtx final_move_label = gen_label_rtx ();
2039      rtx s1addr = gen_reg_rtx (Pmode);
2040      rtx s2addr = gen_reg_rtx (Pmode);
2041      rtx off_reg = gen_reg_rtx (Pmode);
2042      rtx cleanup_label = NULL;
2043      rtx vec_result = gen_reg_rtx (V16QImode);
2044      rtx s1data = gen_reg_rtx (V16QImode);
2045      rtx s2data = gen_reg_rtx (V16QImode);
2046      rtx result_reg = gen_reg_rtx (word_mode);
2047      emit_move_insn (result_reg, GEN_INT (0));
2048
2049      expand_cmp_vec_sequence (bytes, orig_src1, orig_src2,
2050			       s1addr, s2addr, off_reg, s1data, s2data,
2051			       vec_result, false,
2052			       &cleanup_label, final_move_label, false);
2053
2054      if (cleanup_label)
2055	emit_label (cleanup_label);
2056
2057      emit_insn (gen_one_cmplv16qi2 (vec_result, vec_result));
2058
2059      emit_final_compare_vec (s1data, s2data, result_reg,
2060			      s1addr, s2addr, orig_src1, orig_src2,
2061			      off_reg, vec_result);
2062
2063      emit_label (final_move_label);
2064      emit_insn (gen_movsi (target,
2065			    gen_lowpart (SImode, result_reg)));
2066    }
2067  else
2068    { /* generate GPR code */
2069
2070      rtx convert_label = NULL;
2071      rtx sub_result = gen_reg_rtx (word_mode);
2072      bool need_6432_conversion =
2073	expand_block_compare_gpr(bytes, base_align,
2074				 orig_src1, orig_src2,
2075				 sub_result, cond, target,
2076				 &convert_label, &final_label);
2077
2078      if (need_6432_conversion)
2079	{
2080	  if (convert_label)
2081	    emit_label (convert_label);
2082	  if (TARGET_P9_MISC)
2083	    emit_insn (gen_setb_unsigned (target, cond));
2084	  else
2085	    generate_6432_conversion(target, sub_result);
2086	}
2087    }
2088
2089  if (final_label)
2090    emit_label (final_label);
2091
2092  return true;
2093}
2094
2095/* Generate page crossing check and branch code to set up for
2096   strncmp when we don't have DI alignment.
2097   STRNCMP_LABEL is the label to branch if there is a page crossing.
2098   SRC_ADDR is the string address to be examined.
2099   BYTES is the max number of bytes to compare.  */
2100static void
2101expand_strncmp_align_check (rtx strncmp_label, rtx src_addr, HOST_WIDE_INT bytes)
2102{
2103  rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, strncmp_label);
2104  rtx src_pgoff = gen_reg_rtx (GET_MODE (src_addr));
2105  do_and3 (src_pgoff, src_addr, GEN_INT (0xfff));
2106  rtx cond = gen_reg_rtx (CCmode);
2107  emit_move_insn (cond, gen_rtx_COMPARE (CCmode, src_pgoff,
2108					 GEN_INT (4096 - bytes)));
2109
2110  rtx cmp_rtx = gen_rtx_GE (VOIDmode, cond, const0_rtx);
2111
2112  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2113				     lab_ref, pc_rtx);
2114  rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2115  add_reg_br_prob_note (j, profile_probability::unlikely ());
2116  JUMP_LABEL (j) = strncmp_label;
2117  LABEL_NUSES (strncmp_label) += 1;
2118}
2119
2120/* Generate the sequence of compares for strcmp/strncmp using gpr instructions.
2121   BYTES_TO_COMPARE is the number of bytes to be compared.
2122   BASE_ALIGN is the smaller of the alignment of the two strings.
2123   ORIG_SRC1 is the unmodified rtx for the first string.
2124   ORIG_SRC2 is the unmodified rtx for the second string.
2125   TMP_REG_SRC1 is the register for loading the first string.
2126   TMP_REG_SRC2 is the register for loading the second string.
2127   RESULT_REG is the rtx for the result register.
2128   EQUALITY_COMPARE_REST is a flag to indicate we need to make a cleanup call
2129   to strcmp/strncmp if we have equality at the end of the inline comparison.
2130   P_CLEANUP_LABEL is a pointer to rtx for a label we generate if we need code
2131   to clean up and generate the final comparison result.
2132   FINAL_MOVE_LABEL is rtx for a label we can branch to when we can just
2133   set the final result.  */
2134static void
2135expand_strncmp_gpr_sequence (unsigned HOST_WIDE_INT bytes_to_compare,
2136			     unsigned int base_align,
2137			     rtx orig_src1, rtx orig_src2,
2138			     rtx tmp_reg_src1, rtx tmp_reg_src2, rtx result_reg,
2139			     bool equality_compare_rest, rtx *p_cleanup_label,
2140			     rtx final_move_label)
2141{
2142  unsigned int word_mode_size = GET_MODE_SIZE (word_mode);
2143  machine_mode load_mode;
2144  unsigned int load_mode_size;
2145  unsigned HOST_WIDE_INT cmp_bytes = 0;
2146  unsigned HOST_WIDE_INT offset = 0;
2147  rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
2148  rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
2149  gcc_assert (p_cleanup_label != NULL);
2150  rtx cleanup_label = *p_cleanup_label;
2151
2152  while (bytes_to_compare > 0)
2153    {
2154      /* GPR compare sequence:
2155         check each 8B with: ld/ld/cmpb/cmpb/orc./bne
2156
2157         cleanup code at end:
2158         cntlzd        get bit of first zero/diff byte
2159         subfic        convert for rldcl use
2160         rldcl rldcl   extract diff/zero byte
2161         subf          subtract for final result
2162
2163         The last compare can branch around the cleanup code if the
2164         result is zero because the strings are exactly equal.  */
2165
2166      unsigned int align = compute_current_alignment (base_align, offset);
2167      load_mode = select_block_compare_mode (offset, bytes_to_compare, align);
2168      load_mode_size = GET_MODE_SIZE (load_mode);
2169      if (bytes_to_compare >= load_mode_size)
2170	cmp_bytes = load_mode_size;
2171      else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
2172	{
2173	  /* Move this load back so it doesn't go past the end.
2174	     P8/P9 can do this efficiently.  */
2175	  unsigned int extra_bytes = load_mode_size - bytes_to_compare;
2176	  cmp_bytes = bytes_to_compare;
2177	  if (extra_bytes < offset)
2178	    {
2179	      offset -= extra_bytes;
2180	      cmp_bytes = load_mode_size;
2181	      bytes_to_compare = cmp_bytes;
2182	    }
2183	}
2184      else
2185	/* P7 and earlier can't do the overlapping load trick fast,
2186	   so this forces a non-overlapping load and a shift to get
2187	   rid of the extra bytes.  */
2188	cmp_bytes = bytes_to_compare;
2189
2190      rtx offset_rtx;
2191      if (BYTES_BIG_ENDIAN || TARGET_AVOID_XFORM)
2192	offset_rtx = GEN_INT (offset);
2193      else
2194	{
2195	  offset_rtx = gen_reg_rtx (Pmode);
2196	  emit_move_insn (offset_rtx, GEN_INT (offset));
2197	}
2198      rtx addr1 = gen_rtx_PLUS (Pmode, src1_addr, offset_rtx);
2199      rtx addr2 = gen_rtx_PLUS (Pmode, src2_addr, offset_rtx);
2200
2201      do_load_for_compare_from_addr (load_mode, tmp_reg_src1, addr1, orig_src1);
2202      do_load_for_compare_from_addr (load_mode, tmp_reg_src2, addr2, orig_src2);
2203
2204      /* We must always left-align the data we read, and
2205	 clear any bytes to the right that are beyond the string.
2206	 Otherwise the cmpb sequence won't produce the correct
2207	 results.  However if there is only one byte left, we
2208	 can just subtract to get the final result so the shifts
2209	 and clears are not needed.  */
2210
2211      unsigned HOST_WIDE_INT remain = bytes_to_compare - cmp_bytes;
2212
2213      /* Loading just a single byte is a special case.  If we are
2214	 loading more than that, we have to check whether we are
2215	 looking at the entire chunk of data.  If not, rotate left and
2216	 clear right so that bytes we aren't supposed to look at are
2217	 zeroed, and the first byte we are supposed to compare is
2218	 leftmost.  */
2219      if (load_mode_size != 1)
2220	{
2221	  if (load_mode_size < word_mode_size)
2222	    {
2223	      /* Rotate left first.  */
2224	      rtx sh = GEN_INT (BITS_PER_UNIT
2225				* (word_mode_size - load_mode_size));
2226	      do_rotl3 (tmp_reg_src1, tmp_reg_src1, sh);
2227	      do_rotl3 (tmp_reg_src2, tmp_reg_src2, sh);
2228	    }
2229
2230	  if (cmp_bytes < word_mode_size)
2231	    {
2232	      /* Now clear right.  This plus the rotate can be
2233		 turned into a rldicr instruction.  */
2234	      HOST_WIDE_INT mb = BITS_PER_UNIT * (word_mode_size - cmp_bytes);
2235	      rtx mask = GEN_INT (HOST_WIDE_INT_M1U << mb);
2236	      do_and3 (tmp_reg_src1, tmp_reg_src1, mask);
2237	      do_and3 (tmp_reg_src2, tmp_reg_src2, mask);
2238	    }
2239	}
2240
2241      /* Cases to handle.  A and B are chunks of the two strings.
2242	 1: Not end of comparison:
2243	 A != B: branch to cleanup code to compute result.
2244	 A == B: check for 0 byte, next block if not found.
2245	 2: End of the inline comparison:
2246	 A != B: branch to cleanup code to compute result.
2247	 A == B: check for 0 byte, call strcmp/strncmp
2248	 3: compared requested N bytes:
2249	 A == B: branch to result 0.
2250	 A != B: cleanup code to compute result.  */
2251
2252      rtx dst_label;
2253      if (remain > 0 || equality_compare_rest)
2254	{
2255	  /* Branch to cleanup code, otherwise fall through to do
2256	     more compares.  */
2257	  if (!cleanup_label)
2258	    cleanup_label = gen_label_rtx ();
2259	  dst_label = cleanup_label;
2260	}
2261      else
2262	/* Branch to end and produce result of 0.  */
2263	dst_label = final_move_label;
2264
2265      if (load_mode_size == 1)
2266	{
2267	  /* Special case for comparing just single byte.  */
2268	  if (equality_compare_rest)
2269	    {
2270	      /* Use subf./bne to branch to final_move_label if the
2271		 byte differs, otherwise fall through to the strncmp
2272		 call.  We must also check for a zero byte here as we
2273		 must not make the library call if this is the end of
2274		 the string.  */
2275
2276	      rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
2277	      rtx cond = gen_reg_rtx (CCmode);
2278	      rtx diff_rtx = gen_rtx_MINUS (word_mode,
2279					    tmp_reg_src1, tmp_reg_src2);
2280	      rs6000_emit_dot_insn (result_reg, diff_rtx, 2, cond);
2281	      rtx cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
2282
2283	      rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2284						 lab_ref, pc_rtx);
2285	      rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2286	      add_reg_br_prob_note (j, profile_probability::unlikely ());
2287	      JUMP_LABEL (j) = final_move_label;
2288	      LABEL_NUSES (final_move_label) += 1;
2289
2290	      /* Check for zero byte here before fall through to
2291		 library call.  This catches the case where the
2292		 strings are equal and end in a zero byte at this
2293		 position.  */
2294
2295	      rtx cond0 = gen_reg_rtx (CCmode);
2296	      emit_move_insn (cond0, gen_rtx_COMPARE (CCmode, tmp_reg_src1,
2297						      const0_rtx));
2298
2299	      rtx cmp0eq_rtx = gen_rtx_EQ (VOIDmode, cond0, const0_rtx);
2300
2301	      rtx ifelse0 = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp0eq_rtx,
2302						 lab_ref, pc_rtx);
2303	      rtx_insn *j0 = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse0));
2304	      add_reg_br_prob_note (j0, profile_probability::unlikely ());
2305	      JUMP_LABEL (j0) = final_move_label;
2306	      LABEL_NUSES (final_move_label) += 1;
2307	    }
2308	  else
2309	    {
2310	      /* This is the last byte to be compared so we can use
2311		 subf to compute the final result and branch
2312		 unconditionally to final_move_label.  */
2313
2314	      do_sub3 (result_reg, tmp_reg_src1, tmp_reg_src2);
2315
2316	      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_move_label);
2317	      rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2318	      JUMP_LABEL (j) = final_move_label;
2319	      LABEL_NUSES (final_move_label) += 1;
2320	      emit_barrier ();
2321	    }
2322	}
2323      else
2324	{
2325	  rtx cmpb_zero = gen_reg_rtx (word_mode);
2326	  rtx cmpb_diff = gen_reg_rtx (word_mode);
2327	  rtx zero_reg = gen_reg_rtx (word_mode);
2328	  rtx lab_ref = gen_rtx_LABEL_REF (VOIDmode, dst_label);
2329	  rtx cond = gen_reg_rtx (CCmode);
2330
2331	  emit_move_insn (zero_reg, GEN_INT (0));
2332	  do_cmpb3 (cmpb_diff, tmp_reg_src1, tmp_reg_src2);
2333	  do_cmpb3 (cmpb_zero, tmp_reg_src1, zero_reg);
2334	  rtx not_diff = gen_rtx_NOT (word_mode, cmpb_diff);
2335	  rtx orc_rtx = gen_rtx_IOR (word_mode, not_diff, cmpb_zero);
2336
2337	  rs6000_emit_dot_insn (result_reg, orc_rtx, 2, cond);
2338
2339	  rtx cmp_rtx;
2340	  if (remain == 0 && !equality_compare_rest)
2341	    cmp_rtx = gen_rtx_EQ (VOIDmode, cond, const0_rtx);
2342	  else
2343	    cmp_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
2344
2345	  rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, cmp_rtx,
2346					     lab_ref, pc_rtx);
2347	  rtx_insn *j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
2348	  add_reg_br_prob_note (j, profile_probability::unlikely ());
2349	  JUMP_LABEL (j) = dst_label;
2350	  LABEL_NUSES (dst_label) += 1;
2351	}
2352
2353      offset += cmp_bytes;
2354      bytes_to_compare -= cmp_bytes;
2355    }
2356
2357  *p_cleanup_label = cleanup_label;
2358  return;
2359}
2360
2361/* Generate the final sequence that identifies the differing
2362   byte and generates the final result, taking into account
2363   zero bytes:
2364
2365   cntlzd            get bit of first zero/diff byte
2366   addi              convert for rldcl use
2367   rldcl rldcl       extract diff/zero byte
2368   subf              subtract for final result
2369
2370   STR1 is the reg rtx for data from string 1.
2371   STR2 is the reg rtx for data from string 2.
2372   RESULT is the reg rtx for the comparison result.  */
2373
2374static void
2375emit_final_str_compare_gpr (rtx str1, rtx str2, rtx result)
2376{
2377  machine_mode m = GET_MODE (str1);
2378  rtx rot_amt = gen_reg_rtx (m);
2379
2380  rtx rot1_1 = gen_reg_rtx (m);
2381  rtx rot1_2 = gen_reg_rtx (m);
2382  rtx rot2_1 = gen_reg_rtx (m);
2383  rtx rot2_2 = gen_reg_rtx (m);
2384
2385  if (m == SImode)
2386    {
2387      emit_insn (gen_clzsi2 (rot_amt, result));
2388      emit_insn (gen_addsi3 (rot_amt, rot_amt, GEN_INT (8)));
2389      emit_insn (gen_rotlsi3 (rot1_1, str1,
2390			      gen_lowpart (SImode, rot_amt)));
2391      emit_insn (gen_andsi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2392      emit_insn (gen_rotlsi3 (rot2_1, str2,
2393			      gen_lowpart (SImode, rot_amt)));
2394      emit_insn (gen_andsi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2395      emit_insn (gen_subsi3 (result, rot1_2, rot2_2));
2396    }
2397  else if (m == DImode)
2398    {
2399      emit_insn (gen_clzdi2 (rot_amt, result));
2400      emit_insn (gen_adddi3 (rot_amt, rot_amt, GEN_INT (8)));
2401      emit_insn (gen_rotldi3 (rot1_1, str1,
2402			      gen_lowpart (SImode, rot_amt)));
2403      emit_insn (gen_anddi3_mask (rot1_2, rot1_1, GEN_INT (0xff)));
2404      emit_insn (gen_rotldi3 (rot2_1, str2,
2405			      gen_lowpart (SImode, rot_amt)));
2406      emit_insn (gen_anddi3_mask (rot2_2, rot2_1, GEN_INT (0xff)));
2407      emit_insn (gen_subdi3 (result, rot1_2, rot2_2));
2408    }
2409  else
2410    gcc_unreachable ();
2411
2412  return;
2413}
2414
2415/* Expand a string compare operation with length, and return
2416   true if successful.  Return false if we should let the
2417   compiler generate normal code, probably a strncmp call.
2418
2419   OPERANDS[0] is the target (result).
2420   OPERANDS[1] is the first source.
2421   OPERANDS[2] is the second source.
2422   If NO_LENGTH is zero, then:
2423   OPERANDS[3] is the length.
2424   OPERANDS[4] is the alignment in bytes.
2425   If NO_LENGTH is nonzero, then:
2426   OPERANDS[3] is the alignment in bytes.  */
2427bool
2428expand_strn_compare (rtx operands[], int no_length)
2429{
2430  rtx target = operands[0];
2431  rtx orig_src1 = operands[1];
2432  rtx orig_src2 = operands[2];
2433  rtx bytes_rtx, align_rtx;
2434  if (no_length)
2435    {
2436      bytes_rtx = NULL;
2437      align_rtx = operands[3];
2438    }
2439  else
2440    {
2441      bytes_rtx = operands[3];
2442      align_rtx = operands[4];
2443    }
2444
2445  rtx src1_addr = force_reg (Pmode, XEXP (orig_src1, 0));
2446  rtx src2_addr = force_reg (Pmode, XEXP (orig_src2, 0));
2447
2448  /* If we have a length, it must be constant.  This simplifies things
2449     a bit as we don't have to generate code to check if we've exceeded
2450     the length.  Later this could be expanded to handle this case.  */
2451  if (!no_length && !CONST_INT_P (bytes_rtx))
2452    return false;
2453
2454  /* This must be a fixed size alignment.  */
2455  if (!CONST_INT_P (align_rtx))
2456    return false;
2457
2458  unsigned int base_align = UINTVAL (align_rtx);
2459  unsigned int align1 = MEM_ALIGN (orig_src1) / BITS_PER_UNIT;
2460  unsigned int align2 = MEM_ALIGN (orig_src2) / BITS_PER_UNIT;
2461
2462  /* targetm.slow_unaligned_access -- don't do unaligned stuff.  */
2463  if (targetm.slow_unaligned_access (word_mode, align1)
2464      || targetm.slow_unaligned_access (word_mode, align2))
2465    return false;
2466
2467  gcc_assert (GET_MODE (target) == SImode);
2468
2469  unsigned int required_align = 8;
2470
2471  unsigned HOST_WIDE_INT offset = 0;
2472  unsigned HOST_WIDE_INT bytes; /* N from the strncmp args if available.  */
2473  unsigned HOST_WIDE_INT compare_length; /* How much to compare inline.  */
2474
2475  if (no_length)
2476    bytes = rs6000_string_compare_inline_limit;
2477  else
2478    bytes = UINTVAL (bytes_rtx);
2479
2480  /* Is it OK to use vec/vsx for this.  TARGET_VSX means we have at
2481     least POWER7 but we use TARGET_EFFICIENT_UNALIGNED_VSX which is
2482     at least POWER8.  That way we can rely on overlapping compares to
2483     do the final comparison of less than 16 bytes.  Also I do not
2484     want to deal with making this work for 32 bits.  In addition, we
2485     have to make sure that we have at least P8_VECTOR (we don't allow
2486     P9_VECTOR without P8_VECTOR).  */
2487  int use_vec = (bytes >= 16 && !TARGET_32BIT
2488		 && TARGET_EFFICIENT_UNALIGNED_VSX && TARGET_P8_VECTOR);
2489
2490  if (use_vec)
2491    required_align = 16;
2492
2493  machine_mode load_mode;
2494  rtx tmp_reg_src1, tmp_reg_src2;
2495  if (use_vec)
2496    {
2497      load_mode = V16QImode;
2498      tmp_reg_src1 = gen_reg_rtx (V16QImode);
2499      tmp_reg_src2 = gen_reg_rtx (V16QImode);
2500    }
2501  else
2502    {
2503      load_mode = select_block_compare_mode (0, bytes, base_align);
2504      tmp_reg_src1 = gen_reg_rtx (word_mode);
2505      tmp_reg_src2 = gen_reg_rtx (word_mode);
2506    }
2507
2508  compare_length = rs6000_string_compare_inline_limit;
2509
2510  /* If we have equality at the end of the last compare and we have not
2511     found the end of the string, we need to call strcmp/strncmp to
2512     compare the remainder.  */
2513  bool equality_compare_rest = false;
2514
2515  if (no_length)
2516    {
2517      bytes = compare_length;
2518      equality_compare_rest = true;
2519    }
2520  else
2521    {
2522      if (bytes <= compare_length)
2523	compare_length = bytes;
2524      else
2525	equality_compare_rest = true;
2526    }
2527
2528  rtx result_reg = gen_reg_rtx (word_mode);
2529  rtx final_move_label = gen_label_rtx ();
2530  rtx final_label = gen_label_rtx ();
2531  rtx begin_compare_label = NULL;
2532
2533  if (base_align < required_align)
2534    {
2535      /* Generate code that checks distance to 4k boundary for this case.  */
2536      begin_compare_label = gen_label_rtx ();
2537      rtx strncmp_label = gen_label_rtx ();
2538      rtx jmp;
2539
2540      /* Strncmp for power8 in glibc does this:
2541	 rldicl r8,r3,0,52
2542	 cmpldi cr7,r8,4096-16
2543	 bgt    cr7,L(pagecross) */
2544
2545      /* Make sure that the length we use for the alignment test and
2546         the subsequent code generation are in agreement so we do not
2547         go past the length we tested for a 4k boundary crossing.  */
2548      unsigned HOST_WIDE_INT align_test = compare_length;
2549      if (align_test < required_align)
2550        {
2551          align_test = HOST_WIDE_INT_1U << ceil_log2 (align_test);
2552          base_align = align_test;
2553        }
2554      else
2555        {
2556          align_test = ROUND_UP (align_test, required_align);
2557          base_align = required_align;
2558        }
2559
2560      if (align1 < required_align)
2561        expand_strncmp_align_check (strncmp_label, src1_addr, align_test);
2562      if (align2 < required_align)
2563        expand_strncmp_align_check (strncmp_label, src2_addr, align_test);
2564
2565      /* Now generate the following sequence:
2566	 - branch to begin_compare
2567	 - strncmp_label
2568	 - call to strncmp
2569	 - branch to final_label
2570	 - begin_compare_label */
2571
2572      rtx cmp_ref = gen_rtx_LABEL_REF (VOIDmode, begin_compare_label);
2573      jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, cmp_ref));
2574      JUMP_LABEL (jmp) = begin_compare_label;
2575      LABEL_NUSES (begin_compare_label) += 1;
2576      emit_barrier ();
2577
2578      emit_label (strncmp_label);
2579
2580      if (no_length)
2581	{
2582	  tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2583	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2584				   target, LCT_NORMAL, GET_MODE (target),
2585				   force_reg (Pmode, src1_addr), Pmode,
2586				   force_reg (Pmode, src2_addr), Pmode);
2587	}
2588      else
2589	{
2590	  /* -m32 -mpowerpc64 results in word_mode being DImode even
2591	     though otherwise it is 32-bit.  The length arg to strncmp
2592	     is a size_t which will be the same size as pointers.  */
2593	  rtx len_rtx = gen_reg_rtx (Pmode);
2594	  emit_move_insn (len_rtx, gen_int_mode (bytes, Pmode));
2595
2596	  tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2597	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2598				   target, LCT_NORMAL, GET_MODE (target),
2599				   force_reg (Pmode, src1_addr), Pmode,
2600				   force_reg (Pmode, src2_addr), Pmode,
2601				   len_rtx, Pmode);
2602	}
2603
2604      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2605      jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2606      JUMP_LABEL (jmp) = final_label;
2607      LABEL_NUSES (final_label) += 1;
2608      emit_barrier ();
2609      emit_label (begin_compare_label);
2610    }
2611
2612  rtx cleanup_label = NULL;
2613  rtx s1addr = NULL, s2addr = NULL, off_reg = NULL, vec_result = NULL;
2614
2615  /* Generate a sequence of GPR or VEC/VSX instructions to compare out
2616     to the length specified.  */
2617  if (use_vec)
2618    {
2619      s1addr = gen_reg_rtx (Pmode);
2620      s2addr = gen_reg_rtx (Pmode);
2621      off_reg = gen_reg_rtx (Pmode);
2622      vec_result = gen_reg_rtx (load_mode);
2623      emit_move_insn (result_reg, GEN_INT (0));
2624      expand_cmp_vec_sequence (compare_length,
2625			       orig_src1, orig_src2,
2626			       s1addr, s2addr, off_reg,
2627			       tmp_reg_src1, tmp_reg_src2,
2628			       vec_result,
2629			       equality_compare_rest,
2630			       &cleanup_label, final_move_label, true);
2631    }
2632  else
2633    expand_strncmp_gpr_sequence (compare_length, base_align,
2634				 orig_src1, orig_src2,
2635				 tmp_reg_src1, tmp_reg_src2,
2636				 result_reg,
2637				 equality_compare_rest,
2638				 &cleanup_label, final_move_label);
2639
2640  offset = compare_length;
2641
2642  if (equality_compare_rest)
2643    {
2644      /* Update pointers past what has been compared already.  */
2645      rtx src1 = force_reg (Pmode,
2646			    gen_rtx_PLUS (Pmode, src1_addr, GEN_INT (offset)));
2647      rtx src2 = force_reg (Pmode,
2648			    gen_rtx_PLUS (Pmode, src2_addr, GEN_INT (offset)));
2649
2650      /* Construct call to strcmp/strncmp to compare the rest of the string.  */
2651      if (no_length)
2652	{
2653	  tree fun = builtin_decl_explicit (BUILT_IN_STRCMP);
2654	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2655				   target, LCT_NORMAL, GET_MODE (target),
2656				   src1, Pmode, src2, Pmode);
2657	}
2658      else
2659	{
2660	  rtx len_rtx = gen_reg_rtx (Pmode);
2661	  emit_move_insn (len_rtx, gen_int_mode (bytes - compare_length, Pmode));
2662	  tree fun = builtin_decl_explicit (BUILT_IN_STRNCMP);
2663	  emit_library_call_value (XEXP (DECL_RTL (fun), 0),
2664				   target, LCT_NORMAL, GET_MODE (target),
2665				   src1, Pmode, src2, Pmode, len_rtx, Pmode);
2666	}
2667
2668      rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
2669      rtx jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, fin_ref));
2670      JUMP_LABEL (jmp) = final_label;
2671      LABEL_NUSES (final_label) += 1;
2672      emit_barrier ();
2673    }
2674
2675  if (cleanup_label)
2676    emit_label (cleanup_label);
2677
2678  if (use_vec)
2679    emit_final_compare_vec (tmp_reg_src1, tmp_reg_src2, result_reg,
2680			    s1addr, s2addr, orig_src1, orig_src2,
2681			    off_reg, vec_result);
2682  else
2683    emit_final_str_compare_gpr (tmp_reg_src1, tmp_reg_src2, result_reg);
2684
2685  emit_label (final_move_label);
2686  emit_insn (gen_movsi (target,
2687			gen_lowpart (SImode, result_reg)));
2688  emit_label (final_label);
2689  return true;
2690}
2691
2692/* Generate loads and stores for a move of v4si mode using lvx/stvx.
2693   This uses altivec_{l,st}vx_<mode>_internal which use unspecs to
2694   keep combine from changing what instruction gets used.
2695
2696   DEST is the destination for the data.
2697   SRC is the source of the data for the move.  */
2698
2699static rtx
2700gen_lvx_v4si_move (rtx dest, rtx src)
2701{
2702  gcc_assert (MEM_P (dest) ^ MEM_P (src));
2703  gcc_assert (GET_MODE (dest) == V4SImode && GET_MODE (src) == V4SImode);
2704
2705  if (MEM_P (dest))
2706    return gen_altivec_stvx_v4si_internal (dest, src);
2707  else
2708    return gen_altivec_lvx_v4si_internal (dest, src);
2709}
2710
2711/* Expand a block move operation, and return 1 if successful.  Return 0
2712   if we should let the compiler generate normal code.
2713
2714   operands[0] is the destination
2715   operands[1] is the source
2716   operands[2] is the length
2717   operands[3] is the alignment */
2718
2719#define MAX_MOVE_REG 4
2720
2721int
2722expand_block_move (rtx operands[], bool might_overlap)
2723{
2724  rtx orig_dest = operands[0];
2725  rtx orig_src	= operands[1];
2726  rtx bytes_rtx	= operands[2];
2727  rtx align_rtx = operands[3];
2728  int constp	= CONST_INT_P (bytes_rtx);
2729  int align;
2730  int bytes;
2731  int offset;
2732  int move_bytes;
2733  rtx loads[MAX_MOVE_REG];
2734  rtx stores[MAX_MOVE_REG];
2735  int num_reg = 0;
2736
2737  /* If this is not a fixed size move, just call memcpy */
2738  if (! constp)
2739    return 0;
2740
2741  /* This must be a fixed size alignment */
2742  gcc_assert (CONST_INT_P (align_rtx));
2743  align = INTVAL (align_rtx) * BITS_PER_UNIT;
2744
2745  /* Anything to move? */
2746  bytes = INTVAL (bytes_rtx);
2747  if (bytes <= 0)
2748    return 1;
2749
2750  if (bytes > rs6000_block_move_inline_limit)
2751    return 0;
2752
2753  for (offset = 0; bytes > 0; offset += move_bytes, bytes -= move_bytes)
2754    {
2755      union {
2756	rtx (*movmemsi) (rtx, rtx, rtx, rtx);
2757	rtx (*mov) (rtx, rtx);
2758      } gen_func;
2759      machine_mode mode = BLKmode;
2760      rtx src, dest;
2761
2762      /* Altivec first, since it will be faster than a string move
2763	 when it applies, and usually not significantly larger.  */
2764      if (TARGET_ALTIVEC && bytes >= 16 && align >= 128)
2765	{
2766	  move_bytes = 16;
2767	  mode = V4SImode;
2768	  gen_func.mov = gen_lvx_v4si_move;
2769	}
2770      else if (bytes >= 8 && TARGET_POWERPC64
2771	       && (align >= 64 || !STRICT_ALIGNMENT))
2772	{
2773	  move_bytes = 8;
2774	  mode = DImode;
2775	  gen_func.mov = gen_movdi;
2776	  if (offset == 0 && align < 64)
2777	    {
2778	      rtx addr;
2779
2780	      /* If the address form is reg+offset with offset not a
2781		 multiple of four, reload into reg indirect form here
2782		 rather than waiting for reload.  This way we get one
2783		 reload, not one per load and/or store.  */
2784	      addr = XEXP (orig_dest, 0);
2785	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2786		  && CONST_INT_P (XEXP (addr, 1))
2787		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2788		{
2789		  addr = copy_addr_to_reg (addr);
2790		  orig_dest = replace_equiv_address (orig_dest, addr);
2791		}
2792	      addr = XEXP (orig_src, 0);
2793	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
2794		  && CONST_INT_P (XEXP (addr, 1))
2795		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
2796		{
2797		  addr = copy_addr_to_reg (addr);
2798		  orig_src = replace_equiv_address (orig_src, addr);
2799		}
2800	    }
2801	}
2802      else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
2803	{			/* move 4 bytes */
2804	  move_bytes = 4;
2805	  mode = SImode;
2806	  gen_func.mov = gen_movsi;
2807	}
2808      else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
2809	{			/* move 2 bytes */
2810	  move_bytes = 2;
2811	  mode = HImode;
2812	  gen_func.mov = gen_movhi;
2813	}
2814      else /* move 1 byte at a time */
2815	{
2816	  move_bytes = 1;
2817	  mode = QImode;
2818	  gen_func.mov = gen_movqi;
2819	}
2820
2821      /* Mode is always set to something other than BLKmode by one of the
2822	 cases of the if statement above.  */
2823      gcc_assert (mode != BLKmode);
2824
2825      src = adjust_address (orig_src, mode, offset);
2826      dest = adjust_address (orig_dest, mode, offset);
2827
2828      rtx tmp_reg = gen_reg_rtx (mode);
2829
2830      loads[num_reg]    = (*gen_func.mov) (tmp_reg, src);
2831      stores[num_reg++] = (*gen_func.mov) (dest, tmp_reg);
2832
2833      /* If we didn't succeed in doing it in one pass, we can't do it in the
2834	 might_overlap case.  Bail out and return failure.  */
2835      if (might_overlap && num_reg >= MAX_MOVE_REG
2836	  && bytes > move_bytes)
2837	return 0;
2838
2839      /* Emit loads and stores saved up.  */
2840      if (num_reg >= MAX_MOVE_REG || bytes == move_bytes)
2841	{
2842	  int i;
2843	  for (i = 0; i < num_reg; i++)
2844	    emit_insn (loads[i]);
2845	  for (i = 0; i < num_reg; i++)
2846	    emit_insn (stores[i]);
2847	  num_reg = 0;
2848	}
2849
2850    }
2851
2852  return 1;
2853}
2854