1/* Subroutines used to remove unnecessary doubleword swaps
2   for p8 little-endian VSX code.
3   Copyright (C) 1991-2022 Free Software Foundation, Inc.
4
5   This file is part of GCC.
6
7   GCC is free software; you can redistribute it and/or modify it
8   under the terms of the GNU General Public License as published
9   by the Free Software Foundation; either version 3, or (at your
10   option) any later version.
11
12   GCC is distributed in the hope that it will be useful, but WITHOUT
13   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14   or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
15   License for more details.
16
17   You should have received a copy of the GNU General Public License
18   along with GCC; see the file COPYING3.  If not see
19   <http://www.gnu.org/licenses/>.  */
20
21#define IN_TARGET_CODE 1
22
23#include "config.h"
24#include "system.h"
25#include "coretypes.h"
26#include "backend.h"
27#include "rtl.h"
28#include "tree.h"
29#include "memmodel.h"
30#include "df.h"
31#include "tm_p.h"
32#include "ira.h"
33#include "print-tree.h"
34#include "varasm.h"
35#include "explow.h"
36#include "expr.h"
37#include "output.h"
38#include "tree-pass.h"
39#include "rtx-vector-builder.h"
40
41/* Analyze vector computations and remove unnecessary doubleword
42   swaps (xxswapdi instructions).  This pass is performed only
43   for little-endian VSX code generation.
44
45   For this specific case, loads and stores of 4x32 and 2x64 vectors
46   are inefficient.  These are implemented using the lvx2dx and
47   stvx2dx instructions, which invert the order of doublewords in
48   a vector register.  Thus the code generation inserts an xxswapdi
49   after each such load, and prior to each such store.  (For spill
50   code after register assignment, an additional xxswapdi is inserted
51   following each store in order to return a hard register to its
52   unpermuted value.)
53
54   The extra xxswapdi instructions reduce performance.  This can be
55   particularly bad for vectorized code.  The purpose of this pass
56   is to reduce the number of xxswapdi instructions required for
57   correctness.
58
59   The primary insight is that much code that operates on vectors
60   does not care about the relative order of elements in a register,
61   so long as the correct memory order is preserved.  If we have
62   a computation where all input values are provided by lvxd2x/xxswapdi
63   sequences, all outputs are stored using xxswapdi/stvxd2x sequences,
64   and all intermediate computations are pure SIMD (independent of
65   element order), then all the xxswapdi's associated with the loads
66   and stores may be removed.
67
68   This pass uses some of the infrastructure and logical ideas from
69   the "web" pass in web.cc.  We create maximal webs of computations
70   fitting the description above using union-find.  Each such web is
71   then optimized by removing its unnecessary xxswapdi instructions.
72
73   The pass is placed prior to global optimization so that we can
74   perform the optimization in the safest and simplest way possible;
75   that is, by replacing each xxswapdi insn with a register copy insn.
76   Subsequent forward propagation will remove copies where possible.
77
78   There are some operations sensitive to element order for which we
79   can still allow the operation, provided we modify those operations.
80   These include CONST_VECTORs, for which we must swap the first and
81   second halves of the constant vector; and SUBREGs, for which we
82   must adjust the byte offset to account for the swapped doublewords.
83   A remaining opportunity would be non-immediate-form splats, for
84   which we should adjust the selected lane of the input.  We should
85   also make code generation adjustments for sum-across operations,
86   since this is a common vectorizer reduction.
87
88   Because we run prior to the first split, we can see loads and stores
89   here that match *vsx_le_perm_{load,store}_<mode>.  These are vanilla
90   vector loads and stores that have not yet been split into a permuting
91   load/store and a swap.  (One way this can happen is with a builtin
92   call to vec_vsx_{ld,st}.)  We can handle these as well, but rather
93   than deleting a swap, we convert the load/store into a permuting
94   load/store (which effectively removes the swap).  */
95
96/* Notes on Permutes
97
98   We do not currently handle computations that contain permutes.  There
99   is a general transformation that can be performed correctly, but it
100   may introduce more expensive code than it replaces.  To handle these
101   would require a cost model to determine when to perform the optimization.
102   This commentary records how this could be done if desired.
103
104   The most general permute is something like this (example for V16QI):
105
106   (vec_select:V16QI (vec_concat:V32QI (op1:V16QI) (op2:V16QI))
107                     (parallel [(const_int a0) (const_int a1)
108                                 ...
109                                (const_int a14) (const_int a15)]))
110
111   where a0,...,a15 are in [0,31] and select elements from op1 and op2
112   to produce in the result.
113
114   Regardless of mode, we can convert the PARALLEL to a mask of 16
115   byte-element selectors.  Let's call this M, with M[i] representing
116   the ith byte-element selector value.  Then if we swap doublewords
117   throughout the computation, we can get correct behavior by replacing
118   M with M' as follows:
119
120    M'[i] = { (M[i]+8)%16      : M[i] in [0,15]
121            { ((M[i]+8)%16)+16 : M[i] in [16,31]
122
123   This seems promising at first, since we are just replacing one mask
124   with another.  But certain masks are preferable to others.  If M
125   is a mask that matches a vmrghh pattern, for example, M' certainly
126   will not.  Instead of a single vmrghh, we would generate a load of
127   M' and a vperm.  So we would need to know how many xxswapd's we can
128   remove as a result of this transformation to determine if it's
129   profitable; and preferably the logic would need to be aware of all
130   the special preferable masks.
131
132   Another form of permute is an UNSPEC_VPERM, in which the mask is
133   already in a register.  In some cases, this mask may be a constant
134   that we can discover with ud-chains, in which case the above
135   transformation is ok.  However, the common usage here is for the
136   mask to be produced by an UNSPEC_LVSL, in which case the mask
137   cannot be known at compile time.  In such a case we would have to
138   generate several instructions to compute M' as above at run time,
139   and a cost model is needed again.
140
141   However, when the mask M for an UNSPEC_VPERM is loaded from the
142   constant pool, we can replace M with M' as above at no cost
143   beyond adding a constant pool entry.  */
144
145/* This is based on the union-find logic in web.cc.  web_entry_base is
146   defined in df.h.  */
147class swap_web_entry : public web_entry_base
148{
149 public:
150  /* Pointer to the insn.  */
151  rtx_insn *insn;
152  /* Set if insn contains a mention of a vector register.  All other
153     fields are undefined if this field is unset.  */
154  unsigned int is_relevant : 1;
155  /* Set if insn is a load.  */
156  unsigned int is_load : 1;
157  /* Set if insn is a store.  */
158  unsigned int is_store : 1;
159  /* Set if insn is a doubleword swap.  This can either be a register swap
160     or a permuting load or store (test is_load and is_store for this).  */
161  unsigned int is_swap : 1;
162  /* Set if the insn has a live-in use of a parameter register.  */
163  unsigned int is_live_in : 1;
164  /* Set if the insn has a live-out def of a return register.  */
165  unsigned int is_live_out : 1;
166  /* Set if the insn contains a subreg reference of a vector register.  */
167  unsigned int contains_subreg : 1;
168  /* Set if the insn contains a 128-bit integer operand.  */
169  unsigned int is_128_int : 1;
170  /* Set if this is a call-insn.  */
171  unsigned int is_call : 1;
172  /* Set if this insn does not perform a vector operation for which
173     element order matters, or if we know how to fix it up if it does.
174     Undefined if is_swap is set.  */
175  unsigned int is_swappable : 1;
176  /* A nonzero value indicates what kind of special handling for this
177     insn is required if doublewords are swapped.  Undefined if
178     is_swappable is not set.  */
179  unsigned int special_handling : 4;
180  /* Set if the web represented by this entry cannot be optimized.  */
181  unsigned int web_not_optimizable : 1;
182  /* Set if this insn should be deleted.  */
183  unsigned int will_delete : 1;
184};
185
186enum special_handling_values {
187  SH_NONE = 0,
188  SH_CONST_VECTOR,
189  SH_SUBREG,
190  SH_NOSWAP_LD,
191  SH_NOSWAP_ST,
192  SH_EXTRACT,
193  SH_SPLAT,
194  SH_XXPERMDI,
195  SH_CONCAT,
196  SH_VPERM
197};
198
199/* Union INSN with all insns containing definitions that reach USE.
200   Detect whether USE is live-in to the current function.  */
201static void
202union_defs (swap_web_entry *insn_entry, rtx insn, df_ref use)
203{
204  struct df_link *link = DF_REF_CHAIN (use);
205
206  if (!link)
207    insn_entry[INSN_UID (insn)].is_live_in = 1;
208
209  while (link)
210    {
211      if (DF_REF_IS_ARTIFICIAL (link->ref))
212	insn_entry[INSN_UID (insn)].is_live_in = 1;
213
214      if (DF_REF_INSN_INFO (link->ref))
215	{
216	  rtx def_insn = DF_REF_INSN (link->ref);
217	  (void)unionfind_union (insn_entry + INSN_UID (insn),
218				 insn_entry + INSN_UID (def_insn));
219	}
220
221      link = link->next;
222    }
223}
224
225/* Union INSN with all insns containing uses reached from DEF.
226   Detect whether DEF is live-out from the current function.  */
227static void
228union_uses (swap_web_entry *insn_entry, rtx insn, df_ref def)
229{
230  struct df_link *link = DF_REF_CHAIN (def);
231
232  if (!link)
233    insn_entry[INSN_UID (insn)].is_live_out = 1;
234
235  while (link)
236    {
237      /* This could be an eh use or some other artificial use;
238	 we treat these all the same (killing the optimization).  */
239      if (DF_REF_IS_ARTIFICIAL (link->ref))
240	insn_entry[INSN_UID (insn)].is_live_out = 1;
241
242      if (DF_REF_INSN_INFO (link->ref))
243	{
244	  rtx use_insn = DF_REF_INSN (link->ref);
245	  (void)unionfind_union (insn_entry + INSN_UID (insn),
246				 insn_entry + INSN_UID (use_insn));
247	}
248
249      link = link->next;
250    }
251}
252
253/* Return 1 iff PAT (a SINGLE_SET) is a rotate 64 bit expression; else return
254   0.  */
255
256static bool
257pattern_is_rotate64 (rtx pat)
258{
259  rtx rot = SET_SRC (pat);
260
261  if (GET_CODE (rot) == ROTATE && CONST_INT_P (XEXP (rot, 1))
262      && INTVAL (XEXP (rot, 1)) == 64)
263    return true;
264
265  return false;
266}
267
268/* Return 1 iff INSN is a load insn, including permuting loads that
269   represent an lvxd2x instruction; else return 0.  */
270static unsigned int
271insn_is_load_p (rtx insn)
272{
273  rtx body = PATTERN (insn);
274
275  if (GET_CODE (body) == SET)
276    {
277      if (MEM_P (SET_SRC (body)))
278	return 1;
279
280      if (GET_CODE (SET_SRC (body)) == VEC_SELECT
281	  && MEM_P (XEXP (SET_SRC (body), 0)))
282	return 1;
283
284      if (pattern_is_rotate64 (body) && MEM_P (XEXP (SET_SRC (body), 0)))
285	return 1;
286
287      return 0;
288    }
289
290  if (GET_CODE (body) != PARALLEL)
291    return 0;
292
293  rtx set = XVECEXP (body, 0, 0);
294
295  if (GET_CODE (set) == SET && MEM_P (SET_SRC (set)))
296    return 1;
297
298  return 0;
299}
300
301/* Return 1 iff INSN is a store insn, including permuting stores that
302   represent an stvxd2x instruction; else return 0.  */
303static unsigned int
304insn_is_store_p (rtx insn)
305{
306  rtx body = PATTERN (insn);
307  if (GET_CODE (body) == SET && MEM_P (SET_DEST (body)))
308    return 1;
309  if (GET_CODE (body) != PARALLEL)
310    return 0;
311  rtx set = XVECEXP (body, 0, 0);
312  if (GET_CODE (set) == SET && MEM_P (SET_DEST (set)))
313    return 1;
314  return 0;
315}
316
317/* Return 1 iff INSN swaps doublewords.  This may be a reg-reg swap,
318   a permuting load, or a permuting store.  */
319static unsigned int
320insn_is_swap_p (rtx insn)
321{
322  rtx body = PATTERN (insn);
323  if (GET_CODE (body) != SET)
324    return 0;
325  rtx rhs = SET_SRC (body);
326  if (pattern_is_rotate64 (body))
327    return 1;
328  if (GET_CODE (rhs) != VEC_SELECT)
329    return 0;
330  rtx parallel = XEXP (rhs, 1);
331  if (GET_CODE (parallel) != PARALLEL)
332    return 0;
333  unsigned int len = XVECLEN (parallel, 0);
334  if (len != 2 && len != 4 && len != 8 && len != 16)
335    return 0;
336  for (unsigned int i = 0; i < len / 2; ++i)
337    {
338      rtx op = XVECEXP (parallel, 0, i);
339      if (!CONST_INT_P (op) || INTVAL (op) != len / 2 + i)
340	return 0;
341    }
342  for (unsigned int i = len / 2; i < len; ++i)
343    {
344      rtx op = XVECEXP (parallel, 0, i);
345      if (!CONST_INT_P (op) || INTVAL (op) != i - len / 2)
346	return 0;
347    }
348  return 1;
349}
350
351/* Return true iff EXPR represents the sum of two registers.  */
352bool
353rs6000_sum_of_two_registers_p (const_rtx expr)
354{
355  if (GET_CODE (expr) == PLUS)
356    {
357      const_rtx operand1 = XEXP (expr, 0);
358      const_rtx operand2 = XEXP (expr, 1);
359      return (REG_P (operand1) && REG_P (operand2));
360    }
361  return false;
362}
363
364/* Return true iff EXPR represents an address expression that masks off
365   the low-order 4 bits in the style of an lvx or stvx rtl pattern.  */
366bool
367rs6000_quadword_masked_address_p (const_rtx expr)
368{
369  if (GET_CODE (expr) == AND)
370    {
371      const_rtx operand1 = XEXP (expr, 0);
372      const_rtx operand2 = XEXP (expr, 1);
373      if ((REG_P (operand1) || rs6000_sum_of_two_registers_p (operand1))
374	  && CONST_SCALAR_INT_P (operand2) && INTVAL (operand2) == -16)
375	return true;
376    }
377  return false;
378}
379
380/* Return TRUE if INSN represents a swap of a swapped load from memory
381   and the memory address is quad-word aligned.  */
382static bool
383quad_aligned_load_p (swap_web_entry *insn_entry, rtx_insn *insn)
384{
385  unsigned uid = INSN_UID (insn);
386  if (!insn_entry[uid].is_swap || insn_entry[uid].is_load)
387    return false;
388
389  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
390
391  /* Since insn is known to represent a swap instruction, we know it
392     "uses" only one input variable.  */
393  df_ref use = DF_INSN_INFO_USES (insn_info);
394
395  /* Figure out where this input variable is defined.  */
396  struct df_link *def_link = DF_REF_CHAIN (use);
397
398  /* If there is no definition or the definition is artificial or there are
399     multiple definitions, punt.  */
400  if (!def_link || !def_link->ref || DF_REF_IS_ARTIFICIAL (def_link->ref)
401      || def_link->next)
402    return false;
403
404  rtx def_insn = DF_REF_INSN (def_link->ref);
405  unsigned uid2 = INSN_UID (def_insn);
406  /* We're looking for a load-with-swap insn.  If this is not that,
407     return false.  */
408  if (!insn_entry[uid2].is_load || !insn_entry[uid2].is_swap)
409    return false;
410
411  /* If the source of the rtl def is not a set from memory, return
412     false.  */
413  rtx body = PATTERN (def_insn);
414  if (GET_CODE (body) != SET
415      || !(GET_CODE (SET_SRC (body)) == VEC_SELECT
416	   || pattern_is_rotate64 (body))
417      || !MEM_P (XEXP (SET_SRC (body), 0)))
418    return false;
419
420  rtx mem = XEXP (SET_SRC (body), 0);
421  rtx base_reg = XEXP (mem, 0);
422  return ((REG_P (base_reg) || rs6000_sum_of_two_registers_p (base_reg))
423	  && MEM_ALIGN (mem) >= 128) ? true : false;
424}
425
426/* Return TRUE if INSN represents a store-with-swap of a swapped value
427   and the memory address is quad-word aligned.  */
428static bool
429quad_aligned_store_p (swap_web_entry *insn_entry, rtx_insn *insn)
430{
431  unsigned uid = INSN_UID (insn);
432  if (!insn_entry[uid].is_swap || !insn_entry[uid].is_store)
433    return false;
434
435  rtx body = PATTERN (insn);
436  rtx dest_address = XEXP (SET_DEST (body), 0);
437  rtx swap_reg = XEXP (SET_SRC (body), 0);
438
439  /* If the base address for the memory expression is not represented
440     by a single register and is not the sum of two registers, punt.  */
441  if (!REG_P (dest_address) && !rs6000_sum_of_two_registers_p (dest_address))
442    return false;
443
444  /* Confirm that the value to be stored is produced by a swap
445     instruction.  */
446  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
447  df_ref use;
448  FOR_EACH_INSN_INFO_USE (use, insn_info)
449    {
450      struct df_link *def_link = DF_REF_CHAIN (use);
451
452      /* If this is not the definition of the candidate swap register,
453	 then skip it.  I am interested in a different definition.  */
454      if (!rtx_equal_p (DF_REF_REG (use), swap_reg))
455	continue;
456
457      /* If there is no def or the def is artifical or there are
458	 multiple defs, punt.  */
459      if (!def_link || !def_link->ref || DF_REF_IS_ARTIFICIAL (def_link->ref)
460	  || def_link->next)
461	return false;
462
463      rtx def_insn = DF_REF_INSN (def_link->ref);
464      unsigned uid2 = INSN_UID (def_insn);
465
466      /* If this source value is not a simple swap, return false */
467      if (!insn_entry[uid2].is_swap || insn_entry[uid2].is_load
468	  || insn_entry[uid2].is_store)
469	return false;
470
471      /* I've processed the use that I care about, so break out of
472	 this loop.  */
473      break;
474    }
475
476  /* At this point, we know the source data comes from a swap.  The
477     remaining question is whether the memory address is aligned.  */
478  rtx set = single_set (insn);
479  if (set)
480    {
481      rtx dest = SET_DEST (set);
482      if (MEM_P (dest))
483	return (MEM_ALIGN (dest) >= 128);
484    }
485  return false;
486}
487
488/* Return 1 iff UID, known to reference a swap, is both fed by a load
489   and a feeder of a store.  */
490static unsigned int
491swap_feeds_both_load_and_store (swap_web_entry *insn_entry)
492{
493  rtx insn = insn_entry->insn;
494  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
495  df_ref def, use;
496  struct df_link *link = 0;
497  rtx_insn *load = 0, *store = 0;
498  bool fed_by_load = 0;
499  bool feeds_store = 0;
500
501  FOR_EACH_INSN_INFO_USE (use, insn_info)
502    {
503      link = DF_REF_CHAIN (use);
504      load = DF_REF_INSN (link->ref);
505      if (insn_is_load_p (load) && insn_is_swap_p (load))
506	fed_by_load = 1;
507    }
508
509  FOR_EACH_INSN_INFO_DEF (def, insn_info)
510    {
511      link = DF_REF_CHAIN (def);
512      store = DF_REF_INSN (link->ref);
513      if (insn_is_store_p (store) && insn_is_swap_p (store))
514	feeds_store = 1;
515    }
516
517  return fed_by_load && feeds_store;
518}
519
520/* Return TRUE if insn is a swap fed by a load from the constant pool.  */
521static bool
522const_load_sequence_p (swap_web_entry *insn_entry, rtx insn)
523{
524  unsigned uid = INSN_UID (insn);
525  if (!insn_entry[uid].is_swap || insn_entry[uid].is_load)
526    return false;
527
528  const_rtx tocrel_base;
529
530  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
531  df_ref use;
532
533  /* Iterate over the definitions that are used by this insn.  Since
534     this is known to be a swap insn, expect only one used definnition.  */
535  FOR_EACH_INSN_INFO_USE (use, insn_info)
536    {
537      struct df_link *def_link = DF_REF_CHAIN (use);
538
539      /* If there is no def or the def is artificial or there are
540	 multiple defs, punt.  */
541      if (!def_link || !def_link->ref || DF_REF_IS_ARTIFICIAL (def_link->ref)
542	  || def_link->next)
543	return false;
544
545      rtx def_insn = DF_REF_INSN (def_link->ref);
546      unsigned uid2 = INSN_UID (def_insn);
547      /* If this is not a load or is not a swap, return false.  */
548      if (!insn_entry[uid2].is_load || !insn_entry[uid2].is_swap)
549	return false;
550
551      /* If the source of the rtl def is not a set from memory, return
552	 false.  */
553      rtx body = PATTERN (def_insn);
554      if (GET_CODE (body) != SET
555	  || !(GET_CODE (SET_SRC (body)) == VEC_SELECT
556	       || pattern_is_rotate64 (body))
557	  || !MEM_P (XEXP (SET_SRC (body), 0)))
558	return false;
559
560      rtx mem = XEXP (SET_SRC (body), 0);
561      rtx base_reg = XEXP (mem, 0);
562      /* If the base address for the memory expression is not
563	 represented by a register, punt.  */
564      if (!REG_P (base_reg))
565	return false;
566
567      df_ref base_use;
568      insn_info = DF_INSN_INFO_GET (def_insn);
569      FOR_EACH_INSN_INFO_USE (base_use, insn_info)
570	{
571	  /* If base_use does not represent base_reg, look for another
572	     use.  */
573	  if (!rtx_equal_p (DF_REF_REG (base_use), base_reg))
574	    continue;
575
576	  struct df_link *base_def_link = DF_REF_CHAIN (base_use);
577	  if (!base_def_link || base_def_link->next)
578	    return false;
579
580	  /* Constants held on the stack are not "true" constants
581	     because their values are not part of the static load
582	     image.  If this constant's base reference is a stack
583	     or frame pointer, it is seen as an artificial
584	     reference.  */
585	  if (DF_REF_IS_ARTIFICIAL (base_def_link->ref))
586	    return false;
587
588	  rtx tocrel_insn = DF_REF_INSN (base_def_link->ref);
589	  rtx tocrel_body = PATTERN (tocrel_insn);
590	  rtx base, offset;
591	  if (GET_CODE (tocrel_body) != SET)
592	    return false;
593	  /* There is an extra level of indirection for small/large
594	     code models.  */
595	  rtx tocrel_expr = SET_SRC (tocrel_body);
596	  if (MEM_P (tocrel_expr))
597	    tocrel_expr = XEXP (tocrel_expr, 0);
598	  if (!toc_relative_expr_p (tocrel_expr, false, &tocrel_base, NULL))
599	    return false;
600	  split_const (XVECEXP (tocrel_base, 0, 0), &base, &offset);
601
602	  if (!SYMBOL_REF_P (base) || !CONSTANT_POOL_ADDRESS_P (base))
603	    return false;
604	  else
605	    {
606	      /* FIXME: The conditions under which
607	          (SYMBOL_REF_P (const_vector)
608	           && !CONSTANT_POOL_ADDRESS_P (const_vector))
609	         are not well understood.  This code prevents
610	         an internal compiler error which will occur in
611	         replace_swapped_load_constant () if we were to return
612	         true.  Some day, we should figure out how to properly
613	         handle this condition in
614	         replace_swapped_load_constant () and then we can
615	         remove this special test.  */
616	      rtx const_vector = get_pool_constant (base);
617	      if (SYMBOL_REF_P (const_vector)
618		  && CONSTANT_POOL_ADDRESS_P (const_vector))
619		const_vector = get_pool_constant (const_vector);
620	      if (GET_CODE (const_vector) != CONST_VECTOR)
621		return false;
622	    }
623	}
624    }
625  return true;
626}
627
628/* Return TRUE iff OP matches a V2DF reduction pattern.  See the
629   definition of vsx_reduc_<VEC_reduc_name>_v2df in vsx.md.  */
630static bool
631v2df_reduction_p (rtx op)
632{
633  if (GET_MODE (op) != V2DFmode)
634    return false;
635
636  enum rtx_code code = GET_CODE (op);
637  if (code != PLUS && code != SMIN && code != SMAX)
638    return false;
639
640  rtx concat = XEXP (op, 0);
641  if (GET_CODE (concat) != VEC_CONCAT)
642    return false;
643
644  rtx select0 = XEXP (concat, 0);
645  rtx select1 = XEXP (concat, 1);
646  if (GET_CODE (select0) != VEC_SELECT || GET_CODE (select1) != VEC_SELECT)
647    return false;
648
649  rtx reg0 = XEXP (select0, 0);
650  rtx reg1 = XEXP (select1, 0);
651  if (!rtx_equal_p (reg0, reg1) || !REG_P (reg0))
652    return false;
653
654  rtx parallel0 = XEXP (select0, 1);
655  rtx parallel1 = XEXP (select1, 1);
656  if (GET_CODE (parallel0) != PARALLEL || GET_CODE (parallel1) != PARALLEL)
657    return false;
658
659  if (!rtx_equal_p (XVECEXP (parallel0, 0, 0), const1_rtx)
660      || !rtx_equal_p (XVECEXP (parallel1, 0, 0), const0_rtx))
661    return false;
662
663  return true;
664}
665
666/* Return 1 iff OP is an operand that will not be affected by having
667   vector doublewords swapped in memory.  */
668static unsigned int
669rtx_is_swappable_p (rtx op, unsigned int *special)
670{
671  enum rtx_code code = GET_CODE (op);
672  int i, j;
673  rtx parallel;
674
675  switch (code)
676    {
677    case LABEL_REF:
678    case SYMBOL_REF:
679    case CLOBBER:
680    case REG:
681      return 1;
682
683    case VEC_CONCAT:
684    case ASM_INPUT:
685    case ASM_OPERANDS:
686      return 0;
687
688    case CONST_VECTOR:
689      {
690	*special = SH_CONST_VECTOR;
691	return 1;
692      }
693
694    case VEC_DUPLICATE:
695      /* Opportunity: If XEXP (op, 0) has the same mode as the result,
696	 and XEXP (op, 1) is a PARALLEL with a single QImode const int,
697	 it represents a vector splat for which we can do special
698	 handling.  */
699      if (CONST_INT_P (XEXP (op, 0)))
700	return 1;
701      else if (REG_P (XEXP (op, 0))
702	       && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0)))
703	/* This catches V2DF and V2DI splat, at a minimum.  */
704	return 1;
705      else if (GET_CODE (XEXP (op, 0)) == TRUNCATE
706	       && REG_P (XEXP (XEXP (op, 0), 0))
707	       && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0)))
708	/* This catches splat of a truncated value.  */
709	return 1;
710      else if (GET_CODE (XEXP (op, 0)) == VEC_SELECT)
711	/* If the duplicated item is from a select, defer to the select
712	   processing to see if we can change the lane for the splat.  */
713	return rtx_is_swappable_p (XEXP (op, 0), special);
714      else
715	return 0;
716
717    case VEC_SELECT:
718      /* A vec_extract operation is ok if we change the lane.  */
719      if (REG_P (XEXP (op, 0))
720	  && GET_MODE_INNER (GET_MODE (XEXP (op, 0))) == GET_MODE (op)
721	  && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL
722	  && XVECLEN (parallel, 0) == 1
723	  && CONST_INT_P (XVECEXP (parallel, 0, 0)))
724	{
725	  *special = SH_EXTRACT;
726	  return 1;
727	}
728      /* An XXPERMDI is ok if we adjust the lanes.  Note that if the
729	 XXPERMDI is a swap operation, it will be identified by
730	 insn_is_swap_p and therefore we won't get here.  */
731      else if (GET_CODE (XEXP (op, 0)) == VEC_CONCAT
732	       && (GET_MODE (XEXP (op, 0)) == V4DFmode
733		   || GET_MODE (XEXP (op, 0)) == V4DImode)
734	       && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL
735	       && XVECLEN (parallel, 0) == 2
736	       && CONST_INT_P (XVECEXP (parallel, 0, 0))
737	       && CONST_INT_P (XVECEXP (parallel, 0, 1)))
738	{
739	  *special = SH_XXPERMDI;
740	  return 1;
741	}
742      else if (v2df_reduction_p (op))
743	return 1;
744      else
745	return 0;
746
747    case UNSPEC:
748      {
749	/* Various operations are unsafe for this optimization, at least
750	   without significant additional work.  Permutes are obviously
751	   problematic, as both the permute control vector and the ordering
752	   of the target values are invalidated by doubleword swapping.
753	   Vector pack and unpack modify the number of vector lanes.
754	   Merge-high/low will not operate correctly on swapped operands.
755	   Vector shifts across element boundaries are clearly uncool,
756	   as are vector select and concatenate operations.  Vector
757	   sum-across instructions define one operand with a specific
758	   order-dependent element, so additional fixup code would be
759	   needed to make those work.  Vector set and non-immediate-form
760	   vector splat are element-order sensitive.  A few of these
761	   cases might be workable with special handling if required.
762	   Adding cost modeling would be appropriate in some cases.  */
763	int val = XINT (op, 1);
764	switch (val)
765	  {
766	  default:
767	    break;
768	  case UNSPEC_VBPERMQ:
769	  case UNSPEC_VPACK_SIGN_SIGN_SAT:
770	  case UNSPEC_VPACK_SIGN_UNS_SAT:
771	  case UNSPEC_VPACK_UNS_UNS_MOD:
772	  case UNSPEC_VPACK_UNS_UNS_MOD_DIRECT:
773	  case UNSPEC_VPACK_UNS_UNS_SAT:
774	  case UNSPEC_VPERM:
775	  case UNSPEC_VPERM_UNS:
776	  case UNSPEC_VPERMHI:
777	  case UNSPEC_VPERMSI:
778	  case UNSPEC_VPERMXOR:
779	  case UNSPEC_VPKPX:
780	  case UNSPEC_VSLDOI:
781	  case UNSPEC_VSLO:
782	  case UNSPEC_VSRO:
783	  case UNSPEC_VSUM2SWS:
784	  case UNSPEC_VSUM4S:
785	  case UNSPEC_VSUM4UBS:
786	  case UNSPEC_VSUMSWS:
787	  case UNSPEC_VSUMSWS_DIRECT:
788	  case UNSPEC_VSX_CONCAT:
789	  case UNSPEC_VSX_CVDPSPN:
790	  case UNSPEC_VSX_CVSPDP:
791	  case UNSPEC_VSX_CVSPDPN:
792	  case UNSPEC_VSX_EXTRACT:
793	  case UNSPEC_VSX_SET:
794	  case UNSPEC_VSX_SLDWI:
795	  case UNSPEC_VSX_VSLO:
796	  case UNSPEC_VUNPACK_HI_SIGN:
797	  case UNSPEC_VUNPACK_HI_SIGN_DIRECT:
798	  case UNSPEC_VUNPACK_LO_SIGN:
799	  case UNSPEC_VUNPACK_LO_SIGN_DIRECT:
800	  case UNSPEC_VUPKHPX:
801	  case UNSPEC_VUPKHS_V4SF:
802	  case UNSPEC_VUPKHU_V4SF:
803	  case UNSPEC_VUPKLPX:
804	  case UNSPEC_VUPKLS_V4SF:
805	  case UNSPEC_VUPKLU_V4SF:
806	    return 0;
807	  case UNSPEC_VSPLT_DIRECT:
808	  case UNSPEC_VSX_XXSPLTD:
809	    *special = SH_SPLAT;
810	    return 1;
811	  case UNSPEC_REDUC_PLUS:
812	  case UNSPEC_REDUC:
813	    return 1;
814	  case UNSPEC_VPMSUM:
815	    /* vpmsumd is not swappable, but vpmsum[bhw] are.  */
816	    if (GET_MODE (op) == V2DImode)
817	      return 0;
818	    break;
819	  }
820      }
821
822    default:
823      break;
824    }
825
826  const char *fmt = GET_RTX_FORMAT (code);
827  int ok = 1;
828
829  for (i = 0; i < GET_RTX_LENGTH (code); ++i)
830    if (fmt[i] == 'e' || fmt[i] == 'u')
831      {
832	unsigned int special_op = SH_NONE;
833	ok &= rtx_is_swappable_p (XEXP (op, i), &special_op);
834	if (special_op == SH_NONE)
835	  continue;
836	/* Ensure we never have two kinds of special handling
837	   for the same insn.  */
838	if (*special != SH_NONE && *special != special_op)
839	  return 0;
840	*special = special_op;
841      }
842    else if (fmt[i] == 'E')
843      for (j = 0; j < XVECLEN (op, i); ++j)
844	{
845	  unsigned int special_op = SH_NONE;
846	  ok &= rtx_is_swappable_p (XVECEXP (op, i, j), &special_op);
847	  if (special_op == SH_NONE)
848	    continue;
849	  /* Ensure we never have two kinds of special handling
850	     for the same insn.  */
851	  if (*special != SH_NONE && *special != special_op)
852	    return 0;
853	  *special = special_op;
854	}
855
856  return ok;
857}
858
859/* Return 1 iff INSN is an operand that will not be affected by
860   having vector doublewords swapped in memory (in which case
861   *SPECIAL is unchanged), or that can be modified to be correct
862   if vector doublewords are swapped in memory (in which case
863   *SPECIAL is changed to a value indicating how).  */
864static unsigned int
865insn_is_swappable_p (swap_web_entry *insn_entry, rtx insn,
866		     unsigned int *special)
867{
868  /* Calls are always bad.  */
869  if (GET_CODE (insn) == CALL_INSN)
870    return 0;
871
872  /* Loads and stores seen here are not permuting, but we can still
873     fix them up by converting them to permuting ones.  Exceptions:
874     UNSPEC_LVE, UNSPEC_LVX, and UNSPEC_STVX, which have a PARALLEL
875     body instead of a SET; and UNSPEC_STVE, which has an UNSPEC
876     for the SET source.  Also we must now make an exception for lvx
877     and stvx when they are not in the UNSPEC_LVX/STVX form (with the
878     explicit "& -16") since this leads to unrecognizable insns.  */
879  rtx body = PATTERN (insn);
880  int i = INSN_UID (insn);
881
882  if (insn_entry[i].is_load)
883    {
884      if (GET_CODE (body) == SET)
885	{
886	  rtx rhs = SET_SRC (body);
887	  /* Even without a swap, the RHS might be a vec_select for, say,
888	     a byte-reversing load.  */
889	  if (!MEM_P (rhs))
890	    return 0;
891	  if (GET_CODE (XEXP (rhs, 0)) == AND)
892	    return 0;
893
894	  *special = SH_NOSWAP_LD;
895	  return 1;
896	}
897      else
898	return 0;
899    }
900
901  if (insn_entry[i].is_store)
902    {
903      if (GET_CODE (body) == SET
904	  && GET_CODE (SET_SRC (body)) != UNSPEC
905	  && GET_CODE (SET_SRC (body)) != VEC_SELECT)
906	{
907	  rtx lhs = SET_DEST (body);
908	  /* Even without a swap, the RHS might be a vec_select for, say,
909	     a byte-reversing store.  */
910	  if (!MEM_P (lhs))
911	    return 0;
912	  if (GET_CODE (XEXP (lhs, 0)) == AND)
913	    return 0;
914
915	  *special = SH_NOSWAP_ST;
916	  return 1;
917	}
918      else
919	return 0;
920    }
921
922  /* A convert to single precision can be left as is provided that
923     all of its uses are in xxspltw instructions that splat BE element
924     zero.  */
925  if (GET_CODE (body) == SET
926      && GET_CODE (SET_SRC (body)) == UNSPEC
927      && XINT (SET_SRC (body), 1) == UNSPEC_VSX_CVDPSPN)
928    {
929      df_ref def;
930      struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
931
932      FOR_EACH_INSN_INFO_DEF (def, insn_info)
933	{
934	  struct df_link *link = DF_REF_CHAIN (def);
935	  if (!link)
936	    return 0;
937
938	  for (; link; link = link->next) {
939	    rtx use_insn = DF_REF_INSN (link->ref);
940	    rtx use_body = PATTERN (use_insn);
941	    if (GET_CODE (use_body) != SET
942		|| GET_CODE (SET_SRC (use_body)) != UNSPEC
943		|| XINT (SET_SRC (use_body), 1) != UNSPEC_VSX_XXSPLTW
944		|| XVECEXP (SET_SRC (use_body), 0, 1) != const0_rtx)
945	      return 0;
946	  }
947	}
948
949      return 1;
950    }
951
952  /* A concatenation of two doublewords is ok if we reverse the
953     order of the inputs.  */
954  if (GET_CODE (body) == SET
955      && GET_CODE (SET_SRC (body)) == VEC_CONCAT
956      && (GET_MODE (SET_SRC (body)) == V2DFmode
957	  || GET_MODE (SET_SRC (body)) == V2DImode))
958    {
959      *special = SH_CONCAT;
960      return 1;
961    }
962
963  /* V2DF reductions are always swappable.  */
964  if (GET_CODE (body) == PARALLEL)
965    {
966      rtx expr = XVECEXP (body, 0, 0);
967      if (GET_CODE (expr) == SET
968	  && v2df_reduction_p (SET_SRC (expr)))
969	return 1;
970    }
971
972  /* An UNSPEC_VPERM is ok if the mask operand is loaded from the
973     constant pool.  */
974  if (GET_CODE (body) == SET
975      && GET_CODE (SET_SRC (body)) == UNSPEC
976      && XINT (SET_SRC (body), 1) == UNSPEC_VPERM
977      && XVECLEN (SET_SRC (body), 0) == 3
978      && REG_P (XVECEXP (SET_SRC (body), 0, 2)))
979    {
980      rtx mask_reg = XVECEXP (SET_SRC (body), 0, 2);
981      struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
982      df_ref use;
983      FOR_EACH_INSN_INFO_USE (use, insn_info)
984	if (rtx_equal_p (DF_REF_REG (use), mask_reg))
985	  {
986	    struct df_link *def_link = DF_REF_CHAIN (use);
987	    /* Punt if multiple definitions for this reg.  */
988	    if (def_link && !def_link->next &&
989		const_load_sequence_p (insn_entry,
990				       DF_REF_INSN (def_link->ref)))
991	      {
992		*special = SH_VPERM;
993		return 1;
994	      }
995	  }
996    }
997
998  /* Otherwise check the operands for vector lane violations.  */
999  return rtx_is_swappable_p (body, special);
1000}
1001
1002enum chain_purpose { FOR_LOADS, FOR_STORES };
1003
1004/* Return true if the UD or DU chain headed by LINK is non-empty,
1005   and every entry on the chain references an insn that is a
1006   register swap.  Furthermore, if PURPOSE is FOR_LOADS, each such
1007   register swap must have only permuting loads as reaching defs.
1008   If PURPOSE is FOR_STORES, each such register swap must have only
1009   register swaps or permuting stores as reached uses.  */
1010static bool
1011chain_contains_only_swaps (swap_web_entry *insn_entry, struct df_link *link,
1012			   enum chain_purpose purpose)
1013{
1014  if (!link)
1015    return false;
1016
1017  for (; link; link = link->next)
1018    {
1019      if (!ALTIVEC_OR_VSX_VECTOR_MODE (GET_MODE (DF_REF_REG (link->ref))))
1020	continue;
1021
1022      if (DF_REF_IS_ARTIFICIAL (link->ref))
1023	return false;
1024
1025      rtx reached_insn = DF_REF_INSN (link->ref);
1026      unsigned uid = INSN_UID (reached_insn);
1027      struct df_insn_info *insn_info = DF_INSN_INFO_GET (reached_insn);
1028
1029      if (!insn_entry[uid].is_swap || insn_entry[uid].is_load
1030	  || insn_entry[uid].is_store)
1031	return false;
1032
1033      if (purpose == FOR_LOADS)
1034	{
1035	  df_ref use;
1036	  FOR_EACH_INSN_INFO_USE (use, insn_info)
1037	    {
1038	      struct df_link *swap_link = DF_REF_CHAIN (use);
1039
1040	      while (swap_link)
1041		{
1042		  if (DF_REF_IS_ARTIFICIAL (link->ref))
1043		    return false;
1044
1045		  rtx swap_def_insn = DF_REF_INSN (swap_link->ref);
1046		  unsigned uid2 = INSN_UID (swap_def_insn);
1047
1048		  /* Only permuting loads are allowed.  */
1049		  if (!insn_entry[uid2].is_swap || !insn_entry[uid2].is_load)
1050		    return false;
1051
1052		  swap_link = swap_link->next;
1053		}
1054	    }
1055	}
1056      else if (purpose == FOR_STORES)
1057	{
1058	  df_ref def;
1059	  FOR_EACH_INSN_INFO_DEF (def, insn_info)
1060	    {
1061	      struct df_link *swap_link = DF_REF_CHAIN (def);
1062
1063	      while (swap_link)
1064		{
1065		  if (DF_REF_IS_ARTIFICIAL (link->ref))
1066		    return false;
1067
1068		  rtx swap_use_insn = DF_REF_INSN (swap_link->ref);
1069		  unsigned uid2 = INSN_UID (swap_use_insn);
1070
1071		  /* Permuting stores or register swaps are allowed.  */
1072		  if (!insn_entry[uid2].is_swap || insn_entry[uid2].is_load)
1073		    return false;
1074
1075		  swap_link = swap_link->next;
1076		}
1077	    }
1078	}
1079    }
1080
1081  return true;
1082}
1083
1084/* Mark the xxswapdi instructions associated with permuting loads and
1085   stores for removal.  Note that we only flag them for deletion here,
1086   as there is a possibility of a swap being reached from multiple
1087   loads, etc.  */
1088static void
1089mark_swaps_for_removal (swap_web_entry *insn_entry, unsigned int i)
1090{
1091  rtx insn = insn_entry[i].insn;
1092  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
1093
1094  if (insn_entry[i].is_load)
1095    {
1096      df_ref def;
1097      FOR_EACH_INSN_INFO_DEF (def, insn_info)
1098	{
1099	  struct df_link *link = DF_REF_CHAIN (def);
1100
1101	  /* We know by now that these are swaps, so we can delete
1102	     them confidently.  */
1103	  while (link)
1104	    {
1105	      rtx use_insn = DF_REF_INSN (link->ref);
1106	      insn_entry[INSN_UID (use_insn)].will_delete = 1;
1107	      link = link->next;
1108	    }
1109	}
1110    }
1111  else if (insn_entry[i].is_store)
1112    {
1113      df_ref use;
1114      FOR_EACH_INSN_INFO_USE (use, insn_info)
1115	{
1116	  /* Ignore uses for addressability.  */
1117	  machine_mode mode = GET_MODE (DF_REF_REG (use));
1118	  if (!ALTIVEC_OR_VSX_VECTOR_MODE (mode))
1119	    continue;
1120
1121	  struct df_link *link = DF_REF_CHAIN (use);
1122
1123	  /* We know by now that these are swaps, so we can delete
1124	     them confidently.  */
1125	  while (link)
1126	    {
1127	      rtx def_insn = DF_REF_INSN (link->ref);
1128	      insn_entry[INSN_UID (def_insn)].will_delete = 1;
1129	      link = link->next;
1130	    }
1131	}
1132    }
1133}
1134
1135/* *OP_PTR is either a CONST_VECTOR or an expression containing one.
1136   Swap the first half of the vector with the second in the first
1137   case.  Recurse to find it in the second.  */
1138static void
1139swap_const_vector_halves (rtx *op_ptr)
1140{
1141  int i;
1142  rtx op = *op_ptr;
1143  enum rtx_code code = GET_CODE (op);
1144  if (GET_CODE (op) == CONST_VECTOR)
1145    {
1146      int units = GET_MODE_NUNITS (GET_MODE (op));
1147      rtx_vector_builder builder (GET_MODE (op), units, 1);
1148      for (i = 0; i < units / 2; ++i)
1149	builder.quick_push (CONST_VECTOR_ELT (op, i + units / 2));
1150      for (i = 0; i < units / 2; ++i)
1151	builder.quick_push (CONST_VECTOR_ELT (op, i));
1152      *op_ptr = builder.build ();
1153    }
1154  else
1155    {
1156      int j;
1157      const char *fmt = GET_RTX_FORMAT (code);
1158      for (i = 0; i < GET_RTX_LENGTH (code); ++i)
1159	if (fmt[i] == 'e' || fmt[i] == 'u')
1160	  swap_const_vector_halves (&XEXP (op, i));
1161	else if (fmt[i] == 'E')
1162	  for (j = 0; j < XVECLEN (op, i); ++j)
1163	    swap_const_vector_halves (&XVECEXP (op, i, j));
1164    }
1165}
1166
1167/* Find all subregs of a vector expression that perform a narrowing,
1168   and adjust the subreg index to account for doubleword swapping.  */
1169static void
1170adjust_subreg_index (rtx op)
1171{
1172  enum rtx_code code = GET_CODE (op);
1173  if (code == SUBREG
1174      && (GET_MODE_SIZE (GET_MODE (op))
1175	  < GET_MODE_SIZE (GET_MODE (XEXP (op, 0)))))
1176    {
1177      unsigned int index = SUBREG_BYTE (op);
1178      if (index < 8)
1179	index += 8;
1180      else
1181	index -= 8;
1182      SUBREG_BYTE (op) = index;
1183    }
1184
1185  const char *fmt = GET_RTX_FORMAT (code);
1186  int i,j;
1187  for (i = 0; i < GET_RTX_LENGTH (code); ++i)
1188    if (fmt[i] == 'e' || fmt[i] == 'u')
1189      adjust_subreg_index (XEXP (op, i));
1190    else if (fmt[i] == 'E')
1191      for (j = 0; j < XVECLEN (op, i); ++j)
1192	adjust_subreg_index (XVECEXP (op, i, j));
1193}
1194
1195/* Convert the non-permuting load INSN to a permuting one.  */
1196static void
1197permute_load (rtx_insn *insn)
1198{
1199  rtx body = PATTERN (insn);
1200  rtx mem_op = SET_SRC (body);
1201  rtx tgt_reg = SET_DEST (body);
1202  machine_mode mode = GET_MODE (tgt_reg);
1203  int n_elts = GET_MODE_NUNITS (mode);
1204  int half_elts = n_elts / 2;
1205  rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts));
1206  int i, j;
1207  for (i = 0, j = half_elts; i < half_elts; ++i, ++j)
1208    XVECEXP (par, 0, i) = GEN_INT (j);
1209  for (i = half_elts, j = 0; j < half_elts; ++i, ++j)
1210    XVECEXP (par, 0, i) = GEN_INT (j);
1211  rtx sel = gen_rtx_VEC_SELECT (mode, mem_op, par);
1212  SET_SRC (body) = sel;
1213  INSN_CODE (insn) = -1; /* Force re-recognition.  */
1214  df_insn_rescan (insn);
1215
1216  if (dump_file)
1217    fprintf (dump_file, "Replacing load %d with permuted load\n",
1218	     INSN_UID (insn));
1219}
1220
1221/* Convert the non-permuting store INSN to a permuting one.  */
1222static void
1223permute_store (rtx_insn *insn)
1224{
1225  rtx body = PATTERN (insn);
1226  rtx src_reg = SET_SRC (body);
1227  machine_mode mode = GET_MODE (src_reg);
1228  int n_elts = GET_MODE_NUNITS (mode);
1229  int half_elts = n_elts / 2;
1230  rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts));
1231  int i, j;
1232  for (i = 0, j = half_elts; i < half_elts; ++i, ++j)
1233    XVECEXP (par, 0, i) = GEN_INT (j);
1234  for (i = half_elts, j = 0; j < half_elts; ++i, ++j)
1235    XVECEXP (par, 0, i) = GEN_INT (j);
1236  rtx sel = gen_rtx_VEC_SELECT (mode, src_reg, par);
1237  SET_SRC (body) = sel;
1238  INSN_CODE (insn) = -1; /* Force re-recognition.  */
1239  df_insn_rescan (insn);
1240
1241  if (dump_file)
1242    fprintf (dump_file, "Replacing store %d with permuted store\n",
1243	     INSN_UID (insn));
1244}
1245
1246/* Given OP that contains a vector extract operation, adjust the index
1247   of the extracted lane to account for the doubleword swap.  */
1248static void
1249adjust_extract (rtx_insn *insn)
1250{
1251  rtx pattern = PATTERN (insn);
1252  if (GET_CODE (pattern) == PARALLEL)
1253    pattern = XVECEXP (pattern, 0, 0);
1254  rtx src = SET_SRC (pattern);
1255  /* The vec_select may be wrapped in a vec_duplicate for a splat, so
1256     account for that.  */
1257  rtx sel = GET_CODE (src) == VEC_DUPLICATE ? XEXP (src, 0) : src;
1258  rtx par = XEXP (sel, 1);
1259  int half_elts = GET_MODE_NUNITS (GET_MODE (XEXP (sel, 0))) >> 1;
1260  int lane = INTVAL (XVECEXP (par, 0, 0));
1261  lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
1262  XVECEXP (par, 0, 0) = GEN_INT (lane);
1263  INSN_CODE (insn) = -1; /* Force re-recognition.  */
1264  df_insn_rescan (insn);
1265
1266  if (dump_file)
1267    fprintf (dump_file, "Changing lane for extract %d\n", INSN_UID (insn));
1268}
1269
1270/* Given OP that contains a vector direct-splat operation, adjust the index
1271   of the source lane to account for the doubleword swap.  */
1272static void
1273adjust_splat (rtx_insn *insn)
1274{
1275  rtx body = PATTERN (insn);
1276  rtx unspec = XEXP (body, 1);
1277  int half_elts = GET_MODE_NUNITS (GET_MODE (unspec)) >> 1;
1278  int lane = INTVAL (XVECEXP (unspec, 0, 1));
1279  lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
1280  XVECEXP (unspec, 0, 1) = GEN_INT (lane);
1281  INSN_CODE (insn) = -1; /* Force re-recognition.  */
1282  df_insn_rescan (insn);
1283
1284  if (dump_file)
1285    fprintf (dump_file, "Changing lane for splat %d\n", INSN_UID (insn));
1286}
1287
1288/* Given OP that contains an XXPERMDI operation (that is not a doubleword
1289   swap), reverse the order of the source operands and adjust the indices
1290   of the source lanes to account for doubleword reversal.  */
1291static void
1292adjust_xxpermdi (rtx_insn *insn)
1293{
1294  rtx set = PATTERN (insn);
1295  rtx select = XEXP (set, 1);
1296  rtx concat = XEXP (select, 0);
1297  rtx src0 = XEXP (concat, 0);
1298  XEXP (concat, 0) = XEXP (concat, 1);
1299  XEXP (concat, 1) = src0;
1300  rtx parallel = XEXP (select, 1);
1301  int lane0 = INTVAL (XVECEXP (parallel, 0, 0));
1302  int lane1 = INTVAL (XVECEXP (parallel, 0, 1));
1303  int new_lane0 = 3 - lane1;
1304  int new_lane1 = 3 - lane0;
1305  XVECEXP (parallel, 0, 0) = GEN_INT (new_lane0);
1306  XVECEXP (parallel, 0, 1) = GEN_INT (new_lane1);
1307  INSN_CODE (insn) = -1; /* Force re-recognition.  */
1308  df_insn_rescan (insn);
1309
1310  if (dump_file)
1311    fprintf (dump_file, "Changing lanes for xxpermdi %d\n", INSN_UID (insn));
1312}
1313
1314/* Given OP that contains a VEC_CONCAT operation of two doublewords,
1315   reverse the order of those inputs.  */
1316static void
1317adjust_concat (rtx_insn *insn)
1318{
1319  rtx set = PATTERN (insn);
1320  rtx concat = XEXP (set, 1);
1321  rtx src0 = XEXP (concat, 0);
1322  XEXP (concat, 0) = XEXP (concat, 1);
1323  XEXP (concat, 1) = src0;
1324  INSN_CODE (insn) = -1; /* Force re-recognition.  */
1325  df_insn_rescan (insn);
1326
1327  if (dump_file)
1328    fprintf (dump_file, "Reversing inputs for concat %d\n", INSN_UID (insn));
1329}
1330
1331/* Given an UNSPEC_VPERM insn, modify the mask loaded from the
1332   constant pool to reflect swapped doublewords.  */
1333static void
1334adjust_vperm (rtx_insn *insn)
1335{
1336  /* We previously determined that the UNSPEC_VPERM was fed by a
1337     swap of a swapping load of a TOC-relative constant pool symbol.
1338     Find the MEM in the swapping load and replace it with a MEM for
1339     the adjusted mask constant.  */
1340  rtx set = PATTERN (insn);
1341  rtx mask_reg = XVECEXP (SET_SRC (set), 0, 2);
1342
1343  /* Find the swap.  */
1344  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
1345  df_ref use;
1346  rtx_insn *swap_insn = 0;
1347  FOR_EACH_INSN_INFO_USE (use, insn_info)
1348    if (rtx_equal_p (DF_REF_REG (use), mask_reg))
1349      {
1350	struct df_link *def_link = DF_REF_CHAIN (use);
1351	gcc_assert (def_link && !def_link->next);
1352	swap_insn = DF_REF_INSN (def_link->ref);
1353	break;
1354      }
1355  gcc_assert (swap_insn);
1356
1357  /* Find the load.  */
1358  insn_info = DF_INSN_INFO_GET (swap_insn);
1359  rtx_insn *load_insn = 0;
1360  FOR_EACH_INSN_INFO_USE (use, insn_info)
1361    {
1362      struct df_link *def_link = DF_REF_CHAIN (use);
1363      gcc_assert (def_link && !def_link->next);
1364      load_insn = DF_REF_INSN (def_link->ref);
1365      break;
1366    }
1367  gcc_assert (load_insn);
1368
1369  /* Find the TOC-relative symbol access.  */
1370  insn_info = DF_INSN_INFO_GET (load_insn);
1371  rtx_insn *tocrel_insn = 0;
1372  FOR_EACH_INSN_INFO_USE (use, insn_info)
1373    {
1374      struct df_link *def_link = DF_REF_CHAIN (use);
1375      gcc_assert (def_link && !def_link->next);
1376      tocrel_insn = DF_REF_INSN (def_link->ref);
1377      break;
1378    }
1379  gcc_assert (tocrel_insn);
1380
1381  /* Find the embedded CONST_VECTOR.  We have to call toc_relative_expr_p
1382     to set tocrel_base; otherwise it would be unnecessary as we've
1383     already established it will return true.  */
1384  rtx base, offset;
1385  const_rtx tocrel_base;
1386  rtx tocrel_expr = SET_SRC (PATTERN (tocrel_insn));
1387  /* There is an extra level of indirection for small/large code models.  */
1388  if (MEM_P (tocrel_expr))
1389    tocrel_expr = XEXP (tocrel_expr, 0);
1390  if (!toc_relative_expr_p (tocrel_expr, false, &tocrel_base, NULL))
1391    gcc_unreachable ();
1392  split_const (XVECEXP (tocrel_base, 0, 0), &base, &offset);
1393  rtx const_vector = get_pool_constant (base);
1394  /* With the extra indirection, get_pool_constant will produce the
1395     real constant from the reg_equal expression, so get the real
1396     constant.  */
1397  if (SYMBOL_REF_P (const_vector))
1398    const_vector = get_pool_constant (const_vector);
1399  gcc_assert (GET_CODE (const_vector) == CONST_VECTOR);
1400
1401  /* Create an adjusted mask from the initial mask.  */
1402  unsigned int new_mask[16], i, val;
1403  for (i = 0; i < 16; ++i) {
1404    val = INTVAL (XVECEXP (const_vector, 0, i));
1405    if (val < 16)
1406      new_mask[i] = (val + 8) % 16;
1407    else
1408      new_mask[i] = ((val + 8) % 16) + 16;
1409  }
1410
1411  /* Create a new CONST_VECTOR and a MEM that references it.  */
1412  rtx vals = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
1413  for (i = 0; i < 16; ++i)
1414    XVECEXP (vals, 0, i) = GEN_INT (new_mask[i]);
1415  rtx new_const_vector = gen_rtx_CONST_VECTOR (V16QImode, XVEC (vals, 0));
1416  rtx new_mem = force_const_mem (V16QImode, new_const_vector);
1417  /* This gives us a MEM whose base operand is a SYMBOL_REF, which we
1418     can't recognize.  Force the SYMBOL_REF into a register.  */
1419  if (!REG_P (XEXP (new_mem, 0))) {
1420    rtx base_reg = force_reg (Pmode, XEXP (new_mem, 0));
1421    XEXP (new_mem, 0) = base_reg;
1422    /* Move the newly created insn ahead of the load insn.  */
1423    rtx_insn *force_insn = get_last_insn ();
1424    remove_insn (force_insn);
1425    rtx_insn *before_load_insn = PREV_INSN (load_insn);
1426    add_insn_after (force_insn, before_load_insn, BLOCK_FOR_INSN (load_insn));
1427    df_insn_rescan (before_load_insn);
1428    df_insn_rescan (force_insn);
1429  }
1430
1431  /* Replace the MEM in the load instruction and rescan it.  */
1432  XEXP (SET_SRC (PATTERN (load_insn)), 0) = new_mem;
1433  INSN_CODE (load_insn) = -1; /* Force re-recognition.  */
1434  df_insn_rescan (load_insn);
1435
1436  if (dump_file)
1437    fprintf (dump_file, "Adjusting mask for vperm %d\n", INSN_UID (insn));
1438}
1439
1440/* The insn described by INSN_ENTRY[I] can be swapped, but only
1441   with special handling.  Take care of that here.  */
1442static void
1443handle_special_swappables (swap_web_entry *insn_entry, unsigned i)
1444{
1445  rtx_insn *insn = insn_entry[i].insn;
1446  rtx body = PATTERN (insn);
1447
1448  switch (insn_entry[i].special_handling)
1449    {
1450    default:
1451      gcc_unreachable ();
1452    case SH_CONST_VECTOR:
1453      {
1454	/* A CONST_VECTOR will only show up somewhere in the RHS of a SET.  */
1455	gcc_assert (GET_CODE (body) == SET);
1456	swap_const_vector_halves (&SET_SRC (body));
1457	if (dump_file)
1458	  fprintf (dump_file, "Swapping constant halves in insn %d\n", i);
1459	break;
1460      }
1461    case SH_SUBREG:
1462      /* A subreg of the same size is already safe.  For subregs that
1463	 select a smaller portion of a reg, adjust the index for
1464	 swapped doublewords.  */
1465      adjust_subreg_index (body);
1466      if (dump_file)
1467	fprintf (dump_file, "Adjusting subreg in insn %d\n", i);
1468      break;
1469    case SH_NOSWAP_LD:
1470      /* Convert a non-permuting load to a permuting one.  */
1471      permute_load (insn);
1472      break;
1473    case SH_NOSWAP_ST:
1474      /* Convert a non-permuting store to a permuting one.  */
1475      permute_store (insn);
1476      break;
1477    case SH_EXTRACT:
1478      /* Change the lane on an extract operation.  */
1479      adjust_extract (insn);
1480      break;
1481    case SH_SPLAT:
1482      /* Change the lane on a direct-splat operation.  */
1483      adjust_splat (insn);
1484      break;
1485    case SH_XXPERMDI:
1486      /* Change the lanes on an XXPERMDI operation.  */
1487      adjust_xxpermdi (insn);
1488      break;
1489    case SH_CONCAT:
1490      /* Reverse the order of a concatenation operation.  */
1491      adjust_concat (insn);
1492      break;
1493    case SH_VPERM:
1494      /* Change the mask loaded from the constant pool for a VPERM.  */
1495      adjust_vperm (insn);
1496      break;
1497    }
1498}
1499
1500/* Find the insn from the Ith table entry, which is known to be a
1501   register swap Y = SWAP(X).  Replace it with a copy Y = X.  */
1502static void
1503replace_swap_with_copy (swap_web_entry *insn_entry, unsigned i)
1504{
1505  rtx_insn *insn = insn_entry[i].insn;
1506  rtx body = PATTERN (insn);
1507  rtx src_reg = XEXP (SET_SRC (body), 0);
1508  rtx copy = gen_rtx_SET (SET_DEST (body), src_reg);
1509  rtx_insn *new_insn = emit_insn_before (copy, insn);
1510  set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn));
1511  df_insn_rescan (new_insn);
1512
1513  if (dump_file)
1514    {
1515      unsigned int new_uid = INSN_UID (new_insn);
1516      fprintf (dump_file, "Replacing swap %d with copy %d\n", i, new_uid);
1517    }
1518
1519  df_insn_delete (insn);
1520  remove_insn (insn);
1521  insn->set_deleted ();
1522}
1523
1524/* INSN is known to contain a SUBREG, which we can normally handle,
1525   but if the SUBREG itself contains a MULT then we need to leave it alone
1526   to avoid turning a mult_hipart into a mult_lopart, for example.  */
1527static bool
1528has_part_mult (rtx_insn *insn)
1529{
1530  rtx body = PATTERN (insn);
1531  if (GET_CODE (body) != SET)
1532    return false;
1533  rtx src = SET_SRC (body);
1534  if (GET_CODE (src) != SUBREG)
1535    return false;
1536  rtx inner = XEXP (src, 0);
1537  return (GET_CODE (inner) == MULT);
1538}
1539
1540/* Make NEW_MEM_EXP's attributes and flags resemble those of
1541   ORIGINAL_MEM_EXP.  */
1542static void
1543mimic_memory_attributes_and_flags (rtx new_mem_exp, const_rtx original_mem_exp)
1544{
1545  RTX_FLAG (new_mem_exp, jump) = RTX_FLAG (original_mem_exp, jump);
1546  RTX_FLAG (new_mem_exp, call) = RTX_FLAG (original_mem_exp, call);
1547  RTX_FLAG (new_mem_exp, unchanging) = RTX_FLAG (original_mem_exp, unchanging);
1548  RTX_FLAG (new_mem_exp, volatil) = RTX_FLAG (original_mem_exp, volatil);
1549  RTX_FLAG (new_mem_exp, frame_related) =
1550    RTX_FLAG (original_mem_exp, frame_related);
1551
1552  /* The following fields may not be used with MEM subexpressions */
1553  RTX_FLAG (new_mem_exp, in_struct) = RTX_FLAG (original_mem_exp, in_struct);
1554  RTX_FLAG (new_mem_exp, return_val) = RTX_FLAG (original_mem_exp, return_val);
1555
1556  struct mem_attrs original_attrs = *get_mem_attrs(original_mem_exp);
1557
1558  alias_set_type set = original_attrs.alias;
1559  set_mem_alias_set (new_mem_exp, set);
1560
1561  addr_space_t addrspace = original_attrs.addrspace;
1562  set_mem_addr_space (new_mem_exp, addrspace);
1563
1564  unsigned int align = original_attrs.align;
1565  set_mem_align (new_mem_exp, align);
1566
1567  tree expr = original_attrs.expr;
1568  set_mem_expr (new_mem_exp, expr);
1569
1570  if (original_attrs.offset_known_p)
1571    {
1572      HOST_WIDE_INT offset = original_attrs.offset;
1573      set_mem_offset (new_mem_exp, offset);
1574    }
1575  else
1576    clear_mem_offset (new_mem_exp);
1577
1578  if (original_attrs.size_known_p)
1579    {
1580      HOST_WIDE_INT size = original_attrs.size;
1581      set_mem_size (new_mem_exp, size);
1582    }
1583  else
1584    clear_mem_size (new_mem_exp);
1585}
1586
1587/* Generate an rtx expression to represent use of the stvx insn to store
1588   the value represented by register SRC_EXP into the memory at address
1589   DEST_EXP, with vector mode MODE.  */
1590rtx
1591rs6000_gen_stvx (enum machine_mode mode, rtx dest_exp, rtx src_exp)
1592{
1593  rtx stvx;
1594
1595  if (mode == V16QImode)
1596    stvx = gen_altivec_stvx_v16qi (src_exp, dest_exp);
1597  else if (mode == V8HImode)
1598    stvx = gen_altivec_stvx_v8hi (src_exp, dest_exp);
1599#ifdef HAVE_V8HFmode
1600  else if (mode == V8HFmode)
1601    stvx = gen_altivec_stvx_v8hf (src_exp, dest_exp);
1602#endif
1603  else if (mode == V4SImode)
1604    stvx = gen_altivec_stvx_v4si (src_exp, dest_exp);
1605  else if (mode == V4SFmode)
1606    stvx = gen_altivec_stvx_v4sf (src_exp, dest_exp);
1607  else if (mode == V2DImode)
1608    stvx = gen_altivec_stvx_v2di (src_exp, dest_exp);
1609  else if (mode == V2DFmode)
1610    stvx = gen_altivec_stvx_v2df (src_exp, dest_exp);
1611  else if (mode == V1TImode)
1612    stvx = gen_altivec_stvx_v1ti (src_exp, dest_exp);
1613  else
1614    /* KFmode, TFmode, other modes not expected in this context.  */
1615    gcc_unreachable ();
1616
1617  rtx new_mem_exp = SET_DEST (PATTERN (stvx));
1618  mimic_memory_attributes_and_flags (new_mem_exp, dest_exp);
1619  return stvx;
1620}
1621
1622/* Given that STORE_INSN represents an aligned store-with-swap of a
1623   swapped value, replace the store with an aligned store (without
1624   swap) and replace the swap with a copy insn.  */
1625static void
1626replace_swapped_aligned_store (swap_web_entry *insn_entry,
1627			       rtx_insn *store_insn)
1628{
1629  unsigned uid = INSN_UID (store_insn);
1630  gcc_assert (insn_entry[uid].is_swap && insn_entry[uid].is_store);
1631
1632  rtx body = PATTERN (store_insn);
1633  rtx dest_address = XEXP (SET_DEST (body), 0);
1634  rtx swap_reg = XEXP (SET_SRC (body), 0);
1635  gcc_assert (REG_P (dest_address)
1636	      || rs6000_sum_of_two_registers_p (dest_address));
1637
1638  /* Find the swap instruction that provides the value to be stored by
1639   * this store-with-swap instruction. */
1640  struct df_insn_info *insn_info = DF_INSN_INFO_GET (store_insn);
1641  df_ref use;
1642  rtx_insn *swap_insn = NULL;
1643  unsigned uid2 = 0;
1644  FOR_EACH_INSN_INFO_USE (use, insn_info)
1645    {
1646      struct df_link *def_link = DF_REF_CHAIN (use);
1647
1648      /* if this is not the definition of the candidate swap register,
1649	 then skip it.  I am only interested in the swap insnd.  */
1650      if (!rtx_equal_p (DF_REF_REG (use), swap_reg))
1651	continue;
1652
1653      /* If there is no def or the def is artifical or there are
1654	 multiple defs, we should not be here.  */
1655      gcc_assert (def_link && def_link->ref && !def_link->next
1656		  && !DF_REF_IS_ARTIFICIAL (def_link->ref));
1657
1658      swap_insn = DF_REF_INSN (def_link->ref);
1659      uid2 = INSN_UID (swap_insn);
1660
1661      /* If this source value is not a simple swap, we should not be here.  */
1662      gcc_assert (insn_entry[uid2].is_swap && !insn_entry[uid2].is_load
1663		  && !insn_entry[uid2].is_store);
1664
1665      /* We've processed the use we care about, so break out of
1666	 this loop.  */
1667      break;
1668    }
1669
1670  /* At this point, swap_insn and uid2 represent the swap instruction
1671     that feeds the store.  */
1672  gcc_assert (swap_insn);
1673  rtx set = single_set (store_insn);
1674  gcc_assert (set);
1675  rtx dest_exp = SET_DEST (set);
1676  rtx src_exp = XEXP (SET_SRC (body), 0);
1677  enum machine_mode mode = GET_MODE (dest_exp);
1678  gcc_assert (MEM_P (dest_exp));
1679  gcc_assert (MEM_ALIGN (dest_exp) >= 128);
1680
1681  /* Replace the copy with a new insn.  */
1682  rtx stvx;
1683  stvx = rs6000_gen_stvx (mode, dest_exp, src_exp);
1684
1685  rtx_insn *new_insn = emit_insn_before (stvx, store_insn);
1686  rtx new_body = PATTERN (new_insn);
1687
1688  gcc_assert ((GET_CODE (new_body) == SET)
1689	      && MEM_P (SET_DEST (new_body)));
1690
1691  basic_block bb = BLOCK_FOR_INSN (store_insn);
1692  set_block_for_insn (new_insn, bb);
1693  /* Handle REG_EH_REGION note.  */
1694  if (cfun->can_throw_non_call_exceptions && BB_END (bb) == store_insn)
1695    {
1696      rtx note = find_reg_note (store_insn, REG_EH_REGION, NULL_RTX);
1697      if (note)
1698	add_reg_note (new_insn, REG_EH_REGION, XEXP (note, 0));
1699    }
1700  df_insn_rescan (new_insn);
1701
1702  df_insn_delete (store_insn);
1703  remove_insn (store_insn);
1704  store_insn->set_deleted ();
1705
1706  /* Replace the swap with a copy.  */
1707  uid2 = INSN_UID (swap_insn);
1708  mark_swaps_for_removal (insn_entry, uid2);
1709  replace_swap_with_copy (insn_entry, uid2);
1710}
1711
1712/* Generate an rtx expression to represent use of the lvx insn to load
1713   from memory SRC_EXP into register DEST_EXP with vector mode MODE. */
1714rtx
1715rs6000_gen_lvx (enum machine_mode mode, rtx dest_exp, rtx src_exp)
1716{
1717  rtx lvx;
1718
1719  if (mode == V16QImode)
1720    lvx = gen_altivec_lvx_v16qi (dest_exp, src_exp);
1721  else if (mode == V8HImode)
1722    lvx = gen_altivec_lvx_v8hi (dest_exp, src_exp);
1723#ifdef HAVE_V8HFmode
1724  else if (mode == V8HFmode)
1725    lvx = gen_altivec_lvx_v8hf (dest_exp, src_exp);
1726#endif
1727  else if (mode == V4SImode)
1728    lvx = gen_altivec_lvx_v4si (dest_exp, src_exp);
1729  else if (mode == V4SFmode)
1730    lvx = gen_altivec_lvx_v4sf (dest_exp, src_exp);
1731  else if (mode == V2DImode)
1732    lvx = gen_altivec_lvx_v2di (dest_exp, src_exp);
1733  else if (mode == V2DFmode)
1734    lvx = gen_altivec_lvx_v2df (dest_exp, src_exp);
1735  else if (mode == V1TImode)
1736    lvx = gen_altivec_lvx_v1ti (dest_exp, src_exp);
1737  else
1738    /* KFmode, TFmode, other modes not expected in this context.  */
1739    gcc_unreachable ();
1740
1741  rtx new_mem_exp = SET_SRC (PATTERN (lvx));
1742  mimic_memory_attributes_and_flags (new_mem_exp, src_exp);
1743
1744  return lvx;
1745}
1746
1747/* Given that SWAP_INSN represents a swap of an aligned
1748   load-with-swap, replace the load with an aligned load (without
1749   swap) and replace the swap with a copy insn.  */
1750static void
1751replace_swapped_aligned_load (swap_web_entry *insn_entry, rtx swap_insn)
1752{
1753  /* Find the load.  */
1754  unsigned uid = INSN_UID (swap_insn);
1755  /* Only call this if quad_aligned_load_p (swap_insn).  */
1756  gcc_assert (insn_entry[uid].is_swap && !insn_entry[uid].is_load);
1757  struct df_insn_info *insn_info = DF_INSN_INFO_GET (swap_insn);
1758
1759  /* Since insn is known to represent a swap instruction, we know it
1760     "uses" only one input variable.  */
1761  df_ref use = DF_INSN_INFO_USES (insn_info);
1762
1763  /* Figure out where this input variable is defined.  */
1764  struct df_link *def_link = DF_REF_CHAIN (use);
1765  gcc_assert (def_link && !def_link->next);
1766  gcc_assert (def_link && def_link->ref &&
1767	      !DF_REF_IS_ARTIFICIAL (def_link->ref) && !def_link->next);
1768
1769  rtx_insn *def_insn = DF_REF_INSN (def_link->ref);
1770  unsigned uid2 = INSN_UID (def_insn);
1771
1772  /* We're expecting a load-with-swap insn.  */
1773  gcc_assert (insn_entry[uid2].is_load && insn_entry[uid2].is_swap);
1774
1775  /* We expect this to be a set to memory, with source representing a
1776     swap (indicated by code VEC_SELECT).  */
1777  rtx body = PATTERN (def_insn);
1778  gcc_assert ((GET_CODE (body) == SET)
1779	      && (GET_CODE (SET_SRC (body)) == VEC_SELECT
1780		  || pattern_is_rotate64 (body))
1781	      && MEM_P (XEXP (SET_SRC (body), 0)));
1782
1783  rtx src_exp = XEXP (SET_SRC (body), 0);
1784  enum machine_mode mode = GET_MODE (src_exp);
1785  rtx lvx = rs6000_gen_lvx (mode, SET_DEST (body), src_exp);
1786
1787  rtx_insn *new_insn = emit_insn_before (lvx, def_insn);
1788  rtx new_body = PATTERN (new_insn);
1789
1790  gcc_assert ((GET_CODE (new_body) == SET)
1791	      && MEM_P (SET_SRC (new_body)));
1792
1793  basic_block bb = BLOCK_FOR_INSN (def_insn);
1794  set_block_for_insn (new_insn, bb);
1795  /* Handle REG_EH_REGION note.  */
1796  if (cfun->can_throw_non_call_exceptions && BB_END (bb) == def_insn)
1797    {
1798      rtx note = find_reg_note (def_insn, REG_EH_REGION, NULL_RTX);
1799      if (note)
1800	add_reg_note (new_insn, REG_EH_REGION, XEXP (note, 0));
1801    }
1802  df_insn_rescan (new_insn);
1803
1804  df_insn_delete (def_insn);
1805  remove_insn (def_insn);
1806  def_insn->set_deleted ();
1807
1808  /* Replace the swap with a copy.  */
1809  mark_swaps_for_removal (insn_entry, uid);
1810  replace_swap_with_copy (insn_entry, uid);
1811}
1812
1813/* Given that SWAP_INSN represents a swap of a load of a constant
1814   vector value, replace with a single instruction that loads a
1815   swapped variant of the original constant.
1816
1817   The "natural" representation of a byte array in memory is the same
1818   for big endian and little endian.
1819
1820   unsigned char byte_array[] =
1821     { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f };
1822
1823   However, when loaded into a vector register, the representation
1824   depends on endian conventions.
1825
1826   In big-endian mode, the register holds:
1827
1828     MSB                                            LSB
1829     [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f ]
1830
1831   In little-endian mode, the register holds:
1832
1833     MSB                                            LSB
1834     [ f, e, d, c, b, a, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ]
1835
1836   Word arrays require different handling.  Consider the word array:
1837
1838   unsigned int word_array[] =
1839     { 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f };
1840
1841   The in-memory representation depends on endian configuration.  The
1842   equivalent array, declared as a byte array, in memory would be:
1843
1844   unsigned char big_endian_word_array_data[] =
1845     { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f }
1846
1847   unsigned char little_endian_word_array_data[] =
1848     { 3, 2, 1, 0, 7, 6, 5, 4, b, a, 9, 8, f, e, d, c }
1849
1850   In big-endian mode, the register holds:
1851
1852     MSB                                            LSB
1853     [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f ]
1854
1855   In little-endian mode, the register holds:
1856
1857     MSB                                            LSB
1858     [ c, d, e, f, 8, 9, a, b, 4, 5, 6, 7, 0, 1, 2, 3 ]
1859
1860
1861  Similar transformations apply to the vector of half-word and vector
1862  of double-word representations.
1863
1864  For now, don't handle vectors of quad-precision values.  Just return.
1865  A better solution is to fix the code generator to emit lvx/stvx for
1866  those.  */
1867static void
1868replace_swapped_load_constant (swap_web_entry *insn_entry, rtx swap_insn)
1869{
1870  /* Find the load.  */
1871  struct df_insn_info *insn_info = DF_INSN_INFO_GET (swap_insn);
1872  rtx_insn *load_insn;
1873  df_ref use  = DF_INSN_INFO_USES (insn_info);
1874  struct df_link *def_link = DF_REF_CHAIN (use);
1875  gcc_assert (def_link && !def_link->next);
1876
1877  load_insn = DF_REF_INSN (def_link->ref);
1878  gcc_assert (load_insn);
1879
1880  /* Find the TOC-relative symbol access.  */
1881  insn_info = DF_INSN_INFO_GET (load_insn);
1882  use = DF_INSN_INFO_USES (insn_info);
1883
1884  def_link = DF_REF_CHAIN (use);
1885  gcc_assert (def_link && !def_link->next);
1886
1887  rtx_insn *tocrel_insn = DF_REF_INSN (def_link->ref);
1888  gcc_assert (tocrel_insn);
1889
1890  /* Find the embedded CONST_VECTOR.  We have to call toc_relative_expr_p
1891     to set tocrel_base; otherwise it would be unnecessary as we've
1892     already established it will return true.  */
1893  rtx base, offset;
1894  rtx tocrel_expr = SET_SRC (PATTERN (tocrel_insn));
1895  const_rtx tocrel_base;
1896
1897  /* There is an extra level of indirection for small/large code models.  */
1898  if (MEM_P (tocrel_expr))
1899    tocrel_expr = XEXP (tocrel_expr, 0);
1900
1901  if (!toc_relative_expr_p (tocrel_expr, false, &tocrel_base, NULL))
1902    gcc_unreachable ();
1903
1904  split_const (XVECEXP (tocrel_base, 0, 0), &base, &offset);
1905  rtx const_vector = get_pool_constant (base);
1906
1907  /* With the extra indirection, get_pool_constant will produce the
1908     real constant from the reg_equal expression, so get the real
1909     constant.  */
1910  if (SYMBOL_REF_P (const_vector))
1911    const_vector = get_pool_constant (const_vector);
1912  gcc_assert (GET_CODE (const_vector) == CONST_VECTOR);
1913
1914  rtx new_mem;
1915  enum machine_mode mode = GET_MODE (const_vector);
1916
1917  /* Create an adjusted constant from the original constant.  */
1918  if (mode == V1TImode)
1919    /* Leave this code as is.  */
1920    return;
1921  else if (mode == V16QImode)
1922    {
1923      rtx vals = gen_rtx_PARALLEL (mode, rtvec_alloc (16));
1924      int i;
1925
1926      for (i = 0; i < 16; i++)
1927	XVECEXP (vals, 0, ((i+8) % 16)) = XVECEXP (const_vector, 0, i);
1928      rtx new_const_vector = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
1929      new_mem = force_const_mem (mode, new_const_vector);
1930    }
1931  else if ((mode == V8HImode)
1932#ifdef HAVE_V8HFmode
1933	   || (mode == V8HFmode)
1934#endif
1935	   )
1936    {
1937      rtx vals = gen_rtx_PARALLEL (mode, rtvec_alloc (8));
1938      int i;
1939
1940      for (i = 0; i < 8; i++)
1941	XVECEXP (vals, 0, ((i+4) % 8)) = XVECEXP (const_vector, 0, i);
1942      rtx new_const_vector = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
1943      new_mem = force_const_mem (mode, new_const_vector);
1944    }
1945  else if ((mode == V4SImode) || (mode == V4SFmode))
1946    {
1947      rtx vals = gen_rtx_PARALLEL (mode, rtvec_alloc (4));
1948      int i;
1949
1950      for (i = 0; i < 4; i++)
1951	XVECEXP (vals, 0, ((i+2) % 4)) = XVECEXP (const_vector, 0, i);
1952      rtx new_const_vector = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
1953      new_mem = force_const_mem (mode, new_const_vector);
1954    }
1955  else if ((mode == V2DImode) || (mode == V2DFmode))
1956    {
1957      rtx vals = gen_rtx_PARALLEL (mode, rtvec_alloc (2));
1958      int i;
1959
1960      for (i = 0; i < 2; i++)
1961	XVECEXP (vals, 0, ((i+1) % 2)) = XVECEXP (const_vector, 0, i);
1962      rtx new_const_vector = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
1963      new_mem = force_const_mem (mode, new_const_vector);
1964    }
1965  else
1966    {
1967      /* We do not expect other modes to be constant-load-swapped.  */
1968      gcc_unreachable ();
1969    }
1970
1971  /* This gives us a MEM whose base operand is a SYMBOL_REF, which we
1972     can't recognize.  Force the SYMBOL_REF into a register.  */
1973  if (!REG_P (XEXP (new_mem, 0))) {
1974    rtx base_reg = force_reg (Pmode, XEXP (new_mem, 0));
1975    XEXP (new_mem, 0) = base_reg;
1976
1977    /* Move the newly created insn ahead of the load insn.  */
1978    /* The last insn is the insn that forced new_mem into a register.  */
1979    rtx_insn *force_insn = get_last_insn ();
1980    /* Remove this insn from the end of the instruction sequence.  */
1981    remove_insn (force_insn);
1982    rtx_insn *before_load_insn = PREV_INSN (load_insn);
1983
1984    /* And insert this insn back into the sequence before the previous
1985       load insn so this new expression will be available when the
1986       existing load is modified to load the swapped constant.  */
1987    add_insn_after (force_insn, before_load_insn, BLOCK_FOR_INSN (load_insn));
1988    df_insn_rescan (before_load_insn);
1989    df_insn_rescan (force_insn);
1990  }
1991
1992  /* Replace the MEM in the load instruction and rescan it.  */
1993  XEXP (SET_SRC (PATTERN (load_insn)), 0) = new_mem;
1994  INSN_CODE (load_insn) = -1; /* Force re-recognition.  */
1995  df_insn_rescan (load_insn);
1996
1997  unsigned int uid = INSN_UID (swap_insn);
1998  mark_swaps_for_removal (insn_entry, uid);
1999  replace_swap_with_copy (insn_entry, uid);
2000}
2001
2002/* Dump the swap table to DUMP_FILE.  */
2003static void
2004dump_swap_insn_table (swap_web_entry *insn_entry)
2005{
2006  int e = get_max_uid ();
2007  fprintf (dump_file, "\nRelevant insns with their flag settings\n\n");
2008
2009  for (int i = 0; i < e; ++i)
2010    if (insn_entry[i].is_relevant)
2011      {
2012	swap_web_entry *pred_entry = (swap_web_entry *)insn_entry[i].pred ();
2013	fprintf (dump_file, "%6d %6d  ", i,
2014		 pred_entry && pred_entry->insn
2015		 ? INSN_UID (pred_entry->insn) : 0);
2016	if (insn_entry[i].is_load)
2017	  fputs ("load ", dump_file);
2018	if (insn_entry[i].is_store)
2019	  fputs ("store ", dump_file);
2020	if (insn_entry[i].is_swap)
2021	  fputs ("swap ", dump_file);
2022	if (insn_entry[i].is_live_in)
2023	  fputs ("live-in ", dump_file);
2024	if (insn_entry[i].is_live_out)
2025	  fputs ("live-out ", dump_file);
2026	if (insn_entry[i].contains_subreg)
2027	  fputs ("subreg ", dump_file);
2028	if (insn_entry[i].is_128_int)
2029	  fputs ("int128 ", dump_file);
2030	if (insn_entry[i].is_call)
2031	  fputs ("call ", dump_file);
2032	if (insn_entry[i].is_swappable)
2033	  {
2034	    fputs ("swappable ", dump_file);
2035	    if (insn_entry[i].special_handling == SH_CONST_VECTOR)
2036	      fputs ("special:constvec ", dump_file);
2037	    else if (insn_entry[i].special_handling == SH_SUBREG)
2038	      fputs ("special:subreg ", dump_file);
2039	    else if (insn_entry[i].special_handling == SH_NOSWAP_LD)
2040	      fputs ("special:load ", dump_file);
2041	    else if (insn_entry[i].special_handling == SH_NOSWAP_ST)
2042	      fputs ("special:store ", dump_file);
2043	    else if (insn_entry[i].special_handling == SH_EXTRACT)
2044	      fputs ("special:extract ", dump_file);
2045	    else if (insn_entry[i].special_handling == SH_SPLAT)
2046	      fputs ("special:splat ", dump_file);
2047	    else if (insn_entry[i].special_handling == SH_XXPERMDI)
2048	      fputs ("special:xxpermdi ", dump_file);
2049	    else if (insn_entry[i].special_handling == SH_CONCAT)
2050	      fputs ("special:concat ", dump_file);
2051	    else if (insn_entry[i].special_handling == SH_VPERM)
2052	      fputs ("special:vperm ", dump_file);
2053	  }
2054	if (insn_entry[i].web_not_optimizable)
2055	  fputs ("unoptimizable ", dump_file);
2056	if (insn_entry[i].will_delete)
2057	  fputs ("delete ", dump_file);
2058	fputs ("\n", dump_file);
2059      }
2060  fputs ("\n", dump_file);
2061}
2062
2063/* Return RTX with its address canonicalized to (reg) or (+ reg reg).
2064   Here RTX is an (& addr (const_int -16)).  Always return a new copy
2065   to avoid problems with combine.  */
2066static rtx
2067alignment_with_canonical_addr (rtx align)
2068{
2069  rtx canon;
2070  rtx addr = XEXP (align, 0);
2071
2072  if (REG_P (addr))
2073    canon = addr;
2074
2075  else if (GET_CODE (addr) == PLUS)
2076    {
2077      rtx addrop0 = XEXP (addr, 0);
2078      rtx addrop1 = XEXP (addr, 1);
2079
2080      if (!REG_P (addrop0))
2081	addrop0 = force_reg (GET_MODE (addrop0), addrop0);
2082
2083      if (!REG_P (addrop1))
2084	addrop1 = force_reg (GET_MODE (addrop1), addrop1);
2085
2086      canon = gen_rtx_PLUS (GET_MODE (addr), addrop0, addrop1);
2087    }
2088
2089  else
2090    canon = force_reg (GET_MODE (addr), addr);
2091
2092  return gen_rtx_AND (GET_MODE (align), canon, GEN_INT (-16));
2093}
2094
2095/* Check whether an rtx is an alignment mask, and if so, return
2096   a fully-expanded rtx for the masking operation.  */
2097static rtx
2098alignment_mask (rtx_insn *insn)
2099{
2100  rtx body = PATTERN (insn);
2101
2102  if (GET_CODE (body) != SET
2103      || GET_CODE (SET_SRC (body)) != AND
2104      || !REG_P (XEXP (SET_SRC (body), 0)))
2105    return 0;
2106
2107  rtx mask = XEXP (SET_SRC (body), 1);
2108
2109  if (CONST_INT_P (mask))
2110    {
2111      if (INTVAL (mask) == -16)
2112	return alignment_with_canonical_addr (SET_SRC (body));
2113      else
2114	return 0;
2115    }
2116
2117  if (!REG_P (mask))
2118    return 0;
2119
2120  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
2121  df_ref use;
2122  rtx real_mask = 0;
2123
2124  FOR_EACH_INSN_INFO_USE (use, insn_info)
2125    {
2126      if (!rtx_equal_p (DF_REF_REG (use), mask))
2127	continue;
2128
2129      struct df_link *def_link = DF_REF_CHAIN (use);
2130      if (!def_link || def_link->next)
2131	return 0;
2132
2133      rtx_insn *const_insn = DF_REF_INSN (def_link->ref);
2134      rtx const_body = PATTERN (const_insn);
2135      if (GET_CODE (const_body) != SET)
2136	return 0;
2137
2138      real_mask = SET_SRC (const_body);
2139
2140      if (!CONST_INT_P (real_mask)
2141	  || INTVAL (real_mask) != -16)
2142	return 0;
2143    }
2144
2145  if (real_mask == 0)
2146    return 0;
2147
2148  return alignment_with_canonical_addr (SET_SRC (body));
2149}
2150
2151/* Given INSN that's a load or store based at BASE_REG, check if
2152   all of its feeding computations align its address on a 16-byte
2153   boundary.  If so, return true and add all definition insns into
2154   AND_INSNS and their corresponding fully-expanded rtxes for the
2155   masking operations into AND_OPS.  */
2156
2157static bool
2158find_alignment_op (rtx_insn *insn, rtx base_reg, vec<rtx_insn *> *and_insns,
2159		   vec<rtx> *and_ops)
2160{
2161  df_ref base_use;
2162  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
2163  rtx and_operation = 0;
2164
2165  FOR_EACH_INSN_INFO_USE (base_use, insn_info)
2166    {
2167      if (!rtx_equal_p (DF_REF_REG (base_use), base_reg))
2168	continue;
2169
2170      struct df_link *base_def_link = DF_REF_CHAIN (base_use);
2171      if (!base_def_link)
2172	return false;
2173
2174      while (base_def_link)
2175	{
2176	  /* With stack-protector code enabled, and possibly in other
2177	     circumstances, there may not be an associated insn for
2178	     the def.  */
2179	  if (DF_REF_IS_ARTIFICIAL (base_def_link->ref))
2180	    return false;
2181
2182	  rtx_insn *and_insn = DF_REF_INSN (base_def_link->ref);
2183	  and_operation = alignment_mask (and_insn);
2184
2185	  /* Stop if we find any one which doesn't align.  */
2186	  if (!and_operation)
2187	    return false;
2188
2189	  and_insns->safe_push (and_insn);
2190	  and_ops->safe_push (and_operation);
2191	  base_def_link = base_def_link->next;
2192	}
2193    }
2194
2195  return and_operation;
2196}
2197
2198struct del_info { bool replace; rtx_insn *replace_insn; };
2199
2200/* If INSN is the load for an lvx pattern, put it in canonical form.  */
2201static void
2202recombine_lvx_pattern (rtx_insn *insn, del_info *to_delete)
2203{
2204  rtx body = PATTERN (insn);
2205  gcc_assert (GET_CODE (body) == SET
2206	      && (GET_CODE (SET_SRC (body)) == VEC_SELECT
2207		  || pattern_is_rotate64 (body))
2208	      && MEM_P (XEXP (SET_SRC (body), 0)));
2209
2210  rtx mem = XEXP (SET_SRC (body), 0);
2211  rtx base_reg = XEXP (mem, 0);
2212
2213  auto_vec<rtx_insn *> and_insns;
2214  auto_vec<rtx> and_ops;
2215  bool is_any_def_and
2216    = find_alignment_op (insn, base_reg, &and_insns, &and_ops);
2217
2218  if (is_any_def_and)
2219    {
2220      gcc_assert (and_insns.length () == and_ops.length ());
2221      df_ref def;
2222      struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
2223      FOR_EACH_INSN_INFO_DEF (def, insn_info)
2224	{
2225	  struct df_link *link = DF_REF_CHAIN (def);
2226	  if (!link || link->next)
2227	    break;
2228
2229	  rtx_insn *swap_insn = DF_REF_INSN (link->ref);
2230	  if (!insn_is_swap_p (swap_insn)
2231	      || insn_is_load_p (swap_insn)
2232	      || insn_is_store_p (swap_insn))
2233	    break;
2234
2235	  /* Expected lvx pattern found.  Change the swap to
2236	     a copy, and propagate the AND operation into the
2237	     load.  */
2238	  to_delete[INSN_UID (swap_insn)].replace = true;
2239	  to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn;
2240
2241	  rtx new_reg = 0;
2242	  rtx and_mask = 0;
2243	  for (unsigned i = 0; i < and_insns.length (); i++)
2244	    {
2245	      /* However, first we must be sure that we make the
2246		 base register from the AND operation available
2247		 in case the register has been overwritten.  Copy
2248		 the base register to a new pseudo and use that
2249		 as the base register of the AND operation in
2250		 the new LVX instruction.  */
2251	      rtx_insn *and_insn = and_insns[i];
2252	      rtx and_op = and_ops[i];
2253	      rtx and_base = XEXP (and_op, 0);
2254	      if (!new_reg)
2255		{
2256		  new_reg = gen_reg_rtx (GET_MODE (and_base));
2257		  and_mask = XEXP (and_op, 1);
2258		}
2259	      rtx copy = gen_rtx_SET (new_reg, and_base);
2260	      rtx_insn *new_insn = emit_insn_after (copy, and_insn);
2261	      set_block_for_insn (new_insn, BLOCK_FOR_INSN (and_insn));
2262	      df_insn_rescan (new_insn);
2263	    }
2264
2265	  XEXP (mem, 0) = gen_rtx_AND (GET_MODE (new_reg), new_reg, and_mask);
2266	  SET_SRC (body) = mem;
2267	  INSN_CODE (insn) = -1; /* Force re-recognition.  */
2268	  df_insn_rescan (insn);
2269
2270	  if (dump_file)
2271	    fprintf (dump_file, "lvx opportunity found at %d\n",
2272		     INSN_UID (insn));
2273	}
2274    }
2275}
2276
2277/* If INSN is the store for an stvx pattern, put it in canonical form.  */
2278static void
2279recombine_stvx_pattern (rtx_insn *insn, del_info *to_delete)
2280{
2281  rtx body = PATTERN (insn);
2282  gcc_assert (GET_CODE (body) == SET
2283	      && MEM_P (SET_DEST (body))
2284	      && (GET_CODE (SET_SRC (body)) == VEC_SELECT
2285		  || pattern_is_rotate64 (body)));
2286  rtx mem = SET_DEST (body);
2287  rtx base_reg = XEXP (mem, 0);
2288
2289  auto_vec<rtx_insn *> and_insns;
2290  auto_vec<rtx> and_ops;
2291  bool is_any_def_and
2292    = find_alignment_op (insn, base_reg, &and_insns, &and_ops);
2293
2294  if (is_any_def_and)
2295    {
2296      gcc_assert (and_insns.length () == and_ops.length ());
2297      rtx src_reg = XEXP (SET_SRC (body), 0);
2298      df_ref src_use;
2299      struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
2300      FOR_EACH_INSN_INFO_USE (src_use, insn_info)
2301	{
2302	  if (!rtx_equal_p (DF_REF_REG (src_use), src_reg))
2303	    continue;
2304
2305	  struct df_link *link = DF_REF_CHAIN (src_use);
2306	  if (!link || link->next)
2307	    break;
2308
2309	  rtx_insn *swap_insn = DF_REF_INSN (link->ref);
2310	  if (!insn_is_swap_p (swap_insn)
2311	      || insn_is_load_p (swap_insn)
2312	      || insn_is_store_p (swap_insn))
2313	    break;
2314
2315	  /* Expected stvx pattern found.  Change the swap to
2316	     a copy, and propagate the AND operation into the
2317	     store.  */
2318	  to_delete[INSN_UID (swap_insn)].replace = true;
2319	  to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn;
2320
2321	  rtx new_reg = 0;
2322	  rtx and_mask = 0;
2323	  for (unsigned i = 0; i < and_insns.length (); i++)
2324	    {
2325	      /* However, first we must be sure that we make the
2326		 base register from the AND operation available
2327		 in case the register has been overwritten.  Copy
2328		 the base register to a new pseudo and use that
2329		 as the base register of the AND operation in
2330		 the new STVX instruction.  */
2331	      rtx_insn *and_insn = and_insns[i];
2332	      rtx and_op = and_ops[i];
2333	      rtx and_base = XEXP (and_op, 0);
2334	      if (!new_reg)
2335		{
2336		  new_reg = gen_reg_rtx (GET_MODE (and_base));
2337		  and_mask = XEXP (and_op, 1);
2338		}
2339	      rtx copy = gen_rtx_SET (new_reg, and_base);
2340	      rtx_insn *new_insn = emit_insn_after (copy, and_insn);
2341	      set_block_for_insn (new_insn, BLOCK_FOR_INSN (and_insn));
2342	      df_insn_rescan (new_insn);
2343	    }
2344
2345	  XEXP (mem, 0) = gen_rtx_AND (GET_MODE (new_reg), new_reg, and_mask);
2346	  SET_SRC (body) = src_reg;
2347	  INSN_CODE (insn) = -1; /* Force re-recognition.  */
2348	  df_insn_rescan (insn);
2349
2350	  if (dump_file)
2351	    fprintf (dump_file, "stvx opportunity found at %d\n",
2352		     INSN_UID (insn));
2353	}
2354    }
2355}
2356
2357/* Look for patterns created from builtin lvx and stvx calls, and
2358   canonicalize them to be properly recognized as such.  */
2359static void
2360recombine_lvx_stvx_patterns (function *fun)
2361{
2362  int i;
2363  basic_block bb;
2364  rtx_insn *insn;
2365
2366  int num_insns = get_max_uid ();
2367  del_info *to_delete = XCNEWVEC (del_info, num_insns);
2368
2369  FOR_ALL_BB_FN (bb, fun)
2370    FOR_BB_INSNS (bb, insn)
2371    {
2372      if (!NONDEBUG_INSN_P (insn))
2373	continue;
2374
2375      if (insn_is_load_p (insn) && insn_is_swap_p (insn))
2376	recombine_lvx_pattern (insn, to_delete);
2377      else if (insn_is_store_p (insn) && insn_is_swap_p (insn))
2378	recombine_stvx_pattern (insn, to_delete);
2379    }
2380
2381  /* Turning swaps into copies is delayed until now, to avoid problems
2382     with deleting instructions during the insn walk.  */
2383  for (i = 0; i < num_insns; i++)
2384    if (to_delete[i].replace)
2385      {
2386	rtx swap_body = PATTERN (to_delete[i].replace_insn);
2387	rtx src_reg = XEXP (SET_SRC (swap_body), 0);
2388	rtx copy = gen_rtx_SET (SET_DEST (swap_body), src_reg);
2389	rtx_insn *new_insn = emit_insn_before (copy,
2390					       to_delete[i].replace_insn);
2391	set_block_for_insn (new_insn,
2392			    BLOCK_FOR_INSN (to_delete[i].replace_insn));
2393	df_insn_rescan (new_insn);
2394	df_insn_delete (to_delete[i].replace_insn);
2395	remove_insn (to_delete[i].replace_insn);
2396	to_delete[i].replace_insn->set_deleted ();
2397      }
2398
2399  free (to_delete);
2400}
2401
2402/* Main entry point for this pass.  */
2403unsigned int
2404rs6000_analyze_swaps (function *fun)
2405{
2406  swap_web_entry *insn_entry;
2407  basic_block bb;
2408  rtx_insn *insn, *curr_insn = 0;
2409
2410  /* Dataflow analysis for use-def chains.  */
2411  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
2412  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2413  df_analyze ();
2414  df_set_flags (DF_DEFER_INSN_RESCAN);
2415
2416  /* Pre-pass to recombine lvx and stvx patterns so we don't lose info.  */
2417  recombine_lvx_stvx_patterns (fun);
2418
2419  /* Rebuild ud- and du-chains.  */
2420  df_remove_problem (df_chain);
2421  df_process_deferred_rescans ();
2422  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
2423  df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2424  df_analyze ();
2425  df_set_flags (DF_DEFER_INSN_RESCAN);
2426
2427  /* Allocate structure to represent webs of insns.  */
2428  insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ());
2429
2430  /* Walk the insns to gather basic data.  */
2431  FOR_ALL_BB_FN (bb, fun)
2432    FOR_BB_INSNS_SAFE (bb, insn, curr_insn)
2433    {
2434      unsigned int uid = INSN_UID (insn);
2435      if (NONDEBUG_INSN_P (insn))
2436	{
2437	  insn_entry[uid].insn = insn;
2438
2439	  if (GET_CODE (insn) == CALL_INSN)
2440	    insn_entry[uid].is_call = 1;
2441
2442	  /* Walk the uses and defs to see if we mention vector regs.
2443	     Record any constraints on optimization of such mentions.  */
2444	  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
2445	  df_ref mention;
2446	  FOR_EACH_INSN_INFO_USE (mention, insn_info)
2447	    {
2448	      /* We use DF_REF_REAL_REG here to get inside any subregs.  */
2449	      machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention));
2450
2451	      /* If a use gets its value from a call insn, it will be
2452		 a hard register and will look like (reg:V4SI 3 3).
2453		 The df analysis creates two mentions for GPR3 and GPR4,
2454		 both DImode.  We must recognize this and treat it as a
2455		 vector mention to ensure the call is unioned with this
2456		 use.  */
2457	      if (mode == DImode && DF_REF_INSN_INFO (mention))
2458		{
2459		  rtx feeder = DF_REF_INSN (mention);
2460		  /* FIXME:  It is pretty hard to get from the df mention
2461		     to the mode of the use in the insn.  We arbitrarily
2462		     pick a vector mode here, even though the use might
2463		     be a real DImode.  We can be too conservative
2464		     (create a web larger than necessary) because of
2465		     this, so consider eventually fixing this.  */
2466		  if (GET_CODE (feeder) == CALL_INSN)
2467		    mode = V4SImode;
2468		}
2469
2470	      if (ALTIVEC_OR_VSX_VECTOR_MODE (mode) || mode == TImode)
2471		{
2472		  insn_entry[uid].is_relevant = 1;
2473		  if (mode == TImode || mode == V1TImode
2474		      || FLOAT128_VECTOR_P (mode))
2475		    insn_entry[uid].is_128_int = 1;
2476		  if (DF_REF_INSN_INFO (mention))
2477		    insn_entry[uid].contains_subreg
2478		      = !rtx_equal_p (DF_REF_REG (mention),
2479				      DF_REF_REAL_REG (mention));
2480		  union_defs (insn_entry, insn, mention);
2481		}
2482	    }
2483	  FOR_EACH_INSN_INFO_DEF (mention, insn_info)
2484	    {
2485	      /* We use DF_REF_REAL_REG here to get inside any subregs.  */
2486	      machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention));
2487
2488	      /* If we're loading up a hard vector register for a call,
2489		 it looks like (set (reg:V4SI 9 9) (...)).  The df
2490		 analysis creates two mentions for GPR9 and GPR10, both
2491		 DImode.  So relying on the mode from the mentions
2492		 isn't sufficient to ensure we union the call into the
2493		 web with the parameter setup code.  */
2494	      if (mode == DImode && GET_CODE (insn) == SET
2495		  && ALTIVEC_OR_VSX_VECTOR_MODE (GET_MODE (SET_DEST (insn))))
2496		mode = GET_MODE (SET_DEST (insn));
2497
2498	      if (ALTIVEC_OR_VSX_VECTOR_MODE (mode) || mode == TImode)
2499		{
2500		  insn_entry[uid].is_relevant = 1;
2501		  if (mode == TImode || mode == V1TImode
2502		      || FLOAT128_VECTOR_P (mode))
2503		    insn_entry[uid].is_128_int = 1;
2504		  if (DF_REF_INSN_INFO (mention))
2505		    insn_entry[uid].contains_subreg
2506		      = !rtx_equal_p (DF_REF_REG (mention),
2507				      DF_REF_REAL_REG (mention));
2508		  /* REG_FUNCTION_VALUE_P is not valid for subregs. */
2509		  else if (REG_FUNCTION_VALUE_P (DF_REF_REG (mention)))
2510		    insn_entry[uid].is_live_out = 1;
2511		  union_uses (insn_entry, insn, mention);
2512		}
2513	    }
2514
2515	  if (insn_entry[uid].is_relevant)
2516	    {
2517	      /* Determine if this is a load or store.  */
2518	      insn_entry[uid].is_load = insn_is_load_p (insn);
2519	      insn_entry[uid].is_store = insn_is_store_p (insn);
2520
2521	      /* Determine if this is a doubleword swap.  If not,
2522		 determine whether it can legally be swapped.  */
2523	      if (insn_is_swap_p (insn))
2524		insn_entry[uid].is_swap = 1;
2525	      else
2526		{
2527		  unsigned int special = SH_NONE;
2528		  insn_entry[uid].is_swappable
2529		    = insn_is_swappable_p (insn_entry, insn, &special);
2530		  if (special != SH_NONE && insn_entry[uid].contains_subreg)
2531		    insn_entry[uid].is_swappable = 0;
2532		  else if (special != SH_NONE)
2533		    insn_entry[uid].special_handling = special;
2534		  else if (insn_entry[uid].contains_subreg
2535			   && has_part_mult (insn))
2536		    insn_entry[uid].is_swappable = 0;
2537		  else if (insn_entry[uid].contains_subreg)
2538		    insn_entry[uid].special_handling = SH_SUBREG;
2539		}
2540	    }
2541	}
2542    }
2543
2544  if (dump_file)
2545    {
2546      fprintf (dump_file, "\nSwap insn entry table when first built\n");
2547      dump_swap_insn_table (insn_entry);
2548    }
2549
2550  /* Record unoptimizable webs.  */
2551  unsigned e = get_max_uid (), i;
2552  for (i = 0; i < e; ++i)
2553    {
2554      if (!insn_entry[i].is_relevant)
2555	continue;
2556
2557      swap_web_entry *root
2558	= (swap_web_entry*)(&insn_entry[i])->unionfind_root ();
2559
2560      if (insn_entry[i].is_live_in || insn_entry[i].is_live_out
2561	  || (insn_entry[i].contains_subreg
2562	      && insn_entry[i].special_handling != SH_SUBREG)
2563	  || insn_entry[i].is_128_int || insn_entry[i].is_call
2564	  || !(insn_entry[i].is_swappable || insn_entry[i].is_swap))
2565	root->web_not_optimizable = 1;
2566
2567      /* If we have loads or stores that aren't permuting then the
2568	 optimization isn't appropriate.  */
2569      else if ((insn_entry[i].is_load || insn_entry[i].is_store)
2570	  && !insn_entry[i].is_swap && !insn_entry[i].is_swappable)
2571	root->web_not_optimizable = 1;
2572
2573      /* If we have a swap that is both fed by a permuting load
2574	 and a feeder of a permuting store, then the optimization
2575	 isn't appropriate.  (Consider vec_xl followed by vec_xst_be.)  */
2576      else if (insn_entry[i].is_swap && !insn_entry[i].is_load
2577	       && !insn_entry[i].is_store
2578	       && swap_feeds_both_load_and_store (&insn_entry[i]))
2579	root->web_not_optimizable = 1;
2580
2581      /* If we have permuting loads or stores that are not accompanied
2582	 by a register swap, the optimization isn't appropriate.  */
2583      else if (insn_entry[i].is_load && insn_entry[i].is_swap)
2584	{
2585	  rtx insn = insn_entry[i].insn;
2586	  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
2587	  df_ref def;
2588
2589	  FOR_EACH_INSN_INFO_DEF (def, insn_info)
2590	    {
2591	      struct df_link *link = DF_REF_CHAIN (def);
2592
2593	      if (!chain_contains_only_swaps (insn_entry, link, FOR_LOADS))
2594		{
2595		  root->web_not_optimizable = 1;
2596		  break;
2597		}
2598	    }
2599	}
2600      else if (insn_entry[i].is_store && insn_entry[i].is_swap)
2601	{
2602	  rtx insn = insn_entry[i].insn;
2603	  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
2604	  df_ref use;
2605
2606	  FOR_EACH_INSN_INFO_USE (use, insn_info)
2607	    {
2608	      struct df_link *link = DF_REF_CHAIN (use);
2609
2610	      if (!chain_contains_only_swaps (insn_entry, link, FOR_STORES))
2611		{
2612		  root->web_not_optimizable = 1;
2613		  break;
2614		}
2615	    }
2616	}
2617    }
2618
2619  if (dump_file)
2620    {
2621      fprintf (dump_file, "\nSwap insn entry table after web analysis\n");
2622      dump_swap_insn_table (insn_entry);
2623    }
2624
2625  /* For each load and store in an optimizable web (which implies
2626     the loads and stores are permuting), find the associated
2627     register swaps and mark them for removal.  Due to various
2628     optimizations we may mark the same swap more than once.  Also
2629     perform special handling for swappable insns that require it.  */
2630  for (i = 0; i < e; ++i)
2631    if ((insn_entry[i].is_load || insn_entry[i].is_store)
2632	&& insn_entry[i].is_swap)
2633      {
2634	swap_web_entry* root_entry
2635	  = (swap_web_entry*)((&insn_entry[i])->unionfind_root ());
2636	if (!root_entry->web_not_optimizable)
2637	  mark_swaps_for_removal (insn_entry, i);
2638      }
2639    else if (insn_entry[i].is_swappable && insn_entry[i].special_handling)
2640      {
2641	swap_web_entry* root_entry
2642	  = (swap_web_entry*)((&insn_entry[i])->unionfind_root ());
2643	if (!root_entry->web_not_optimizable)
2644	  handle_special_swappables (insn_entry, i);
2645      }
2646
2647  /* Now delete the swaps marked for removal.  */
2648  for (i = 0; i < e; ++i)
2649    if (insn_entry[i].will_delete)
2650      replace_swap_with_copy (insn_entry, i);
2651
2652  /* Clean up.  */
2653  free (insn_entry);
2654
2655  /* Use a second pass over rtl to detect that certain vector values
2656     fetched from or stored to memory on quad-word aligned addresses
2657     can use lvx/stvx without swaps.  */
2658
2659  /* First, rebuild ud chains.  */
2660  df_remove_problem (df_chain);
2661  df_process_deferred_rescans ();
2662  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
2663  df_chain_add_problem (DF_UD_CHAIN);
2664  df_analyze ();
2665
2666  swap_web_entry *pass2_insn_entry;
2667  pass2_insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ());
2668
2669  /* Walk the insns to gather basic data.  */
2670  FOR_ALL_BB_FN (bb, fun)
2671    FOR_BB_INSNS_SAFE (bb, insn, curr_insn)
2672    {
2673      unsigned int uid = INSN_UID (insn);
2674      if (NONDEBUG_INSN_P (insn))
2675	{
2676	  pass2_insn_entry[uid].insn = insn;
2677
2678	  pass2_insn_entry[uid].is_relevant = 1;
2679	  pass2_insn_entry[uid].is_load = insn_is_load_p (insn);
2680	  pass2_insn_entry[uid].is_store = insn_is_store_p (insn);
2681
2682	  /* Determine if this is a doubleword swap.  If not,
2683	     determine whether it can legally be swapped.  */
2684	  if (insn_is_swap_p (insn))
2685	    pass2_insn_entry[uid].is_swap = 1;
2686	}
2687    }
2688
2689  e = get_max_uid ();
2690  for (unsigned i = 0; i < e; ++i)
2691    if (pass2_insn_entry[i].is_swap && !pass2_insn_entry[i].is_load
2692	&& !pass2_insn_entry[i].is_store)
2693      {
2694	/* Replace swap of aligned load-swap with aligned unswapped
2695	   load.  */
2696	rtx_insn *rtx_insn = pass2_insn_entry[i].insn;
2697	if (quad_aligned_load_p (pass2_insn_entry, rtx_insn))
2698	  replace_swapped_aligned_load (pass2_insn_entry, rtx_insn);
2699      }
2700    else if (pass2_insn_entry[i].is_swap && pass2_insn_entry[i].is_store)
2701      {
2702	/* Replace aligned store-swap of swapped value with aligned
2703	   unswapped store.  */
2704	rtx_insn *rtx_insn = pass2_insn_entry[i].insn;
2705	if (quad_aligned_store_p (pass2_insn_entry, rtx_insn))
2706	  replace_swapped_aligned_store (pass2_insn_entry, rtx_insn);
2707      }
2708
2709  /* Clean up.  */
2710  free (pass2_insn_entry);
2711
2712  /* Use a third pass over rtl to replace swap(load(vector constant))
2713     with load(swapped vector constant).  */
2714
2715  /* First, rebuild ud chains.  */
2716  df_remove_problem (df_chain);
2717  df_process_deferred_rescans ();
2718  df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
2719  df_chain_add_problem (DF_UD_CHAIN);
2720  df_analyze ();
2721
2722  swap_web_entry *pass3_insn_entry;
2723  pass3_insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ());
2724
2725  /* Walk the insns to gather basic data.  */
2726  FOR_ALL_BB_FN (bb, fun)
2727    FOR_BB_INSNS_SAFE (bb, insn, curr_insn)
2728    {
2729      unsigned int uid = INSN_UID (insn);
2730      if (NONDEBUG_INSN_P (insn))
2731	{
2732	  pass3_insn_entry[uid].insn = insn;
2733
2734	  pass3_insn_entry[uid].is_relevant = 1;
2735	  pass3_insn_entry[uid].is_load = insn_is_load_p (insn);
2736	  pass3_insn_entry[uid].is_store = insn_is_store_p (insn);
2737
2738	  /* Determine if this is a doubleword swap.  If not,
2739	     determine whether it can legally be swapped.  */
2740	  if (insn_is_swap_p (insn))
2741	    pass3_insn_entry[uid].is_swap = 1;
2742	}
2743    }
2744
2745  e = get_max_uid ();
2746  for (unsigned i = 0; i < e; ++i)
2747    if (pass3_insn_entry[i].is_swap && !pass3_insn_entry[i].is_load
2748	&& !pass3_insn_entry[i].is_store)
2749      {
2750	insn = pass3_insn_entry[i].insn;
2751	if (const_load_sequence_p (pass3_insn_entry, insn))
2752	  replace_swapped_load_constant (pass3_insn_entry, insn);
2753      }
2754
2755  /* Clean up.  */
2756  free (pass3_insn_entry);
2757  return 0;
2758}
2759
2760const pass_data pass_data_analyze_swaps =
2761{
2762  RTL_PASS, /* type */
2763  "swaps", /* name */
2764  OPTGROUP_NONE, /* optinfo_flags */
2765  TV_NONE, /* tv_id */
2766  0, /* properties_required */
2767  0, /* properties_provided */
2768  0, /* properties_destroyed */
2769  0, /* todo_flags_start */
2770  TODO_df_finish, /* todo_flags_finish */
2771};
2772
2773class pass_analyze_swaps : public rtl_opt_pass
2774{
2775public:
2776  pass_analyze_swaps(gcc::context *ctxt)
2777    : rtl_opt_pass(pass_data_analyze_swaps, ctxt)
2778  {}
2779
2780  /* opt_pass methods: */
2781  virtual bool gate (function *)
2782    {
2783      return (optimize > 0 && !BYTES_BIG_ENDIAN && TARGET_VSX
2784	      && !TARGET_P9_VECTOR && rs6000_optimize_swaps);
2785    }
2786
2787  virtual unsigned int execute (function *fun)
2788    {
2789      return rs6000_analyze_swaps (fun);
2790    }
2791
2792  opt_pass *clone ()
2793    {
2794      return new pass_analyze_swaps (m_ctxt);
2795    }
2796
2797}; // class pass_analyze_swaps
2798
2799rtl_opt_pass *
2800make_pass_analyze_swaps (gcc::context *ctxt)
2801{
2802  return new pass_analyze_swaps (ctxt);
2803}
2804
2805