xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/config/rs6000/rs6000-p8swap.c (revision dd75ac5b443e967e26b4d18cc8cd5eb98512bfbf)
1 /* Subroutines used to remove unnecessary doubleword swaps
2    for p8 little-endian VSX code.
3    Copyright (C) 1991-2020 Free Software Foundation, Inc.
4 
5    This file is part of GCC.
6 
7    GCC is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published
9    by the Free Software Foundation; either version 3, or (at your
10    option) any later version.
11 
12    GCC is distributed in the hope that it will be useful, but WITHOUT
13    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
15    License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GCC; see the file COPYING3.  If not see
19    <http://www.gnu.org/licenses/>.  */
20 
21 #define IN_TARGET_CODE 1
22 
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "df.h"
31 #include "tm_p.h"
32 #include "ira.h"
33 #include "print-tree.h"
34 #include "varasm.h"
35 #include "explow.h"
36 #include "expr.h"
37 #include "output.h"
38 #include "tree-pass.h"
39 #include "rtx-vector-builder.h"
40 
41 /* Analyze vector computations and remove unnecessary doubleword
42    swaps (xxswapdi instructions).  This pass is performed only
43    for little-endian VSX code generation.
44 
45    For this specific case, loads and stores of 4x32 and 2x64 vectors
46    are inefficient.  These are implemented using the lvx2dx and
47    stvx2dx instructions, which invert the order of doublewords in
48    a vector register.  Thus the code generation inserts an xxswapdi
49    after each such load, and prior to each such store.  (For spill
50    code after register assignment, an additional xxswapdi is inserted
51    following each store in order to return a hard register to its
52    unpermuted value.)
53 
54    The extra xxswapdi instructions reduce performance.  This can be
55    particularly bad for vectorized code.  The purpose of this pass
56    is to reduce the number of xxswapdi instructions required for
57    correctness.
58 
59    The primary insight is that much code that operates on vectors
60    does not care about the relative order of elements in a register,
61    so long as the correct memory order is preserved.  If we have
62    a computation where all input values are provided by lvxd2x/xxswapdi
63    sequences, all outputs are stored using xxswapdi/stvxd2x sequences,
64    and all intermediate computations are pure SIMD (independent of
65    element order), then all the xxswapdi's associated with the loads
66    and stores may be removed.
67 
68    This pass uses some of the infrastructure and logical ideas from
69    the "web" pass in web.c.  We create maximal webs of computations
70    fitting the description above using union-find.  Each such web is
71    then optimized by removing its unnecessary xxswapdi instructions.
72 
73    The pass is placed prior to global optimization so that we can
74    perform the optimization in the safest and simplest way possible;
75    that is, by replacing each xxswapdi insn with a register copy insn.
76    Subsequent forward propagation will remove copies where possible.
77 
78    There are some operations sensitive to element order for which we
79    can still allow the operation, provided we modify those operations.
80    These include CONST_VECTORs, for which we must swap the first and
81    second halves of the constant vector; and SUBREGs, for which we
82    must adjust the byte offset to account for the swapped doublewords.
83    A remaining opportunity would be non-immediate-form splats, for
84    which we should adjust the selected lane of the input.  We should
85    also make code generation adjustments for sum-across operations,
86    since this is a common vectorizer reduction.
87 
88    Because we run prior to the first split, we can see loads and stores
89    here that match *vsx_le_perm_{load,store}_<mode>.  These are vanilla
90    vector loads and stores that have not yet been split into a permuting
91    load/store and a swap.  (One way this can happen is with a builtin
92    call to vec_vsx_{ld,st}.)  We can handle these as well, but rather
93    than deleting a swap, we convert the load/store into a permuting
94    load/store (which effectively removes the swap).  */
95 
96 /* Notes on Permutes
97 
98    We do not currently handle computations that contain permutes.  There
99    is a general transformation that can be performed correctly, but it
100    may introduce more expensive code than it replaces.  To handle these
101    would require a cost model to determine when to perform the optimization.
102    This commentary records how this could be done if desired.
103 
104    The most general permute is something like this (example for V16QI):
105 
106    (vec_select:V16QI (vec_concat:V32QI (op1:V16QI) (op2:V16QI))
107                      (parallel [(const_int a0) (const_int a1)
108                                  ...
109                                 (const_int a14) (const_int a15)]))
110 
111    where a0,...,a15 are in [0,31] and select elements from op1 and op2
112    to produce in the result.
113 
114    Regardless of mode, we can convert the PARALLEL to a mask of 16
115    byte-element selectors.  Let's call this M, with M[i] representing
116    the ith byte-element selector value.  Then if we swap doublewords
117    throughout the computation, we can get correct behavior by replacing
118    M with M' as follows:
119 
120     M'[i] = { (M[i]+8)%16      : M[i] in [0,15]
121             { ((M[i]+8)%16)+16 : M[i] in [16,31]
122 
123    This seems promising at first, since we are just replacing one mask
124    with another.  But certain masks are preferable to others.  If M
125    is a mask that matches a vmrghh pattern, for example, M' certainly
126    will not.  Instead of a single vmrghh, we would generate a load of
127    M' and a vperm.  So we would need to know how many xxswapd's we can
128    remove as a result of this transformation to determine if it's
129    profitable; and preferably the logic would need to be aware of all
130    the special preferable masks.
131 
132    Another form of permute is an UNSPEC_VPERM, in which the mask is
133    already in a register.  In some cases, this mask may be a constant
134    that we can discover with ud-chains, in which case the above
135    transformation is ok.  However, the common usage here is for the
136    mask to be produced by an UNSPEC_LVSL, in which case the mask
137    cannot be known at compile time.  In such a case we would have to
138    generate several instructions to compute M' as above at run time,
139    and a cost model is needed again.
140 
141    However, when the mask M for an UNSPEC_VPERM is loaded from the
142    constant pool, we can replace M with M' as above at no cost
143    beyond adding a constant pool entry.  */
144 
145 /* This is based on the union-find logic in web.c.  web_entry_base is
146    defined in df.h.  */
147 class swap_web_entry : public web_entry_base
148 {
149  public:
150   /* Pointer to the insn.  */
151   rtx_insn *insn;
152   /* Set if insn contains a mention of a vector register.  All other
153      fields are undefined if this field is unset.  */
154   unsigned int is_relevant : 1;
155   /* Set if insn is a load.  */
156   unsigned int is_load : 1;
157   /* Set if insn is a store.  */
158   unsigned int is_store : 1;
159   /* Set if insn is a doubleword swap.  This can either be a register swap
160      or a permuting load or store (test is_load and is_store for this).  */
161   unsigned int is_swap : 1;
162   /* Set if the insn has a live-in use of a parameter register.  */
163   unsigned int is_live_in : 1;
164   /* Set if the insn has a live-out def of a return register.  */
165   unsigned int is_live_out : 1;
166   /* Set if the insn contains a subreg reference of a vector register.  */
167   unsigned int contains_subreg : 1;
168   /* Set if the insn contains a 128-bit integer operand.  */
169   unsigned int is_128_int : 1;
170   /* Set if this is a call-insn.  */
171   unsigned int is_call : 1;
172   /* Set if this insn does not perform a vector operation for which
173      element order matters, or if we know how to fix it up if it does.
174      Undefined if is_swap is set.  */
175   unsigned int is_swappable : 1;
176   /* A nonzero value indicates what kind of special handling for this
177      insn is required if doublewords are swapped.  Undefined if
178      is_swappable is not set.  */
179   unsigned int special_handling : 4;
180   /* Set if the web represented by this entry cannot be optimized.  */
181   unsigned int web_not_optimizable : 1;
182   /* Set if this insn should be deleted.  */
183   unsigned int will_delete : 1;
184 };
185 
186 enum special_handling_values {
187   SH_NONE = 0,
188   SH_CONST_VECTOR,
189   SH_SUBREG,
190   SH_NOSWAP_LD,
191   SH_NOSWAP_ST,
192   SH_EXTRACT,
193   SH_SPLAT,
194   SH_XXPERMDI,
195   SH_CONCAT,
196   SH_VPERM
197 };
198 
199 /* Union INSN with all insns containing definitions that reach USE.
200    Detect whether USE is live-in to the current function.  */
201 static void
202 union_defs (swap_web_entry *insn_entry, rtx insn, df_ref use)
203 {
204   struct df_link *link = DF_REF_CHAIN (use);
205 
206   if (!link)
207     insn_entry[INSN_UID (insn)].is_live_in = 1;
208 
209   while (link)
210     {
211       if (DF_REF_IS_ARTIFICIAL (link->ref))
212 	insn_entry[INSN_UID (insn)].is_live_in = 1;
213 
214       if (DF_REF_INSN_INFO (link->ref))
215 	{
216 	  rtx def_insn = DF_REF_INSN (link->ref);
217 	  (void)unionfind_union (insn_entry + INSN_UID (insn),
218 				 insn_entry + INSN_UID (def_insn));
219 	}
220 
221       link = link->next;
222     }
223 }
224 
225 /* Union INSN with all insns containing uses reached from DEF.
226    Detect whether DEF is live-out from the current function.  */
227 static void
228 union_uses (swap_web_entry *insn_entry, rtx insn, df_ref def)
229 {
230   struct df_link *link = DF_REF_CHAIN (def);
231 
232   if (!link)
233     insn_entry[INSN_UID (insn)].is_live_out = 1;
234 
235   while (link)
236     {
237       /* This could be an eh use or some other artificial use;
238 	 we treat these all the same (killing the optimization).  */
239       if (DF_REF_IS_ARTIFICIAL (link->ref))
240 	insn_entry[INSN_UID (insn)].is_live_out = 1;
241 
242       if (DF_REF_INSN_INFO (link->ref))
243 	{
244 	  rtx use_insn = DF_REF_INSN (link->ref);
245 	  (void)unionfind_union (insn_entry + INSN_UID (insn),
246 				 insn_entry + INSN_UID (use_insn));
247 	}
248 
249       link = link->next;
250     }
251 }
252 
253 /* Return 1 iff INSN is a load insn, including permuting loads that
254    represent an lvxd2x instruction; else return 0.  */
255 static unsigned int
256 insn_is_load_p (rtx insn)
257 {
258   rtx body = PATTERN (insn);
259 
260   if (GET_CODE (body) == SET)
261     {
262       if (MEM_P (SET_SRC (body)))
263 	return 1;
264 
265       if (GET_CODE (SET_SRC (body)) == VEC_SELECT
266 	  && MEM_P (XEXP (SET_SRC (body), 0)))
267 	return 1;
268 
269       return 0;
270     }
271 
272   if (GET_CODE (body) != PARALLEL)
273     return 0;
274 
275   rtx set = XVECEXP (body, 0, 0);
276 
277   if (GET_CODE (set) == SET && MEM_P (SET_SRC (set)))
278     return 1;
279 
280   return 0;
281 }
282 
283 /* Return 1 iff INSN is a store insn, including permuting stores that
284    represent an stvxd2x instruction; else return 0.  */
285 static unsigned int
286 insn_is_store_p (rtx insn)
287 {
288   rtx body = PATTERN (insn);
289   if (GET_CODE (body) == SET && MEM_P (SET_DEST (body)))
290     return 1;
291   if (GET_CODE (body) != PARALLEL)
292     return 0;
293   rtx set = XVECEXP (body, 0, 0);
294   if (GET_CODE (set) == SET && MEM_P (SET_DEST (set)))
295     return 1;
296   return 0;
297 }
298 
299 /* Return 1 iff INSN swaps doublewords.  This may be a reg-reg swap,
300    a permuting load, or a permuting store.  */
301 static unsigned int
302 insn_is_swap_p (rtx insn)
303 {
304   rtx body = PATTERN (insn);
305   if (GET_CODE (body) != SET)
306     return 0;
307   rtx rhs = SET_SRC (body);
308   if (GET_CODE (rhs) != VEC_SELECT)
309     return 0;
310   rtx parallel = XEXP (rhs, 1);
311   if (GET_CODE (parallel) != PARALLEL)
312     return 0;
313   unsigned int len = XVECLEN (parallel, 0);
314   if (len != 2 && len != 4 && len != 8 && len != 16)
315     return 0;
316   for (unsigned int i = 0; i < len / 2; ++i)
317     {
318       rtx op = XVECEXP (parallel, 0, i);
319       if (!CONST_INT_P (op) || INTVAL (op) != len / 2 + i)
320 	return 0;
321     }
322   for (unsigned int i = len / 2; i < len; ++i)
323     {
324       rtx op = XVECEXP (parallel, 0, i);
325       if (!CONST_INT_P (op) || INTVAL (op) != i - len / 2)
326 	return 0;
327     }
328   return 1;
329 }
330 
331 /* Return true iff EXPR represents the sum of two registers.  */
332 bool
333 rs6000_sum_of_two_registers_p (const_rtx expr)
334 {
335   if (GET_CODE (expr) == PLUS)
336     {
337       const_rtx operand1 = XEXP (expr, 0);
338       const_rtx operand2 = XEXP (expr, 1);
339       return (REG_P (operand1) && REG_P (operand2));
340     }
341   return false;
342 }
343 
344 /* Return true iff EXPR represents an address expression that masks off
345    the low-order 4 bits in the style of an lvx or stvx rtl pattern.  */
346 bool
347 rs6000_quadword_masked_address_p (const_rtx expr)
348 {
349   if (GET_CODE (expr) == AND)
350     {
351       const_rtx operand1 = XEXP (expr, 0);
352       const_rtx operand2 = XEXP (expr, 1);
353       if ((REG_P (operand1) || rs6000_sum_of_two_registers_p (operand1))
354 	  && CONST_SCALAR_INT_P (operand2) && INTVAL (operand2) == -16)
355 	return true;
356     }
357   return false;
358 }
359 
360 /* Return TRUE if INSN represents a swap of a swapped load from memory
361    and the memory address is quad-word aligned.  */
362 static bool
363 quad_aligned_load_p (swap_web_entry *insn_entry, rtx_insn *insn)
364 {
365   unsigned uid = INSN_UID (insn);
366   if (!insn_entry[uid].is_swap || insn_entry[uid].is_load)
367     return false;
368 
369   struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
370 
371   /* Since insn is known to represent a swap instruction, we know it
372      "uses" only one input variable.  */
373   df_ref use = DF_INSN_INFO_USES (insn_info);
374 
375   /* Figure out where this input variable is defined.  */
376   struct df_link *def_link = DF_REF_CHAIN (use);
377 
378   /* If there is no definition or the definition is artificial or there are
379      multiple definitions, punt.  */
380   if (!def_link || !def_link->ref || DF_REF_IS_ARTIFICIAL (def_link->ref)
381       || def_link->next)
382     return false;
383 
384   rtx def_insn = DF_REF_INSN (def_link->ref);
385   unsigned uid2 = INSN_UID (def_insn);
386   /* We're looking for a load-with-swap insn.  If this is not that,
387      return false.  */
388   if (!insn_entry[uid2].is_load || !insn_entry[uid2].is_swap)
389     return false;
390 
391   /* If the source of the rtl def is not a set from memory, return
392      false.  */
393   rtx body = PATTERN (def_insn);
394   if (GET_CODE (body) != SET
395       || GET_CODE (SET_SRC (body)) != VEC_SELECT
396       || !MEM_P (XEXP (SET_SRC (body), 0)))
397     return false;
398 
399   rtx mem = XEXP (SET_SRC (body), 0);
400   rtx base_reg = XEXP (mem, 0);
401   return ((REG_P (base_reg) || rs6000_sum_of_two_registers_p (base_reg))
402 	  && MEM_ALIGN (mem) >= 128) ? true : false;
403 }
404 
405 /* Return TRUE if INSN represents a store-with-swap of a swapped value
406    and the memory address is quad-word aligned.  */
407 static bool
408 quad_aligned_store_p (swap_web_entry *insn_entry, rtx_insn *insn)
409 {
410   unsigned uid = INSN_UID (insn);
411   if (!insn_entry[uid].is_swap || !insn_entry[uid].is_store)
412     return false;
413 
414   rtx body = PATTERN (insn);
415   rtx dest_address = XEXP (SET_DEST (body), 0);
416   rtx swap_reg = XEXP (SET_SRC (body), 0);
417 
418   /* If the base address for the memory expression is not represented
419      by a single register and is not the sum of two registers, punt.  */
420   if (!REG_P (dest_address) && !rs6000_sum_of_two_registers_p (dest_address))
421     return false;
422 
423   /* Confirm that the value to be stored is produced by a swap
424      instruction.  */
425   struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
426   df_ref use;
427   FOR_EACH_INSN_INFO_USE (use, insn_info)
428     {
429       struct df_link *def_link = DF_REF_CHAIN (use);
430 
431       /* If this is not the definition of the candidate swap register,
432 	 then skip it.  I am interested in a different definition.  */
433       if (!rtx_equal_p (DF_REF_REG (use), swap_reg))
434 	continue;
435 
436       /* If there is no def or the def is artifical or there are
437 	 multiple defs, punt.  */
438       if (!def_link || !def_link->ref || DF_REF_IS_ARTIFICIAL (def_link->ref)
439 	  || def_link->next)
440 	return false;
441 
442       rtx def_insn = DF_REF_INSN (def_link->ref);
443       unsigned uid2 = INSN_UID (def_insn);
444 
445       /* If this source value is not a simple swap, return false */
446       if (!insn_entry[uid2].is_swap || insn_entry[uid2].is_load
447 	  || insn_entry[uid2].is_store)
448 	return false;
449 
450       /* I've processed the use that I care about, so break out of
451 	 this loop.  */
452       break;
453     }
454 
455   /* At this point, we know the source data comes from a swap.  The
456      remaining question is whether the memory address is aligned.  */
457   rtx set = single_set (insn);
458   if (set)
459     {
460       rtx dest = SET_DEST (set);
461       if (MEM_P (dest))
462 	return (MEM_ALIGN (dest) >= 128);
463     }
464   return false;
465 }
466 
467 /* Return 1 iff UID, known to reference a swap, is both fed by a load
468    and a feeder of a store.  */
469 static unsigned int
470 swap_feeds_both_load_and_store (swap_web_entry *insn_entry)
471 {
472   rtx insn = insn_entry->insn;
473   struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
474   df_ref def, use;
475   struct df_link *link = 0;
476   rtx_insn *load = 0, *store = 0;
477   bool fed_by_load = 0;
478   bool feeds_store = 0;
479 
480   FOR_EACH_INSN_INFO_USE (use, insn_info)
481     {
482       link = DF_REF_CHAIN (use);
483       load = DF_REF_INSN (link->ref);
484       if (insn_is_load_p (load) && insn_is_swap_p (load))
485 	fed_by_load = 1;
486     }
487 
488   FOR_EACH_INSN_INFO_DEF (def, insn_info)
489     {
490       link = DF_REF_CHAIN (def);
491       store = DF_REF_INSN (link->ref);
492       if (insn_is_store_p (store) && insn_is_swap_p (store))
493 	feeds_store = 1;
494     }
495 
496   return fed_by_load && feeds_store;
497 }
498 
499 /* Return TRUE if insn is a swap fed by a load from the constant pool.  */
500 static bool
501 const_load_sequence_p (swap_web_entry *insn_entry, rtx insn)
502 {
503   unsigned uid = INSN_UID (insn);
504   if (!insn_entry[uid].is_swap || insn_entry[uid].is_load)
505     return false;
506 
507   const_rtx tocrel_base;
508 
509   struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
510   df_ref use;
511 
512   /* Iterate over the definitions that are used by this insn.  Since
513      this is known to be a swap insn, expect only one used definnition.  */
514   FOR_EACH_INSN_INFO_USE (use, insn_info)
515     {
516       struct df_link *def_link = DF_REF_CHAIN (use);
517 
518       /* If there is no def or the def is artificial or there are
519 	 multiple defs, punt.  */
520       if (!def_link || !def_link->ref || DF_REF_IS_ARTIFICIAL (def_link->ref)
521 	  || def_link->next)
522 	return false;
523 
524       rtx def_insn = DF_REF_INSN (def_link->ref);
525       unsigned uid2 = INSN_UID (def_insn);
526       /* If this is not a load or is not a swap, return false.  */
527       if (!insn_entry[uid2].is_load || !insn_entry[uid2].is_swap)
528 	return false;
529 
530       /* If the source of the rtl def is not a set from memory, return
531 	 false.  */
532       rtx body = PATTERN (def_insn);
533       if (GET_CODE (body) != SET
534 	  || GET_CODE (SET_SRC (body)) != VEC_SELECT
535 	  || !MEM_P (XEXP (SET_SRC (body), 0)))
536 	return false;
537 
538       rtx mem = XEXP (SET_SRC (body), 0);
539       rtx base_reg = XEXP (mem, 0);
540       /* If the base address for the memory expression is not
541 	 represented by a register, punt.  */
542       if (!REG_P (base_reg))
543 	return false;
544 
545       df_ref base_use;
546       insn_info = DF_INSN_INFO_GET (def_insn);
547       FOR_EACH_INSN_INFO_USE (base_use, insn_info)
548 	{
549 	  /* If base_use does not represent base_reg, look for another
550 	     use.  */
551 	  if (!rtx_equal_p (DF_REF_REG (base_use), base_reg))
552 	    continue;
553 
554 	  struct df_link *base_def_link = DF_REF_CHAIN (base_use);
555 	  if (!base_def_link || base_def_link->next)
556 	    return false;
557 
558 	  /* Constants held on the stack are not "true" constants
559 	     because their values are not part of the static load
560 	     image.  If this constant's base reference is a stack
561 	     or frame pointer, it is seen as an artificial
562 	     reference.  */
563 	  if (DF_REF_IS_ARTIFICIAL (base_def_link->ref))
564 	    return false;
565 
566 	  rtx tocrel_insn = DF_REF_INSN (base_def_link->ref);
567 	  rtx tocrel_body = PATTERN (tocrel_insn);
568 	  rtx base, offset;
569 	  if (GET_CODE (tocrel_body) != SET)
570 	    return false;
571 	  /* There is an extra level of indirection for small/large
572 	     code models.  */
573 	  rtx tocrel_expr = SET_SRC (tocrel_body);
574 	  if (MEM_P (tocrel_expr))
575 	    tocrel_expr = XEXP (tocrel_expr, 0);
576 	  if (!toc_relative_expr_p (tocrel_expr, false, &tocrel_base, NULL))
577 	    return false;
578 	  split_const (XVECEXP (tocrel_base, 0, 0), &base, &offset);
579 
580 	  if (!SYMBOL_REF_P (base) || !CONSTANT_POOL_ADDRESS_P (base))
581 	    return false;
582 	  else
583 	    {
584 	      /* FIXME: The conditions under which
585 	          (SYMBOL_REF_P (const_vector)
586 	           && !CONSTANT_POOL_ADDRESS_P (const_vector))
587 	         are not well understood.  This code prevents
588 	         an internal compiler error which will occur in
589 	         replace_swapped_load_constant () if we were to return
590 	         true.  Some day, we should figure out how to properly
591 	         handle this condition in
592 	         replace_swapped_load_constant () and then we can
593 	         remove this special test.  */
594 	      rtx const_vector = get_pool_constant (base);
595 	      if (SYMBOL_REF_P (const_vector)
596 		  && CONSTANT_POOL_ADDRESS_P (const_vector))
597 		const_vector = get_pool_constant (const_vector);
598 	      if (GET_CODE (const_vector) != CONST_VECTOR)
599 		return false;
600 	    }
601 	}
602     }
603   return true;
604 }
605 
606 /* Return TRUE iff OP matches a V2DF reduction pattern.  See the
607    definition of vsx_reduc_<VEC_reduc_name>_v2df in vsx.md.  */
608 static bool
609 v2df_reduction_p (rtx op)
610 {
611   if (GET_MODE (op) != V2DFmode)
612     return false;
613 
614   enum rtx_code code = GET_CODE (op);
615   if (code != PLUS && code != SMIN && code != SMAX)
616     return false;
617 
618   rtx concat = XEXP (op, 0);
619   if (GET_CODE (concat) != VEC_CONCAT)
620     return false;
621 
622   rtx select0 = XEXP (concat, 0);
623   rtx select1 = XEXP (concat, 1);
624   if (GET_CODE (select0) != VEC_SELECT || GET_CODE (select1) != VEC_SELECT)
625     return false;
626 
627   rtx reg0 = XEXP (select0, 0);
628   rtx reg1 = XEXP (select1, 0);
629   if (!rtx_equal_p (reg0, reg1) || !REG_P (reg0))
630     return false;
631 
632   rtx parallel0 = XEXP (select0, 1);
633   rtx parallel1 = XEXP (select1, 1);
634   if (GET_CODE (parallel0) != PARALLEL || GET_CODE (parallel1) != PARALLEL)
635     return false;
636 
637   if (!rtx_equal_p (XVECEXP (parallel0, 0, 0), const1_rtx)
638       || !rtx_equal_p (XVECEXP (parallel1, 0, 0), const0_rtx))
639     return false;
640 
641   return true;
642 }
643 
644 /* Return 1 iff OP is an operand that will not be affected by having
645    vector doublewords swapped in memory.  */
646 static unsigned int
647 rtx_is_swappable_p (rtx op, unsigned int *special)
648 {
649   enum rtx_code code = GET_CODE (op);
650   int i, j;
651   rtx parallel;
652 
653   switch (code)
654     {
655     case LABEL_REF:
656     case SYMBOL_REF:
657     case CLOBBER:
658     case REG:
659       return 1;
660 
661     case VEC_CONCAT:
662     case ASM_INPUT:
663     case ASM_OPERANDS:
664       return 0;
665 
666     case CONST_VECTOR:
667       {
668 	*special = SH_CONST_VECTOR;
669 	return 1;
670       }
671 
672     case VEC_DUPLICATE:
673       /* Opportunity: If XEXP (op, 0) has the same mode as the result,
674 	 and XEXP (op, 1) is a PARALLEL with a single QImode const int,
675 	 it represents a vector splat for which we can do special
676 	 handling.  */
677       if (CONST_INT_P (XEXP (op, 0)))
678 	return 1;
679       else if (REG_P (XEXP (op, 0))
680 	       && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0)))
681 	/* This catches V2DF and V2DI splat, at a minimum.  */
682 	return 1;
683       else if (GET_CODE (XEXP (op, 0)) == TRUNCATE
684 	       && REG_P (XEXP (XEXP (op, 0), 0))
685 	       && GET_MODE_INNER (GET_MODE (op)) == GET_MODE (XEXP (op, 0)))
686 	/* This catches splat of a truncated value.  */
687 	return 1;
688       else if (GET_CODE (XEXP (op, 0)) == VEC_SELECT)
689 	/* If the duplicated item is from a select, defer to the select
690 	   processing to see if we can change the lane for the splat.  */
691 	return rtx_is_swappable_p (XEXP (op, 0), special);
692       else
693 	return 0;
694 
695     case VEC_SELECT:
696       /* A vec_extract operation is ok if we change the lane.  */
697       if (REG_P (XEXP (op, 0))
698 	  && GET_MODE_INNER (GET_MODE (XEXP (op, 0))) == GET_MODE (op)
699 	  && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL
700 	  && XVECLEN (parallel, 0) == 1
701 	  && CONST_INT_P (XVECEXP (parallel, 0, 0)))
702 	{
703 	  *special = SH_EXTRACT;
704 	  return 1;
705 	}
706       /* An XXPERMDI is ok if we adjust the lanes.  Note that if the
707 	 XXPERMDI is a swap operation, it will be identified by
708 	 insn_is_swap_p and therefore we won't get here.  */
709       else if (GET_CODE (XEXP (op, 0)) == VEC_CONCAT
710 	       && (GET_MODE (XEXP (op, 0)) == V4DFmode
711 		   || GET_MODE (XEXP (op, 0)) == V4DImode)
712 	       && GET_CODE ((parallel = XEXP (op, 1))) == PARALLEL
713 	       && XVECLEN (parallel, 0) == 2
714 	       && CONST_INT_P (XVECEXP (parallel, 0, 0))
715 	       && CONST_INT_P (XVECEXP (parallel, 0, 1)))
716 	{
717 	  *special = SH_XXPERMDI;
718 	  return 1;
719 	}
720       else if (v2df_reduction_p (op))
721 	return 1;
722       else
723 	return 0;
724 
725     case UNSPEC:
726       {
727 	/* Various operations are unsafe for this optimization, at least
728 	   without significant additional work.  Permutes are obviously
729 	   problematic, as both the permute control vector and the ordering
730 	   of the target values are invalidated by doubleword swapping.
731 	   Vector pack and unpack modify the number of vector lanes.
732 	   Merge-high/low will not operate correctly on swapped operands.
733 	   Vector shifts across element boundaries are clearly uncool,
734 	   as are vector select and concatenate operations.  Vector
735 	   sum-across instructions define one operand with a specific
736 	   order-dependent element, so additional fixup code would be
737 	   needed to make those work.  Vector set and non-immediate-form
738 	   vector splat are element-order sensitive.  A few of these
739 	   cases might be workable with special handling if required.
740 	   Adding cost modeling would be appropriate in some cases.  */
741 	int val = XINT (op, 1);
742 	switch (val)
743 	  {
744 	  default:
745 	    break;
746 	  case UNSPEC_VBPERMQ:
747 	  case UNSPEC_VMRGH_DIRECT:
748 	  case UNSPEC_VMRGL_DIRECT:
749 	  case UNSPEC_VPACK_SIGN_SIGN_SAT:
750 	  case UNSPEC_VPACK_SIGN_UNS_SAT:
751 	  case UNSPEC_VPACK_UNS_UNS_MOD:
752 	  case UNSPEC_VPACK_UNS_UNS_MOD_DIRECT:
753 	  case UNSPEC_VPACK_UNS_UNS_SAT:
754 	  case UNSPEC_VPERM:
755 	  case UNSPEC_VPERM_UNS:
756 	  case UNSPEC_VPERMHI:
757 	  case UNSPEC_VPERMSI:
758 	  case UNSPEC_VPERMXOR:
759 	  case UNSPEC_VPKPX:
760 	  case UNSPEC_VSLDOI:
761 	  case UNSPEC_VSLO:
762 	  case UNSPEC_VSRO:
763 	  case UNSPEC_VSUM2SWS:
764 	  case UNSPEC_VSUM4S:
765 	  case UNSPEC_VSUM4UBS:
766 	  case UNSPEC_VSUMSWS:
767 	  case UNSPEC_VSUMSWS_DIRECT:
768 	  case UNSPEC_VSX_CONCAT:
769 	  case UNSPEC_VSX_CVDPSPN:
770 	  case UNSPEC_VSX_CVSPDP:
771 	  case UNSPEC_VSX_CVSPDPN:
772 	  case UNSPEC_VSX_EXTRACT:
773 	  case UNSPEC_VSX_SET:
774 	  case UNSPEC_VSX_SLDWI:
775 	  case UNSPEC_VSX_VSLO:
776 	  case UNSPEC_VUNPACK_HI_SIGN:
777 	  case UNSPEC_VUNPACK_HI_SIGN_DIRECT:
778 	  case UNSPEC_VUNPACK_LO_SIGN:
779 	  case UNSPEC_VUNPACK_LO_SIGN_DIRECT:
780 	  case UNSPEC_VUPKHPX:
781 	  case UNSPEC_VUPKHS_V4SF:
782 	  case UNSPEC_VUPKHU_V4SF:
783 	  case UNSPEC_VUPKLPX:
784 	  case UNSPEC_VUPKLS_V4SF:
785 	  case UNSPEC_VUPKLU_V4SF:
786 	    return 0;
787 	  case UNSPEC_VSPLT_DIRECT:
788 	  case UNSPEC_VSX_XXSPLTD:
789 	    *special = SH_SPLAT;
790 	    return 1;
791 	  case UNSPEC_REDUC_PLUS:
792 	  case UNSPEC_REDUC:
793 	    return 1;
794 	  case UNSPEC_VPMSUM:
795 	    /* vpmsumd is not swappable, but vpmsum[bhw] are.  */
796 	    if (GET_MODE (op) == V2DImode)
797 	      return 0;
798 	    break;
799 	  }
800       }
801 
802     default:
803       break;
804     }
805 
806   const char *fmt = GET_RTX_FORMAT (code);
807   int ok = 1;
808 
809   for (i = 0; i < GET_RTX_LENGTH (code); ++i)
810     if (fmt[i] == 'e' || fmt[i] == 'u')
811       {
812 	unsigned int special_op = SH_NONE;
813 	ok &= rtx_is_swappable_p (XEXP (op, i), &special_op);
814 	if (special_op == SH_NONE)
815 	  continue;
816 	/* Ensure we never have two kinds of special handling
817 	   for the same insn.  */
818 	if (*special != SH_NONE && *special != special_op)
819 	  return 0;
820 	*special = special_op;
821       }
822     else if (fmt[i] == 'E')
823       for (j = 0; j < XVECLEN (op, i); ++j)
824 	{
825 	  unsigned int special_op = SH_NONE;
826 	  ok &= rtx_is_swappable_p (XVECEXP (op, i, j), &special_op);
827 	  if (special_op == SH_NONE)
828 	    continue;
829 	  /* Ensure we never have two kinds of special handling
830 	     for the same insn.  */
831 	  if (*special != SH_NONE && *special != special_op)
832 	    return 0;
833 	  *special = special_op;
834 	}
835 
836   return ok;
837 }
838 
839 /* Return 1 iff INSN is an operand that will not be affected by
840    having vector doublewords swapped in memory (in which case
841    *SPECIAL is unchanged), or that can be modified to be correct
842    if vector doublewords are swapped in memory (in which case
843    *SPECIAL is changed to a value indicating how).  */
844 static unsigned int
845 insn_is_swappable_p (swap_web_entry *insn_entry, rtx insn,
846 		     unsigned int *special)
847 {
848   /* Calls are always bad.  */
849   if (GET_CODE (insn) == CALL_INSN)
850     return 0;
851 
852   /* Loads and stores seen here are not permuting, but we can still
853      fix them up by converting them to permuting ones.  Exceptions:
854      UNSPEC_LVE, UNSPEC_LVX, and UNSPEC_STVX, which have a PARALLEL
855      body instead of a SET; and UNSPEC_STVE, which has an UNSPEC
856      for the SET source.  Also we must now make an exception for lvx
857      and stvx when they are not in the UNSPEC_LVX/STVX form (with the
858      explicit "& -16") since this leads to unrecognizable insns.  */
859   rtx body = PATTERN (insn);
860   int i = INSN_UID (insn);
861 
862   if (insn_entry[i].is_load)
863     {
864       if (GET_CODE (body) == SET)
865 	{
866 	  rtx rhs = SET_SRC (body);
867 	  /* Even without a swap, the RHS might be a vec_select for, say,
868 	     a byte-reversing load.  */
869 	  if (!MEM_P (rhs))
870 	    return 0;
871 	  if (GET_CODE (XEXP (rhs, 0)) == AND)
872 	    return 0;
873 
874 	  *special = SH_NOSWAP_LD;
875 	  return 1;
876 	}
877       else
878 	return 0;
879     }
880 
881   if (insn_entry[i].is_store)
882     {
883       if (GET_CODE (body) == SET
884 	  && GET_CODE (SET_SRC (body)) != UNSPEC
885 	  && GET_CODE (SET_SRC (body)) != VEC_SELECT)
886 	{
887 	  rtx lhs = SET_DEST (body);
888 	  /* Even without a swap, the RHS might be a vec_select for, say,
889 	     a byte-reversing store.  */
890 	  if (!MEM_P (lhs))
891 	    return 0;
892 	  if (GET_CODE (XEXP (lhs, 0)) == AND)
893 	    return 0;
894 
895 	  *special = SH_NOSWAP_ST;
896 	  return 1;
897 	}
898       else
899 	return 0;
900     }
901 
902   /* A convert to single precision can be left as is provided that
903      all of its uses are in xxspltw instructions that splat BE element
904      zero.  */
905   if (GET_CODE (body) == SET
906       && GET_CODE (SET_SRC (body)) == UNSPEC
907       && XINT (SET_SRC (body), 1) == UNSPEC_VSX_CVDPSPN)
908     {
909       df_ref def;
910       struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
911 
912       FOR_EACH_INSN_INFO_DEF (def, insn_info)
913 	{
914 	  struct df_link *link = DF_REF_CHAIN (def);
915 	  if (!link)
916 	    return 0;
917 
918 	  for (; link; link = link->next) {
919 	    rtx use_insn = DF_REF_INSN (link->ref);
920 	    rtx use_body = PATTERN (use_insn);
921 	    if (GET_CODE (use_body) != SET
922 		|| GET_CODE (SET_SRC (use_body)) != UNSPEC
923 		|| XINT (SET_SRC (use_body), 1) != UNSPEC_VSX_XXSPLTW
924 		|| XVECEXP (SET_SRC (use_body), 0, 1) != const0_rtx)
925 	      return 0;
926 	  }
927 	}
928 
929       return 1;
930     }
931 
932   /* A concatenation of two doublewords is ok if we reverse the
933      order of the inputs.  */
934   if (GET_CODE (body) == SET
935       && GET_CODE (SET_SRC (body)) == VEC_CONCAT
936       && (GET_MODE (SET_SRC (body)) == V2DFmode
937 	  || GET_MODE (SET_SRC (body)) == V2DImode))
938     {
939       *special = SH_CONCAT;
940       return 1;
941     }
942 
943   /* V2DF reductions are always swappable.  */
944   if (GET_CODE (body) == PARALLEL)
945     {
946       rtx expr = XVECEXP (body, 0, 0);
947       if (GET_CODE (expr) == SET
948 	  && v2df_reduction_p (SET_SRC (expr)))
949 	return 1;
950     }
951 
952   /* An UNSPEC_VPERM is ok if the mask operand is loaded from the
953      constant pool.  */
954   if (GET_CODE (body) == SET
955       && GET_CODE (SET_SRC (body)) == UNSPEC
956       && XINT (SET_SRC (body), 1) == UNSPEC_VPERM
957       && XVECLEN (SET_SRC (body), 0) == 3
958       && REG_P (XVECEXP (SET_SRC (body), 0, 2)))
959     {
960       rtx mask_reg = XVECEXP (SET_SRC (body), 0, 2);
961       struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
962       df_ref use;
963       FOR_EACH_INSN_INFO_USE (use, insn_info)
964 	if (rtx_equal_p (DF_REF_REG (use), mask_reg))
965 	  {
966 	    struct df_link *def_link = DF_REF_CHAIN (use);
967 	    /* Punt if multiple definitions for this reg.  */
968 	    if (def_link && !def_link->next &&
969 		const_load_sequence_p (insn_entry,
970 				       DF_REF_INSN (def_link->ref)))
971 	      {
972 		*special = SH_VPERM;
973 		return 1;
974 	      }
975 	  }
976     }
977 
978   /* Otherwise check the operands for vector lane violations.  */
979   return rtx_is_swappable_p (body, special);
980 }
981 
982 enum chain_purpose { FOR_LOADS, FOR_STORES };
983 
984 /* Return true if the UD or DU chain headed by LINK is non-empty,
985    and every entry on the chain references an insn that is a
986    register swap.  Furthermore, if PURPOSE is FOR_LOADS, each such
987    register swap must have only permuting loads as reaching defs.
988    If PURPOSE is FOR_STORES, each such register swap must have only
989    register swaps or permuting stores as reached uses.  */
990 static bool
991 chain_contains_only_swaps (swap_web_entry *insn_entry, struct df_link *link,
992 			   enum chain_purpose purpose)
993 {
994   if (!link)
995     return false;
996 
997   for (; link; link = link->next)
998     {
999       if (!ALTIVEC_OR_VSX_VECTOR_MODE (GET_MODE (DF_REF_REG (link->ref))))
1000 	continue;
1001 
1002       if (DF_REF_IS_ARTIFICIAL (link->ref))
1003 	return false;
1004 
1005       rtx reached_insn = DF_REF_INSN (link->ref);
1006       unsigned uid = INSN_UID (reached_insn);
1007       struct df_insn_info *insn_info = DF_INSN_INFO_GET (reached_insn);
1008 
1009       if (!insn_entry[uid].is_swap || insn_entry[uid].is_load
1010 	  || insn_entry[uid].is_store)
1011 	return false;
1012 
1013       if (purpose == FOR_LOADS)
1014 	{
1015 	  df_ref use;
1016 	  FOR_EACH_INSN_INFO_USE (use, insn_info)
1017 	    {
1018 	      struct df_link *swap_link = DF_REF_CHAIN (use);
1019 
1020 	      while (swap_link)
1021 		{
1022 		  if (DF_REF_IS_ARTIFICIAL (link->ref))
1023 		    return false;
1024 
1025 		  rtx swap_def_insn = DF_REF_INSN (swap_link->ref);
1026 		  unsigned uid2 = INSN_UID (swap_def_insn);
1027 
1028 		  /* Only permuting loads are allowed.  */
1029 		  if (!insn_entry[uid2].is_swap || !insn_entry[uid2].is_load)
1030 		    return false;
1031 
1032 		  swap_link = swap_link->next;
1033 		}
1034 	    }
1035 	}
1036       else if (purpose == FOR_STORES)
1037 	{
1038 	  df_ref def;
1039 	  FOR_EACH_INSN_INFO_DEF (def, insn_info)
1040 	    {
1041 	      struct df_link *swap_link = DF_REF_CHAIN (def);
1042 
1043 	      while (swap_link)
1044 		{
1045 		  if (DF_REF_IS_ARTIFICIAL (link->ref))
1046 		    return false;
1047 
1048 		  rtx swap_use_insn = DF_REF_INSN (swap_link->ref);
1049 		  unsigned uid2 = INSN_UID (swap_use_insn);
1050 
1051 		  /* Permuting stores or register swaps are allowed.  */
1052 		  if (!insn_entry[uid2].is_swap || insn_entry[uid2].is_load)
1053 		    return false;
1054 
1055 		  swap_link = swap_link->next;
1056 		}
1057 	    }
1058 	}
1059     }
1060 
1061   return true;
1062 }
1063 
1064 /* Mark the xxswapdi instructions associated with permuting loads and
1065    stores for removal.  Note that we only flag them for deletion here,
1066    as there is a possibility of a swap being reached from multiple
1067    loads, etc.  */
1068 static void
1069 mark_swaps_for_removal (swap_web_entry *insn_entry, unsigned int i)
1070 {
1071   rtx insn = insn_entry[i].insn;
1072   struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
1073 
1074   if (insn_entry[i].is_load)
1075     {
1076       df_ref def;
1077       FOR_EACH_INSN_INFO_DEF (def, insn_info)
1078 	{
1079 	  struct df_link *link = DF_REF_CHAIN (def);
1080 
1081 	  /* We know by now that these are swaps, so we can delete
1082 	     them confidently.  */
1083 	  while (link)
1084 	    {
1085 	      rtx use_insn = DF_REF_INSN (link->ref);
1086 	      insn_entry[INSN_UID (use_insn)].will_delete = 1;
1087 	      link = link->next;
1088 	    }
1089 	}
1090     }
1091   else if (insn_entry[i].is_store)
1092     {
1093       df_ref use;
1094       FOR_EACH_INSN_INFO_USE (use, insn_info)
1095 	{
1096 	  /* Ignore uses for addressability.  */
1097 	  machine_mode mode = GET_MODE (DF_REF_REG (use));
1098 	  if (!ALTIVEC_OR_VSX_VECTOR_MODE (mode))
1099 	    continue;
1100 
1101 	  struct df_link *link = DF_REF_CHAIN (use);
1102 
1103 	  /* We know by now that these are swaps, so we can delete
1104 	     them confidently.  */
1105 	  while (link)
1106 	    {
1107 	      rtx def_insn = DF_REF_INSN (link->ref);
1108 	      insn_entry[INSN_UID (def_insn)].will_delete = 1;
1109 	      link = link->next;
1110 	    }
1111 	}
1112     }
1113 }
1114 
1115 /* *OP_PTR is either a CONST_VECTOR or an expression containing one.
1116    Swap the first half of the vector with the second in the first
1117    case.  Recurse to find it in the second.  */
1118 static void
1119 swap_const_vector_halves (rtx *op_ptr)
1120 {
1121   int i;
1122   rtx op = *op_ptr;
1123   enum rtx_code code = GET_CODE (op);
1124   if (GET_CODE (op) == CONST_VECTOR)
1125     {
1126       int units = GET_MODE_NUNITS (GET_MODE (op));
1127       rtx_vector_builder builder (GET_MODE (op), units, 1);
1128       for (i = 0; i < units / 2; ++i)
1129 	builder.quick_push (CONST_VECTOR_ELT (op, i + units / 2));
1130       for (i = 0; i < units / 2; ++i)
1131 	builder.quick_push (CONST_VECTOR_ELT (op, i));
1132       *op_ptr = builder.build ();
1133     }
1134   else
1135     {
1136       int j;
1137       const char *fmt = GET_RTX_FORMAT (code);
1138       for (i = 0; i < GET_RTX_LENGTH (code); ++i)
1139 	if (fmt[i] == 'e' || fmt[i] == 'u')
1140 	  swap_const_vector_halves (&XEXP (op, i));
1141 	else if (fmt[i] == 'E')
1142 	  for (j = 0; j < XVECLEN (op, i); ++j)
1143 	    swap_const_vector_halves (&XVECEXP (op, i, j));
1144     }
1145 }
1146 
1147 /* Find all subregs of a vector expression that perform a narrowing,
1148    and adjust the subreg index to account for doubleword swapping.  */
1149 static void
1150 adjust_subreg_index (rtx op)
1151 {
1152   enum rtx_code code = GET_CODE (op);
1153   if (code == SUBREG
1154       && (GET_MODE_SIZE (GET_MODE (op))
1155 	  < GET_MODE_SIZE (GET_MODE (XEXP (op, 0)))))
1156     {
1157       unsigned int index = SUBREG_BYTE (op);
1158       if (index < 8)
1159 	index += 8;
1160       else
1161 	index -= 8;
1162       SUBREG_BYTE (op) = index;
1163     }
1164 
1165   const char *fmt = GET_RTX_FORMAT (code);
1166   int i,j;
1167   for (i = 0; i < GET_RTX_LENGTH (code); ++i)
1168     if (fmt[i] == 'e' || fmt[i] == 'u')
1169       adjust_subreg_index (XEXP (op, i));
1170     else if (fmt[i] == 'E')
1171       for (j = 0; j < XVECLEN (op, i); ++j)
1172 	adjust_subreg_index (XVECEXP (op, i, j));
1173 }
1174 
1175 /* Convert the non-permuting load INSN to a permuting one.  */
1176 static void
1177 permute_load (rtx_insn *insn)
1178 {
1179   rtx body = PATTERN (insn);
1180   rtx mem_op = SET_SRC (body);
1181   rtx tgt_reg = SET_DEST (body);
1182   machine_mode mode = GET_MODE (tgt_reg);
1183   int n_elts = GET_MODE_NUNITS (mode);
1184   int half_elts = n_elts / 2;
1185   rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts));
1186   int i, j;
1187   for (i = 0, j = half_elts; i < half_elts; ++i, ++j)
1188     XVECEXP (par, 0, i) = GEN_INT (j);
1189   for (i = half_elts, j = 0; j < half_elts; ++i, ++j)
1190     XVECEXP (par, 0, i) = GEN_INT (j);
1191   rtx sel = gen_rtx_VEC_SELECT (mode, mem_op, par);
1192   SET_SRC (body) = sel;
1193   INSN_CODE (insn) = -1; /* Force re-recognition.  */
1194   df_insn_rescan (insn);
1195 
1196   if (dump_file)
1197     fprintf (dump_file, "Replacing load %d with permuted load\n",
1198 	     INSN_UID (insn));
1199 }
1200 
1201 /* Convert the non-permuting store INSN to a permuting one.  */
1202 static void
1203 permute_store (rtx_insn *insn)
1204 {
1205   rtx body = PATTERN (insn);
1206   rtx src_reg = SET_SRC (body);
1207   machine_mode mode = GET_MODE (src_reg);
1208   int n_elts = GET_MODE_NUNITS (mode);
1209   int half_elts = n_elts / 2;
1210   rtx par = gen_rtx_PARALLEL (mode, rtvec_alloc (n_elts));
1211   int i, j;
1212   for (i = 0, j = half_elts; i < half_elts; ++i, ++j)
1213     XVECEXP (par, 0, i) = GEN_INT (j);
1214   for (i = half_elts, j = 0; j < half_elts; ++i, ++j)
1215     XVECEXP (par, 0, i) = GEN_INT (j);
1216   rtx sel = gen_rtx_VEC_SELECT (mode, src_reg, par);
1217   SET_SRC (body) = sel;
1218   INSN_CODE (insn) = -1; /* Force re-recognition.  */
1219   df_insn_rescan (insn);
1220 
1221   if (dump_file)
1222     fprintf (dump_file, "Replacing store %d with permuted store\n",
1223 	     INSN_UID (insn));
1224 }
1225 
1226 /* Given OP that contains a vector extract operation, adjust the index
1227    of the extracted lane to account for the doubleword swap.  */
1228 static void
1229 adjust_extract (rtx_insn *insn)
1230 {
1231   rtx pattern = PATTERN (insn);
1232   if (GET_CODE (pattern) == PARALLEL)
1233     pattern = XVECEXP (pattern, 0, 0);
1234   rtx src = SET_SRC (pattern);
1235   /* The vec_select may be wrapped in a vec_duplicate for a splat, so
1236      account for that.  */
1237   rtx sel = GET_CODE (src) == VEC_DUPLICATE ? XEXP (src, 0) : src;
1238   rtx par = XEXP (sel, 1);
1239   int half_elts = GET_MODE_NUNITS (GET_MODE (XEXP (sel, 0))) >> 1;
1240   int lane = INTVAL (XVECEXP (par, 0, 0));
1241   lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
1242   XVECEXP (par, 0, 0) = GEN_INT (lane);
1243   INSN_CODE (insn) = -1; /* Force re-recognition.  */
1244   df_insn_rescan (insn);
1245 
1246   if (dump_file)
1247     fprintf (dump_file, "Changing lane for extract %d\n", INSN_UID (insn));
1248 }
1249 
1250 /* Given OP that contains a vector direct-splat operation, adjust the index
1251    of the source lane to account for the doubleword swap.  */
1252 static void
1253 adjust_splat (rtx_insn *insn)
1254 {
1255   rtx body = PATTERN (insn);
1256   rtx unspec = XEXP (body, 1);
1257   int half_elts = GET_MODE_NUNITS (GET_MODE (unspec)) >> 1;
1258   int lane = INTVAL (XVECEXP (unspec, 0, 1));
1259   lane = lane >= half_elts ? lane - half_elts : lane + half_elts;
1260   XVECEXP (unspec, 0, 1) = GEN_INT (lane);
1261   INSN_CODE (insn) = -1; /* Force re-recognition.  */
1262   df_insn_rescan (insn);
1263 
1264   if (dump_file)
1265     fprintf (dump_file, "Changing lane for splat %d\n", INSN_UID (insn));
1266 }
1267 
1268 /* Given OP that contains an XXPERMDI operation (that is not a doubleword
1269    swap), reverse the order of the source operands and adjust the indices
1270    of the source lanes to account for doubleword reversal.  */
1271 static void
1272 adjust_xxpermdi (rtx_insn *insn)
1273 {
1274   rtx set = PATTERN (insn);
1275   rtx select = XEXP (set, 1);
1276   rtx concat = XEXP (select, 0);
1277   rtx src0 = XEXP (concat, 0);
1278   XEXP (concat, 0) = XEXP (concat, 1);
1279   XEXP (concat, 1) = src0;
1280   rtx parallel = XEXP (select, 1);
1281   int lane0 = INTVAL (XVECEXP (parallel, 0, 0));
1282   int lane1 = INTVAL (XVECEXP (parallel, 0, 1));
1283   int new_lane0 = 3 - lane1;
1284   int new_lane1 = 3 - lane0;
1285   XVECEXP (parallel, 0, 0) = GEN_INT (new_lane0);
1286   XVECEXP (parallel, 0, 1) = GEN_INT (new_lane1);
1287   INSN_CODE (insn) = -1; /* Force re-recognition.  */
1288   df_insn_rescan (insn);
1289 
1290   if (dump_file)
1291     fprintf (dump_file, "Changing lanes for xxpermdi %d\n", INSN_UID (insn));
1292 }
1293 
1294 /* Given OP that contains a VEC_CONCAT operation of two doublewords,
1295    reverse the order of those inputs.  */
1296 static void
1297 adjust_concat (rtx_insn *insn)
1298 {
1299   rtx set = PATTERN (insn);
1300   rtx concat = XEXP (set, 1);
1301   rtx src0 = XEXP (concat, 0);
1302   XEXP (concat, 0) = XEXP (concat, 1);
1303   XEXP (concat, 1) = src0;
1304   INSN_CODE (insn) = -1; /* Force re-recognition.  */
1305   df_insn_rescan (insn);
1306 
1307   if (dump_file)
1308     fprintf (dump_file, "Reversing inputs for concat %d\n", INSN_UID (insn));
1309 }
1310 
1311 /* Given an UNSPEC_VPERM insn, modify the mask loaded from the
1312    constant pool to reflect swapped doublewords.  */
1313 static void
1314 adjust_vperm (rtx_insn *insn)
1315 {
1316   /* We previously determined that the UNSPEC_VPERM was fed by a
1317      swap of a swapping load of a TOC-relative constant pool symbol.
1318      Find the MEM in the swapping load and replace it with a MEM for
1319      the adjusted mask constant.  */
1320   rtx set = PATTERN (insn);
1321   rtx mask_reg = XVECEXP (SET_SRC (set), 0, 2);
1322 
1323   /* Find the swap.  */
1324   struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
1325   df_ref use;
1326   rtx_insn *swap_insn = 0;
1327   FOR_EACH_INSN_INFO_USE (use, insn_info)
1328     if (rtx_equal_p (DF_REF_REG (use), mask_reg))
1329       {
1330 	struct df_link *def_link = DF_REF_CHAIN (use);
1331 	gcc_assert (def_link && !def_link->next);
1332 	swap_insn = DF_REF_INSN (def_link->ref);
1333 	break;
1334       }
1335   gcc_assert (swap_insn);
1336 
1337   /* Find the load.  */
1338   insn_info = DF_INSN_INFO_GET (swap_insn);
1339   rtx_insn *load_insn = 0;
1340   FOR_EACH_INSN_INFO_USE (use, insn_info)
1341     {
1342       struct df_link *def_link = DF_REF_CHAIN (use);
1343       gcc_assert (def_link && !def_link->next);
1344       load_insn = DF_REF_INSN (def_link->ref);
1345       break;
1346     }
1347   gcc_assert (load_insn);
1348 
1349   /* Find the TOC-relative symbol access.  */
1350   insn_info = DF_INSN_INFO_GET (load_insn);
1351   rtx_insn *tocrel_insn = 0;
1352   FOR_EACH_INSN_INFO_USE (use, insn_info)
1353     {
1354       struct df_link *def_link = DF_REF_CHAIN (use);
1355       gcc_assert (def_link && !def_link->next);
1356       tocrel_insn = DF_REF_INSN (def_link->ref);
1357       break;
1358     }
1359   gcc_assert (tocrel_insn);
1360 
1361   /* Find the embedded CONST_VECTOR.  We have to call toc_relative_expr_p
1362      to set tocrel_base; otherwise it would be unnecessary as we've
1363      already established it will return true.  */
1364   rtx base, offset;
1365   const_rtx tocrel_base;
1366   rtx tocrel_expr = SET_SRC (PATTERN (tocrel_insn));
1367   /* There is an extra level of indirection for small/large code models.  */
1368   if (MEM_P (tocrel_expr))
1369     tocrel_expr = XEXP (tocrel_expr, 0);
1370   if (!toc_relative_expr_p (tocrel_expr, false, &tocrel_base, NULL))
1371     gcc_unreachable ();
1372   split_const (XVECEXP (tocrel_base, 0, 0), &base, &offset);
1373   rtx const_vector = get_pool_constant (base);
1374   /* With the extra indirection, get_pool_constant will produce the
1375      real constant from the reg_equal expression, so get the real
1376      constant.  */
1377   if (SYMBOL_REF_P (const_vector))
1378     const_vector = get_pool_constant (const_vector);
1379   gcc_assert (GET_CODE (const_vector) == CONST_VECTOR);
1380 
1381   /* Create an adjusted mask from the initial mask.  */
1382   unsigned int new_mask[16], i, val;
1383   for (i = 0; i < 16; ++i) {
1384     val = INTVAL (XVECEXP (const_vector, 0, i));
1385     if (val < 16)
1386       new_mask[i] = (val + 8) % 16;
1387     else
1388       new_mask[i] = ((val + 8) % 16) + 16;
1389   }
1390 
1391   /* Create a new CONST_VECTOR and a MEM that references it.  */
1392   rtx vals = gen_rtx_PARALLEL (V16QImode, rtvec_alloc (16));
1393   for (i = 0; i < 16; ++i)
1394     XVECEXP (vals, 0, i) = GEN_INT (new_mask[i]);
1395   rtx new_const_vector = gen_rtx_CONST_VECTOR (V16QImode, XVEC (vals, 0));
1396   rtx new_mem = force_const_mem (V16QImode, new_const_vector);
1397   /* This gives us a MEM whose base operand is a SYMBOL_REF, which we
1398      can't recognize.  Force the SYMBOL_REF into a register.  */
1399   if (!REG_P (XEXP (new_mem, 0))) {
1400     rtx base_reg = force_reg (Pmode, XEXP (new_mem, 0));
1401     XEXP (new_mem, 0) = base_reg;
1402     /* Move the newly created insn ahead of the load insn.  */
1403     rtx_insn *force_insn = get_last_insn ();
1404     remove_insn (force_insn);
1405     rtx_insn *before_load_insn = PREV_INSN (load_insn);
1406     add_insn_after (force_insn, before_load_insn, BLOCK_FOR_INSN (load_insn));
1407     df_insn_rescan (before_load_insn);
1408     df_insn_rescan (force_insn);
1409   }
1410 
1411   /* Replace the MEM in the load instruction and rescan it.  */
1412   XEXP (SET_SRC (PATTERN (load_insn)), 0) = new_mem;
1413   INSN_CODE (load_insn) = -1; /* Force re-recognition.  */
1414   df_insn_rescan (load_insn);
1415 
1416   if (dump_file)
1417     fprintf (dump_file, "Adjusting mask for vperm %d\n", INSN_UID (insn));
1418 }
1419 
1420 /* The insn described by INSN_ENTRY[I] can be swapped, but only
1421    with special handling.  Take care of that here.  */
1422 static void
1423 handle_special_swappables (swap_web_entry *insn_entry, unsigned i)
1424 {
1425   rtx_insn *insn = insn_entry[i].insn;
1426   rtx body = PATTERN (insn);
1427 
1428   switch (insn_entry[i].special_handling)
1429     {
1430     default:
1431       gcc_unreachable ();
1432     case SH_CONST_VECTOR:
1433       {
1434 	/* A CONST_VECTOR will only show up somewhere in the RHS of a SET.  */
1435 	gcc_assert (GET_CODE (body) == SET);
1436 	swap_const_vector_halves (&SET_SRC (body));
1437 	if (dump_file)
1438 	  fprintf (dump_file, "Swapping constant halves in insn %d\n", i);
1439 	break;
1440       }
1441     case SH_SUBREG:
1442       /* A subreg of the same size is already safe.  For subregs that
1443 	 select a smaller portion of a reg, adjust the index for
1444 	 swapped doublewords.  */
1445       adjust_subreg_index (body);
1446       if (dump_file)
1447 	fprintf (dump_file, "Adjusting subreg in insn %d\n", i);
1448       break;
1449     case SH_NOSWAP_LD:
1450       /* Convert a non-permuting load to a permuting one.  */
1451       permute_load (insn);
1452       break;
1453     case SH_NOSWAP_ST:
1454       /* Convert a non-permuting store to a permuting one.  */
1455       permute_store (insn);
1456       break;
1457     case SH_EXTRACT:
1458       /* Change the lane on an extract operation.  */
1459       adjust_extract (insn);
1460       break;
1461     case SH_SPLAT:
1462       /* Change the lane on a direct-splat operation.  */
1463       adjust_splat (insn);
1464       break;
1465     case SH_XXPERMDI:
1466       /* Change the lanes on an XXPERMDI operation.  */
1467       adjust_xxpermdi (insn);
1468       break;
1469     case SH_CONCAT:
1470       /* Reverse the order of a concatenation operation.  */
1471       adjust_concat (insn);
1472       break;
1473     case SH_VPERM:
1474       /* Change the mask loaded from the constant pool for a VPERM.  */
1475       adjust_vperm (insn);
1476       break;
1477     }
1478 }
1479 
1480 /* Find the insn from the Ith table entry, which is known to be a
1481    register swap Y = SWAP(X).  Replace it with a copy Y = X.  */
1482 static void
1483 replace_swap_with_copy (swap_web_entry *insn_entry, unsigned i)
1484 {
1485   rtx_insn *insn = insn_entry[i].insn;
1486   rtx body = PATTERN (insn);
1487   rtx src_reg = XEXP (SET_SRC (body), 0);
1488   rtx copy = gen_rtx_SET (SET_DEST (body), src_reg);
1489   rtx_insn *new_insn = emit_insn_before (copy, insn);
1490   set_block_for_insn (new_insn, BLOCK_FOR_INSN (insn));
1491   df_insn_rescan (new_insn);
1492 
1493   if (dump_file)
1494     {
1495       unsigned int new_uid = INSN_UID (new_insn);
1496       fprintf (dump_file, "Replacing swap %d with copy %d\n", i, new_uid);
1497     }
1498 
1499   df_insn_delete (insn);
1500   remove_insn (insn);
1501   insn->set_deleted ();
1502 }
1503 
1504 /* INSN is known to contain a SUBREG, which we can normally handle,
1505    but if the SUBREG itself contains a MULT then we need to leave it alone
1506    to avoid turning a mult_hipart into a mult_lopart, for example.  */
1507 static bool
1508 has_part_mult (rtx_insn *insn)
1509 {
1510   rtx body = PATTERN (insn);
1511   if (GET_CODE (body) != SET)
1512     return false;
1513   rtx src = SET_SRC (body);
1514   if (GET_CODE (src) != SUBREG)
1515     return false;
1516   rtx inner = XEXP (src, 0);
1517   return (GET_CODE (inner) == MULT);
1518 }
1519 
1520 /* Make NEW_MEM_EXP's attributes and flags resemble those of
1521    ORIGINAL_MEM_EXP.  */
1522 static void
1523 mimic_memory_attributes_and_flags (rtx new_mem_exp, const_rtx original_mem_exp)
1524 {
1525   RTX_FLAG (new_mem_exp, jump) = RTX_FLAG (original_mem_exp, jump);
1526   RTX_FLAG (new_mem_exp, call) = RTX_FLAG (original_mem_exp, call);
1527   RTX_FLAG (new_mem_exp, unchanging) = RTX_FLAG (original_mem_exp, unchanging);
1528   RTX_FLAG (new_mem_exp, volatil) = RTX_FLAG (original_mem_exp, volatil);
1529   RTX_FLAG (new_mem_exp, frame_related) =
1530     RTX_FLAG (original_mem_exp, frame_related);
1531 
1532   /* The following fields may not be used with MEM subexpressions */
1533   RTX_FLAG (new_mem_exp, in_struct) = RTX_FLAG (original_mem_exp, in_struct);
1534   RTX_FLAG (new_mem_exp, return_val) = RTX_FLAG (original_mem_exp, return_val);
1535 
1536   struct mem_attrs original_attrs = *get_mem_attrs(original_mem_exp);
1537 
1538   alias_set_type set = original_attrs.alias;
1539   set_mem_alias_set (new_mem_exp, set);
1540 
1541   addr_space_t addrspace = original_attrs.addrspace;
1542   set_mem_addr_space (new_mem_exp, addrspace);
1543 
1544   unsigned int align = original_attrs.align;
1545   set_mem_align (new_mem_exp, align);
1546 
1547   tree expr = original_attrs.expr;
1548   set_mem_expr (new_mem_exp, expr);
1549 
1550   if (original_attrs.offset_known_p)
1551     {
1552       HOST_WIDE_INT offset = original_attrs.offset;
1553       set_mem_offset (new_mem_exp, offset);
1554     }
1555   else
1556     clear_mem_offset (new_mem_exp);
1557 
1558   if (original_attrs.size_known_p)
1559     {
1560       HOST_WIDE_INT size = original_attrs.size;
1561       set_mem_size (new_mem_exp, size);
1562     }
1563   else
1564     clear_mem_size (new_mem_exp);
1565 }
1566 
1567 /* Generate an rtx expression to represent use of the stvx insn to store
1568    the value represented by register SRC_EXP into the memory at address
1569    DEST_EXP, with vector mode MODE.  */
1570 rtx
1571 rs6000_gen_stvx (enum machine_mode mode, rtx dest_exp, rtx src_exp)
1572 {
1573   rtx stvx;
1574 
1575   if (mode == V16QImode)
1576     stvx = gen_altivec_stvx_v16qi (src_exp, dest_exp);
1577   else if (mode == V8HImode)
1578     stvx = gen_altivec_stvx_v8hi (src_exp, dest_exp);
1579 #ifdef HAVE_V8HFmode
1580   else if (mode == V8HFmode)
1581     stvx = gen_altivec_stvx_v8hf (src_exp, dest_exp);
1582 #endif
1583   else if (mode == V4SImode)
1584     stvx = gen_altivec_stvx_v4si (src_exp, dest_exp);
1585   else if (mode == V4SFmode)
1586     stvx = gen_altivec_stvx_v4sf (src_exp, dest_exp);
1587   else if (mode == V2DImode)
1588     stvx = gen_altivec_stvx_v2di (src_exp, dest_exp);
1589   else if (mode == V2DFmode)
1590     stvx = gen_altivec_stvx_v2df (src_exp, dest_exp);
1591   else if (mode == V1TImode)
1592     stvx = gen_altivec_stvx_v1ti (src_exp, dest_exp);
1593   else
1594     /* KFmode, TFmode, other modes not expected in this context.  */
1595     gcc_unreachable ();
1596 
1597   rtx new_mem_exp = SET_DEST (PATTERN (stvx));
1598   mimic_memory_attributes_and_flags (new_mem_exp, dest_exp);
1599   return stvx;
1600 }
1601 
1602 /* Given that STORE_INSN represents an aligned store-with-swap of a
1603    swapped value, replace the store with an aligned store (without
1604    swap) and replace the swap with a copy insn.  */
1605 static void
1606 replace_swapped_aligned_store (swap_web_entry *insn_entry,
1607 			       rtx_insn *store_insn)
1608 {
1609   unsigned uid = INSN_UID (store_insn);
1610   gcc_assert (insn_entry[uid].is_swap && insn_entry[uid].is_store);
1611 
1612   rtx body = PATTERN (store_insn);
1613   rtx dest_address = XEXP (SET_DEST (body), 0);
1614   rtx swap_reg = XEXP (SET_SRC (body), 0);
1615   gcc_assert (REG_P (dest_address)
1616 	      || rs6000_sum_of_two_registers_p (dest_address));
1617 
1618   /* Find the swap instruction that provides the value to be stored by
1619    * this store-with-swap instruction. */
1620   struct df_insn_info *insn_info = DF_INSN_INFO_GET (store_insn);
1621   df_ref use;
1622   rtx_insn *swap_insn = NULL;
1623   unsigned uid2 = 0;
1624   FOR_EACH_INSN_INFO_USE (use, insn_info)
1625     {
1626       struct df_link *def_link = DF_REF_CHAIN (use);
1627 
1628       /* if this is not the definition of the candidate swap register,
1629 	 then skip it.  I am only interested in the swap insnd.  */
1630       if (!rtx_equal_p (DF_REF_REG (use), swap_reg))
1631 	continue;
1632 
1633       /* If there is no def or the def is artifical or there are
1634 	 multiple defs, we should not be here.  */
1635       gcc_assert (def_link && def_link->ref && !def_link->next
1636 		  && !DF_REF_IS_ARTIFICIAL (def_link->ref));
1637 
1638       swap_insn = DF_REF_INSN (def_link->ref);
1639       uid2 = INSN_UID (swap_insn);
1640 
1641       /* If this source value is not a simple swap, we should not be here.  */
1642       gcc_assert (insn_entry[uid2].is_swap && !insn_entry[uid2].is_load
1643 		  && !insn_entry[uid2].is_store);
1644 
1645       /* We've processed the use we care about, so break out of
1646 	 this loop.  */
1647       break;
1648     }
1649 
1650   /* At this point, swap_insn and uid2 represent the swap instruction
1651      that feeds the store.  */
1652   gcc_assert (swap_insn);
1653   rtx set = single_set (store_insn);
1654   gcc_assert (set);
1655   rtx dest_exp = SET_DEST (set);
1656   rtx src_exp = XEXP (SET_SRC (body), 0);
1657   enum machine_mode mode = GET_MODE (dest_exp);
1658   gcc_assert (MEM_P (dest_exp));
1659   gcc_assert (MEM_ALIGN (dest_exp) >= 128);
1660 
1661   /* Replace the copy with a new insn.  */
1662   rtx stvx;
1663   stvx = rs6000_gen_stvx (mode, dest_exp, src_exp);
1664 
1665   rtx_insn *new_insn = emit_insn_before (stvx, store_insn);
1666   rtx new_body = PATTERN (new_insn);
1667 
1668   gcc_assert ((GET_CODE (new_body) == SET)
1669 	      && MEM_P (SET_DEST (new_body)));
1670 
1671   set_block_for_insn (new_insn, BLOCK_FOR_INSN (store_insn));
1672   df_insn_rescan (new_insn);
1673 
1674   df_insn_delete (store_insn);
1675   remove_insn (store_insn);
1676   store_insn->set_deleted ();
1677 
1678   /* Replace the swap with a copy.  */
1679   uid2 = INSN_UID (swap_insn);
1680   mark_swaps_for_removal (insn_entry, uid2);
1681   replace_swap_with_copy (insn_entry, uid2);
1682 }
1683 
1684 /* Generate an rtx expression to represent use of the lvx insn to load
1685    from memory SRC_EXP into register DEST_EXP with vector mode MODE. */
1686 rtx
1687 rs6000_gen_lvx (enum machine_mode mode, rtx dest_exp, rtx src_exp)
1688 {
1689   rtx lvx;
1690 
1691   if (mode == V16QImode)
1692     lvx = gen_altivec_lvx_v16qi (dest_exp, src_exp);
1693   else if (mode == V8HImode)
1694     lvx = gen_altivec_lvx_v8hi (dest_exp, src_exp);
1695 #ifdef HAVE_V8HFmode
1696   else if (mode == V8HFmode)
1697     lvx = gen_altivec_lvx_v8hf (dest_exp, src_exp);
1698 #endif
1699   else if (mode == V4SImode)
1700     lvx = gen_altivec_lvx_v4si (dest_exp, src_exp);
1701   else if (mode == V4SFmode)
1702     lvx = gen_altivec_lvx_v4sf (dest_exp, src_exp);
1703   else if (mode == V2DImode)
1704     lvx = gen_altivec_lvx_v2di (dest_exp, src_exp);
1705   else if (mode == V2DFmode)
1706     lvx = gen_altivec_lvx_v2df (dest_exp, src_exp);
1707   else if (mode == V1TImode)
1708     lvx = gen_altivec_lvx_v1ti (dest_exp, src_exp);
1709   else
1710     /* KFmode, TFmode, other modes not expected in this context.  */
1711     gcc_unreachable ();
1712 
1713   rtx new_mem_exp = SET_SRC (PATTERN (lvx));
1714   mimic_memory_attributes_and_flags (new_mem_exp, src_exp);
1715 
1716   return lvx;
1717 }
1718 
1719 /* Given that SWAP_INSN represents a swap of an aligned
1720    load-with-swap, replace the load with an aligned load (without
1721    swap) and replace the swap with a copy insn.  */
1722 static void
1723 replace_swapped_aligned_load (swap_web_entry *insn_entry, rtx swap_insn)
1724 {
1725   /* Find the load.  */
1726   unsigned uid = INSN_UID (swap_insn);
1727   /* Only call this if quad_aligned_load_p (swap_insn).  */
1728   gcc_assert (insn_entry[uid].is_swap && !insn_entry[uid].is_load);
1729   struct df_insn_info *insn_info = DF_INSN_INFO_GET (swap_insn);
1730 
1731   /* Since insn is known to represent a swap instruction, we know it
1732      "uses" only one input variable.  */
1733   df_ref use = DF_INSN_INFO_USES (insn_info);
1734 
1735   /* Figure out where this input variable is defined.  */
1736   struct df_link *def_link = DF_REF_CHAIN (use);
1737   gcc_assert (def_link && !def_link->next);
1738   gcc_assert (def_link && def_link->ref &&
1739 	      !DF_REF_IS_ARTIFICIAL (def_link->ref) && !def_link->next);
1740 
1741   rtx_insn *def_insn = DF_REF_INSN (def_link->ref);
1742   unsigned uid2 = INSN_UID (def_insn);
1743 
1744   /* We're expecting a load-with-swap insn.  */
1745   gcc_assert (insn_entry[uid2].is_load && insn_entry[uid2].is_swap);
1746 
1747   /* We expect this to be a set to memory, with source representing a
1748      swap (indicated by code VEC_SELECT).  */
1749   rtx body = PATTERN (def_insn);
1750   gcc_assert ((GET_CODE (body) == SET)
1751 	      && (GET_CODE (SET_SRC (body)) == VEC_SELECT)
1752 	      && MEM_P (XEXP (SET_SRC (body), 0)));
1753 
1754   rtx src_exp = XEXP (SET_SRC (body), 0);
1755   enum machine_mode mode = GET_MODE (src_exp);
1756   rtx lvx = rs6000_gen_lvx (mode, SET_DEST (body), src_exp);
1757 
1758   rtx_insn *new_insn = emit_insn_before (lvx, def_insn);
1759   rtx new_body = PATTERN (new_insn);
1760 
1761   gcc_assert ((GET_CODE (new_body) == SET)
1762 	      && MEM_P (SET_SRC (new_body)));
1763 
1764   set_block_for_insn (new_insn, BLOCK_FOR_INSN (def_insn));
1765   df_insn_rescan (new_insn);
1766 
1767   df_insn_delete (def_insn);
1768   remove_insn (def_insn);
1769   def_insn->set_deleted ();
1770 
1771   /* Replace the swap with a copy.  */
1772   mark_swaps_for_removal (insn_entry, uid);
1773   replace_swap_with_copy (insn_entry, uid);
1774 }
1775 
1776 /* Given that SWAP_INSN represents a swap of a load of a constant
1777    vector value, replace with a single instruction that loads a
1778    swapped variant of the original constant.
1779 
1780    The "natural" representation of a byte array in memory is the same
1781    for big endian and little endian.
1782 
1783    unsigned char byte_array[] =
1784      { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f };
1785 
1786    However, when loaded into a vector register, the representation
1787    depends on endian conventions.
1788 
1789    In big-endian mode, the register holds:
1790 
1791      MSB                                            LSB
1792      [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f ]
1793 
1794    In little-endian mode, the register holds:
1795 
1796      MSB                                            LSB
1797      [ f, e, d, c, b, a, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 ]
1798 
1799    Word arrays require different handling.  Consider the word array:
1800 
1801    unsigned int word_array[] =
1802      { 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f };
1803 
1804    The in-memory representation depends on endian configuration.  The
1805    equivalent array, declared as a byte array, in memory would be:
1806 
1807    unsigned char big_endian_word_array_data[] =
1808      { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f }
1809 
1810    unsigned char little_endian_word_array_data[] =
1811      { 3, 2, 1, 0, 7, 6, 5, 4, b, a, 9, 8, f, e, d, c }
1812 
1813    In big-endian mode, the register holds:
1814 
1815      MSB                                            LSB
1816      [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f ]
1817 
1818    In little-endian mode, the register holds:
1819 
1820      MSB                                            LSB
1821      [ c, d, e, f, 8, 9, a, b, 4, 5, 6, 7, 0, 1, 2, 3 ]
1822 
1823 
1824   Similar transformations apply to the vector of half-word and vector
1825   of double-word representations.
1826 
1827   For now, don't handle vectors of quad-precision values.  Just return.
1828   A better solution is to fix the code generator to emit lvx/stvx for
1829   those.  */
1830 static void
1831 replace_swapped_load_constant (swap_web_entry *insn_entry, rtx swap_insn)
1832 {
1833   /* Find the load.  */
1834   struct df_insn_info *insn_info = DF_INSN_INFO_GET (swap_insn);
1835   rtx_insn *load_insn;
1836   df_ref use  = DF_INSN_INFO_USES (insn_info);
1837   struct df_link *def_link = DF_REF_CHAIN (use);
1838   gcc_assert (def_link && !def_link->next);
1839 
1840   load_insn = DF_REF_INSN (def_link->ref);
1841   gcc_assert (load_insn);
1842 
1843   /* Find the TOC-relative symbol access.  */
1844   insn_info = DF_INSN_INFO_GET (load_insn);
1845   use = DF_INSN_INFO_USES (insn_info);
1846 
1847   def_link = DF_REF_CHAIN (use);
1848   gcc_assert (def_link && !def_link->next);
1849 
1850   rtx_insn *tocrel_insn = DF_REF_INSN (def_link->ref);
1851   gcc_assert (tocrel_insn);
1852 
1853   /* Find the embedded CONST_VECTOR.  We have to call toc_relative_expr_p
1854      to set tocrel_base; otherwise it would be unnecessary as we've
1855      already established it will return true.  */
1856   rtx base, offset;
1857   rtx tocrel_expr = SET_SRC (PATTERN (tocrel_insn));
1858   const_rtx tocrel_base;
1859 
1860   /* There is an extra level of indirection for small/large code models.  */
1861   if (MEM_P (tocrel_expr))
1862     tocrel_expr = XEXP (tocrel_expr, 0);
1863 
1864   if (!toc_relative_expr_p (tocrel_expr, false, &tocrel_base, NULL))
1865     gcc_unreachable ();
1866 
1867   split_const (XVECEXP (tocrel_base, 0, 0), &base, &offset);
1868   rtx const_vector = get_pool_constant (base);
1869 
1870   /* With the extra indirection, get_pool_constant will produce the
1871      real constant from the reg_equal expression, so get the real
1872      constant.  */
1873   if (SYMBOL_REF_P (const_vector))
1874     const_vector = get_pool_constant (const_vector);
1875   gcc_assert (GET_CODE (const_vector) == CONST_VECTOR);
1876 
1877   rtx new_mem;
1878   enum machine_mode mode = GET_MODE (const_vector);
1879 
1880   /* Create an adjusted constant from the original constant.  */
1881   if (mode == V1TImode)
1882     /* Leave this code as is.  */
1883     return;
1884   else if (mode == V16QImode)
1885     {
1886       rtx vals = gen_rtx_PARALLEL (mode, rtvec_alloc (16));
1887       int i;
1888 
1889       for (i = 0; i < 16; i++)
1890 	XVECEXP (vals, 0, ((i+8) % 16)) = XVECEXP (const_vector, 0, i);
1891       rtx new_const_vector = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
1892       new_mem = force_const_mem (mode, new_const_vector);
1893     }
1894   else if ((mode == V8HImode)
1895 #ifdef HAVE_V8HFmode
1896 	   || (mode == V8HFmode)
1897 #endif
1898 	   )
1899     {
1900       rtx vals = gen_rtx_PARALLEL (mode, rtvec_alloc (8));
1901       int i;
1902 
1903       for (i = 0; i < 8; i++)
1904 	XVECEXP (vals, 0, ((i+4) % 8)) = XVECEXP (const_vector, 0, i);
1905       rtx new_const_vector = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
1906       new_mem = force_const_mem (mode, new_const_vector);
1907     }
1908   else if ((mode == V4SImode) || (mode == V4SFmode))
1909     {
1910       rtx vals = gen_rtx_PARALLEL (mode, rtvec_alloc (4));
1911       int i;
1912 
1913       for (i = 0; i < 4; i++)
1914 	XVECEXP (vals, 0, ((i+2) % 4)) = XVECEXP (const_vector, 0, i);
1915       rtx new_const_vector = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
1916       new_mem = force_const_mem (mode, new_const_vector);
1917     }
1918   else if ((mode == V2DImode) || (mode == V2DFmode))
1919     {
1920       rtx vals = gen_rtx_PARALLEL (mode, rtvec_alloc (2));
1921       int i;
1922 
1923       for (i = 0; i < 2; i++)
1924 	XVECEXP (vals, 0, ((i+1) % 2)) = XVECEXP (const_vector, 0, i);
1925       rtx new_const_vector = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
1926       new_mem = force_const_mem (mode, new_const_vector);
1927     }
1928   else
1929     {
1930       /* We do not expect other modes to be constant-load-swapped.  */
1931       gcc_unreachable ();
1932     }
1933 
1934   /* This gives us a MEM whose base operand is a SYMBOL_REF, which we
1935      can't recognize.  Force the SYMBOL_REF into a register.  */
1936   if (!REG_P (XEXP (new_mem, 0))) {
1937     rtx base_reg = force_reg (Pmode, XEXP (new_mem, 0));
1938     XEXP (new_mem, 0) = base_reg;
1939 
1940     /* Move the newly created insn ahead of the load insn.  */
1941     /* The last insn is the insn that forced new_mem into a register.  */
1942     rtx_insn *force_insn = get_last_insn ();
1943     /* Remove this insn from the end of the instruction sequence.  */
1944     remove_insn (force_insn);
1945     rtx_insn *before_load_insn = PREV_INSN (load_insn);
1946 
1947     /* And insert this insn back into the sequence before the previous
1948        load insn so this new expression will be available when the
1949        existing load is modified to load the swapped constant.  */
1950     add_insn_after (force_insn, before_load_insn, BLOCK_FOR_INSN (load_insn));
1951     df_insn_rescan (before_load_insn);
1952     df_insn_rescan (force_insn);
1953   }
1954 
1955   /* Replace the MEM in the load instruction and rescan it.  */
1956   XEXP (SET_SRC (PATTERN (load_insn)), 0) = new_mem;
1957   INSN_CODE (load_insn) = -1; /* Force re-recognition.  */
1958   df_insn_rescan (load_insn);
1959 
1960   unsigned int uid = INSN_UID (swap_insn);
1961   mark_swaps_for_removal (insn_entry, uid);
1962   replace_swap_with_copy (insn_entry, uid);
1963 }
1964 
1965 /* Dump the swap table to DUMP_FILE.  */
1966 static void
1967 dump_swap_insn_table (swap_web_entry *insn_entry)
1968 {
1969   int e = get_max_uid ();
1970   fprintf (dump_file, "\nRelevant insns with their flag settings\n\n");
1971 
1972   for (int i = 0; i < e; ++i)
1973     if (insn_entry[i].is_relevant)
1974       {
1975 	swap_web_entry *pred_entry = (swap_web_entry *)insn_entry[i].pred ();
1976 	fprintf (dump_file, "%6d %6d  ", i,
1977 		 pred_entry && pred_entry->insn
1978 		 ? INSN_UID (pred_entry->insn) : 0);
1979 	if (insn_entry[i].is_load)
1980 	  fputs ("load ", dump_file);
1981 	if (insn_entry[i].is_store)
1982 	  fputs ("store ", dump_file);
1983 	if (insn_entry[i].is_swap)
1984 	  fputs ("swap ", dump_file);
1985 	if (insn_entry[i].is_live_in)
1986 	  fputs ("live-in ", dump_file);
1987 	if (insn_entry[i].is_live_out)
1988 	  fputs ("live-out ", dump_file);
1989 	if (insn_entry[i].contains_subreg)
1990 	  fputs ("subreg ", dump_file);
1991 	if (insn_entry[i].is_128_int)
1992 	  fputs ("int128 ", dump_file);
1993 	if (insn_entry[i].is_call)
1994 	  fputs ("call ", dump_file);
1995 	if (insn_entry[i].is_swappable)
1996 	  {
1997 	    fputs ("swappable ", dump_file);
1998 	    if (insn_entry[i].special_handling == SH_CONST_VECTOR)
1999 	      fputs ("special:constvec ", dump_file);
2000 	    else if (insn_entry[i].special_handling == SH_SUBREG)
2001 	      fputs ("special:subreg ", dump_file);
2002 	    else if (insn_entry[i].special_handling == SH_NOSWAP_LD)
2003 	      fputs ("special:load ", dump_file);
2004 	    else if (insn_entry[i].special_handling == SH_NOSWAP_ST)
2005 	      fputs ("special:store ", dump_file);
2006 	    else if (insn_entry[i].special_handling == SH_EXTRACT)
2007 	      fputs ("special:extract ", dump_file);
2008 	    else if (insn_entry[i].special_handling == SH_SPLAT)
2009 	      fputs ("special:splat ", dump_file);
2010 	    else if (insn_entry[i].special_handling == SH_XXPERMDI)
2011 	      fputs ("special:xxpermdi ", dump_file);
2012 	    else if (insn_entry[i].special_handling == SH_CONCAT)
2013 	      fputs ("special:concat ", dump_file);
2014 	    else if (insn_entry[i].special_handling == SH_VPERM)
2015 	      fputs ("special:vperm ", dump_file);
2016 	  }
2017 	if (insn_entry[i].web_not_optimizable)
2018 	  fputs ("unoptimizable ", dump_file);
2019 	if (insn_entry[i].will_delete)
2020 	  fputs ("delete ", dump_file);
2021 	fputs ("\n", dump_file);
2022       }
2023   fputs ("\n", dump_file);
2024 }
2025 
2026 /* Return RTX with its address canonicalized to (reg) or (+ reg reg).
2027    Here RTX is an (& addr (const_int -16)).  Always return a new copy
2028    to avoid problems with combine.  */
2029 static rtx
2030 alignment_with_canonical_addr (rtx align)
2031 {
2032   rtx canon;
2033   rtx addr = XEXP (align, 0);
2034 
2035   if (REG_P (addr))
2036     canon = addr;
2037 
2038   else if (GET_CODE (addr) == PLUS)
2039     {
2040       rtx addrop0 = XEXP (addr, 0);
2041       rtx addrop1 = XEXP (addr, 1);
2042 
2043       if (!REG_P (addrop0))
2044 	addrop0 = force_reg (GET_MODE (addrop0), addrop0);
2045 
2046       if (!REG_P (addrop1))
2047 	addrop1 = force_reg (GET_MODE (addrop1), addrop1);
2048 
2049       canon = gen_rtx_PLUS (GET_MODE (addr), addrop0, addrop1);
2050     }
2051 
2052   else
2053     canon = force_reg (GET_MODE (addr), addr);
2054 
2055   return gen_rtx_AND (GET_MODE (align), canon, GEN_INT (-16));
2056 }
2057 
2058 /* Check whether an rtx is an alignment mask, and if so, return
2059    a fully-expanded rtx for the masking operation.  */
2060 static rtx
2061 alignment_mask (rtx_insn *insn)
2062 {
2063   rtx body = PATTERN (insn);
2064 
2065   if (GET_CODE (body) != SET
2066       || GET_CODE (SET_SRC (body)) != AND
2067       || !REG_P (XEXP (SET_SRC (body), 0)))
2068     return 0;
2069 
2070   rtx mask = XEXP (SET_SRC (body), 1);
2071 
2072   if (CONST_INT_P (mask))
2073     {
2074       if (INTVAL (mask) == -16)
2075 	return alignment_with_canonical_addr (SET_SRC (body));
2076       else
2077 	return 0;
2078     }
2079 
2080   if (!REG_P (mask))
2081     return 0;
2082 
2083   struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
2084   df_ref use;
2085   rtx real_mask = 0;
2086 
2087   FOR_EACH_INSN_INFO_USE (use, insn_info)
2088     {
2089       if (!rtx_equal_p (DF_REF_REG (use), mask))
2090 	continue;
2091 
2092       struct df_link *def_link = DF_REF_CHAIN (use);
2093       if (!def_link || def_link->next)
2094 	return 0;
2095 
2096       rtx_insn *const_insn = DF_REF_INSN (def_link->ref);
2097       rtx const_body = PATTERN (const_insn);
2098       if (GET_CODE (const_body) != SET)
2099 	return 0;
2100 
2101       real_mask = SET_SRC (const_body);
2102 
2103       if (!CONST_INT_P (real_mask)
2104 	  || INTVAL (real_mask) != -16)
2105 	return 0;
2106     }
2107 
2108   if (real_mask == 0)
2109     return 0;
2110 
2111   return alignment_with_canonical_addr (SET_SRC (body));
2112 }
2113 
2114 /* Given INSN that's a load or store based at BASE_REG, look for a
2115    feeding computation that aligns its address on a 16-byte boundary.
2116    Return the rtx and its containing AND_INSN.  */
2117 static rtx
2118 find_alignment_op (rtx_insn *insn, rtx base_reg, rtx_insn **and_insn)
2119 {
2120   df_ref base_use;
2121   struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
2122   rtx and_operation = 0;
2123 
2124   FOR_EACH_INSN_INFO_USE (base_use, insn_info)
2125     {
2126       if (!rtx_equal_p (DF_REF_REG (base_use), base_reg))
2127 	continue;
2128 
2129       struct df_link *base_def_link = DF_REF_CHAIN (base_use);
2130       if (!base_def_link || base_def_link->next)
2131 	break;
2132 
2133       /* With stack-protector code enabled, and possibly in other
2134 	 circumstances, there may not be an associated insn for
2135 	 the def.  */
2136       if (DF_REF_IS_ARTIFICIAL (base_def_link->ref))
2137 	break;
2138 
2139       *and_insn = DF_REF_INSN (base_def_link->ref);
2140       and_operation = alignment_mask (*and_insn);
2141       if (and_operation != 0)
2142 	break;
2143     }
2144 
2145   return and_operation;
2146 }
2147 
2148 struct del_info { bool replace; rtx_insn *replace_insn; };
2149 
2150 /* If INSN is the load for an lvx pattern, put it in canonical form.  */
2151 static void
2152 recombine_lvx_pattern (rtx_insn *insn, del_info *to_delete)
2153 {
2154   rtx body = PATTERN (insn);
2155   gcc_assert (GET_CODE (body) == SET
2156 	      && GET_CODE (SET_SRC (body)) == VEC_SELECT
2157 	      && MEM_P (XEXP (SET_SRC (body), 0)));
2158 
2159   rtx mem = XEXP (SET_SRC (body), 0);
2160   rtx base_reg = XEXP (mem, 0);
2161 
2162   rtx_insn *and_insn;
2163   rtx and_operation = find_alignment_op (insn, base_reg, &and_insn);
2164 
2165   if (and_operation != 0)
2166     {
2167       df_ref def;
2168       struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
2169       FOR_EACH_INSN_INFO_DEF (def, insn_info)
2170 	{
2171 	  struct df_link *link = DF_REF_CHAIN (def);
2172 	  if (!link || link->next)
2173 	    break;
2174 
2175 	  rtx_insn *swap_insn = DF_REF_INSN (link->ref);
2176 	  if (!insn_is_swap_p (swap_insn)
2177 	      || insn_is_load_p (swap_insn)
2178 	      || insn_is_store_p (swap_insn))
2179 	    break;
2180 
2181 	  /* Expected lvx pattern found.  Change the swap to
2182 	     a copy, and propagate the AND operation into the
2183 	     load.  */
2184 	  to_delete[INSN_UID (swap_insn)].replace = true;
2185 	  to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn;
2186 
2187 	  /* However, first we must be sure that we make the
2188 	     base register from the AND operation available
2189 	     in case the register has been overwritten.  Copy
2190 	     the base register to a new pseudo and use that
2191 	     as the base register of the AND operation in
2192 	     the new LVX instruction.  */
2193 	  rtx and_base = XEXP (and_operation, 0);
2194 	  rtx new_reg = gen_reg_rtx (GET_MODE (and_base));
2195 	  rtx copy = gen_rtx_SET (new_reg, and_base);
2196 	  rtx_insn *new_insn = emit_insn_after (copy, and_insn);
2197 	  set_block_for_insn (new_insn, BLOCK_FOR_INSN (and_insn));
2198 	  df_insn_rescan (new_insn);
2199 
2200 	  XEXP (mem, 0) = gen_rtx_AND (GET_MODE (and_base), new_reg,
2201 				       XEXP (and_operation, 1));
2202 	  SET_SRC (body) = mem;
2203 	  INSN_CODE (insn) = -1; /* Force re-recognition.  */
2204 	  df_insn_rescan (insn);
2205 
2206 	  if (dump_file)
2207 	    fprintf (dump_file, "lvx opportunity found at %d\n",
2208 		     INSN_UID (insn));
2209 	}
2210     }
2211 }
2212 
2213 /* If INSN is the store for an stvx pattern, put it in canonical form.  */
2214 static void
2215 recombine_stvx_pattern (rtx_insn *insn, del_info *to_delete)
2216 {
2217   rtx body = PATTERN (insn);
2218   gcc_assert (GET_CODE (body) == SET
2219 	      && MEM_P (SET_DEST (body))
2220 	      && GET_CODE (SET_SRC (body)) == VEC_SELECT);
2221   rtx mem = SET_DEST (body);
2222   rtx base_reg = XEXP (mem, 0);
2223 
2224   rtx_insn *and_insn;
2225   rtx and_operation = find_alignment_op (insn, base_reg, &and_insn);
2226 
2227   if (and_operation != 0)
2228     {
2229       rtx src_reg = XEXP (SET_SRC (body), 0);
2230       df_ref src_use;
2231       struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
2232       FOR_EACH_INSN_INFO_USE (src_use, insn_info)
2233 	{
2234 	  if (!rtx_equal_p (DF_REF_REG (src_use), src_reg))
2235 	    continue;
2236 
2237 	  struct df_link *link = DF_REF_CHAIN (src_use);
2238 	  if (!link || link->next)
2239 	    break;
2240 
2241 	  rtx_insn *swap_insn = DF_REF_INSN (link->ref);
2242 	  if (!insn_is_swap_p (swap_insn)
2243 	      || insn_is_load_p (swap_insn)
2244 	      || insn_is_store_p (swap_insn))
2245 	    break;
2246 
2247 	  /* Expected stvx pattern found.  Change the swap to
2248 	     a copy, and propagate the AND operation into the
2249 	     store.  */
2250 	  to_delete[INSN_UID (swap_insn)].replace = true;
2251 	  to_delete[INSN_UID (swap_insn)].replace_insn = swap_insn;
2252 
2253 	  /* However, first we must be sure that we make the
2254 	     base register from the AND operation available
2255 	     in case the register has been overwritten.  Copy
2256 	     the base register to a new pseudo and use that
2257 	     as the base register of the AND operation in
2258 	     the new STVX instruction.  */
2259 	  rtx and_base = XEXP (and_operation, 0);
2260 	  rtx new_reg = gen_reg_rtx (GET_MODE (and_base));
2261 	  rtx copy = gen_rtx_SET (new_reg, and_base);
2262 	  rtx_insn *new_insn = emit_insn_after (copy, and_insn);
2263 	  set_block_for_insn (new_insn, BLOCK_FOR_INSN (and_insn));
2264 	  df_insn_rescan (new_insn);
2265 
2266 	  XEXP (mem, 0) = gen_rtx_AND (GET_MODE (and_base), new_reg,
2267 				       XEXP (and_operation, 1));
2268 	  SET_SRC (body) = src_reg;
2269 	  INSN_CODE (insn) = -1; /* Force re-recognition.  */
2270 	  df_insn_rescan (insn);
2271 
2272 	  if (dump_file)
2273 	    fprintf (dump_file, "stvx opportunity found at %d\n",
2274 		     INSN_UID (insn));
2275 	}
2276     }
2277 }
2278 
2279 /* Look for patterns created from builtin lvx and stvx calls, and
2280    canonicalize them to be properly recognized as such.  */
2281 static void
2282 recombine_lvx_stvx_patterns (function *fun)
2283 {
2284   int i;
2285   basic_block bb;
2286   rtx_insn *insn;
2287 
2288   int num_insns = get_max_uid ();
2289   del_info *to_delete = XCNEWVEC (del_info, num_insns);
2290 
2291   FOR_ALL_BB_FN (bb, fun)
2292     FOR_BB_INSNS (bb, insn)
2293     {
2294       if (!NONDEBUG_INSN_P (insn))
2295 	continue;
2296 
2297       if (insn_is_load_p (insn) && insn_is_swap_p (insn))
2298 	recombine_lvx_pattern (insn, to_delete);
2299       else if (insn_is_store_p (insn) && insn_is_swap_p (insn))
2300 	recombine_stvx_pattern (insn, to_delete);
2301     }
2302 
2303   /* Turning swaps into copies is delayed until now, to avoid problems
2304      with deleting instructions during the insn walk.  */
2305   for (i = 0; i < num_insns; i++)
2306     if (to_delete[i].replace)
2307       {
2308 	rtx swap_body = PATTERN (to_delete[i].replace_insn);
2309 	rtx src_reg = XEXP (SET_SRC (swap_body), 0);
2310 	rtx copy = gen_rtx_SET (SET_DEST (swap_body), src_reg);
2311 	rtx_insn *new_insn = emit_insn_before (copy,
2312 					       to_delete[i].replace_insn);
2313 	set_block_for_insn (new_insn,
2314 			    BLOCK_FOR_INSN (to_delete[i].replace_insn));
2315 	df_insn_rescan (new_insn);
2316 	df_insn_delete (to_delete[i].replace_insn);
2317 	remove_insn (to_delete[i].replace_insn);
2318 	to_delete[i].replace_insn->set_deleted ();
2319       }
2320 
2321   free (to_delete);
2322 }
2323 
2324 /* Main entry point for this pass.  */
2325 unsigned int
2326 rs6000_analyze_swaps (function *fun)
2327 {
2328   swap_web_entry *insn_entry;
2329   basic_block bb;
2330   rtx_insn *insn, *curr_insn = 0;
2331 
2332   /* Dataflow analysis for use-def chains.  */
2333   df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
2334   df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2335   df_analyze ();
2336   df_set_flags (DF_DEFER_INSN_RESCAN);
2337 
2338   /* Pre-pass to recombine lvx and stvx patterns so we don't lose info.  */
2339   recombine_lvx_stvx_patterns (fun);
2340 
2341   /* Rebuild ud- and du-chains.  */
2342   df_remove_problem (df_chain);
2343   df_process_deferred_rescans ();
2344   df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
2345   df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2346   df_analyze ();
2347   df_set_flags (DF_DEFER_INSN_RESCAN);
2348 
2349   /* Allocate structure to represent webs of insns.  */
2350   insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ());
2351 
2352   /* Walk the insns to gather basic data.  */
2353   FOR_ALL_BB_FN (bb, fun)
2354     FOR_BB_INSNS_SAFE (bb, insn, curr_insn)
2355     {
2356       unsigned int uid = INSN_UID (insn);
2357       if (NONDEBUG_INSN_P (insn))
2358 	{
2359 	  insn_entry[uid].insn = insn;
2360 
2361 	  if (GET_CODE (insn) == CALL_INSN)
2362 	    insn_entry[uid].is_call = 1;
2363 
2364 	  /* Walk the uses and defs to see if we mention vector regs.
2365 	     Record any constraints on optimization of such mentions.  */
2366 	  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
2367 	  df_ref mention;
2368 	  FOR_EACH_INSN_INFO_USE (mention, insn_info)
2369 	    {
2370 	      /* We use DF_REF_REAL_REG here to get inside any subregs.  */
2371 	      machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention));
2372 
2373 	      /* If a use gets its value from a call insn, it will be
2374 		 a hard register and will look like (reg:V4SI 3 3).
2375 		 The df analysis creates two mentions for GPR3 and GPR4,
2376 		 both DImode.  We must recognize this and treat it as a
2377 		 vector mention to ensure the call is unioned with this
2378 		 use.  */
2379 	      if (mode == DImode && DF_REF_INSN_INFO (mention))
2380 		{
2381 		  rtx feeder = DF_REF_INSN (mention);
2382 		  /* FIXME:  It is pretty hard to get from the df mention
2383 		     to the mode of the use in the insn.  We arbitrarily
2384 		     pick a vector mode here, even though the use might
2385 		     be a real DImode.  We can be too conservative
2386 		     (create a web larger than necessary) because of
2387 		     this, so consider eventually fixing this.  */
2388 		  if (GET_CODE (feeder) == CALL_INSN)
2389 		    mode = V4SImode;
2390 		}
2391 
2392 	      if (ALTIVEC_OR_VSX_VECTOR_MODE (mode) || mode == TImode)
2393 		{
2394 		  insn_entry[uid].is_relevant = 1;
2395 		  if (mode == TImode || mode == V1TImode
2396 		      || FLOAT128_VECTOR_P (mode))
2397 		    insn_entry[uid].is_128_int = 1;
2398 		  if (DF_REF_INSN_INFO (mention))
2399 		    insn_entry[uid].contains_subreg
2400 		      = !rtx_equal_p (DF_REF_REG (mention),
2401 				      DF_REF_REAL_REG (mention));
2402 		  union_defs (insn_entry, insn, mention);
2403 		}
2404 	    }
2405 	  FOR_EACH_INSN_INFO_DEF (mention, insn_info)
2406 	    {
2407 	      /* We use DF_REF_REAL_REG here to get inside any subregs.  */
2408 	      machine_mode mode = GET_MODE (DF_REF_REAL_REG (mention));
2409 
2410 	      /* If we're loading up a hard vector register for a call,
2411 		 it looks like (set (reg:V4SI 9 9) (...)).  The df
2412 		 analysis creates two mentions for GPR9 and GPR10, both
2413 		 DImode.  So relying on the mode from the mentions
2414 		 isn't sufficient to ensure we union the call into the
2415 		 web with the parameter setup code.  */
2416 	      if (mode == DImode && GET_CODE (insn) == SET
2417 		  && ALTIVEC_OR_VSX_VECTOR_MODE (GET_MODE (SET_DEST (insn))))
2418 		mode = GET_MODE (SET_DEST (insn));
2419 
2420 	      if (ALTIVEC_OR_VSX_VECTOR_MODE (mode) || mode == TImode)
2421 		{
2422 		  insn_entry[uid].is_relevant = 1;
2423 		  if (mode == TImode || mode == V1TImode
2424 		      || FLOAT128_VECTOR_P (mode))
2425 		    insn_entry[uid].is_128_int = 1;
2426 		  if (DF_REF_INSN_INFO (mention))
2427 		    insn_entry[uid].contains_subreg
2428 		      = !rtx_equal_p (DF_REF_REG (mention),
2429 				      DF_REF_REAL_REG (mention));
2430 		  /* REG_FUNCTION_VALUE_P is not valid for subregs. */
2431 		  else if (REG_FUNCTION_VALUE_P (DF_REF_REG (mention)))
2432 		    insn_entry[uid].is_live_out = 1;
2433 		  union_uses (insn_entry, insn, mention);
2434 		}
2435 	    }
2436 
2437 	  if (insn_entry[uid].is_relevant)
2438 	    {
2439 	      /* Determine if this is a load or store.  */
2440 	      insn_entry[uid].is_load = insn_is_load_p (insn);
2441 	      insn_entry[uid].is_store = insn_is_store_p (insn);
2442 
2443 	      /* Determine if this is a doubleword swap.  If not,
2444 		 determine whether it can legally be swapped.  */
2445 	      if (insn_is_swap_p (insn))
2446 		insn_entry[uid].is_swap = 1;
2447 	      else
2448 		{
2449 		  unsigned int special = SH_NONE;
2450 		  insn_entry[uid].is_swappable
2451 		    = insn_is_swappable_p (insn_entry, insn, &special);
2452 		  if (special != SH_NONE && insn_entry[uid].contains_subreg)
2453 		    insn_entry[uid].is_swappable = 0;
2454 		  else if (special != SH_NONE)
2455 		    insn_entry[uid].special_handling = special;
2456 		  else if (insn_entry[uid].contains_subreg
2457 			   && has_part_mult (insn))
2458 		    insn_entry[uid].is_swappable = 0;
2459 		  else if (insn_entry[uid].contains_subreg)
2460 		    insn_entry[uid].special_handling = SH_SUBREG;
2461 		}
2462 	    }
2463 	}
2464     }
2465 
2466   if (dump_file)
2467     {
2468       fprintf (dump_file, "\nSwap insn entry table when first built\n");
2469       dump_swap_insn_table (insn_entry);
2470     }
2471 
2472   /* Record unoptimizable webs.  */
2473   unsigned e = get_max_uid (), i;
2474   for (i = 0; i < e; ++i)
2475     {
2476       if (!insn_entry[i].is_relevant)
2477 	continue;
2478 
2479       swap_web_entry *root
2480 	= (swap_web_entry*)(&insn_entry[i])->unionfind_root ();
2481 
2482       if (insn_entry[i].is_live_in || insn_entry[i].is_live_out
2483 	  || (insn_entry[i].contains_subreg
2484 	      && insn_entry[i].special_handling != SH_SUBREG)
2485 	  || insn_entry[i].is_128_int || insn_entry[i].is_call
2486 	  || !(insn_entry[i].is_swappable || insn_entry[i].is_swap))
2487 	root->web_not_optimizable = 1;
2488 
2489       /* If we have loads or stores that aren't permuting then the
2490 	 optimization isn't appropriate.  */
2491       else if ((insn_entry[i].is_load || insn_entry[i].is_store)
2492 	  && !insn_entry[i].is_swap && !insn_entry[i].is_swappable)
2493 	root->web_not_optimizable = 1;
2494 
2495       /* If we have a swap that is both fed by a permuting load
2496 	 and a feeder of a permuting store, then the optimization
2497 	 isn't appropriate.  (Consider vec_xl followed by vec_xst_be.)  */
2498       else if (insn_entry[i].is_swap && !insn_entry[i].is_load
2499 	       && !insn_entry[i].is_store
2500 	       && swap_feeds_both_load_and_store (&insn_entry[i]))
2501 	root->web_not_optimizable = 1;
2502 
2503       /* If we have permuting loads or stores that are not accompanied
2504 	 by a register swap, the optimization isn't appropriate.  */
2505       else if (insn_entry[i].is_load && insn_entry[i].is_swap)
2506 	{
2507 	  rtx insn = insn_entry[i].insn;
2508 	  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
2509 	  df_ref def;
2510 
2511 	  FOR_EACH_INSN_INFO_DEF (def, insn_info)
2512 	    {
2513 	      struct df_link *link = DF_REF_CHAIN (def);
2514 
2515 	      if (!chain_contains_only_swaps (insn_entry, link, FOR_LOADS))
2516 		{
2517 		  root->web_not_optimizable = 1;
2518 		  break;
2519 		}
2520 	    }
2521 	}
2522       else if (insn_entry[i].is_store && insn_entry[i].is_swap)
2523 	{
2524 	  rtx insn = insn_entry[i].insn;
2525 	  struct df_insn_info *insn_info = DF_INSN_INFO_GET (insn);
2526 	  df_ref use;
2527 
2528 	  FOR_EACH_INSN_INFO_USE (use, insn_info)
2529 	    {
2530 	      struct df_link *link = DF_REF_CHAIN (use);
2531 
2532 	      if (!chain_contains_only_swaps (insn_entry, link, FOR_STORES))
2533 		{
2534 		  root->web_not_optimizable = 1;
2535 		  break;
2536 		}
2537 	    }
2538 	}
2539     }
2540 
2541   if (dump_file)
2542     {
2543       fprintf (dump_file, "\nSwap insn entry table after web analysis\n");
2544       dump_swap_insn_table (insn_entry);
2545     }
2546 
2547   /* For each load and store in an optimizable web (which implies
2548      the loads and stores are permuting), find the associated
2549      register swaps and mark them for removal.  Due to various
2550      optimizations we may mark the same swap more than once.  Also
2551      perform special handling for swappable insns that require it.  */
2552   for (i = 0; i < e; ++i)
2553     if ((insn_entry[i].is_load || insn_entry[i].is_store)
2554 	&& insn_entry[i].is_swap)
2555       {
2556 	swap_web_entry* root_entry
2557 	  = (swap_web_entry*)((&insn_entry[i])->unionfind_root ());
2558 	if (!root_entry->web_not_optimizable)
2559 	  mark_swaps_for_removal (insn_entry, i);
2560       }
2561     else if (insn_entry[i].is_swappable && insn_entry[i].special_handling)
2562       {
2563 	swap_web_entry* root_entry
2564 	  = (swap_web_entry*)((&insn_entry[i])->unionfind_root ());
2565 	if (!root_entry->web_not_optimizable)
2566 	  handle_special_swappables (insn_entry, i);
2567       }
2568 
2569   /* Now delete the swaps marked for removal.  */
2570   for (i = 0; i < e; ++i)
2571     if (insn_entry[i].will_delete)
2572       replace_swap_with_copy (insn_entry, i);
2573 
2574   /* Clean up.  */
2575   free (insn_entry);
2576 
2577   /* Use a second pass over rtl to detect that certain vector values
2578      fetched from or stored to memory on quad-word aligned addresses
2579      can use lvx/stvx without swaps.  */
2580 
2581   /* First, rebuild ud chains.  */
2582   df_remove_problem (df_chain);
2583   df_process_deferred_rescans ();
2584   df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
2585   df_chain_add_problem (DF_UD_CHAIN);
2586   df_analyze ();
2587 
2588   swap_web_entry *pass2_insn_entry;
2589   pass2_insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ());
2590 
2591   /* Walk the insns to gather basic data.  */
2592   FOR_ALL_BB_FN (bb, fun)
2593     FOR_BB_INSNS_SAFE (bb, insn, curr_insn)
2594     {
2595       unsigned int uid = INSN_UID (insn);
2596       if (NONDEBUG_INSN_P (insn))
2597 	{
2598 	  pass2_insn_entry[uid].insn = insn;
2599 
2600 	  pass2_insn_entry[uid].is_relevant = 1;
2601 	  pass2_insn_entry[uid].is_load = insn_is_load_p (insn);
2602 	  pass2_insn_entry[uid].is_store = insn_is_store_p (insn);
2603 
2604 	  /* Determine if this is a doubleword swap.  If not,
2605 	     determine whether it can legally be swapped.  */
2606 	  if (insn_is_swap_p (insn))
2607 	    pass2_insn_entry[uid].is_swap = 1;
2608 	}
2609     }
2610 
2611   e = get_max_uid ();
2612   for (unsigned i = 0; i < e; ++i)
2613     if (pass2_insn_entry[i].is_swap && !pass2_insn_entry[i].is_load
2614 	&& !pass2_insn_entry[i].is_store)
2615       {
2616 	/* Replace swap of aligned load-swap with aligned unswapped
2617 	   load.  */
2618 	rtx_insn *rtx_insn = pass2_insn_entry[i].insn;
2619 	if (quad_aligned_load_p (pass2_insn_entry, rtx_insn))
2620 	  replace_swapped_aligned_load (pass2_insn_entry, rtx_insn);
2621       }
2622     else if (pass2_insn_entry[i].is_swap && pass2_insn_entry[i].is_store)
2623       {
2624 	/* Replace aligned store-swap of swapped value with aligned
2625 	   unswapped store.  */
2626 	rtx_insn *rtx_insn = pass2_insn_entry[i].insn;
2627 	if (quad_aligned_store_p (pass2_insn_entry, rtx_insn))
2628 	  replace_swapped_aligned_store (pass2_insn_entry, rtx_insn);
2629       }
2630 
2631   /* Clean up.  */
2632   free (pass2_insn_entry);
2633 
2634   /* Use a third pass over rtl to replace swap(load(vector constant))
2635      with load(swapped vector constant).  */
2636 
2637   /* First, rebuild ud chains.  */
2638   df_remove_problem (df_chain);
2639   df_process_deferred_rescans ();
2640   df_set_flags (DF_RD_PRUNE_DEAD_DEFS);
2641   df_chain_add_problem (DF_UD_CHAIN);
2642   df_analyze ();
2643 
2644   swap_web_entry *pass3_insn_entry;
2645   pass3_insn_entry = XCNEWVEC (swap_web_entry, get_max_uid ());
2646 
2647   /* Walk the insns to gather basic data.  */
2648   FOR_ALL_BB_FN (bb, fun)
2649     FOR_BB_INSNS_SAFE (bb, insn, curr_insn)
2650     {
2651       unsigned int uid = INSN_UID (insn);
2652       if (NONDEBUG_INSN_P (insn))
2653 	{
2654 	  pass3_insn_entry[uid].insn = insn;
2655 
2656 	  pass3_insn_entry[uid].is_relevant = 1;
2657 	  pass3_insn_entry[uid].is_load = insn_is_load_p (insn);
2658 	  pass3_insn_entry[uid].is_store = insn_is_store_p (insn);
2659 
2660 	  /* Determine if this is a doubleword swap.  If not,
2661 	     determine whether it can legally be swapped.  */
2662 	  if (insn_is_swap_p (insn))
2663 	    pass3_insn_entry[uid].is_swap = 1;
2664 	}
2665     }
2666 
2667   e = get_max_uid ();
2668   for (unsigned i = 0; i < e; ++i)
2669     if (pass3_insn_entry[i].is_swap && !pass3_insn_entry[i].is_load
2670 	&& !pass3_insn_entry[i].is_store)
2671       {
2672 	insn = pass3_insn_entry[i].insn;
2673 	if (const_load_sequence_p (pass3_insn_entry, insn))
2674 	  replace_swapped_load_constant (pass3_insn_entry, insn);
2675       }
2676 
2677   /* Clean up.  */
2678   free (pass3_insn_entry);
2679   return 0;
2680 }
2681 
2682 const pass_data pass_data_analyze_swaps =
2683 {
2684   RTL_PASS, /* type */
2685   "swaps", /* name */
2686   OPTGROUP_NONE, /* optinfo_flags */
2687   TV_NONE, /* tv_id */
2688   0, /* properties_required */
2689   0, /* properties_provided */
2690   0, /* properties_destroyed */
2691   0, /* todo_flags_start */
2692   TODO_df_finish, /* todo_flags_finish */
2693 };
2694 
2695 class pass_analyze_swaps : public rtl_opt_pass
2696 {
2697 public:
2698   pass_analyze_swaps(gcc::context *ctxt)
2699     : rtl_opt_pass(pass_data_analyze_swaps, ctxt)
2700   {}
2701 
2702   /* opt_pass methods: */
2703   virtual bool gate (function *)
2704     {
2705       return (optimize > 0 && !BYTES_BIG_ENDIAN && TARGET_VSX
2706 	      && !TARGET_P9_VECTOR && rs6000_optimize_swaps);
2707     }
2708 
2709   virtual unsigned int execute (function *fun)
2710     {
2711       return rs6000_analyze_swaps (fun);
2712     }
2713 
2714   opt_pass *clone ()
2715     {
2716       return new pass_analyze_swaps (m_ctxt);
2717     }
2718 
2719 }; // class pass_analyze_swaps
2720 
2721 rtl_opt_pass *
2722 make_pass_analyze_swaps (gcc::context *ctxt)
2723 {
2724   return new pass_analyze_swaps (ctxt);
2725 }
2726 
2727