xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-ssa-math-opts.c (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1 /* Global, SSA-based optimizations using mathematical identities.
2    Copyright (C) 2005-2015 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21    operations.  These are common in sequences such as this one:
22 
23 	modulus = sqrt(x*x + y*y + z*z);
24 	x = x / modulus;
25 	y = y / modulus;
26 	z = z / modulus;
27 
28    that can be optimized to
29 
30 	modulus = sqrt(x*x + y*y + z*z);
31         rmodulus = 1.0 / modulus;
32 	x = x * rmodulus;
33 	y = y * rmodulus;
34 	z = z * rmodulus;
35 
36    We do this for loop invariant divisors, and with this pass whenever
37    we notice that a division has the same divisor multiple times.
38 
39    Of course, like in PRE, we don't insert a division if a dominator
40    already has one.  However, this cannot be done as an extension of
41    PRE for several reasons.
42 
43    First of all, with some experiments it was found out that the
44    transformation is not always useful if there are only two divisions
45    hy the same divisor.  This is probably because modern processors
46    can pipeline the divisions; on older, in-order processors it should
47    still be effective to optimize two divisions by the same number.
48    We make this a param, and it shall be called N in the remainder of
49    this comment.
50 
51    Second, if trapping math is active, we have less freedom on where
52    to insert divisions: we can only do so in basic blocks that already
53    contain one.  (If divisions don't trap, instead, we can insert
54    divisions elsewhere, which will be in blocks that are common dominators
55    of those that have the division).
56 
57    We really don't want to compute the reciprocal unless a division will
58    be found.  To do this, we won't insert the division in a basic block
59    that has less than N divisions *post-dominating* it.
60 
61    The algorithm constructs a subset of the dominator tree, holding the
62    blocks containing the divisions and the common dominators to them,
63    and walk it twice.  The first walk is in post-order, and it annotates
64    each block with the number of divisions that post-dominate it: this
65    gives information on where divisions can be inserted profitably.
66    The second walk is in pre-order, and it inserts divisions as explained
67    above, and replaces divisions by multiplications.
68 
69    In the best case, the cost of the pass is O(n_statements).  In the
70    worst-case, the cost is due to creating the dominator tree subset,
71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
72    for n_statements / n_basic_blocks statements.  So, the amortized cost
73    of creating the dominator tree subset is O(n_basic_blocks) and the
74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
75 
76    More practically, the cost will be small because there are few
77    divisions, and they tend to be in the same basic block, so insert_bb
78    is called very few times.
79 
80    If we did this using domwalk.c, an efficient implementation would have
81    to work on all the variables in a single pass, because we could not
82    work on just a subset of the dominator tree, as we do now, and the
83    cost would also be something like O(n_statements * n_basic_blocks).
84    The data structures would be more complex in order to work on all the
85    variables in a single pass.  */
86 
87 #include "config.h"
88 #include "system.h"
89 #include "coretypes.h"
90 #include "tm.h"
91 #include "flags.h"
92 #include "hash-set.h"
93 #include "machmode.h"
94 #include "vec.h"
95 #include "double-int.h"
96 #include "input.h"
97 #include "alias.h"
98 #include "symtab.h"
99 #include "wide-int.h"
100 #include "inchash.h"
101 #include "tree.h"
102 #include "fold-const.h"
103 #include "predict.h"
104 #include "hard-reg-set.h"
105 #include "function.h"
106 #include "dominance.h"
107 #include "cfg.h"
108 #include "basic-block.h"
109 #include "tree-ssa-alias.h"
110 #include "internal-fn.h"
111 #include "gimple-fold.h"
112 #include "gimple-expr.h"
113 #include "is-a.h"
114 #include "gimple.h"
115 #include "gimple-iterator.h"
116 #include "gimplify.h"
117 #include "gimplify-me.h"
118 #include "stor-layout.h"
119 #include "gimple-ssa.h"
120 #include "tree-cfg.h"
121 #include "tree-phinodes.h"
122 #include "ssa-iterators.h"
123 #include "stringpool.h"
124 #include "tree-ssanames.h"
125 #include "hashtab.h"
126 #include "rtl.h"
127 #include "statistics.h"
128 #include "real.h"
129 #include "fixed-value.h"
130 #include "insn-config.h"
131 #include "expmed.h"
132 #include "dojump.h"
133 #include "explow.h"
134 #include "calls.h"
135 #include "emit-rtl.h"
136 #include "varasm.h"
137 #include "stmt.h"
138 #include "expr.h"
139 #include "tree-dfa.h"
140 #include "tree-ssa.h"
141 #include "tree-pass.h"
142 #include "alloc-pool.h"
143 #include "target.h"
144 #include "gimple-pretty-print.h"
145 #include "builtins.h"
146 
147 /* FIXME: RTL headers have to be included here for optabs.  */
148 #include "rtl.h"		/* Because optabs.h wants enum rtx_code.  */
149 #include "expr.h"		/* Because optabs.h wants sepops.  */
150 #include "insn-codes.h"
151 #include "optabs.h"
152 
153 /* This structure represents one basic block that either computes a
154    division, or is a common dominator for basic block that compute a
155    division.  */
156 struct occurrence {
157   /* The basic block represented by this structure.  */
158   basic_block bb;
159 
160   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
161      inserted in BB.  */
162   tree recip_def;
163 
164   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
165      was inserted in BB.  */
166   gimple recip_def_stmt;
167 
168   /* Pointer to a list of "struct occurrence"s for blocks dominated
169      by BB.  */
170   struct occurrence *children;
171 
172   /* Pointer to the next "struct occurrence"s in the list of blocks
173      sharing a common dominator.  */
174   struct occurrence *next;
175 
176   /* The number of divisions that are in BB before compute_merit.  The
177      number of divisions that are in BB or post-dominate it after
178      compute_merit.  */
179   int num_divisions;
180 
181   /* True if the basic block has a division, false if it is a common
182      dominator for basic blocks that do.  If it is false and trapping
183      math is active, BB is not a candidate for inserting a reciprocal.  */
184   bool bb_has_division;
185 };
186 
187 static struct
188 {
189   /* Number of 1.0/X ops inserted.  */
190   int rdivs_inserted;
191 
192   /* Number of 1.0/FUNC ops inserted.  */
193   int rfuncs_inserted;
194 } reciprocal_stats;
195 
196 static struct
197 {
198   /* Number of cexpi calls inserted.  */
199   int inserted;
200 } sincos_stats;
201 
202 static struct
203 {
204   /* Number of hand-written 16-bit nop / bswaps found.  */
205   int found_16bit;
206 
207   /* Number of hand-written 32-bit nop / bswaps found.  */
208   int found_32bit;
209 
210   /* Number of hand-written 64-bit nop / bswaps found.  */
211   int found_64bit;
212 } nop_stats, bswap_stats;
213 
214 static struct
215 {
216   /* Number of widening multiplication ops inserted.  */
217   int widen_mults_inserted;
218 
219   /* Number of integer multiply-and-accumulate ops inserted.  */
220   int maccs_inserted;
221 
222   /* Number of fp fused multiply-add ops inserted.  */
223   int fmas_inserted;
224 } widen_mul_stats;
225 
226 /* The instance of "struct occurrence" representing the highest
227    interesting block in the dominator tree.  */
228 static struct occurrence *occ_head;
229 
230 /* Allocation pool for getting instances of "struct occurrence".  */
231 static alloc_pool occ_pool;
232 
233 
234 
235 /* Allocate and return a new struct occurrence for basic block BB, and
236    whose children list is headed by CHILDREN.  */
237 static struct occurrence *
238 occ_new (basic_block bb, struct occurrence *children)
239 {
240   struct occurrence *occ;
241 
242   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
243   memset (occ, 0, sizeof (struct occurrence));
244 
245   occ->bb = bb;
246   occ->children = children;
247   return occ;
248 }
249 
250 
251 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
252    list of "struct occurrence"s, one per basic block, having IDOM as
253    their common dominator.
254 
255    We try to insert NEW_OCC as deep as possible in the tree, and we also
256    insert any other block that is a common dominator for BB and one
257    block already in the tree.  */
258 
259 static void
260 insert_bb (struct occurrence *new_occ, basic_block idom,
261 	   struct occurrence **p_head)
262 {
263   struct occurrence *occ, **p_occ;
264 
265   for (p_occ = p_head; (occ = *p_occ) != NULL; )
266     {
267       basic_block bb = new_occ->bb, occ_bb = occ->bb;
268       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
269       if (dom == bb)
270 	{
271 	  /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
272 	     from its list.  */
273 	  *p_occ = occ->next;
274 	  occ->next = new_occ->children;
275 	  new_occ->children = occ;
276 
277 	  /* Try the next block (it may as well be dominated by BB).  */
278 	}
279 
280       else if (dom == occ_bb)
281 	{
282 	  /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
283 	  insert_bb (new_occ, dom, &occ->children);
284 	  return;
285 	}
286 
287       else if (dom != idom)
288 	{
289 	  gcc_assert (!dom->aux);
290 
291 	  /* There is a dominator between IDOM and BB, add it and make
292 	     two children out of NEW_OCC and OCC.  First, remove OCC from
293 	     its list.	*/
294 	  *p_occ = occ->next;
295 	  new_occ->next = occ;
296 	  occ->next = NULL;
297 
298 	  /* None of the previous blocks has DOM as a dominator: if we tail
299 	     recursed, we would reexamine them uselessly. Just switch BB with
300 	     DOM, and go on looking for blocks dominated by DOM.  */
301           new_occ = occ_new (dom, new_occ);
302 	}
303 
304       else
305 	{
306 	  /* Nothing special, go on with the next element.  */
307 	  p_occ = &occ->next;
308 	}
309     }
310 
311   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
312   new_occ->next = *p_head;
313   *p_head = new_occ;
314 }
315 
316 /* Register that we found a division in BB.  */
317 
318 static inline void
319 register_division_in (basic_block bb)
320 {
321   struct occurrence *occ;
322 
323   occ = (struct occurrence *) bb->aux;
324   if (!occ)
325     {
326       occ = occ_new (bb, NULL);
327       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
328     }
329 
330   occ->bb_has_division = true;
331   occ->num_divisions++;
332 }
333 
334 
335 /* Compute the number of divisions that postdominate each block in OCC and
336    its children.  */
337 
338 static void
339 compute_merit (struct occurrence *occ)
340 {
341   struct occurrence *occ_child;
342   basic_block dom = occ->bb;
343 
344   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
345     {
346       basic_block bb;
347       if (occ_child->children)
348         compute_merit (occ_child);
349 
350       if (flag_exceptions)
351 	bb = single_noncomplex_succ (dom);
352       else
353 	bb = dom;
354 
355       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
356         occ->num_divisions += occ_child->num_divisions;
357     }
358 }
359 
360 
361 /* Return whether USE_STMT is a floating-point division by DEF.  */
362 static inline bool
363 is_division_by (gimple use_stmt, tree def)
364 {
365   return is_gimple_assign (use_stmt)
366 	 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
367 	 && gimple_assign_rhs2 (use_stmt) == def
368 	 /* Do not recognize x / x as valid division, as we are getting
369 	    confused later by replacing all immediate uses x in such
370 	    a stmt.  */
371 	 && gimple_assign_rhs1 (use_stmt) != def;
372 }
373 
374 /* Walk the subset of the dominator tree rooted at OCC, setting the
375    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
376    the given basic block.  The field may be left NULL, of course,
377    if it is not possible or profitable to do the optimization.
378 
379    DEF_BSI is an iterator pointing at the statement defining DEF.
380    If RECIP_DEF is set, a dominator already has a computation that can
381    be used.  */
382 
383 static void
384 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
385 		    tree def, tree recip_def, int threshold)
386 {
387   tree type;
388   gassign *new_stmt;
389   gimple_stmt_iterator gsi;
390   struct occurrence *occ_child;
391 
392   if (!recip_def
393       && (occ->bb_has_division || !flag_trapping_math)
394       && occ->num_divisions >= threshold)
395     {
396       /* Make a variable with the replacement and substitute it.  */
397       type = TREE_TYPE (def);
398       recip_def = create_tmp_reg (type, "reciptmp");
399       new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
400 				      build_one_cst (type), def);
401 
402       if (occ->bb_has_division)
403         {
404           /* Case 1: insert before an existing division.  */
405           gsi = gsi_after_labels (occ->bb);
406           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
407 	    gsi_next (&gsi);
408 
409           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
410         }
411       else if (def_gsi && occ->bb == def_gsi->bb)
412         {
413           /* Case 2: insert right after the definition.  Note that this will
414 	     never happen if the definition statement can throw, because in
415 	     that case the sole successor of the statement's basic block will
416 	     dominate all the uses as well.  */
417           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
418         }
419       else
420         {
421           /* Case 3: insert in a basic block not containing defs/uses.  */
422           gsi = gsi_after_labels (occ->bb);
423           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
424         }
425 
426       reciprocal_stats.rdivs_inserted++;
427 
428       occ->recip_def_stmt = new_stmt;
429     }
430 
431   occ->recip_def = recip_def;
432   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
433     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
434 }
435 
436 
437 /* Replace the division at USE_P with a multiplication by the reciprocal, if
438    possible.  */
439 
440 static inline void
441 replace_reciprocal (use_operand_p use_p)
442 {
443   gimple use_stmt = USE_STMT (use_p);
444   basic_block bb = gimple_bb (use_stmt);
445   struct occurrence *occ = (struct occurrence *) bb->aux;
446 
447   if (optimize_bb_for_speed_p (bb)
448       && occ->recip_def && use_stmt != occ->recip_def_stmt)
449     {
450       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
451       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
452       SET_USE (use_p, occ->recip_def);
453       fold_stmt_inplace (&gsi);
454       update_stmt (use_stmt);
455     }
456 }
457 
458 
459 /* Free OCC and return one more "struct occurrence" to be freed.  */
460 
461 static struct occurrence *
462 free_bb (struct occurrence *occ)
463 {
464   struct occurrence *child, *next;
465 
466   /* First get the two pointers hanging off OCC.  */
467   next = occ->next;
468   child = occ->children;
469   occ->bb->aux = NULL;
470   pool_free (occ_pool, occ);
471 
472   /* Now ensure that we don't recurse unless it is necessary.  */
473   if (!child)
474     return next;
475   else
476     {
477       while (next)
478 	next = free_bb (next);
479 
480       return child;
481     }
482 }
483 
484 
485 /* Look for floating-point divisions among DEF's uses, and try to
486    replace them by multiplications with the reciprocal.  Add
487    as many statements computing the reciprocal as needed.
488 
489    DEF must be a GIMPLE register of a floating-point type.  */
490 
491 static void
492 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
493 {
494   use_operand_p use_p;
495   imm_use_iterator use_iter;
496   struct occurrence *occ;
497   int count = 0, threshold;
498 
499   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
500 
501   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
502     {
503       gimple use_stmt = USE_STMT (use_p);
504       if (is_division_by (use_stmt, def))
505 	{
506 	  register_division_in (gimple_bb (use_stmt));
507 	  count++;
508 	}
509     }
510 
511   /* Do the expensive part only if we can hope to optimize something.  */
512   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
513   if (count >= threshold)
514     {
515       gimple use_stmt;
516       for (occ = occ_head; occ; occ = occ->next)
517 	{
518 	  compute_merit (occ);
519 	  insert_reciprocals (def_gsi, occ, def, NULL, threshold);
520 	}
521 
522       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
523 	{
524 	  if (is_division_by (use_stmt, def))
525 	    {
526 	      FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
527 		replace_reciprocal (use_p);
528 	    }
529 	}
530     }
531 
532   for (occ = occ_head; occ; )
533     occ = free_bb (occ);
534 
535   occ_head = NULL;
536 }
537 
538 /* Go through all the floating-point SSA_NAMEs, and call
539    execute_cse_reciprocals_1 on each of them.  */
540 namespace {
541 
542 const pass_data pass_data_cse_reciprocals =
543 {
544   GIMPLE_PASS, /* type */
545   "recip", /* name */
546   OPTGROUP_NONE, /* optinfo_flags */
547   TV_NONE, /* tv_id */
548   PROP_ssa, /* properties_required */
549   0, /* properties_provided */
550   0, /* properties_destroyed */
551   0, /* todo_flags_start */
552   TODO_update_ssa, /* todo_flags_finish */
553 };
554 
555 class pass_cse_reciprocals : public gimple_opt_pass
556 {
557 public:
558   pass_cse_reciprocals (gcc::context *ctxt)
559     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
560   {}
561 
562   /* opt_pass methods: */
563   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
564   virtual unsigned int execute (function *);
565 
566 }; // class pass_cse_reciprocals
567 
568 unsigned int
569 pass_cse_reciprocals::execute (function *fun)
570 {
571   basic_block bb;
572   tree arg;
573 
574   occ_pool = create_alloc_pool ("dominators for recip",
575 				sizeof (struct occurrence),
576 				n_basic_blocks_for_fn (fun) / 3 + 1);
577 
578   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
579   calculate_dominance_info (CDI_DOMINATORS);
580   calculate_dominance_info (CDI_POST_DOMINATORS);
581 
582 #ifdef ENABLE_CHECKING
583   FOR_EACH_BB_FN (bb, fun)
584     gcc_assert (!bb->aux);
585 #endif
586 
587   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
588     if (FLOAT_TYPE_P (TREE_TYPE (arg))
589 	&& is_gimple_reg (arg))
590       {
591 	tree name = ssa_default_def (fun, arg);
592 	if (name)
593 	  execute_cse_reciprocals_1 (NULL, name);
594       }
595 
596   FOR_EACH_BB_FN (bb, fun)
597     {
598       tree def;
599 
600       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
601 	   gsi_next (&gsi))
602 	{
603 	  gphi *phi = gsi.phi ();
604 	  def = PHI_RESULT (phi);
605 	  if (! virtual_operand_p (def)
606 	      && FLOAT_TYPE_P (TREE_TYPE (def)))
607 	    execute_cse_reciprocals_1 (NULL, def);
608 	}
609 
610       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
611 	   gsi_next (&gsi))
612         {
613 	  gimple stmt = gsi_stmt (gsi);
614 
615 	  if (gimple_has_lhs (stmt)
616 	      && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
617 	      && FLOAT_TYPE_P (TREE_TYPE (def))
618 	      && TREE_CODE (def) == SSA_NAME)
619 	    execute_cse_reciprocals_1 (&gsi, def);
620 	}
621 
622       if (optimize_bb_for_size_p (bb))
623         continue;
624 
625       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
626       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
627 	   gsi_next (&gsi))
628         {
629 	  gimple stmt = gsi_stmt (gsi);
630 	  tree fndecl;
631 
632 	  if (is_gimple_assign (stmt)
633 	      && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
634 	    {
635 	      tree arg1 = gimple_assign_rhs2 (stmt);
636 	      gimple stmt1;
637 
638 	      if (TREE_CODE (arg1) != SSA_NAME)
639 		continue;
640 
641 	      stmt1 = SSA_NAME_DEF_STMT (arg1);
642 
643 	      if (is_gimple_call (stmt1)
644 		  && gimple_call_lhs (stmt1)
645 		  && (fndecl = gimple_call_fndecl (stmt1))
646 		  && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
647 		      || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
648 		{
649 		  enum built_in_function code;
650 		  bool md_code, fail;
651 		  imm_use_iterator ui;
652 		  use_operand_p use_p;
653 
654 		  code = DECL_FUNCTION_CODE (fndecl);
655 		  md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
656 
657 		  fndecl = targetm.builtin_reciprocal (code, md_code, false);
658 		  if (!fndecl)
659 		    continue;
660 
661 		  /* Check that all uses of the SSA name are divisions,
662 		     otherwise replacing the defining statement will do
663 		     the wrong thing.  */
664 		  fail = false;
665 		  FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
666 		    {
667 		      gimple stmt2 = USE_STMT (use_p);
668 		      if (is_gimple_debug (stmt2))
669 			continue;
670 		      if (!is_gimple_assign (stmt2)
671 			  || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
672 			  || gimple_assign_rhs1 (stmt2) == arg1
673 			  || gimple_assign_rhs2 (stmt2) != arg1)
674 			{
675 			  fail = true;
676 			  break;
677 			}
678 		    }
679 		  if (fail)
680 		    continue;
681 
682 		  gimple_replace_ssa_lhs (stmt1, arg1);
683 		  gimple_call_set_fndecl (stmt1, fndecl);
684 		  update_stmt (stmt1);
685 		  reciprocal_stats.rfuncs_inserted++;
686 
687 		  FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
688 		    {
689 		      gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
690 		      gimple_assign_set_rhs_code (stmt, MULT_EXPR);
691 		      fold_stmt_inplace (&gsi);
692 		      update_stmt (stmt);
693 		    }
694 		}
695 	    }
696 	}
697     }
698 
699   statistics_counter_event (fun, "reciprocal divs inserted",
700 			    reciprocal_stats.rdivs_inserted);
701   statistics_counter_event (fun, "reciprocal functions inserted",
702 			    reciprocal_stats.rfuncs_inserted);
703 
704   free_dominance_info (CDI_DOMINATORS);
705   free_dominance_info (CDI_POST_DOMINATORS);
706   free_alloc_pool (occ_pool);
707   return 0;
708 }
709 
710 } // anon namespace
711 
712 gimple_opt_pass *
713 make_pass_cse_reciprocals (gcc::context *ctxt)
714 {
715   return new pass_cse_reciprocals (ctxt);
716 }
717 
718 /* Records an occurrence at statement USE_STMT in the vector of trees
719    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
720    is not yet initialized.  Returns true if the occurrence was pushed on
721    the vector.  Adjusts *TOP_BB to be the basic block dominating all
722    statements in the vector.  */
723 
724 static bool
725 maybe_record_sincos (vec<gimple> *stmts,
726 		     basic_block *top_bb, gimple use_stmt)
727 {
728   basic_block use_bb = gimple_bb (use_stmt);
729   if (*top_bb
730       && (*top_bb == use_bb
731 	  || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
732     stmts->safe_push (use_stmt);
733   else if (!*top_bb
734 	   || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
735     {
736       stmts->safe_push (use_stmt);
737       *top_bb = use_bb;
738     }
739   else
740     return false;
741 
742   return true;
743 }
744 
745 /* Look for sin, cos and cexpi calls with the same argument NAME and
746    create a single call to cexpi CSEing the result in this case.
747    We first walk over all immediate uses of the argument collecting
748    statements that we can CSE in a vector and in a second pass replace
749    the statement rhs with a REALPART or IMAGPART expression on the
750    result of the cexpi call we insert before the use statement that
751    dominates all other candidates.  */
752 
753 static bool
754 execute_cse_sincos_1 (tree name)
755 {
756   gimple_stmt_iterator gsi;
757   imm_use_iterator use_iter;
758   tree fndecl, res, type;
759   gimple def_stmt, use_stmt, stmt;
760   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
761   auto_vec<gimple> stmts;
762   basic_block top_bb = NULL;
763   int i;
764   bool cfg_changed = false;
765 
766   type = TREE_TYPE (name);
767   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
768     {
769       if (gimple_code (use_stmt) != GIMPLE_CALL
770 	  || !gimple_call_lhs (use_stmt)
771 	  || !(fndecl = gimple_call_fndecl (use_stmt))
772 	  || !gimple_call_builtin_p (use_stmt, BUILT_IN_NORMAL))
773 	continue;
774 
775       switch (DECL_FUNCTION_CODE (fndecl))
776 	{
777 	CASE_FLT_FN (BUILT_IN_COS):
778 	  seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
779 	  break;
780 
781 	CASE_FLT_FN (BUILT_IN_SIN):
782 	  seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
783 	  break;
784 
785 	CASE_FLT_FN (BUILT_IN_CEXPI):
786 	  seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
787 	  break;
788 
789 	default:;
790 	}
791     }
792 
793   if (seen_cos + seen_sin + seen_cexpi <= 1)
794     return false;
795 
796   /* Simply insert cexpi at the beginning of top_bb but not earlier than
797      the name def statement.  */
798   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
799   if (!fndecl)
800     return false;
801   stmt = gimple_build_call (fndecl, 1, name);
802   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
803   gimple_call_set_lhs (stmt, res);
804 
805   def_stmt = SSA_NAME_DEF_STMT (name);
806   if (!SSA_NAME_IS_DEFAULT_DEF (name)
807       && gimple_code (def_stmt) != GIMPLE_PHI
808       && gimple_bb (def_stmt) == top_bb)
809     {
810       gsi = gsi_for_stmt (def_stmt);
811       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
812     }
813   else
814     {
815       gsi = gsi_after_labels (top_bb);
816       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
817     }
818   sincos_stats.inserted++;
819 
820   /* And adjust the recorded old call sites.  */
821   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
822     {
823       tree rhs = NULL;
824       fndecl = gimple_call_fndecl (use_stmt);
825 
826       switch (DECL_FUNCTION_CODE (fndecl))
827 	{
828 	CASE_FLT_FN (BUILT_IN_COS):
829 	  rhs = fold_build1 (REALPART_EXPR, type, res);
830 	  break;
831 
832 	CASE_FLT_FN (BUILT_IN_SIN):
833 	  rhs = fold_build1 (IMAGPART_EXPR, type, res);
834 	  break;
835 
836 	CASE_FLT_FN (BUILT_IN_CEXPI):
837 	  rhs = res;
838 	  break;
839 
840 	default:;
841 	  gcc_unreachable ();
842 	}
843 
844 	/* Replace call with a copy.  */
845 	stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
846 
847 	gsi = gsi_for_stmt (use_stmt);
848 	gsi_replace (&gsi, stmt, true);
849 	if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
850 	  cfg_changed = true;
851     }
852 
853   return cfg_changed;
854 }
855 
856 /* To evaluate powi(x,n), the floating point value x raised to the
857    constant integer exponent n, we use a hybrid algorithm that
858    combines the "window method" with look-up tables.  For an
859    introduction to exponentiation algorithms and "addition chains",
860    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
861    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
862    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
863    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
864 
865 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
866    multiplications to inline before calling the system library's pow
867    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
868    so this default never requires calling pow, powf or powl.  */
869 
870 #ifndef POWI_MAX_MULTS
871 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
872 #endif
873 
874 /* The size of the "optimal power tree" lookup table.  All
875    exponents less than this value are simply looked up in the
876    powi_table below.  This threshold is also used to size the
877    cache of pseudo registers that hold intermediate results.  */
878 #define POWI_TABLE_SIZE 256
879 
880 /* The size, in bits of the window, used in the "window method"
881    exponentiation algorithm.  This is equivalent to a radix of
882    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
883 #define POWI_WINDOW_SIZE 3
884 
885 /* The following table is an efficient representation of an
886    "optimal power tree".  For each value, i, the corresponding
887    value, j, in the table states than an optimal evaluation
888    sequence for calculating pow(x,i) can be found by evaluating
889    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
890    100 integers is given in Knuth's "Seminumerical algorithms".  */
891 
892 static const unsigned char powi_table[POWI_TABLE_SIZE] =
893   {
894       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
895       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
896       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
897      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
898      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
899      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
900      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
901      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
902      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
903      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
904      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
905      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
906      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
907      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
908      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
909      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
910      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
911      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
912      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
913      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
914      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
915      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
916      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
917      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
918      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
919     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
920     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
921     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
922     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
923     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
924     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
925     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
926   };
927 
928 
929 /* Return the number of multiplications required to calculate
930    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
931    subroutine of powi_cost.  CACHE is an array indicating
932    which exponents have already been calculated.  */
933 
934 static int
935 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
936 {
937   /* If we've already calculated this exponent, then this evaluation
938      doesn't require any additional multiplications.  */
939   if (cache[n])
940     return 0;
941 
942   cache[n] = true;
943   return powi_lookup_cost (n - powi_table[n], cache)
944 	 + powi_lookup_cost (powi_table[n], cache) + 1;
945 }
946 
947 /* Return the number of multiplications required to calculate
948    powi(x,n) for an arbitrary x, given the exponent N.  This
949    function needs to be kept in sync with powi_as_mults below.  */
950 
951 static int
952 powi_cost (HOST_WIDE_INT n)
953 {
954   bool cache[POWI_TABLE_SIZE];
955   unsigned HOST_WIDE_INT digit;
956   unsigned HOST_WIDE_INT val;
957   int result;
958 
959   if (n == 0)
960     return 0;
961 
962   /* Ignore the reciprocal when calculating the cost.  */
963   val = (n < 0) ? -n : n;
964 
965   /* Initialize the exponent cache.  */
966   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
967   cache[1] = true;
968 
969   result = 0;
970 
971   while (val >= POWI_TABLE_SIZE)
972     {
973       if (val & 1)
974 	{
975 	  digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
976 	  result += powi_lookup_cost (digit, cache)
977 		    + POWI_WINDOW_SIZE + 1;
978 	  val >>= POWI_WINDOW_SIZE;
979 	}
980       else
981 	{
982 	  val >>= 1;
983 	  result++;
984 	}
985     }
986 
987   return result + powi_lookup_cost (val, cache);
988 }
989 
990 /* Recursive subroutine of powi_as_mults.  This function takes the
991    array, CACHE, of already calculated exponents and an exponent N and
992    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
993 
994 static tree
995 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
996 		 HOST_WIDE_INT n, tree *cache)
997 {
998   tree op0, op1, ssa_target;
999   unsigned HOST_WIDE_INT digit;
1000   gassign *mult_stmt;
1001 
1002   if (n < POWI_TABLE_SIZE && cache[n])
1003     return cache[n];
1004 
1005   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1006 
1007   if (n < POWI_TABLE_SIZE)
1008     {
1009       cache[n] = ssa_target;
1010       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1011       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1012     }
1013   else if (n & 1)
1014     {
1015       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1016       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1017       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1018     }
1019   else
1020     {
1021       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1022       op1 = op0;
1023     }
1024 
1025   mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1026   gimple_set_location (mult_stmt, loc);
1027   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1028 
1029   return ssa_target;
1030 }
1031 
1032 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1033    This function needs to be kept in sync with powi_cost above.  */
1034 
1035 static tree
1036 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1037 	       tree arg0, HOST_WIDE_INT n)
1038 {
1039   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1040   gassign *div_stmt;
1041   tree target;
1042 
1043   if (n == 0)
1044     return build_real (type, dconst1);
1045 
1046   memset (cache, 0,  sizeof (cache));
1047   cache[1] = arg0;
1048 
1049   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
1050   if (n >= 0)
1051     return result;
1052 
1053   /* If the original exponent was negative, reciprocate the result.  */
1054   target = make_temp_ssa_name (type, NULL, "powmult");
1055   div_stmt = gimple_build_assign (target, RDIV_EXPR,
1056 				  build_real (type, dconst1), result);
1057   gimple_set_location (div_stmt, loc);
1058   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1059 
1060   return target;
1061 }
1062 
1063 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1064    location info LOC.  If the arguments are appropriate, create an
1065    equivalent sequence of statements prior to GSI using an optimal
1066    number of multiplications, and return an expession holding the
1067    result.  */
1068 
1069 static tree
1070 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1071 			    tree arg0, HOST_WIDE_INT n)
1072 {
1073   /* Avoid largest negative number.  */
1074   if (n != -n
1075       && ((n >= -1 && n <= 2)
1076 	  || (optimize_function_for_speed_p (cfun)
1077 	      && powi_cost (n) <= POWI_MAX_MULTS)))
1078     return powi_as_mults (gsi, loc, arg0, n);
1079 
1080   return NULL_TREE;
1081 }
1082 
1083 /* Build a gimple call statement that calls FN with argument ARG.
1084    Set the lhs of the call statement to a fresh SSA name.  Insert the
1085    statement prior to GSI's current position, and return the fresh
1086    SSA name.  */
1087 
1088 static tree
1089 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1090 		       tree fn, tree arg)
1091 {
1092   gcall *call_stmt;
1093   tree ssa_target;
1094 
1095   call_stmt = gimple_build_call (fn, 1, arg);
1096   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1097   gimple_set_lhs (call_stmt, ssa_target);
1098   gimple_set_location (call_stmt, loc);
1099   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1100 
1101   return ssa_target;
1102 }
1103 
1104 /* Build a gimple binary operation with the given CODE and arguments
1105    ARG0, ARG1, assigning the result to a new SSA name for variable
1106    TARGET.  Insert the statement prior to GSI's current position, and
1107    return the fresh SSA name.*/
1108 
1109 static tree
1110 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1111 			const char *name, enum tree_code code,
1112 			tree arg0, tree arg1)
1113 {
1114   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1115   gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1116   gimple_set_location (stmt, loc);
1117   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1118   return result;
1119 }
1120 
1121 /* Build a gimple reference operation with the given CODE and argument
1122    ARG, assigning the result to a new SSA name of TYPE with NAME.
1123    Insert the statement prior to GSI's current position, and return
1124    the fresh SSA name.  */
1125 
1126 static inline tree
1127 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1128 		      const char *name, enum tree_code code, tree arg0)
1129 {
1130   tree result = make_temp_ssa_name (type, NULL, name);
1131   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1132   gimple_set_location (stmt, loc);
1133   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1134   return result;
1135 }
1136 
1137 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1138    prior to GSI's current position, and return the fresh SSA name.  */
1139 
1140 static tree
1141 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1142 		       tree type, tree val)
1143 {
1144   tree result = make_ssa_name (type);
1145   gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1146   gimple_set_location (stmt, loc);
1147   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1148   return result;
1149 }
1150 
1151 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1152    with location info LOC.  If possible, create an equivalent and
1153    less expensive sequence of statements prior to GSI, and return an
1154    expession holding the result.  */
1155 
1156 static tree
1157 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1158 			   tree arg0, tree arg1)
1159 {
1160   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1161   REAL_VALUE_TYPE c2, dconst3;
1162   HOST_WIDE_INT n;
1163   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1164   machine_mode mode;
1165   bool hw_sqrt_exists, c_is_int, c2_is_int;
1166 
1167   /* If the exponent isn't a constant, there's nothing of interest
1168      to be done.  */
1169   if (TREE_CODE (arg1) != REAL_CST)
1170     return NULL_TREE;
1171 
1172   /* If the exponent is equivalent to an integer, expand to an optimal
1173      multiplication sequence when profitable.  */
1174   c = TREE_REAL_CST (arg1);
1175   n = real_to_integer (&c);
1176   real_from_integer (&cint, VOIDmode, n, SIGNED);
1177   c_is_int = real_identical (&c, &cint);
1178 
1179   if (c_is_int
1180       && ((n >= -1 && n <= 2)
1181 	  || (flag_unsafe_math_optimizations
1182 	      && optimize_bb_for_speed_p (gsi_bb (*gsi))
1183 	      && powi_cost (n) <= POWI_MAX_MULTS)))
1184     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1185 
1186   /* Attempt various optimizations using sqrt and cbrt.  */
1187   type = TREE_TYPE (arg0);
1188   mode = TYPE_MODE (type);
1189   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1190 
1191   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1192      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1193      sqrt(-0) = -0.  */
1194   if (sqrtfn
1195       && REAL_VALUES_EQUAL (c, dconsthalf)
1196       && !HONOR_SIGNED_ZEROS (mode))
1197     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1198 
1199   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1200      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1201      so do this optimization even if -Os.  Don't do this optimization
1202      if we don't have a hardware sqrt insn.  */
1203   dconst1_4 = dconst1;
1204   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1205   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1206 
1207   if (flag_unsafe_math_optimizations
1208       && sqrtfn
1209       && REAL_VALUES_EQUAL (c, dconst1_4)
1210       && hw_sqrt_exists)
1211     {
1212       /* sqrt(x)  */
1213       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1214 
1215       /* sqrt(sqrt(x))  */
1216       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1217     }
1218 
1219   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1220      optimizing for space.  Don't do this optimization if we don't have
1221      a hardware sqrt insn.  */
1222   real_from_integer (&dconst3_4, VOIDmode, 3, SIGNED);
1223   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1224 
1225   if (flag_unsafe_math_optimizations
1226       && sqrtfn
1227       && optimize_function_for_speed_p (cfun)
1228       && REAL_VALUES_EQUAL (c, dconst3_4)
1229       && hw_sqrt_exists)
1230     {
1231       /* sqrt(x)  */
1232       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1233 
1234       /* sqrt(sqrt(x))  */
1235       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1236 
1237       /* sqrt(x) * sqrt(sqrt(x))  */
1238       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1239 				     sqrt_arg0, sqrt_sqrt);
1240     }
1241 
1242   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1243      optimizations since 1./3. is not exactly representable.  If x
1244      is negative and finite, the correct value of pow(x,1./3.) is
1245      a NaN with the "invalid" exception raised, because the value
1246      of 1./3. actually has an even denominator.  The correct value
1247      of cbrt(x) is a negative real value.  */
1248   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1249   dconst1_3 = real_value_truncate (mode, dconst_third ());
1250 
1251   if (flag_unsafe_math_optimizations
1252       && cbrtfn
1253       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1254       && REAL_VALUES_EQUAL (c, dconst1_3))
1255     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1256 
1257   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1258      if we don't have a hardware sqrt insn.  */
1259   dconst1_6 = dconst1_3;
1260   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1261 
1262   if (flag_unsafe_math_optimizations
1263       && sqrtfn
1264       && cbrtfn
1265       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1266       && optimize_function_for_speed_p (cfun)
1267       && hw_sqrt_exists
1268       && REAL_VALUES_EQUAL (c, dconst1_6))
1269     {
1270       /* sqrt(x)  */
1271       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1272 
1273       /* cbrt(sqrt(x))  */
1274       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1275     }
1276 
1277   /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1278      and c not an integer, into
1279 
1280        sqrt(x) * powi(x, n/2),                n > 0;
1281        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1282 
1283      Do not calculate the powi factor when n/2 = 0.  */
1284   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1285   n = real_to_integer (&c2);
1286   real_from_integer (&cint, VOIDmode, n, SIGNED);
1287   c2_is_int = real_identical (&c2, &cint);
1288 
1289   if (flag_unsafe_math_optimizations
1290       && sqrtfn
1291       && c2_is_int
1292       && !c_is_int
1293       && optimize_function_for_speed_p (cfun))
1294     {
1295       tree powi_x_ndiv2 = NULL_TREE;
1296 
1297       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1298          possible or profitable, give up.  Skip the degenerate case when
1299          n is 1 or -1, where the result is always 1.  */
1300       if (absu_hwi (n) != 1)
1301 	{
1302 	  powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1303 						     abs_hwi (n / 2));
1304 	  if (!powi_x_ndiv2)
1305 	    return NULL_TREE;
1306 	}
1307 
1308       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1309 	 result of the optimal multiply sequence just calculated.  */
1310       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1311 
1312       if (absu_hwi (n) == 1)
1313 	result = sqrt_arg0;
1314       else
1315 	result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1316 					 sqrt_arg0, powi_x_ndiv2);
1317 
1318       /* If n is negative, reciprocate the result.  */
1319       if (n < 0)
1320 	result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1321 					 build_real (type, dconst1), result);
1322       return result;
1323     }
1324 
1325   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1326 
1327      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1328      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1329 
1330      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1331      different from pow(x, 1./3.) due to rounding and behavior with
1332      negative x, we need to constrain this transformation to unsafe
1333      math and positive x or finite math.  */
1334   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
1335   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1336   real_round (&c2, mode, &c2);
1337   n = real_to_integer (&c2);
1338   real_from_integer (&cint, VOIDmode, n, SIGNED);
1339   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1340   real_convert (&c2, mode, &c2);
1341 
1342   if (flag_unsafe_math_optimizations
1343       && cbrtfn
1344       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1345       && real_identical (&c2, &c)
1346       && !c2_is_int
1347       && optimize_function_for_speed_p (cfun)
1348       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1349     {
1350       tree powi_x_ndiv3 = NULL_TREE;
1351 
1352       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1353          possible or profitable, give up.  Skip the degenerate case when
1354          abs(n) < 3, where the result is always 1.  */
1355       if (absu_hwi (n) >= 3)
1356 	{
1357 	  powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1358 						     abs_hwi (n / 3));
1359 	  if (!powi_x_ndiv3)
1360 	    return NULL_TREE;
1361 	}
1362 
1363       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1364          as that creates an unnecessary variable.  Instead, just produce
1365          either cbrt(x) or cbrt(x) * cbrt(x).  */
1366       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1367 
1368       if (absu_hwi (n) % 3 == 1)
1369 	powi_cbrt_x = cbrt_x;
1370       else
1371 	powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1372 					      cbrt_x, cbrt_x);
1373 
1374       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1375       if (absu_hwi (n) < 3)
1376 	result = powi_cbrt_x;
1377       else
1378 	result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1379 					 powi_x_ndiv3, powi_cbrt_x);
1380 
1381       /* If n is negative, reciprocate the result.  */
1382       if (n < 0)
1383 	result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1384 					 build_real (type, dconst1), result);
1385 
1386       return result;
1387     }
1388 
1389   /* No optimizations succeeded.  */
1390   return NULL_TREE;
1391 }
1392 
1393 /* ARG is the argument to a cabs builtin call in GSI with location info
1394    LOC.  Create a sequence of statements prior to GSI that calculates
1395    sqrt(R*R + I*I), where R and I are the real and imaginary components
1396    of ARG, respectively.  Return an expression holding the result.  */
1397 
1398 static tree
1399 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1400 {
1401   tree real_part, imag_part, addend1, addend2, sum, result;
1402   tree type = TREE_TYPE (TREE_TYPE (arg));
1403   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1404   machine_mode mode = TYPE_MODE (type);
1405 
1406   if (!flag_unsafe_math_optimizations
1407       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1408       || !sqrtfn
1409       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1410     return NULL_TREE;
1411 
1412   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1413 				    REALPART_EXPR, arg);
1414   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1415 				    real_part, real_part);
1416   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1417 				    IMAGPART_EXPR, arg);
1418   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1419 				    imag_part, imag_part);
1420   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1421   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1422 
1423   return result;
1424 }
1425 
1426 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1427    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1428    an optimal number of multiplies, when n is a constant.  */
1429 
1430 namespace {
1431 
1432 const pass_data pass_data_cse_sincos =
1433 {
1434   GIMPLE_PASS, /* type */
1435   "sincos", /* name */
1436   OPTGROUP_NONE, /* optinfo_flags */
1437   TV_NONE, /* tv_id */
1438   PROP_ssa, /* properties_required */
1439   0, /* properties_provided */
1440   0, /* properties_destroyed */
1441   0, /* todo_flags_start */
1442   TODO_update_ssa, /* todo_flags_finish */
1443 };
1444 
1445 class pass_cse_sincos : public gimple_opt_pass
1446 {
1447 public:
1448   pass_cse_sincos (gcc::context *ctxt)
1449     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
1450   {}
1451 
1452   /* opt_pass methods: */
1453   virtual bool gate (function *)
1454     {
1455       /* We no longer require either sincos or cexp, since powi expansion
1456 	 piggybacks on this pass.  */
1457       return optimize;
1458     }
1459 
1460   virtual unsigned int execute (function *);
1461 
1462 }; // class pass_cse_sincos
1463 
1464 unsigned int
1465 pass_cse_sincos::execute (function *fun)
1466 {
1467   basic_block bb;
1468   bool cfg_changed = false;
1469 
1470   calculate_dominance_info (CDI_DOMINATORS);
1471   memset (&sincos_stats, 0, sizeof (sincos_stats));
1472 
1473   FOR_EACH_BB_FN (bb, fun)
1474     {
1475       gimple_stmt_iterator gsi;
1476       bool cleanup_eh = false;
1477 
1478       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1479         {
1480 	  gimple stmt = gsi_stmt (gsi);
1481 	  tree fndecl;
1482 
1483 	  /* Only the last stmt in a bb could throw, no need to call
1484 	     gimple_purge_dead_eh_edges if we change something in the middle
1485 	     of a basic block.  */
1486 	  cleanup_eh = false;
1487 
1488 	  if (is_gimple_call (stmt)
1489 	      && gimple_call_lhs (stmt)
1490 	      && (fndecl = gimple_call_fndecl (stmt))
1491 	      && gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
1492 	    {
1493 	      tree arg, arg0, arg1, result;
1494 	      HOST_WIDE_INT n;
1495 	      location_t loc;
1496 
1497 	      switch (DECL_FUNCTION_CODE (fndecl))
1498 		{
1499 		CASE_FLT_FN (BUILT_IN_COS):
1500 		CASE_FLT_FN (BUILT_IN_SIN):
1501 		CASE_FLT_FN (BUILT_IN_CEXPI):
1502 		  /* Make sure we have either sincos or cexp.  */
1503 		  if (!targetm.libc_has_function (function_c99_math_complex)
1504 		      && !targetm.libc_has_function (function_sincos))
1505 		    break;
1506 
1507 		  arg = gimple_call_arg (stmt, 0);
1508 		  if (TREE_CODE (arg) == SSA_NAME)
1509 		    cfg_changed |= execute_cse_sincos_1 (arg);
1510 		  break;
1511 
1512 		CASE_FLT_FN (BUILT_IN_POW):
1513 		  arg0 = gimple_call_arg (stmt, 0);
1514 		  arg1 = gimple_call_arg (stmt, 1);
1515 
1516 		  loc = gimple_location (stmt);
1517 		  result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1518 
1519 		  if (result)
1520 		    {
1521 		      tree lhs = gimple_get_lhs (stmt);
1522 		      gassign *new_stmt = gimple_build_assign (lhs, result);
1523 		      gimple_set_location (new_stmt, loc);
1524 		      unlink_stmt_vdef (stmt);
1525 		      gsi_replace (&gsi, new_stmt, true);
1526 		      cleanup_eh = true;
1527 		      if (gimple_vdef (stmt))
1528 			release_ssa_name (gimple_vdef (stmt));
1529 		    }
1530 		  break;
1531 
1532 		CASE_FLT_FN (BUILT_IN_POWI):
1533 		  arg0 = gimple_call_arg (stmt, 0);
1534 		  arg1 = gimple_call_arg (stmt, 1);
1535 		  loc = gimple_location (stmt);
1536 
1537 		  if (real_minus_onep (arg0))
1538 		    {
1539                       tree t0, t1, cond, one, minus_one;
1540 		      gassign *stmt;
1541 
1542 		      t0 = TREE_TYPE (arg0);
1543 		      t1 = TREE_TYPE (arg1);
1544 		      one = build_real (t0, dconst1);
1545 		      minus_one = build_real (t0, dconstm1);
1546 
1547 		      cond = make_temp_ssa_name (t1, NULL, "powi_cond");
1548 		      stmt = gimple_build_assign (cond, BIT_AND_EXPR,
1549 						  arg1, build_int_cst (t1, 1));
1550 		      gimple_set_location (stmt, loc);
1551 		      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1552 
1553 		      result = make_temp_ssa_name (t0, NULL, "powi");
1554 		      stmt = gimple_build_assign (result, COND_EXPR, cond,
1555 						  minus_one, one);
1556 		      gimple_set_location (stmt, loc);
1557 		      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1558 		    }
1559 		  else
1560 		    {
1561 		      if (!tree_fits_shwi_p (arg1))
1562 			break;
1563 
1564 		      n = tree_to_shwi (arg1);
1565 		      result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1566 		    }
1567 
1568 		  if (result)
1569 		    {
1570 		      tree lhs = gimple_get_lhs (stmt);
1571 		      gassign *new_stmt = gimple_build_assign (lhs, result);
1572 		      gimple_set_location (new_stmt, loc);
1573 		      unlink_stmt_vdef (stmt);
1574 		      gsi_replace (&gsi, new_stmt, true);
1575 		      cleanup_eh = true;
1576 		      if (gimple_vdef (stmt))
1577 			release_ssa_name (gimple_vdef (stmt));
1578 		    }
1579 		  break;
1580 
1581 		CASE_FLT_FN (BUILT_IN_CABS):
1582 		  arg0 = gimple_call_arg (stmt, 0);
1583 		  loc = gimple_location (stmt);
1584 		  result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1585 
1586 		  if (result)
1587 		    {
1588 		      tree lhs = gimple_get_lhs (stmt);
1589 		      gassign *new_stmt = gimple_build_assign (lhs, result);
1590 		      gimple_set_location (new_stmt, loc);
1591 		      unlink_stmt_vdef (stmt);
1592 		      gsi_replace (&gsi, new_stmt, true);
1593 		      cleanup_eh = true;
1594 		      if (gimple_vdef (stmt))
1595 			release_ssa_name (gimple_vdef (stmt));
1596 		    }
1597 		  break;
1598 
1599 		default:;
1600 		}
1601 	    }
1602 	}
1603       if (cleanup_eh)
1604 	cfg_changed |= gimple_purge_dead_eh_edges (bb);
1605     }
1606 
1607   statistics_counter_event (fun, "sincos statements inserted",
1608 			    sincos_stats.inserted);
1609 
1610   free_dominance_info (CDI_DOMINATORS);
1611   return cfg_changed ? TODO_cleanup_cfg : 0;
1612 }
1613 
1614 } // anon namespace
1615 
1616 gimple_opt_pass *
1617 make_pass_cse_sincos (gcc::context *ctxt)
1618 {
1619   return new pass_cse_sincos (ctxt);
1620 }
1621 
1622 /* A symbolic number structure is used to detect byte permutation and selection
1623    patterns of a source.  To achieve that, its field N contains an artificial
1624    number consisting of BITS_PER_MARKER sized markers tracking where does each
1625    byte come from in the source:
1626 
1627    0	   - target byte has the value 0
1628    FF	   - target byte has an unknown value (eg. due to sign extension)
1629    1..size - marker value is the byte index in the source (0 for lsb).
1630 
1631    To detect permutations on memory sources (arrays and structures), a symbolic
1632    number is also associated:
1633    - a base address BASE_ADDR and an OFFSET giving the address of the source;
1634    - a range which gives the difference between the highest and lowest accessed
1635      memory location to make such a symbolic number;
1636    - the address SRC of the source element of lowest address as a convenience
1637      to easily get BASE_ADDR + offset + lowest bytepos.
1638 
1639    Note 1: the range is different from size as size reflects the size of the
1640    type of the current expression.  For instance, for an array char a[],
1641    (short) a[0] | (short) a[3] would have a size of 2 but a range of 4 while
1642    (short) a[0] | ((short) a[0] << 1) would still have a size of 2 but this
1643    time a range of 1.
1644 
1645    Note 2: for non-memory sources, range holds the same value as size.
1646 
1647    Note 3: SRC points to the SSA_NAME in case of non-memory source.  */
1648 
1649 struct symbolic_number {
1650   uint64_t n;
1651   tree type;
1652   tree base_addr;
1653   tree offset;
1654   HOST_WIDE_INT bytepos;
1655   tree src;
1656   tree alias_set;
1657   tree vuse;
1658   unsigned HOST_WIDE_INT range;
1659 };
1660 
1661 #define BITS_PER_MARKER 8
1662 #define MARKER_MASK ((1 << BITS_PER_MARKER) - 1)
1663 #define MARKER_BYTE_UNKNOWN MARKER_MASK
1664 #define HEAD_MARKER(n, size) \
1665   ((n) & ((uint64_t) MARKER_MASK << (((size) - 1) * BITS_PER_MARKER)))
1666 
1667 /* The number which the find_bswap_or_nop_1 result should match in
1668    order to have a nop.  The number is masked according to the size of
1669    the symbolic number before using it.  */
1670 #define CMPNOP (sizeof (int64_t) < 8 ? 0 : \
1671   (uint64_t)0x08070605 << 32 | 0x04030201)
1672 
1673 /* The number which the find_bswap_or_nop_1 result should match in
1674    order to have a byte swap.  The number is masked according to the
1675    size of the symbolic number before using it.  */
1676 #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \
1677   (uint64_t)0x01020304 << 32 | 0x05060708)
1678 
1679 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1680    number N.  Return false if the requested operation is not permitted
1681    on a symbolic number.  */
1682 
1683 static inline bool
1684 do_shift_rotate (enum tree_code code,
1685 		 struct symbolic_number *n,
1686 		 int count)
1687 {
1688   int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1689   unsigned head_marker;
1690 
1691   if (count % BITS_PER_UNIT != 0)
1692     return false;
1693   count = (count / BITS_PER_UNIT) * BITS_PER_MARKER;
1694 
1695   /* Zero out the extra bits of N in order to avoid them being shifted
1696      into the significant bits.  */
1697   if (size < 64 / BITS_PER_MARKER)
1698     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1699 
1700   switch (code)
1701     {
1702     case LSHIFT_EXPR:
1703       n->n <<= count;
1704       break;
1705     case RSHIFT_EXPR:
1706       head_marker = HEAD_MARKER (n->n, size);
1707       n->n >>= count;
1708       /* Arithmetic shift of signed type: result is dependent on the value.  */
1709       if (!TYPE_UNSIGNED (n->type) && head_marker)
1710 	for (i = 0; i < count / BITS_PER_MARKER; i++)
1711 	  n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
1712 		  << ((size - 1 - i) * BITS_PER_MARKER);
1713       break;
1714     case LROTATE_EXPR:
1715       n->n = (n->n << count) | (n->n >> ((size * BITS_PER_MARKER) - count));
1716       break;
1717     case RROTATE_EXPR:
1718       n->n = (n->n >> count) | (n->n << ((size * BITS_PER_MARKER) - count));
1719       break;
1720     default:
1721       return false;
1722     }
1723   /* Zero unused bits for size.  */
1724   if (size < 64 / BITS_PER_MARKER)
1725     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1726   return true;
1727 }
1728 
1729 /* Perform sanity checking for the symbolic number N and the gimple
1730    statement STMT.  */
1731 
1732 static inline bool
1733 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1734 {
1735   tree lhs_type;
1736 
1737   lhs_type = gimple_expr_type (stmt);
1738 
1739   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1740     return false;
1741 
1742   if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
1743     return false;
1744 
1745   return true;
1746 }
1747 
1748 /* Initialize the symbolic number N for the bswap pass from the base element
1749    SRC manipulated by the bitwise OR expression.  */
1750 
1751 static bool
1752 init_symbolic_number (struct symbolic_number *n, tree src)
1753 {
1754   int size;
1755 
1756   n->base_addr = n->offset = n->alias_set = n->vuse = NULL_TREE;
1757   n->src = src;
1758 
1759   /* Set up the symbolic number N by setting each byte to a value between 1 and
1760      the byte size of rhs1.  The highest order byte is set to n->size and the
1761      lowest order byte to 1.  */
1762   n->type = TREE_TYPE (src);
1763   size = TYPE_PRECISION (n->type);
1764   if (size % BITS_PER_UNIT != 0)
1765     return false;
1766   size /= BITS_PER_UNIT;
1767   if (size > 64 / BITS_PER_MARKER)
1768     return false;
1769   n->range = size;
1770   n->n = CMPNOP;
1771 
1772   if (size < 64 / BITS_PER_MARKER)
1773     n->n &= ((uint64_t) 1 << (size * BITS_PER_MARKER)) - 1;
1774 
1775   return true;
1776 }
1777 
1778 /* Check if STMT might be a byte swap or a nop from a memory source and returns
1779    the answer. If so, REF is that memory source and the base of the memory area
1780    accessed and the offset of the access from that base are recorded in N.  */
1781 
1782 bool
1783 find_bswap_or_nop_load (gimple stmt, tree ref, struct symbolic_number *n)
1784 {
1785   /* Leaf node is an array or component ref. Memorize its base and
1786      offset from base to compare to other such leaf node.  */
1787   HOST_WIDE_INT bitsize, bitpos;
1788   machine_mode mode;
1789   int unsignedp, volatilep;
1790   tree offset, base_addr;
1791 
1792   /* Not prepared to handle PDP endian.  */
1793   if (BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
1794     return false;
1795 
1796   if (!gimple_assign_load_p (stmt) || gimple_has_volatile_ops (stmt))
1797     return false;
1798 
1799   base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode,
1800 				   &unsignedp, &volatilep, false);
1801 
1802   if (TREE_CODE (base_addr) == MEM_REF)
1803     {
1804       offset_int bit_offset = 0;
1805       tree off = TREE_OPERAND (base_addr, 1);
1806 
1807       if (!integer_zerop (off))
1808 	{
1809 	  offset_int boff, coff = mem_ref_offset (base_addr);
1810 	  boff = wi::lshift (coff, LOG2_BITS_PER_UNIT);
1811 	  bit_offset += boff;
1812 	}
1813 
1814       base_addr = TREE_OPERAND (base_addr, 0);
1815 
1816       /* Avoid returning a negative bitpos as this may wreak havoc later.  */
1817       if (wi::neg_p (bit_offset))
1818 	{
1819 	  offset_int mask = wi::mask <offset_int> (LOG2_BITS_PER_UNIT, false);
1820 	  offset_int tem = bit_offset.and_not (mask);
1821 	  /* TEM is the bitpos rounded to BITS_PER_UNIT towards -Inf.
1822 	     Subtract it to BIT_OFFSET and add it (scaled) to OFFSET.  */
1823 	  bit_offset -= tem;
1824 	  tem = wi::arshift (tem, LOG2_BITS_PER_UNIT);
1825 	  if (offset)
1826 	    offset = size_binop (PLUS_EXPR, offset,
1827 				    wide_int_to_tree (sizetype, tem));
1828 	  else
1829 	    offset = wide_int_to_tree (sizetype, tem);
1830 	}
1831 
1832       bitpos += bit_offset.to_shwi ();
1833     }
1834 
1835   if (bitpos % BITS_PER_UNIT)
1836     return false;
1837   if (bitsize % BITS_PER_UNIT)
1838     return false;
1839 
1840   if (!init_symbolic_number (n, ref))
1841     return false;
1842   n->base_addr = base_addr;
1843   n->offset = offset;
1844   n->bytepos = bitpos / BITS_PER_UNIT;
1845   n->alias_set = reference_alias_ptr_type (ref);
1846   n->vuse = gimple_vuse (stmt);
1847   return true;
1848 }
1849 
1850 /* Compute the symbolic number N representing the result of a bitwise OR on 2
1851    symbolic number N1 and N2 whose source statements are respectively
1852    SOURCE_STMT1 and SOURCE_STMT2.  */
1853 
1854 static gimple
1855 perform_symbolic_merge (gimple source_stmt1, struct symbolic_number *n1,
1856 			gimple source_stmt2, struct symbolic_number *n2,
1857 			struct symbolic_number *n)
1858 {
1859   int i, size;
1860   uint64_t mask;
1861   gimple source_stmt;
1862   struct symbolic_number *n_start;
1863 
1864   /* Sources are different, cancel bswap if they are not memory location with
1865      the same base (array, structure, ...).  */
1866   if (gimple_assign_rhs1 (source_stmt1) != gimple_assign_rhs1 (source_stmt2))
1867     {
1868       uint64_t inc;
1869       HOST_WIDE_INT start_sub, end_sub, end1, end2, end;
1870       struct symbolic_number *toinc_n_ptr, *n_end;
1871       basic_block bb1, bb2;
1872 
1873       if (!n1->base_addr || !n2->base_addr
1874 	  || !operand_equal_p (n1->base_addr, n2->base_addr, 0))
1875 	return NULL;
1876 
1877       if (!n1->offset != !n2->offset
1878 	  || (n1->offset && !operand_equal_p (n1->offset, n2->offset, 0)))
1879 	return NULL;
1880 
1881       if (n1->bytepos < n2->bytepos)
1882 	{
1883 	  n_start = n1;
1884 	  start_sub = n2->bytepos - n1->bytepos;
1885 	}
1886       else
1887 	{
1888 	  n_start = n2;
1889 	  start_sub = n1->bytepos - n2->bytepos;
1890 	}
1891 
1892       bb1 = gimple_bb (source_stmt1);
1893       bb2 = gimple_bb (source_stmt2);
1894       if (dominated_by_p (CDI_DOMINATORS, bb1, bb2))
1895 	source_stmt = source_stmt1;
1896       else
1897 	source_stmt = source_stmt2;
1898 
1899       /* Find the highest address at which a load is performed and
1900 	 compute related info.  */
1901       end1 = n1->bytepos + (n1->range - 1);
1902       end2 = n2->bytepos + (n2->range - 1);
1903       if (end1 < end2)
1904 	{
1905 	  end = end2;
1906 	  end_sub = end2 - end1;
1907 	}
1908       else
1909 	{
1910 	  end = end1;
1911 	  end_sub = end1 - end2;
1912 	}
1913       n_end = (end2 > end1) ? n2 : n1;
1914 
1915       /* Find symbolic number whose lsb is the most significant.  */
1916       if (BYTES_BIG_ENDIAN)
1917 	toinc_n_ptr = (n_end == n1) ? n2 : n1;
1918       else
1919 	toinc_n_ptr = (n_start == n1) ? n2 : n1;
1920 
1921       n->range = end - n_start->bytepos + 1;
1922 
1923       /* Check that the range of memory covered can be represented by
1924 	 a symbolic number.  */
1925       if (n->range > 64 / BITS_PER_MARKER)
1926 	return NULL;
1927 
1928       /* Reinterpret byte marks in symbolic number holding the value of
1929 	 bigger weight according to target endianness.  */
1930       inc = BYTES_BIG_ENDIAN ? end_sub : start_sub;
1931       size = TYPE_PRECISION (n1->type) / BITS_PER_UNIT;
1932       for (i = 0; i < size; i++, inc <<= BITS_PER_MARKER)
1933 	{
1934 	  unsigned marker
1935 	    = (toinc_n_ptr->n >> (i * BITS_PER_MARKER)) & MARKER_MASK;
1936 	  if (marker && marker != MARKER_BYTE_UNKNOWN)
1937 	    toinc_n_ptr->n += inc;
1938 	}
1939     }
1940   else
1941     {
1942       n->range = n1->range;
1943       n_start = n1;
1944       source_stmt = source_stmt1;
1945     }
1946 
1947   if (!n1->alias_set
1948       || alias_ptr_types_compatible_p (n1->alias_set, n2->alias_set))
1949     n->alias_set = n1->alias_set;
1950   else
1951     n->alias_set = ptr_type_node;
1952   n->vuse = n_start->vuse;
1953   n->base_addr = n_start->base_addr;
1954   n->offset = n_start->offset;
1955   n->src = n_start->src;
1956   n->bytepos = n_start->bytepos;
1957   n->type = n_start->type;
1958   size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1959 
1960   for (i = 0, mask = MARKER_MASK; i < size; i++, mask <<= BITS_PER_MARKER)
1961     {
1962       uint64_t masked1, masked2;
1963 
1964       masked1 = n1->n & mask;
1965       masked2 = n2->n & mask;
1966       if (masked1 && masked2 && masked1 != masked2)
1967 	return NULL;
1968     }
1969   n->n = n1->n | n2->n;
1970 
1971   return source_stmt;
1972 }
1973 
1974 /* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform
1975    the operation given by the rhs of STMT on the result.  If the operation
1976    could successfully be executed the function returns a gimple stmt whose
1977    rhs's first tree is the expression of the source operand and NULL
1978    otherwise.  */
1979 
1980 static gimple
1981 find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit)
1982 {
1983   enum tree_code code;
1984   tree rhs1, rhs2 = NULL;
1985   gimple rhs1_stmt, rhs2_stmt, source_stmt1;
1986   enum gimple_rhs_class rhs_class;
1987 
1988   if (!limit || !is_gimple_assign (stmt))
1989     return NULL;
1990 
1991   rhs1 = gimple_assign_rhs1 (stmt);
1992 
1993   if (find_bswap_or_nop_load (stmt, rhs1, n))
1994     return stmt;
1995 
1996   if (TREE_CODE (rhs1) != SSA_NAME)
1997     return NULL;
1998 
1999   code = gimple_assign_rhs_code (stmt);
2000   rhs_class = gimple_assign_rhs_class (stmt);
2001   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2002 
2003   if (rhs_class == GIMPLE_BINARY_RHS)
2004     rhs2 = gimple_assign_rhs2 (stmt);
2005 
2006   /* Handle unary rhs and binary rhs with integer constants as second
2007      operand.  */
2008 
2009   if (rhs_class == GIMPLE_UNARY_RHS
2010       || (rhs_class == GIMPLE_BINARY_RHS
2011 	  && TREE_CODE (rhs2) == INTEGER_CST))
2012     {
2013       if (code != BIT_AND_EXPR
2014 	  && code != LSHIFT_EXPR
2015 	  && code != RSHIFT_EXPR
2016 	  && code != LROTATE_EXPR
2017 	  && code != RROTATE_EXPR
2018 	  && !CONVERT_EXPR_CODE_P (code))
2019 	return NULL;
2020 
2021       source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1);
2022 
2023       /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and
2024 	 we have to initialize the symbolic number.  */
2025       if (!source_stmt1)
2026 	{
2027 	  if (gimple_assign_load_p (stmt)
2028 	      || !init_symbolic_number (n, rhs1))
2029 	    return NULL;
2030 	  source_stmt1 = stmt;
2031 	}
2032 
2033       switch (code)
2034 	{
2035 	case BIT_AND_EXPR:
2036 	  {
2037 	    int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2038 	    uint64_t val = int_cst_value (rhs2), mask = 0;
2039 	    uint64_t tmp = (1 << BITS_PER_UNIT) - 1;
2040 
2041 	    /* Only constants masking full bytes are allowed.  */
2042 	    for (i = 0; i < size; i++, tmp <<= BITS_PER_UNIT)
2043 	      if ((val & tmp) != 0 && (val & tmp) != tmp)
2044 		return NULL;
2045 	      else if (val & tmp)
2046 		mask |= (uint64_t) MARKER_MASK << (i * BITS_PER_MARKER);
2047 
2048 	    n->n &= mask;
2049 	  }
2050 	  break;
2051 	case LSHIFT_EXPR:
2052 	case RSHIFT_EXPR:
2053 	case LROTATE_EXPR:
2054 	case RROTATE_EXPR:
2055 	  if (!do_shift_rotate (code, n, (int) TREE_INT_CST_LOW (rhs2)))
2056 	    return NULL;
2057 	  break;
2058 	CASE_CONVERT:
2059 	  {
2060 	    int i, type_size, old_type_size;
2061 	    tree type;
2062 
2063 	    type = gimple_expr_type (stmt);
2064 	    type_size = TYPE_PRECISION (type);
2065 	    if (type_size % BITS_PER_UNIT != 0)
2066 	      return NULL;
2067 	    type_size /= BITS_PER_UNIT;
2068 	    if (type_size > 64 / BITS_PER_MARKER)
2069 	      return NULL;
2070 
2071 	    /* Sign extension: result is dependent on the value.  */
2072 	    old_type_size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
2073 	    if (!TYPE_UNSIGNED (n->type) && type_size > old_type_size
2074 		&& HEAD_MARKER (n->n, old_type_size))
2075 	      for (i = 0; i < type_size - old_type_size; i++)
2076 		n->n |= (uint64_t) MARKER_BYTE_UNKNOWN
2077 			<< ((type_size - 1 - i) * BITS_PER_MARKER);
2078 
2079 	    if (type_size < 64 / BITS_PER_MARKER)
2080 	      {
2081 		/* If STMT casts to a smaller type mask out the bits not
2082 		   belonging to the target type.  */
2083 		n->n &= ((uint64_t) 1 << (type_size * BITS_PER_MARKER)) - 1;
2084 	      }
2085 	    n->type = type;
2086 	    if (!n->base_addr)
2087 	      n->range = type_size;
2088 	  }
2089 	  break;
2090 	default:
2091 	  return NULL;
2092 	};
2093       return verify_symbolic_number_p (n, stmt) ? source_stmt1 : NULL;
2094     }
2095 
2096   /* Handle binary rhs.  */
2097 
2098   if (rhs_class == GIMPLE_BINARY_RHS)
2099     {
2100       struct symbolic_number n1, n2;
2101       gimple source_stmt, source_stmt2;
2102 
2103       if (code != BIT_IOR_EXPR)
2104 	return NULL;
2105 
2106       if (TREE_CODE (rhs2) != SSA_NAME)
2107 	return NULL;
2108 
2109       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2110 
2111       switch (code)
2112 	{
2113 	case BIT_IOR_EXPR:
2114 	  source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1);
2115 
2116 	  if (!source_stmt1)
2117 	    return NULL;
2118 
2119 	  source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1);
2120 
2121 	  if (!source_stmt2)
2122 	    return NULL;
2123 
2124 	  if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
2125 	    return NULL;
2126 
2127 	  if (!n1.vuse != !n2.vuse
2128 	      || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0)))
2129 	    return NULL;
2130 
2131 	  source_stmt
2132 	    = perform_symbolic_merge (source_stmt1, &n1, source_stmt2, &n2, n);
2133 
2134 	  if (!source_stmt)
2135 	    return NULL;
2136 
2137 	  if (!verify_symbolic_number_p (n, stmt))
2138 	    return NULL;
2139 
2140 	  break;
2141 	default:
2142 	  return NULL;
2143 	}
2144       return source_stmt;
2145     }
2146   return NULL;
2147 }
2148 
2149 /* Check if STMT completes a bswap implementation or a read in a given
2150    endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP
2151    accordingly.  It also sets N to represent the kind of operations
2152    performed: size of the resulting expression and whether it works on
2153    a memory source, and if so alias-set and vuse.  At last, the
2154    function returns a stmt whose rhs's first tree is the source
2155    expression.  */
2156 
2157 static gimple
2158 find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap)
2159 {
2160   /* The number which the find_bswap_or_nop_1 result should match in order
2161      to have a full byte swap.  The number is shifted to the right
2162      according to the size of the symbolic number before using it.  */
2163   uint64_t cmpxchg = CMPXCHG;
2164   uint64_t cmpnop = CMPNOP;
2165 
2166   gimple ins_stmt;
2167   int limit;
2168 
2169   /* The last parameter determines the depth search limit.  It usually
2170      correlates directly to the number n of bytes to be touched.  We
2171      increase that number by log2(n) + 1 here in order to also
2172      cover signed -> unsigned conversions of the src operand as can be seen
2173      in libgcc, and for initial shift/and operation of the src operand.  */
2174   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
2175   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
2176   ins_stmt = find_bswap_or_nop_1 (stmt, n, limit);
2177 
2178   if (!ins_stmt)
2179     return NULL;
2180 
2181   /* Find real size of result (highest non-zero byte).  */
2182   if (n->base_addr)
2183     {
2184       unsigned HOST_WIDE_INT rsize;
2185       uint64_t tmpn;
2186 
2187       for (tmpn = n->n, rsize = 0; tmpn; tmpn >>= BITS_PER_MARKER, rsize++);
2188       if (BYTES_BIG_ENDIAN && n->range != rsize)
2189 	/* This implies an offset, which is currently not handled by
2190 	   bswap_replace.  */
2191 	return NULL;
2192       n->range = rsize;
2193     }
2194 
2195   /* Zero out the extra bits of N and CMP*.  */
2196   if (n->range < (int) sizeof (int64_t))
2197     {
2198       uint64_t mask;
2199 
2200       mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1;
2201       cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER;
2202       cmpnop &= mask;
2203     }
2204 
2205   /* A complete byte swap should make the symbolic number to start with
2206      the largest digit in the highest order byte. Unchanged symbolic
2207      number indicates a read with same endianness as target architecture.  */
2208   if (n->n == cmpnop)
2209     *bswap = false;
2210   else if (n->n == cmpxchg)
2211     *bswap = true;
2212   else
2213     return NULL;
2214 
2215   /* Useless bit manipulation performed by code.  */
2216   if (!n->base_addr && n->n == cmpnop)
2217     return NULL;
2218 
2219   n->range *= BITS_PER_UNIT;
2220   return ins_stmt;
2221 }
2222 
2223 namespace {
2224 
2225 const pass_data pass_data_optimize_bswap =
2226 {
2227   GIMPLE_PASS, /* type */
2228   "bswap", /* name */
2229   OPTGROUP_NONE, /* optinfo_flags */
2230   TV_NONE, /* tv_id */
2231   PROP_ssa, /* properties_required */
2232   0, /* properties_provided */
2233   0, /* properties_destroyed */
2234   0, /* todo_flags_start */
2235   0, /* todo_flags_finish */
2236 };
2237 
2238 class pass_optimize_bswap : public gimple_opt_pass
2239 {
2240 public:
2241   pass_optimize_bswap (gcc::context *ctxt)
2242     : gimple_opt_pass (pass_data_optimize_bswap, ctxt)
2243   {}
2244 
2245   /* opt_pass methods: */
2246   virtual bool gate (function *)
2247     {
2248       return flag_expensive_optimizations && optimize;
2249     }
2250 
2251   virtual unsigned int execute (function *);
2252 
2253 }; // class pass_optimize_bswap
2254 
2255 /* Perform the bswap optimization: replace the expression computed in the rhs
2256    of CUR_STMT by an equivalent bswap, load or load + bswap expression.
2257    Which of these alternatives replace the rhs is given by N->base_addr (non
2258    null if a load is needed) and BSWAP.  The type, VUSE and set-alias of the
2259    load to perform are also given in N while the builtin bswap invoke is given
2260    in FNDEL.  Finally, if a load is involved, SRC_STMT refers to one of the
2261    load statements involved to construct the rhs in CUR_STMT and N->range gives
2262    the size of the rhs expression for maintaining some statistics.
2263 
2264    Note that if the replacement involve a load, CUR_STMT is moved just after
2265    SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT
2266    changing of basic block.  */
2267 
2268 static bool
2269 bswap_replace (gimple cur_stmt, gimple ins_stmt, tree fndecl, tree bswap_type,
2270 	       tree load_type, struct symbolic_number *n, bool bswap)
2271 {
2272   gimple_stmt_iterator gsi;
2273   tree src, tmp, tgt;
2274   gimple bswap_stmt;
2275 
2276   gsi = gsi_for_stmt (cur_stmt);
2277   src = n->src;
2278   tgt = gimple_assign_lhs (cur_stmt);
2279 
2280   /* Need to load the value from memory first.  */
2281   if (n->base_addr)
2282     {
2283       gimple_stmt_iterator gsi_ins = gsi_for_stmt (ins_stmt);
2284       tree addr_expr, addr_tmp, val_expr, val_tmp;
2285       tree load_offset_ptr, aligned_load_type;
2286       gimple addr_stmt, load_stmt;
2287       unsigned align;
2288       HOST_WIDE_INT load_offset = 0;
2289       basic_block ins_bb, cur_bb;
2290 
2291       ins_bb = gimple_bb (ins_stmt);
2292       cur_bb = gimple_bb (cur_stmt);
2293       if (!dominated_by_p (CDI_DOMINATORS, cur_bb, ins_bb))
2294 	return false;
2295 
2296       align = get_object_alignment (src);
2297       /* If the new access is smaller than the original one, we need
2298 	 to perform big endian adjustment.  */
2299       if (BYTES_BIG_ENDIAN)
2300 	{
2301 	  HOST_WIDE_INT bitsize, bitpos;
2302 	  machine_mode mode;
2303 	  int unsignedp, volatilep;
2304 	  tree offset;
2305 
2306 	  get_inner_reference (src, &bitsize, &bitpos, &offset, &mode,
2307 			       &unsignedp, &volatilep, false);
2308 	  if (n->range < (unsigned HOST_WIDE_INT) bitsize)
2309 	    {
2310 	      load_offset = (bitsize - n->range) / BITS_PER_UNIT;
2311 	      unsigned HOST_WIDE_INT l
2312 		= (load_offset * BITS_PER_UNIT) & (align - 1);
2313 	      if (l)
2314 		align = l & -l;
2315 	    }
2316 	}
2317 
2318       if (bswap
2319 	  && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type))
2320 	  && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align))
2321 	return false;
2322 
2323       /* Move cur_stmt just before  one of the load of the original
2324 	 to ensure it has the same VUSE.  See PR61517 for what could
2325 	 go wrong.  */
2326       if (gimple_bb (cur_stmt) != gimple_bb (ins_stmt))
2327 	reset_flow_sensitive_info (gimple_assign_lhs (cur_stmt));
2328       gsi_move_before (&gsi, &gsi_ins);
2329       gsi = gsi_for_stmt (cur_stmt);
2330 
2331       /* Compute address to load from and cast according to the size
2332 	 of the load.  */
2333       addr_expr = build_fold_addr_expr (unshare_expr (src));
2334       if (is_gimple_mem_ref_addr (addr_expr))
2335 	addr_tmp = addr_expr;
2336       else
2337 	{
2338 	  addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL,
2339 					 "load_src");
2340 	  addr_stmt = gimple_build_assign (addr_tmp, addr_expr);
2341 	  gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT);
2342 	}
2343 
2344       /* Perform the load.  */
2345       aligned_load_type = load_type;
2346       if (align < TYPE_ALIGN (load_type))
2347 	aligned_load_type = build_aligned_type (load_type, align);
2348       load_offset_ptr = build_int_cst (n->alias_set, load_offset);
2349       val_expr = fold_build2 (MEM_REF, aligned_load_type, addr_tmp,
2350 			      load_offset_ptr);
2351 
2352       if (!bswap)
2353 	{
2354 	  if (n->range == 16)
2355 	    nop_stats.found_16bit++;
2356 	  else if (n->range == 32)
2357 	    nop_stats.found_32bit++;
2358 	  else
2359 	    {
2360 	      gcc_assert (n->range == 64);
2361 	      nop_stats.found_64bit++;
2362 	    }
2363 
2364 	  /* Convert the result of load if necessary.  */
2365 	  if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type))
2366 	    {
2367 	      val_tmp = make_temp_ssa_name (aligned_load_type, NULL,
2368 					    "load_dst");
2369 	      load_stmt = gimple_build_assign (val_tmp, val_expr);
2370 	      gimple_set_vuse (load_stmt, n->vuse);
2371 	      gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2372 	      gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
2373 	    }
2374 	  else
2375 	    {
2376 	      gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr);
2377 	      gimple_set_vuse (cur_stmt, n->vuse);
2378 	    }
2379 	  update_stmt (cur_stmt);
2380 
2381 	  if (dump_file)
2382 	    {
2383 	      fprintf (dump_file,
2384 		       "%d bit load in target endianness found at: ",
2385 		       (int) n->range);
2386 	      print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2387 	    }
2388 	  return true;
2389 	}
2390       else
2391 	{
2392 	  val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst");
2393 	  load_stmt = gimple_build_assign (val_tmp, val_expr);
2394 	  gimple_set_vuse (load_stmt, n->vuse);
2395 	  gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
2396 	}
2397       src = val_tmp;
2398     }
2399 
2400   if (n->range == 16)
2401     bswap_stats.found_16bit++;
2402   else if (n->range == 32)
2403     bswap_stats.found_32bit++;
2404   else
2405     {
2406       gcc_assert (n->range == 64);
2407       bswap_stats.found_64bit++;
2408     }
2409 
2410   tmp = src;
2411 
2412   /* Convert the src expression if necessary.  */
2413   if (!useless_type_conversion_p (TREE_TYPE (tmp), bswap_type))
2414     {
2415       gimple convert_stmt;
2416 
2417       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
2418       convert_stmt = gimple_build_assign (tmp, NOP_EXPR, src);
2419       gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
2420     }
2421 
2422   /* Canonical form for 16 bit bswap is a rotate expression.  Only 16bit values
2423      are considered as rotation of 2N bit values by N bits is generally not
2424      equivalent to a bswap.  Consider for instance 0x01020304 r>> 16 which
2425      gives 0x03040102 while a bswap for that value is 0x04030201.  */
2426   if (bswap && n->range == 16)
2427     {
2428       tree count = build_int_cst (NULL, BITS_PER_UNIT);
2429       src = fold_build2 (LROTATE_EXPR, bswap_type, tmp, count);
2430       bswap_stmt = gimple_build_assign (NULL, src);
2431     }
2432   else
2433     bswap_stmt = gimple_build_call (fndecl, 1, tmp);
2434 
2435   tmp = tgt;
2436 
2437   /* Convert the result if necessary.  */
2438   if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type))
2439     {
2440       gimple convert_stmt;
2441 
2442       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
2443       convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
2444       gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
2445     }
2446 
2447   gimple_set_lhs (bswap_stmt, tmp);
2448 
2449   if (dump_file)
2450     {
2451       fprintf (dump_file, "%d bit bswap implementation found at: ",
2452 	       (int) n->range);
2453       print_gimple_stmt (dump_file, cur_stmt, 0, 0);
2454     }
2455 
2456   gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT);
2457   gsi_remove (&gsi, true);
2458   return true;
2459 }
2460 
2461 /* Find manual byte swap implementations as well as load in a given
2462    endianness. Byte swaps are turned into a bswap builtin invokation
2463    while endian loads are converted to bswap builtin invokation or
2464    simple load according to the target endianness.  */
2465 
2466 unsigned int
2467 pass_optimize_bswap::execute (function *fun)
2468 {
2469   basic_block bb;
2470   bool bswap32_p, bswap64_p;
2471   bool changed = false;
2472   tree bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
2473 
2474   if (BITS_PER_UNIT != 8)
2475     return 0;
2476 
2477   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
2478 	       && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
2479   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
2480 	       && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
2481 		   || (bswap32_p && word_mode == SImode)));
2482 
2483   /* Determine the argument type of the builtins.  The code later on
2484      assumes that the return and argument type are the same.  */
2485   if (bswap32_p)
2486     {
2487       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2488       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2489     }
2490 
2491   if (bswap64_p)
2492     {
2493       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2494       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
2495     }
2496 
2497   memset (&nop_stats, 0, sizeof (nop_stats));
2498   memset (&bswap_stats, 0, sizeof (bswap_stats));
2499   calculate_dominance_info (CDI_DOMINATORS);
2500 
2501   FOR_EACH_BB_FN (bb, fun)
2502     {
2503       gimple_stmt_iterator gsi;
2504 
2505       /* We do a reverse scan for bswap patterns to make sure we get the
2506 	 widest match. As bswap pattern matching doesn't handle previously
2507 	 inserted smaller bswap replacements as sub-patterns, the wider
2508 	 variant wouldn't be detected.  */
2509       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi);)
2510         {
2511 	  gimple ins_stmt, cur_stmt = gsi_stmt (gsi);
2512 	  tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type;
2513 	  enum tree_code code;
2514 	  struct symbolic_number n;
2515 	  bool bswap;
2516 
2517 	  /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt
2518 	     might be moved to a different basic block by bswap_replace and gsi
2519 	     must not points to it if that's the case.  Moving the gsi_prev
2520 	     there make sure that gsi points to the statement previous to
2521 	     cur_stmt while still making sure that all statements are
2522 	     considered in this basic block.  */
2523 	  gsi_prev (&gsi);
2524 
2525 	  if (!is_gimple_assign (cur_stmt))
2526 	    continue;
2527 
2528 	  code = gimple_assign_rhs_code (cur_stmt);
2529 	  switch (code)
2530 	    {
2531 	    case LROTATE_EXPR:
2532 	    case RROTATE_EXPR:
2533 	      if (!tree_fits_uhwi_p (gimple_assign_rhs2 (cur_stmt))
2534 		  || tree_to_uhwi (gimple_assign_rhs2 (cur_stmt))
2535 		     % BITS_PER_UNIT)
2536 		continue;
2537 	      /* Fall through.  */
2538 	    case BIT_IOR_EXPR:
2539 	      break;
2540 	    default:
2541 	      continue;
2542 	    }
2543 
2544 	  ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap);
2545 
2546 	  if (!ins_stmt)
2547 	    continue;
2548 
2549 	  switch (n.range)
2550 	    {
2551 	    case 16:
2552 	      /* Already in canonical form, nothing to do.  */
2553 	      if (code == LROTATE_EXPR || code == RROTATE_EXPR)
2554 		continue;
2555 	      load_type = bswap_type = uint16_type_node;
2556 	      break;
2557 	    case 32:
2558 	      load_type = uint32_type_node;
2559 	      if (bswap32_p)
2560 		{
2561 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
2562 		  bswap_type = bswap32_type;
2563 		}
2564 	      break;
2565 	    case 64:
2566 	      load_type = uint64_type_node;
2567 	      if (bswap64_p)
2568 		{
2569 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
2570 		  bswap_type = bswap64_type;
2571 		}
2572 	      break;
2573 	    default:
2574 	      continue;
2575 	    }
2576 
2577 	  if (bswap && !fndecl && n.range != 16)
2578 	    continue;
2579 
2580 	  if (bswap_replace (cur_stmt, ins_stmt, fndecl, bswap_type, load_type,
2581 			     &n, bswap))
2582 	    changed = true;
2583 	}
2584     }
2585 
2586   statistics_counter_event (fun, "16-bit nop implementations found",
2587 			    nop_stats.found_16bit);
2588   statistics_counter_event (fun, "32-bit nop implementations found",
2589 			    nop_stats.found_32bit);
2590   statistics_counter_event (fun, "64-bit nop implementations found",
2591 			    nop_stats.found_64bit);
2592   statistics_counter_event (fun, "16-bit bswap implementations found",
2593 			    bswap_stats.found_16bit);
2594   statistics_counter_event (fun, "32-bit bswap implementations found",
2595 			    bswap_stats.found_32bit);
2596   statistics_counter_event (fun, "64-bit bswap implementations found",
2597 			    bswap_stats.found_64bit);
2598 
2599   return (changed ? TODO_update_ssa : 0);
2600 }
2601 
2602 } // anon namespace
2603 
2604 gimple_opt_pass *
2605 make_pass_optimize_bswap (gcc::context *ctxt)
2606 {
2607   return new pass_optimize_bswap (ctxt);
2608 }
2609 
2610 /* Return true if stmt is a type conversion operation that can be stripped
2611    when used in a widening multiply operation.  */
2612 static bool
2613 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2614 {
2615   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2616 
2617   if (TREE_CODE (result_type) == INTEGER_TYPE)
2618     {
2619       tree op_type;
2620       tree inner_op_type;
2621 
2622       if (!CONVERT_EXPR_CODE_P (rhs_code))
2623 	return false;
2624 
2625       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2626 
2627       /* If the type of OP has the same precision as the result, then
2628 	 we can strip this conversion.  The multiply operation will be
2629 	 selected to create the correct extension as a by-product.  */
2630       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2631 	return true;
2632 
2633       /* We can also strip a conversion if it preserves the signed-ness of
2634 	 the operation and doesn't narrow the range.  */
2635       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2636 
2637       /* If the inner-most type is unsigned, then we can strip any
2638 	 intermediate widening operation.  If it's signed, then the
2639 	 intermediate widening operation must also be signed.  */
2640       if ((TYPE_UNSIGNED (inner_op_type)
2641 	   || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2642 	  && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2643 	return true;
2644 
2645       return false;
2646     }
2647 
2648   return rhs_code == FIXED_CONVERT_EXPR;
2649 }
2650 
2651 /* Return true if RHS is a suitable operand for a widening multiplication,
2652    assuming a target type of TYPE.
2653    There are two cases:
2654 
2655      - RHS makes some value at least twice as wide.  Store that value
2656        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2657 
2658      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2659        but leave *TYPE_OUT untouched.  */
2660 
2661 static bool
2662 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2663 			tree *new_rhs_out)
2664 {
2665   gimple stmt;
2666   tree type1, rhs1;
2667 
2668   if (TREE_CODE (rhs) == SSA_NAME)
2669     {
2670       stmt = SSA_NAME_DEF_STMT (rhs);
2671       if (is_gimple_assign (stmt))
2672 	{
2673 	  if (! widening_mult_conversion_strippable_p (type, stmt))
2674 	    rhs1 = rhs;
2675 	  else
2676 	    {
2677 	      rhs1 = gimple_assign_rhs1 (stmt);
2678 
2679 	      if (TREE_CODE (rhs1) == INTEGER_CST)
2680 		{
2681 		  *new_rhs_out = rhs1;
2682 		  *type_out = NULL;
2683 		  return true;
2684 		}
2685 	    }
2686 	}
2687       else
2688 	rhs1 = rhs;
2689 
2690       type1 = TREE_TYPE (rhs1);
2691 
2692       if (TREE_CODE (type1) != TREE_CODE (type)
2693 	  || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2694 	return false;
2695 
2696       *new_rhs_out = rhs1;
2697       *type_out = type1;
2698       return true;
2699     }
2700 
2701   if (TREE_CODE (rhs) == INTEGER_CST)
2702     {
2703       *new_rhs_out = rhs;
2704       *type_out = NULL;
2705       return true;
2706     }
2707 
2708   return false;
2709 }
2710 
2711 /* Return true if STMT performs a widening multiplication, assuming the
2712    output type is TYPE.  If so, store the unwidened types of the operands
2713    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2714    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2715    and *TYPE2_OUT would give the operands of the multiplication.  */
2716 
2717 static bool
2718 is_widening_mult_p (gimple stmt,
2719 		    tree *type1_out, tree *rhs1_out,
2720 		    tree *type2_out, tree *rhs2_out)
2721 {
2722   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2723 
2724   if (TREE_CODE (type) != INTEGER_TYPE
2725       && TREE_CODE (type) != FIXED_POINT_TYPE)
2726     return false;
2727 
2728   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2729 			       rhs1_out))
2730     return false;
2731 
2732   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2733 			       rhs2_out))
2734     return false;
2735 
2736   if (*type1_out == NULL)
2737     {
2738       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2739 	return false;
2740       *type1_out = *type2_out;
2741     }
2742 
2743   if (*type2_out == NULL)
2744     {
2745       if (!int_fits_type_p (*rhs2_out, *type1_out))
2746 	return false;
2747       *type2_out = *type1_out;
2748     }
2749 
2750   /* Ensure that the larger of the two operands comes first. */
2751   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2752     {
2753       tree tmp;
2754       tmp = *type1_out;
2755       *type1_out = *type2_out;
2756       *type2_out = tmp;
2757       tmp = *rhs1_out;
2758       *rhs1_out = *rhs2_out;
2759       *rhs2_out = tmp;
2760     }
2761 
2762   return true;
2763 }
2764 
2765 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2766    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2767    value is true iff we converted the statement.  */
2768 
2769 static bool
2770 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2771 {
2772   tree lhs, rhs1, rhs2, type, type1, type2;
2773   enum insn_code handler;
2774   machine_mode to_mode, from_mode, actual_mode;
2775   optab op;
2776   int actual_precision;
2777   location_t loc = gimple_location (stmt);
2778   bool from_unsigned1, from_unsigned2;
2779 
2780   lhs = gimple_assign_lhs (stmt);
2781   type = TREE_TYPE (lhs);
2782   if (TREE_CODE (type) != INTEGER_TYPE)
2783     return false;
2784 
2785   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2786     return false;
2787 
2788   to_mode = TYPE_MODE (type);
2789   from_mode = TYPE_MODE (type1);
2790   from_unsigned1 = TYPE_UNSIGNED (type1);
2791   from_unsigned2 = TYPE_UNSIGNED (type2);
2792 
2793   if (from_unsigned1 && from_unsigned2)
2794     op = umul_widen_optab;
2795   else if (!from_unsigned1 && !from_unsigned2)
2796     op = smul_widen_optab;
2797   else
2798     op = usmul_widen_optab;
2799 
2800   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2801 						  0, &actual_mode);
2802 
2803   if (handler == CODE_FOR_nothing)
2804     {
2805       if (op != smul_widen_optab)
2806 	{
2807 	  /* We can use a signed multiply with unsigned types as long as
2808 	     there is a wider mode to use, or it is the smaller of the two
2809 	     types that is unsigned.  Note that type1 >= type2, always.  */
2810 	  if ((TYPE_UNSIGNED (type1)
2811 	       && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2812 	      || (TYPE_UNSIGNED (type2)
2813 		  && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2814 	    {
2815 	      from_mode = GET_MODE_WIDER_MODE (from_mode);
2816 	      if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2817 		return false;
2818 	    }
2819 
2820 	  op = smul_widen_optab;
2821 	  handler = find_widening_optab_handler_and_mode (op, to_mode,
2822 							  from_mode, 0,
2823 							  &actual_mode);
2824 
2825 	  if (handler == CODE_FOR_nothing)
2826 	    return false;
2827 
2828 	  from_unsigned1 = from_unsigned2 = false;
2829 	}
2830       else
2831 	return false;
2832     }
2833 
2834   /* Ensure that the inputs to the handler are in the correct precison
2835      for the opcode.  This will be the full mode size.  */
2836   actual_precision = GET_MODE_PRECISION (actual_mode);
2837   if (2 * actual_precision > TYPE_PRECISION (type))
2838     return false;
2839   if (actual_precision != TYPE_PRECISION (type1)
2840       || from_unsigned1 != TYPE_UNSIGNED (type1))
2841     rhs1 = build_and_insert_cast (gsi, loc,
2842 				  build_nonstandard_integer_type
2843 				    (actual_precision, from_unsigned1), rhs1);
2844   if (actual_precision != TYPE_PRECISION (type2)
2845       || from_unsigned2 != TYPE_UNSIGNED (type2))
2846     rhs2 = build_and_insert_cast (gsi, loc,
2847 				  build_nonstandard_integer_type
2848 				    (actual_precision, from_unsigned2), rhs2);
2849 
2850   /* Handle constants.  */
2851   if (TREE_CODE (rhs1) == INTEGER_CST)
2852     rhs1 = fold_convert (type1, rhs1);
2853   if (TREE_CODE (rhs2) == INTEGER_CST)
2854     rhs2 = fold_convert (type2, rhs2);
2855 
2856   gimple_assign_set_rhs1 (stmt, rhs1);
2857   gimple_assign_set_rhs2 (stmt, rhs2);
2858   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2859   update_stmt (stmt);
2860   widen_mul_stats.widen_mults_inserted++;
2861   return true;
2862 }
2863 
2864 /* Process a single gimple statement STMT, which is found at the
2865    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2866    rhs (given by CODE), and try to convert it into a
2867    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2868    is true iff we converted the statement.  */
2869 
2870 static bool
2871 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2872 			    enum tree_code code)
2873 {
2874   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2875   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2876   tree type, type1, type2, optype;
2877   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2878   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2879   optab this_optab;
2880   enum tree_code wmult_code;
2881   enum insn_code handler;
2882   machine_mode to_mode, from_mode, actual_mode;
2883   location_t loc = gimple_location (stmt);
2884   int actual_precision;
2885   bool from_unsigned1, from_unsigned2;
2886 
2887   lhs = gimple_assign_lhs (stmt);
2888   type = TREE_TYPE (lhs);
2889   if (TREE_CODE (type) != INTEGER_TYPE
2890       && TREE_CODE (type) != FIXED_POINT_TYPE)
2891     return false;
2892 
2893   if (code == MINUS_EXPR)
2894     wmult_code = WIDEN_MULT_MINUS_EXPR;
2895   else
2896     wmult_code = WIDEN_MULT_PLUS_EXPR;
2897 
2898   rhs1 = gimple_assign_rhs1 (stmt);
2899   rhs2 = gimple_assign_rhs2 (stmt);
2900 
2901   if (TREE_CODE (rhs1) == SSA_NAME)
2902     {
2903       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2904       if (is_gimple_assign (rhs1_stmt))
2905 	rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2906     }
2907 
2908   if (TREE_CODE (rhs2) == SSA_NAME)
2909     {
2910       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2911       if (is_gimple_assign (rhs2_stmt))
2912 	rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2913     }
2914 
2915   /* Allow for one conversion statement between the multiply
2916      and addition/subtraction statement.  If there are more than
2917      one conversions then we assume they would invalidate this
2918      transformation.  If that's not the case then they should have
2919      been folded before now.  */
2920   if (CONVERT_EXPR_CODE_P (rhs1_code))
2921     {
2922       conv1_stmt = rhs1_stmt;
2923       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2924       if (TREE_CODE (rhs1) == SSA_NAME)
2925 	{
2926 	  rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2927 	  if (is_gimple_assign (rhs1_stmt))
2928 	    rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2929 	}
2930       else
2931 	return false;
2932     }
2933   if (CONVERT_EXPR_CODE_P (rhs2_code))
2934     {
2935       conv2_stmt = rhs2_stmt;
2936       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2937       if (TREE_CODE (rhs2) == SSA_NAME)
2938 	{
2939 	  rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2940 	  if (is_gimple_assign (rhs2_stmt))
2941 	    rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2942 	}
2943       else
2944 	return false;
2945     }
2946 
2947   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2948      is_widening_mult_p, but we still need the rhs returns.
2949 
2950      It might also appear that it would be sufficient to use the existing
2951      operands of the widening multiply, but that would limit the choice of
2952      multiply-and-accumulate instructions.
2953 
2954      If the widened-multiplication result has more than one uses, it is
2955      probably wiser not to do the conversion.  */
2956   if (code == PLUS_EXPR
2957       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2958     {
2959       if (!has_single_use (rhs1)
2960 	  || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2961 				  &type2, &mult_rhs2))
2962 	return false;
2963       add_rhs = rhs2;
2964       conv_stmt = conv1_stmt;
2965     }
2966   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2967     {
2968       if (!has_single_use (rhs2)
2969 	  || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2970 				  &type2, &mult_rhs2))
2971 	return false;
2972       add_rhs = rhs1;
2973       conv_stmt = conv2_stmt;
2974     }
2975   else
2976     return false;
2977 
2978   to_mode = TYPE_MODE (type);
2979   from_mode = TYPE_MODE (type1);
2980   from_unsigned1 = TYPE_UNSIGNED (type1);
2981   from_unsigned2 = TYPE_UNSIGNED (type2);
2982   optype = type1;
2983 
2984   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2985   if (from_unsigned1 != from_unsigned2)
2986     {
2987       if (!INTEGRAL_TYPE_P (type))
2988 	return false;
2989       /* We can use a signed multiply with unsigned types as long as
2990 	 there is a wider mode to use, or it is the smaller of the two
2991 	 types that is unsigned.  Note that type1 >= type2, always.  */
2992       if ((from_unsigned1
2993 	   && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2994 	  || (from_unsigned2
2995 	      && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2996 	{
2997 	  from_mode = GET_MODE_WIDER_MODE (from_mode);
2998 	  if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2999 	    return false;
3000 	}
3001 
3002       from_unsigned1 = from_unsigned2 = false;
3003       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
3004 					       false);
3005     }
3006 
3007   /* If there was a conversion between the multiply and addition
3008      then we need to make sure it fits a multiply-and-accumulate.
3009      The should be a single mode change which does not change the
3010      value.  */
3011   if (conv_stmt)
3012     {
3013       /* We use the original, unmodified data types for this.  */
3014       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
3015       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
3016       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
3017       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
3018 
3019       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
3020 	{
3021 	  /* Conversion is a truncate.  */
3022 	  if (TYPE_PRECISION (to_type) < data_size)
3023 	    return false;
3024 	}
3025       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
3026 	{
3027 	  /* Conversion is an extend.  Check it's the right sort.  */
3028 	  if (TYPE_UNSIGNED (from_type) != is_unsigned
3029 	      && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
3030 	    return false;
3031 	}
3032       /* else convert is a no-op for our purposes.  */
3033     }
3034 
3035   /* Verify that the machine can perform a widening multiply
3036      accumulate in this mode/signedness combination, otherwise
3037      this transformation is likely to pessimize code.  */
3038   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
3039   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
3040 						  from_mode, 0, &actual_mode);
3041 
3042   if (handler == CODE_FOR_nothing)
3043     return false;
3044 
3045   /* Ensure that the inputs to the handler are in the correct precison
3046      for the opcode.  This will be the full mode size.  */
3047   actual_precision = GET_MODE_PRECISION (actual_mode);
3048   if (actual_precision != TYPE_PRECISION (type1)
3049       || from_unsigned1 != TYPE_UNSIGNED (type1))
3050     mult_rhs1 = build_and_insert_cast (gsi, loc,
3051 				       build_nonstandard_integer_type
3052 				         (actual_precision, from_unsigned1),
3053 				       mult_rhs1);
3054   if (actual_precision != TYPE_PRECISION (type2)
3055       || from_unsigned2 != TYPE_UNSIGNED (type2))
3056     mult_rhs2 = build_and_insert_cast (gsi, loc,
3057 				       build_nonstandard_integer_type
3058 					 (actual_precision, from_unsigned2),
3059 				       mult_rhs2);
3060 
3061   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
3062     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
3063 
3064   /* Handle constants.  */
3065   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
3066     mult_rhs1 = fold_convert (type1, mult_rhs1);
3067   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
3068     mult_rhs2 = fold_convert (type2, mult_rhs2);
3069 
3070   gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
3071 				  add_rhs);
3072   update_stmt (gsi_stmt (*gsi));
3073   widen_mul_stats.maccs_inserted++;
3074   return true;
3075 }
3076 
3077 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3078    with uses in additions and subtractions to form fused multiply-add
3079    operations.  Returns true if successful and MUL_STMT should be removed.  */
3080 
3081 static bool
3082 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
3083 {
3084   tree mul_result = gimple_get_lhs (mul_stmt);
3085   tree type = TREE_TYPE (mul_result);
3086   gimple use_stmt, neguse_stmt;
3087   gassign *fma_stmt;
3088   use_operand_p use_p;
3089   imm_use_iterator imm_iter;
3090 
3091   if (FLOAT_TYPE_P (type)
3092       && flag_fp_contract_mode == FP_CONTRACT_OFF)
3093     return false;
3094 
3095   /* We don't want to do bitfield reduction ops.  */
3096   if (INTEGRAL_TYPE_P (type)
3097       && (TYPE_PRECISION (type)
3098 	  != GET_MODE_PRECISION (TYPE_MODE (type))))
3099     return false;
3100 
3101   /* If the target doesn't support it, don't generate it.  We assume that
3102      if fma isn't available then fms, fnma or fnms are not either.  */
3103   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
3104     return false;
3105 
3106   /* If the multiplication has zero uses, it is kept around probably because
3107      of -fnon-call-exceptions.  Don't optimize it away in that case,
3108      it is DCE job.  */
3109   if (has_zero_uses (mul_result))
3110     return false;
3111 
3112   /* Make sure that the multiplication statement becomes dead after
3113      the transformation, thus that all uses are transformed to FMAs.
3114      This means we assume that an FMA operation has the same cost
3115      as an addition.  */
3116   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3117     {
3118       enum tree_code use_code;
3119       tree result = mul_result;
3120       bool negate_p = false;
3121 
3122       use_stmt = USE_STMT (use_p);
3123 
3124       if (is_gimple_debug (use_stmt))
3125 	continue;
3126 
3127       /* For now restrict this operations to single basic blocks.  In theory
3128 	 we would want to support sinking the multiplication in
3129 	 m = a*b;
3130 	 if ()
3131 	   ma = m + c;
3132 	 else
3133 	   d = m;
3134 	 to form a fma in the then block and sink the multiplication to the
3135 	 else block.  */
3136       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3137 	return false;
3138 
3139       if (!is_gimple_assign (use_stmt))
3140 	return false;
3141 
3142       use_code = gimple_assign_rhs_code (use_stmt);
3143 
3144       /* A negate on the multiplication leads to FNMA.  */
3145       if (use_code == NEGATE_EXPR)
3146 	{
3147 	  ssa_op_iter iter;
3148 	  use_operand_p usep;
3149 
3150 	  result = gimple_assign_lhs (use_stmt);
3151 
3152 	  /* Make sure the negate statement becomes dead with this
3153 	     single transformation.  */
3154 	  if (!single_imm_use (gimple_assign_lhs (use_stmt),
3155 			       &use_p, &neguse_stmt))
3156 	    return false;
3157 
3158 	  /* Make sure the multiplication isn't also used on that stmt.  */
3159 	  FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3160 	    if (USE_FROM_PTR (usep) == mul_result)
3161 	      return false;
3162 
3163 	  /* Re-validate.  */
3164 	  use_stmt = neguse_stmt;
3165 	  if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3166 	    return false;
3167 	  if (!is_gimple_assign (use_stmt))
3168 	    return false;
3169 
3170 	  use_code = gimple_assign_rhs_code (use_stmt);
3171 	  negate_p = true;
3172 	}
3173 
3174       switch (use_code)
3175 	{
3176 	case MINUS_EXPR:
3177 	  if (gimple_assign_rhs2 (use_stmt) == result)
3178 	    negate_p = !negate_p;
3179 	  break;
3180 	case PLUS_EXPR:
3181 	  break;
3182 	default:
3183 	  /* FMA can only be formed from PLUS and MINUS.  */
3184 	  return false;
3185 	}
3186 
3187       /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed
3188 	 by a MULT_EXPR that we'll visit later, we might be able to
3189 	 get a more profitable match with fnma.
3190 	 OTOH, if we don't, a negate / fma pair has likely lower latency
3191 	 that a mult / subtract pair.  */
3192       if (use_code == MINUS_EXPR && !negate_p
3193 	  && gimple_assign_rhs1 (use_stmt) == result
3194 	  && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing
3195 	  && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing)
3196 	{
3197 	  tree rhs2 = gimple_assign_rhs2 (use_stmt);
3198 
3199 	  if (TREE_CODE (rhs2) == SSA_NAME)
3200 	    {
3201 	      gimple stmt2 = SSA_NAME_DEF_STMT (rhs2);
3202 	      if (has_single_use (rhs2)
3203 		  && is_gimple_assign (stmt2)
3204 		  && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3205 	      return false;
3206 	    }
3207 	}
3208 
3209       /* We can't handle a * b + a * b.  */
3210       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
3211 	return false;
3212 
3213       /* While it is possible to validate whether or not the exact form
3214 	 that we've recognized is available in the backend, the assumption
3215 	 is that the transformation is never a loss.  For instance, suppose
3216 	 the target only has the plain FMA pattern available.  Consider
3217 	 a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
3218 	 is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
3219 	 still have 3 operations, but in the FMA form the two NEGs are
3220 	 independent and could be run in parallel.  */
3221     }
3222 
3223   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
3224     {
3225       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3226       enum tree_code use_code;
3227       tree addop, mulop1 = op1, result = mul_result;
3228       bool negate_p = false;
3229 
3230       if (is_gimple_debug (use_stmt))
3231 	continue;
3232 
3233       use_code = gimple_assign_rhs_code (use_stmt);
3234       if (use_code == NEGATE_EXPR)
3235 	{
3236 	  result = gimple_assign_lhs (use_stmt);
3237 	  single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
3238 	  gsi_remove (&gsi, true);
3239 	  release_defs (use_stmt);
3240 
3241 	  use_stmt = neguse_stmt;
3242 	  gsi = gsi_for_stmt (use_stmt);
3243 	  use_code = gimple_assign_rhs_code (use_stmt);
3244 	  negate_p = true;
3245 	}
3246 
3247       if (gimple_assign_rhs1 (use_stmt) == result)
3248 	{
3249 	  addop = gimple_assign_rhs2 (use_stmt);
3250 	  /* a * b - c -> a * b + (-c)  */
3251 	  if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3252 	    addop = force_gimple_operand_gsi (&gsi,
3253 					      build1 (NEGATE_EXPR,
3254 						      type, addop),
3255 					      true, NULL_TREE, true,
3256 					      GSI_SAME_STMT);
3257 	}
3258       else
3259 	{
3260 	  addop = gimple_assign_rhs1 (use_stmt);
3261 	  /* a - b * c -> (-b) * c + a */
3262 	  if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3263 	    negate_p = !negate_p;
3264 	}
3265 
3266       if (negate_p)
3267 	mulop1 = force_gimple_operand_gsi (&gsi,
3268 					   build1 (NEGATE_EXPR,
3269 						   type, mulop1),
3270 					   true, NULL_TREE, true,
3271 					   GSI_SAME_STMT);
3272 
3273       fma_stmt = gimple_build_assign (gimple_assign_lhs (use_stmt),
3274 				      FMA_EXPR, mulop1, op2, addop);
3275       gsi_replace (&gsi, fma_stmt, true);
3276       widen_mul_stats.fmas_inserted++;
3277     }
3278 
3279   return true;
3280 }
3281 
3282 /* Find integer multiplications where the operands are extended from
3283    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3284    where appropriate.  */
3285 
3286 namespace {
3287 
3288 const pass_data pass_data_optimize_widening_mul =
3289 {
3290   GIMPLE_PASS, /* type */
3291   "widening_mul", /* name */
3292   OPTGROUP_NONE, /* optinfo_flags */
3293   TV_NONE, /* tv_id */
3294   PROP_ssa, /* properties_required */
3295   0, /* properties_provided */
3296   0, /* properties_destroyed */
3297   0, /* todo_flags_start */
3298   TODO_update_ssa, /* todo_flags_finish */
3299 };
3300 
3301 class pass_optimize_widening_mul : public gimple_opt_pass
3302 {
3303 public:
3304   pass_optimize_widening_mul (gcc::context *ctxt)
3305     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3306   {}
3307 
3308   /* opt_pass methods: */
3309   virtual bool gate (function *)
3310     {
3311       return flag_expensive_optimizations && optimize;
3312     }
3313 
3314   virtual unsigned int execute (function *);
3315 
3316 }; // class pass_optimize_widening_mul
3317 
3318 unsigned int
3319 pass_optimize_widening_mul::execute (function *fun)
3320 {
3321   basic_block bb;
3322   bool cfg_changed = false;
3323 
3324   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3325 
3326   FOR_EACH_BB_FN (bb, fun)
3327     {
3328       gimple_stmt_iterator gsi;
3329 
3330       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3331         {
3332 	  gimple stmt = gsi_stmt (gsi);
3333 	  enum tree_code code;
3334 
3335 	  if (is_gimple_assign (stmt))
3336 	    {
3337 	      code = gimple_assign_rhs_code (stmt);
3338 	      switch (code)
3339 		{
3340 		case MULT_EXPR:
3341 		  if (!convert_mult_to_widen (stmt, &gsi)
3342 		      && convert_mult_to_fma (stmt,
3343 					      gimple_assign_rhs1 (stmt),
3344 					      gimple_assign_rhs2 (stmt)))
3345 		    {
3346 		      gsi_remove (&gsi, true);
3347 		      release_defs (stmt);
3348 		      continue;
3349 		    }
3350 		  break;
3351 
3352 		case PLUS_EXPR:
3353 		case MINUS_EXPR:
3354 		  convert_plusminus_to_widen (&gsi, stmt, code);
3355 		  break;
3356 
3357 		default:;
3358 		}
3359 	    }
3360 	  else if (is_gimple_call (stmt)
3361 		   && gimple_call_lhs (stmt))
3362 	    {
3363 	      tree fndecl = gimple_call_fndecl (stmt);
3364 	      if (fndecl
3365 		  && gimple_call_builtin_p (stmt, BUILT_IN_NORMAL))
3366 		{
3367 		  switch (DECL_FUNCTION_CODE (fndecl))
3368 		    {
3369 		      case BUILT_IN_POWF:
3370 		      case BUILT_IN_POW:
3371 		      case BUILT_IN_POWL:
3372 			if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3373 			    && REAL_VALUES_EQUAL
3374 			         (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3375 				  dconst2)
3376 			    && convert_mult_to_fma (stmt,
3377 						    gimple_call_arg (stmt, 0),
3378 						    gimple_call_arg (stmt, 0)))
3379 			  {
3380 			    unlink_stmt_vdef (stmt);
3381 			    if (gsi_remove (&gsi, true)
3382 				&& gimple_purge_dead_eh_edges (bb))
3383 			      cfg_changed = true;
3384 			    release_defs (stmt);
3385 			    continue;
3386 			  }
3387 			  break;
3388 
3389 		      default:;
3390 		    }
3391 		}
3392 	    }
3393 	  gsi_next (&gsi);
3394 	}
3395     }
3396 
3397   statistics_counter_event (fun, "widening multiplications inserted",
3398 			    widen_mul_stats.widen_mults_inserted);
3399   statistics_counter_event (fun, "widening maccs inserted",
3400 			    widen_mul_stats.maccs_inserted);
3401   statistics_counter_event (fun, "fused multiply-adds inserted",
3402 			    widen_mul_stats.fmas_inserted);
3403 
3404   return cfg_changed ? TODO_cleanup_cfg : 0;
3405 }
3406 
3407 } // anon namespace
3408 
3409 gimple_opt_pass *
3410 make_pass_optimize_widening_mul (gcc::context *ctxt)
3411 {
3412   return new pass_optimize_widening_mul (ctxt);
3413 }
3414