xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-ssa-math-opts.c (revision a24efa7dea9f1f56c3bdb15a927d3516792ace1c)
1 /* Global, SSA-based optimizations using mathematical identities.
2    Copyright (C) 2005-2013 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21    operations.  These are common in sequences such as this one:
22 
23 	modulus = sqrt(x*x + y*y + z*z);
24 	x = x / modulus;
25 	y = y / modulus;
26 	z = z / modulus;
27 
28    that can be optimized to
29 
30 	modulus = sqrt(x*x + y*y + z*z);
31         rmodulus = 1.0 / modulus;
32 	x = x * rmodulus;
33 	y = y * rmodulus;
34 	z = z * rmodulus;
35 
36    We do this for loop invariant divisors, and with this pass whenever
37    we notice that a division has the same divisor multiple times.
38 
39    Of course, like in PRE, we don't insert a division if a dominator
40    already has one.  However, this cannot be done as an extension of
41    PRE for several reasons.
42 
43    First of all, with some experiments it was found out that the
44    transformation is not always useful if there are only two divisions
45    hy the same divisor.  This is probably because modern processors
46    can pipeline the divisions; on older, in-order processors it should
47    still be effective to optimize two divisions by the same number.
48    We make this a param, and it shall be called N in the remainder of
49    this comment.
50 
51    Second, if trapping math is active, we have less freedom on where
52    to insert divisions: we can only do so in basic blocks that already
53    contain one.  (If divisions don't trap, instead, we can insert
54    divisions elsewhere, which will be in blocks that are common dominators
55    of those that have the division).
56 
57    We really don't want to compute the reciprocal unless a division will
58    be found.  To do this, we won't insert the division in a basic block
59    that has less than N divisions *post-dominating* it.
60 
61    The algorithm constructs a subset of the dominator tree, holding the
62    blocks containing the divisions and the common dominators to them,
63    and walk it twice.  The first walk is in post-order, and it annotates
64    each block with the number of divisions that post-dominate it: this
65    gives information on where divisions can be inserted profitably.
66    The second walk is in pre-order, and it inserts divisions as explained
67    above, and replaces divisions by multiplications.
68 
69    In the best case, the cost of the pass is O(n_statements).  In the
70    worst-case, the cost is due to creating the dominator tree subset,
71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
72    for n_statements / n_basic_blocks statements.  So, the amortized cost
73    of creating the dominator tree subset is O(n_basic_blocks) and the
74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
75 
76    More practically, the cost will be small because there are few
77    divisions, and they tend to be in the same basic block, so insert_bb
78    is called very few times.
79 
80    If we did this using domwalk.c, an efficient implementation would have
81    to work on all the variables in a single pass, because we could not
82    work on just a subset of the dominator tree, as we do now, and the
83    cost would also be something like O(n_statements * n_basic_blocks).
84    The data structures would be more complex in order to work on all the
85    variables in a single pass.  */
86 
87 #include "config.h"
88 #include "system.h"
89 #include "coretypes.h"
90 #include "tm.h"
91 #include "flags.h"
92 #include "tree.h"
93 #include "tree-flow.h"
94 #include "tree-pass.h"
95 #include "alloc-pool.h"
96 #include "basic-block.h"
97 #include "target.h"
98 #include "gimple-pretty-print.h"
99 
100 /* FIXME: RTL headers have to be included here for optabs.  */
101 #include "rtl.h"		/* Because optabs.h wants enum rtx_code.  */
102 #include "expr.h"		/* Because optabs.h wants sepops.  */
103 #include "optabs.h"
104 
105 /* This structure represents one basic block that either computes a
106    division, or is a common dominator for basic block that compute a
107    division.  */
108 struct occurrence {
109   /* The basic block represented by this structure.  */
110   basic_block bb;
111 
112   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
113      inserted in BB.  */
114   tree recip_def;
115 
116   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
117      was inserted in BB.  */
118   gimple recip_def_stmt;
119 
120   /* Pointer to a list of "struct occurrence"s for blocks dominated
121      by BB.  */
122   struct occurrence *children;
123 
124   /* Pointer to the next "struct occurrence"s in the list of blocks
125      sharing a common dominator.  */
126   struct occurrence *next;
127 
128   /* The number of divisions that are in BB before compute_merit.  The
129      number of divisions that are in BB or post-dominate it after
130      compute_merit.  */
131   int num_divisions;
132 
133   /* True if the basic block has a division, false if it is a common
134      dominator for basic blocks that do.  If it is false and trapping
135      math is active, BB is not a candidate for inserting a reciprocal.  */
136   bool bb_has_division;
137 };
138 
139 static struct
140 {
141   /* Number of 1.0/X ops inserted.  */
142   int rdivs_inserted;
143 
144   /* Number of 1.0/FUNC ops inserted.  */
145   int rfuncs_inserted;
146 } reciprocal_stats;
147 
148 static struct
149 {
150   /* Number of cexpi calls inserted.  */
151   int inserted;
152 } sincos_stats;
153 
154 static struct
155 {
156   /* Number of hand-written 16-bit bswaps found.  */
157   int found_16bit;
158 
159   /* Number of hand-written 32-bit bswaps found.  */
160   int found_32bit;
161 
162   /* Number of hand-written 64-bit bswaps found.  */
163   int found_64bit;
164 } bswap_stats;
165 
166 static struct
167 {
168   /* Number of widening multiplication ops inserted.  */
169   int widen_mults_inserted;
170 
171   /* Number of integer multiply-and-accumulate ops inserted.  */
172   int maccs_inserted;
173 
174   /* Number of fp fused multiply-add ops inserted.  */
175   int fmas_inserted;
176 } widen_mul_stats;
177 
178 /* The instance of "struct occurrence" representing the highest
179    interesting block in the dominator tree.  */
180 static struct occurrence *occ_head;
181 
182 /* Allocation pool for getting instances of "struct occurrence".  */
183 static alloc_pool occ_pool;
184 
185 
186 
187 /* Allocate and return a new struct occurrence for basic block BB, and
188    whose children list is headed by CHILDREN.  */
189 static struct occurrence *
190 occ_new (basic_block bb, struct occurrence *children)
191 {
192   struct occurrence *occ;
193 
194   bb->aux = occ = (struct occurrence *) pool_alloc (occ_pool);
195   memset (occ, 0, sizeof (struct occurrence));
196 
197   occ->bb = bb;
198   occ->children = children;
199   return occ;
200 }
201 
202 
203 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
204    list of "struct occurrence"s, one per basic block, having IDOM as
205    their common dominator.
206 
207    We try to insert NEW_OCC as deep as possible in the tree, and we also
208    insert any other block that is a common dominator for BB and one
209    block already in the tree.  */
210 
211 static void
212 insert_bb (struct occurrence *new_occ, basic_block idom,
213 	   struct occurrence **p_head)
214 {
215   struct occurrence *occ, **p_occ;
216 
217   for (p_occ = p_head; (occ = *p_occ) != NULL; )
218     {
219       basic_block bb = new_occ->bb, occ_bb = occ->bb;
220       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
221       if (dom == bb)
222 	{
223 	  /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
224 	     from its list.  */
225 	  *p_occ = occ->next;
226 	  occ->next = new_occ->children;
227 	  new_occ->children = occ;
228 
229 	  /* Try the next block (it may as well be dominated by BB).  */
230 	}
231 
232       else if (dom == occ_bb)
233 	{
234 	  /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
235 	  insert_bb (new_occ, dom, &occ->children);
236 	  return;
237 	}
238 
239       else if (dom != idom)
240 	{
241 	  gcc_assert (!dom->aux);
242 
243 	  /* There is a dominator between IDOM and BB, add it and make
244 	     two children out of NEW_OCC and OCC.  First, remove OCC from
245 	     its list.	*/
246 	  *p_occ = occ->next;
247 	  new_occ->next = occ;
248 	  occ->next = NULL;
249 
250 	  /* None of the previous blocks has DOM as a dominator: if we tail
251 	     recursed, we would reexamine them uselessly. Just switch BB with
252 	     DOM, and go on looking for blocks dominated by DOM.  */
253           new_occ = occ_new (dom, new_occ);
254 	}
255 
256       else
257 	{
258 	  /* Nothing special, go on with the next element.  */
259 	  p_occ = &occ->next;
260 	}
261     }
262 
263   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
264   new_occ->next = *p_head;
265   *p_head = new_occ;
266 }
267 
268 /* Register that we found a division in BB.  */
269 
270 static inline void
271 register_division_in (basic_block bb)
272 {
273   struct occurrence *occ;
274 
275   occ = (struct occurrence *) bb->aux;
276   if (!occ)
277     {
278       occ = occ_new (bb, NULL);
279       insert_bb (occ, ENTRY_BLOCK_PTR, &occ_head);
280     }
281 
282   occ->bb_has_division = true;
283   occ->num_divisions++;
284 }
285 
286 
287 /* Compute the number of divisions that postdominate each block in OCC and
288    its children.  */
289 
290 static void
291 compute_merit (struct occurrence *occ)
292 {
293   struct occurrence *occ_child;
294   basic_block dom = occ->bb;
295 
296   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
297     {
298       basic_block bb;
299       if (occ_child->children)
300         compute_merit (occ_child);
301 
302       if (flag_exceptions)
303 	bb = single_noncomplex_succ (dom);
304       else
305 	bb = dom;
306 
307       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
308         occ->num_divisions += occ_child->num_divisions;
309     }
310 }
311 
312 
313 /* Return whether USE_STMT is a floating-point division by DEF.  */
314 static inline bool
315 is_division_by (gimple use_stmt, tree def)
316 {
317   return is_gimple_assign (use_stmt)
318 	 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
319 	 && gimple_assign_rhs2 (use_stmt) == def
320 	 /* Do not recognize x / x as valid division, as we are getting
321 	    confused later by replacing all immediate uses x in such
322 	    a stmt.  */
323 	 && gimple_assign_rhs1 (use_stmt) != def;
324 }
325 
326 /* Walk the subset of the dominator tree rooted at OCC, setting the
327    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
328    the given basic block.  The field may be left NULL, of course,
329    if it is not possible or profitable to do the optimization.
330 
331    DEF_BSI is an iterator pointing at the statement defining DEF.
332    If RECIP_DEF is set, a dominator already has a computation that can
333    be used.  */
334 
335 static void
336 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
337 		    tree def, tree recip_def, int threshold)
338 {
339   tree type;
340   gimple new_stmt;
341   gimple_stmt_iterator gsi;
342   struct occurrence *occ_child;
343 
344   if (!recip_def
345       && (occ->bb_has_division || !flag_trapping_math)
346       && occ->num_divisions >= threshold)
347     {
348       /* Make a variable with the replacement and substitute it.  */
349       type = TREE_TYPE (def);
350       recip_def = create_tmp_reg (type, "reciptmp");
351       new_stmt = gimple_build_assign_with_ops (RDIV_EXPR, recip_def,
352 					       build_one_cst (type), def);
353 
354       if (occ->bb_has_division)
355         {
356           /* Case 1: insert before an existing division.  */
357           gsi = gsi_after_labels (occ->bb);
358           while (!gsi_end_p (gsi) && !is_division_by (gsi_stmt (gsi), def))
359 	    gsi_next (&gsi);
360 
361           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
362         }
363       else if (def_gsi && occ->bb == def_gsi->bb)
364         {
365           /* Case 2: insert right after the definition.  Note that this will
366 	     never happen if the definition statement can throw, because in
367 	     that case the sole successor of the statement's basic block will
368 	     dominate all the uses as well.  */
369           gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
370         }
371       else
372         {
373           /* Case 3: insert in a basic block not containing defs/uses.  */
374           gsi = gsi_after_labels (occ->bb);
375           gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
376         }
377 
378       reciprocal_stats.rdivs_inserted++;
379 
380       occ->recip_def_stmt = new_stmt;
381     }
382 
383   occ->recip_def = recip_def;
384   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
385     insert_reciprocals (def_gsi, occ_child, def, recip_def, threshold);
386 }
387 
388 
389 /* Replace the division at USE_P with a multiplication by the reciprocal, if
390    possible.  */
391 
392 static inline void
393 replace_reciprocal (use_operand_p use_p)
394 {
395   gimple use_stmt = USE_STMT (use_p);
396   basic_block bb = gimple_bb (use_stmt);
397   struct occurrence *occ = (struct occurrence *) bb->aux;
398 
399   if (optimize_bb_for_speed_p (bb)
400       && occ->recip_def && use_stmt != occ->recip_def_stmt)
401     {
402       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
403       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
404       SET_USE (use_p, occ->recip_def);
405       fold_stmt_inplace (&gsi);
406       update_stmt (use_stmt);
407     }
408 }
409 
410 
411 /* Free OCC and return one more "struct occurrence" to be freed.  */
412 
413 static struct occurrence *
414 free_bb (struct occurrence *occ)
415 {
416   struct occurrence *child, *next;
417 
418   /* First get the two pointers hanging off OCC.  */
419   next = occ->next;
420   child = occ->children;
421   occ->bb->aux = NULL;
422   pool_free (occ_pool, occ);
423 
424   /* Now ensure that we don't recurse unless it is necessary.  */
425   if (!child)
426     return next;
427   else
428     {
429       while (next)
430 	next = free_bb (next);
431 
432       return child;
433     }
434 }
435 
436 
437 /* Look for floating-point divisions among DEF's uses, and try to
438    replace them by multiplications with the reciprocal.  Add
439    as many statements computing the reciprocal as needed.
440 
441    DEF must be a GIMPLE register of a floating-point type.  */
442 
443 static void
444 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
445 {
446   use_operand_p use_p;
447   imm_use_iterator use_iter;
448   struct occurrence *occ;
449   int count = 0, threshold;
450 
451   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && is_gimple_reg (def));
452 
453   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
454     {
455       gimple use_stmt = USE_STMT (use_p);
456       if (is_division_by (use_stmt, def))
457 	{
458 	  register_division_in (gimple_bb (use_stmt));
459 	  count++;
460 	}
461     }
462 
463   /* Do the expensive part only if we can hope to optimize something.  */
464   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
465   if (count >= threshold)
466     {
467       gimple use_stmt;
468       for (occ = occ_head; occ; occ = occ->next)
469 	{
470 	  compute_merit (occ);
471 	  insert_reciprocals (def_gsi, occ, def, NULL, threshold);
472 	}
473 
474       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
475 	{
476 	  if (is_division_by (use_stmt, def))
477 	    {
478 	      FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
479 		replace_reciprocal (use_p);
480 	    }
481 	}
482     }
483 
484   for (occ = occ_head; occ; )
485     occ = free_bb (occ);
486 
487   occ_head = NULL;
488 }
489 
490 static bool
491 gate_cse_reciprocals (void)
492 {
493   return optimize && flag_reciprocal_math;
494 }
495 
496 /* Go through all the floating-point SSA_NAMEs, and call
497    execute_cse_reciprocals_1 on each of them.  */
498 static unsigned int
499 execute_cse_reciprocals (void)
500 {
501   basic_block bb;
502   tree arg;
503 
504   occ_pool = create_alloc_pool ("dominators for recip",
505 				sizeof (struct occurrence),
506 				n_basic_blocks / 3 + 1);
507 
508   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
509   calculate_dominance_info (CDI_DOMINATORS);
510   calculate_dominance_info (CDI_POST_DOMINATORS);
511 
512 #ifdef ENABLE_CHECKING
513   FOR_EACH_BB (bb)
514     gcc_assert (!bb->aux);
515 #endif
516 
517   for (arg = DECL_ARGUMENTS (cfun->decl); arg; arg = DECL_CHAIN (arg))
518     if (FLOAT_TYPE_P (TREE_TYPE (arg))
519 	&& is_gimple_reg (arg))
520       {
521 	tree name = ssa_default_def (cfun, arg);
522 	if (name)
523 	  execute_cse_reciprocals_1 (NULL, name);
524       }
525 
526   FOR_EACH_BB (bb)
527     {
528       gimple_stmt_iterator gsi;
529       gimple phi;
530       tree def;
531 
532       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
533 	{
534 	  phi = gsi_stmt (gsi);
535 	  def = PHI_RESULT (phi);
536 	  if (! virtual_operand_p (def)
537 	      && FLOAT_TYPE_P (TREE_TYPE (def)))
538 	    execute_cse_reciprocals_1 (NULL, def);
539 	}
540 
541       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
542         {
543 	  gimple stmt = gsi_stmt (gsi);
544 
545 	  if (gimple_has_lhs (stmt)
546 	      && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
547 	      && FLOAT_TYPE_P (TREE_TYPE (def))
548 	      && TREE_CODE (def) == SSA_NAME)
549 	    execute_cse_reciprocals_1 (&gsi, def);
550 	}
551 
552       if (optimize_bb_for_size_p (bb))
553         continue;
554 
555       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
556       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
557         {
558 	  gimple stmt = gsi_stmt (gsi);
559 	  tree fndecl;
560 
561 	  if (is_gimple_assign (stmt)
562 	      && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
563 	    {
564 	      tree arg1 = gimple_assign_rhs2 (stmt);
565 	      gimple stmt1;
566 
567 	      if (TREE_CODE (arg1) != SSA_NAME)
568 		continue;
569 
570 	      stmt1 = SSA_NAME_DEF_STMT (arg1);
571 
572 	      if (is_gimple_call (stmt1)
573 		  && gimple_call_lhs (stmt1)
574 		  && (fndecl = gimple_call_fndecl (stmt1))
575 		  && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
576 		      || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
577 		{
578 		  enum built_in_function code;
579 		  bool md_code, fail;
580 		  imm_use_iterator ui;
581 		  use_operand_p use_p;
582 
583 		  code = DECL_FUNCTION_CODE (fndecl);
584 		  md_code = DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD;
585 
586 		  fndecl = targetm.builtin_reciprocal (code, md_code, false);
587 		  if (!fndecl)
588 		    continue;
589 
590 		  /* Check that all uses of the SSA name are divisions,
591 		     otherwise replacing the defining statement will do
592 		     the wrong thing.  */
593 		  fail = false;
594 		  FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
595 		    {
596 		      gimple stmt2 = USE_STMT (use_p);
597 		      if (is_gimple_debug (stmt2))
598 			continue;
599 		      if (!is_gimple_assign (stmt2)
600 			  || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
601 			  || gimple_assign_rhs1 (stmt2) == arg1
602 			  || gimple_assign_rhs2 (stmt2) != arg1)
603 			{
604 			  fail = true;
605 			  break;
606 			}
607 		    }
608 		  if (fail)
609 		    continue;
610 
611 		  gimple_replace_lhs (stmt1, arg1);
612 		  gimple_call_set_fndecl (stmt1, fndecl);
613 		  update_stmt (stmt1);
614 		  reciprocal_stats.rfuncs_inserted++;
615 
616 		  FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
617 		    {
618 		      gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
619 		      gimple_assign_set_rhs_code (stmt, MULT_EXPR);
620 		      fold_stmt_inplace (&gsi);
621 		      update_stmt (stmt);
622 		    }
623 		}
624 	    }
625 	}
626     }
627 
628   statistics_counter_event (cfun, "reciprocal divs inserted",
629 			    reciprocal_stats.rdivs_inserted);
630   statistics_counter_event (cfun, "reciprocal functions inserted",
631 			    reciprocal_stats.rfuncs_inserted);
632 
633   free_dominance_info (CDI_DOMINATORS);
634   free_dominance_info (CDI_POST_DOMINATORS);
635   free_alloc_pool (occ_pool);
636   return 0;
637 }
638 
639 struct gimple_opt_pass pass_cse_reciprocals =
640 {
641  {
642   GIMPLE_PASS,
643   "recip",				/* name */
644   OPTGROUP_NONE,                        /* optinfo_flags */
645   gate_cse_reciprocals,			/* gate */
646   execute_cse_reciprocals,		/* execute */
647   NULL,					/* sub */
648   NULL,					/* next */
649   0,					/* static_pass_number */
650   TV_NONE,				/* tv_id */
651   PROP_ssa,				/* properties_required */
652   0,					/* properties_provided */
653   0,					/* properties_destroyed */
654   0,					/* todo_flags_start */
655   TODO_update_ssa | TODO_verify_ssa
656     | TODO_verify_stmts                /* todo_flags_finish */
657  }
658 };
659 
660 /* Records an occurrence at statement USE_STMT in the vector of trees
661    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
662    is not yet initialized.  Returns true if the occurrence was pushed on
663    the vector.  Adjusts *TOP_BB to be the basic block dominating all
664    statements in the vector.  */
665 
666 static bool
667 maybe_record_sincos (vec<gimple> *stmts,
668 		     basic_block *top_bb, gimple use_stmt)
669 {
670   basic_block use_bb = gimple_bb (use_stmt);
671   if (*top_bb
672       && (*top_bb == use_bb
673 	  || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
674     stmts->safe_push (use_stmt);
675   else if (!*top_bb
676 	   || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
677     {
678       stmts->safe_push (use_stmt);
679       *top_bb = use_bb;
680     }
681   else
682     return false;
683 
684   return true;
685 }
686 
687 /* Look for sin, cos and cexpi calls with the same argument NAME and
688    create a single call to cexpi CSEing the result in this case.
689    We first walk over all immediate uses of the argument collecting
690    statements that we can CSE in a vector and in a second pass replace
691    the statement rhs with a REALPART or IMAGPART expression on the
692    result of the cexpi call we insert before the use statement that
693    dominates all other candidates.  */
694 
695 static bool
696 execute_cse_sincos_1 (tree name)
697 {
698   gimple_stmt_iterator gsi;
699   imm_use_iterator use_iter;
700   tree fndecl, res, type;
701   gimple def_stmt, use_stmt, stmt;
702   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
703   vec<gimple> stmts = vNULL;
704   basic_block top_bb = NULL;
705   int i;
706   bool cfg_changed = false;
707 
708   type = TREE_TYPE (name);
709   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
710     {
711       if (gimple_code (use_stmt) != GIMPLE_CALL
712 	  || !gimple_call_lhs (use_stmt)
713 	  || !(fndecl = gimple_call_fndecl (use_stmt))
714 	  || DECL_BUILT_IN_CLASS (fndecl) != BUILT_IN_NORMAL)
715 	continue;
716 
717       switch (DECL_FUNCTION_CODE (fndecl))
718 	{
719 	CASE_FLT_FN (BUILT_IN_COS):
720 	  seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
721 	  break;
722 
723 	CASE_FLT_FN (BUILT_IN_SIN):
724 	  seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
725 	  break;
726 
727 	CASE_FLT_FN (BUILT_IN_CEXPI):
728 	  seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
729 	  break;
730 
731 	default:;
732 	}
733     }
734 
735   if (seen_cos + seen_sin + seen_cexpi <= 1)
736     {
737       stmts.release ();
738       return false;
739     }
740 
741   /* Simply insert cexpi at the beginning of top_bb but not earlier than
742      the name def statement.  */
743   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
744   if (!fndecl)
745     return false;
746   stmt = gimple_build_call (fndecl, 1, name);
747   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
748   gimple_call_set_lhs (stmt, res);
749 
750   def_stmt = SSA_NAME_DEF_STMT (name);
751   if (!SSA_NAME_IS_DEFAULT_DEF (name)
752       && gimple_code (def_stmt) != GIMPLE_PHI
753       && gimple_bb (def_stmt) == top_bb)
754     {
755       gsi = gsi_for_stmt (def_stmt);
756       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
757     }
758   else
759     {
760       gsi = gsi_after_labels (top_bb);
761       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
762     }
763   sincos_stats.inserted++;
764 
765   /* And adjust the recorded old call sites.  */
766   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
767     {
768       tree rhs = NULL;
769       fndecl = gimple_call_fndecl (use_stmt);
770 
771       switch (DECL_FUNCTION_CODE (fndecl))
772 	{
773 	CASE_FLT_FN (BUILT_IN_COS):
774 	  rhs = fold_build1 (REALPART_EXPR, type, res);
775 	  break;
776 
777 	CASE_FLT_FN (BUILT_IN_SIN):
778 	  rhs = fold_build1 (IMAGPART_EXPR, type, res);
779 	  break;
780 
781 	CASE_FLT_FN (BUILT_IN_CEXPI):
782 	  rhs = res;
783 	  break;
784 
785 	default:;
786 	  gcc_unreachable ();
787 	}
788 
789 	/* Replace call with a copy.  */
790 	stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
791 
792 	gsi = gsi_for_stmt (use_stmt);
793 	gsi_replace (&gsi, stmt, true);
794 	if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
795 	  cfg_changed = true;
796     }
797 
798   stmts.release ();
799 
800   return cfg_changed;
801 }
802 
803 /* To evaluate powi(x,n), the floating point value x raised to the
804    constant integer exponent n, we use a hybrid algorithm that
805    combines the "window method" with look-up tables.  For an
806    introduction to exponentiation algorithms and "addition chains",
807    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
808    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
809    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
810    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
811 
812 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
813    multiplications to inline before calling the system library's pow
814    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
815    so this default never requires calling pow, powf or powl.  */
816 
817 #ifndef POWI_MAX_MULTS
818 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
819 #endif
820 
821 /* The size of the "optimal power tree" lookup table.  All
822    exponents less than this value are simply looked up in the
823    powi_table below.  This threshold is also used to size the
824    cache of pseudo registers that hold intermediate results.  */
825 #define POWI_TABLE_SIZE 256
826 
827 /* The size, in bits of the window, used in the "window method"
828    exponentiation algorithm.  This is equivalent to a radix of
829    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
830 #define POWI_WINDOW_SIZE 3
831 
832 /* The following table is an efficient representation of an
833    "optimal power tree".  For each value, i, the corresponding
834    value, j, in the table states than an optimal evaluation
835    sequence for calculating pow(x,i) can be found by evaluating
836    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
837    100 integers is given in Knuth's "Seminumerical algorithms".  */
838 
839 static const unsigned char powi_table[POWI_TABLE_SIZE] =
840   {
841       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
842       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
843       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
844      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
845      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
846      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
847      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
848      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
849      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
850      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
851      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
852      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
853      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
854      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
855      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
856      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
857      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
858      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
859      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
860      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
861      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
862      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
863      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
864      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
865      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
866     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
867     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
868     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
869     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
870     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
871     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
872     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
873   };
874 
875 
876 /* Return the number of multiplications required to calculate
877    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
878    subroutine of powi_cost.  CACHE is an array indicating
879    which exponents have already been calculated.  */
880 
881 static int
882 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
883 {
884   /* If we've already calculated this exponent, then this evaluation
885      doesn't require any additional multiplications.  */
886   if (cache[n])
887     return 0;
888 
889   cache[n] = true;
890   return powi_lookup_cost (n - powi_table[n], cache)
891 	 + powi_lookup_cost (powi_table[n], cache) + 1;
892 }
893 
894 /* Return the number of multiplications required to calculate
895    powi(x,n) for an arbitrary x, given the exponent N.  This
896    function needs to be kept in sync with powi_as_mults below.  */
897 
898 static int
899 powi_cost (HOST_WIDE_INT n)
900 {
901   bool cache[POWI_TABLE_SIZE];
902   unsigned HOST_WIDE_INT digit;
903   unsigned HOST_WIDE_INT val;
904   int result;
905 
906   if (n == 0)
907     return 0;
908 
909   /* Ignore the reciprocal when calculating the cost.  */
910   val = (n < 0) ? -n : n;
911 
912   /* Initialize the exponent cache.  */
913   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
914   cache[1] = true;
915 
916   result = 0;
917 
918   while (val >= POWI_TABLE_SIZE)
919     {
920       if (val & 1)
921 	{
922 	  digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
923 	  result += powi_lookup_cost (digit, cache)
924 		    + POWI_WINDOW_SIZE + 1;
925 	  val >>= POWI_WINDOW_SIZE;
926 	}
927       else
928 	{
929 	  val >>= 1;
930 	  result++;
931 	}
932     }
933 
934   return result + powi_lookup_cost (val, cache);
935 }
936 
937 /* Recursive subroutine of powi_as_mults.  This function takes the
938    array, CACHE, of already calculated exponents and an exponent N and
939    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
940 
941 static tree
942 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
943 		 HOST_WIDE_INT n, tree *cache)
944 {
945   tree op0, op1, ssa_target;
946   unsigned HOST_WIDE_INT digit;
947   gimple mult_stmt;
948 
949   if (n < POWI_TABLE_SIZE && cache[n])
950     return cache[n];
951 
952   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
953 
954   if (n < POWI_TABLE_SIZE)
955     {
956       cache[n] = ssa_target;
957       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
958       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
959     }
960   else if (n & 1)
961     {
962       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
963       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
964       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
965     }
966   else
967     {
968       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
969       op1 = op0;
970     }
971 
972   mult_stmt = gimple_build_assign_with_ops (MULT_EXPR, ssa_target, op0, op1);
973   gimple_set_location (mult_stmt, loc);
974   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
975 
976   return ssa_target;
977 }
978 
979 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
980    This function needs to be kept in sync with powi_cost above.  */
981 
982 static tree
983 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
984 	       tree arg0, HOST_WIDE_INT n)
985 {
986   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
987   gimple div_stmt;
988   tree target;
989 
990   if (n == 0)
991     return build_real (type, dconst1);
992 
993   memset (cache, 0,  sizeof (cache));
994   cache[1] = arg0;
995 
996   result = powi_as_mults_1 (gsi, loc, type, (n < 0) ? -n : n, cache);
997   if (n >= 0)
998     return result;
999 
1000   /* If the original exponent was negative, reciprocate the result.  */
1001   target = make_temp_ssa_name (type, NULL, "powmult");
1002   div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target,
1003 					   build_real (type, dconst1),
1004 					   result);
1005   gimple_set_location (div_stmt, loc);
1006   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1007 
1008   return target;
1009 }
1010 
1011 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1012    location info LOC.  If the arguments are appropriate, create an
1013    equivalent sequence of statements prior to GSI using an optimal
1014    number of multiplications, and return an expession holding the
1015    result.  */
1016 
1017 static tree
1018 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1019 			    tree arg0, HOST_WIDE_INT n)
1020 {
1021   /* Avoid largest negative number.  */
1022   if (n != -n
1023       && ((n >= -1 && n <= 2)
1024 	  || (optimize_function_for_speed_p (cfun)
1025 	      && powi_cost (n) <= POWI_MAX_MULTS)))
1026     return powi_as_mults (gsi, loc, arg0, n);
1027 
1028   return NULL_TREE;
1029 }
1030 
1031 /* Build a gimple call statement that calls FN with argument ARG.
1032    Set the lhs of the call statement to a fresh SSA name.  Insert the
1033    statement prior to GSI's current position, and return the fresh
1034    SSA name.  */
1035 
1036 static tree
1037 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1038 		       tree fn, tree arg)
1039 {
1040   gimple call_stmt;
1041   tree ssa_target;
1042 
1043   call_stmt = gimple_build_call (fn, 1, arg);
1044   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1045   gimple_set_lhs (call_stmt, ssa_target);
1046   gimple_set_location (call_stmt, loc);
1047   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1048 
1049   return ssa_target;
1050 }
1051 
1052 /* Build a gimple binary operation with the given CODE and arguments
1053    ARG0, ARG1, assigning the result to a new SSA name for variable
1054    TARGET.  Insert the statement prior to GSI's current position, and
1055    return the fresh SSA name.*/
1056 
1057 static tree
1058 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1059 			const char *name, enum tree_code code,
1060 			tree arg0, tree arg1)
1061 {
1062   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1063   gimple stmt = gimple_build_assign_with_ops (code, result, arg0, arg1);
1064   gimple_set_location (stmt, loc);
1065   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1066   return result;
1067 }
1068 
1069 /* Build a gimple reference operation with the given CODE and argument
1070    ARG, assigning the result to a new SSA name of TYPE with NAME.
1071    Insert the statement prior to GSI's current position, and return
1072    the fresh SSA name.  */
1073 
1074 static inline tree
1075 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1076 		      const char *name, enum tree_code code, tree arg0)
1077 {
1078   tree result = make_temp_ssa_name (type, NULL, name);
1079   gimple stmt = gimple_build_assign (result, build1 (code, type, arg0));
1080   gimple_set_location (stmt, loc);
1081   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1082   return result;
1083 }
1084 
1085 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1086    prior to GSI's current position, and return the fresh SSA name.  */
1087 
1088 static tree
1089 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1090 		       tree type, tree val)
1091 {
1092   tree result = make_ssa_name (type, NULL);
1093   gimple stmt = gimple_build_assign_with_ops (NOP_EXPR, result, val, NULL_TREE);
1094   gimple_set_location (stmt, loc);
1095   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1096   return result;
1097 }
1098 
1099 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1100    with location info LOC.  If possible, create an equivalent and
1101    less expensive sequence of statements prior to GSI, and return an
1102    expession holding the result.  */
1103 
1104 static tree
1105 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1106 			   tree arg0, tree arg1)
1107 {
1108   REAL_VALUE_TYPE c, cint, dconst1_4, dconst3_4, dconst1_3, dconst1_6;
1109   REAL_VALUE_TYPE c2, dconst3;
1110   HOST_WIDE_INT n;
1111   tree type, sqrtfn, cbrtfn, sqrt_arg0, sqrt_sqrt, result, cbrt_x, powi_cbrt_x;
1112   enum machine_mode mode;
1113   bool hw_sqrt_exists, c_is_int, c2_is_int;
1114 
1115   /* If the exponent isn't a constant, there's nothing of interest
1116      to be done.  */
1117   if (TREE_CODE (arg1) != REAL_CST)
1118     return NULL_TREE;
1119 
1120   /* If the exponent is equivalent to an integer, expand to an optimal
1121      multiplication sequence when profitable.  */
1122   c = TREE_REAL_CST (arg1);
1123   n = real_to_integer (&c);
1124   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1125   c_is_int = real_identical (&c, &cint);
1126 
1127   if (c_is_int
1128       && ((n >= -1 && n <= 2)
1129 	  || (flag_unsafe_math_optimizations
1130 	      && optimize_insn_for_speed_p ()
1131 	      && powi_cost (n) <= POWI_MAX_MULTS)))
1132     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1133 
1134   /* Attempt various optimizations using sqrt and cbrt.  */
1135   type = TREE_TYPE (arg0);
1136   mode = TYPE_MODE (type);
1137   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1138 
1139   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1140      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1141      sqrt(-0) = -0.  */
1142   if (sqrtfn
1143       && REAL_VALUES_EQUAL (c, dconsthalf)
1144       && !HONOR_SIGNED_ZEROS (mode))
1145     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1146 
1147   /* Optimize pow(x,0.25) = sqrt(sqrt(x)).  Assume on most machines that
1148      a builtin sqrt instruction is smaller than a call to pow with 0.25,
1149      so do this optimization even if -Os.  Don't do this optimization
1150      if we don't have a hardware sqrt insn.  */
1151   dconst1_4 = dconst1;
1152   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1153   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1154 
1155   if (flag_unsafe_math_optimizations
1156       && sqrtfn
1157       && REAL_VALUES_EQUAL (c, dconst1_4)
1158       && hw_sqrt_exists)
1159     {
1160       /* sqrt(x)  */
1161       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1162 
1163       /* sqrt(sqrt(x))  */
1164       return build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1165     }
1166 
1167   /* Optimize pow(x,0.75) = sqrt(x) * sqrt(sqrt(x)) unless we are
1168      optimizing for space.  Don't do this optimization if we don't have
1169      a hardware sqrt insn.  */
1170   real_from_integer (&dconst3_4, VOIDmode, 3, 0, 0);
1171   SET_REAL_EXP (&dconst3_4, REAL_EXP (&dconst3_4) - 2);
1172 
1173   if (flag_unsafe_math_optimizations
1174       && sqrtfn
1175       && optimize_function_for_speed_p (cfun)
1176       && REAL_VALUES_EQUAL (c, dconst3_4)
1177       && hw_sqrt_exists)
1178     {
1179       /* sqrt(x)  */
1180       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1181 
1182       /* sqrt(sqrt(x))  */
1183       sqrt_sqrt = build_and_insert_call (gsi, loc, sqrtfn, sqrt_arg0);
1184 
1185       /* sqrt(x) * sqrt(sqrt(x))  */
1186       return build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1187 				     sqrt_arg0, sqrt_sqrt);
1188     }
1189 
1190   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1191      optimizations since 1./3. is not exactly representable.  If x
1192      is negative and finite, the correct value of pow(x,1./3.) is
1193      a NaN with the "invalid" exception raised, because the value
1194      of 1./3. actually has an even denominator.  The correct value
1195      of cbrt(x) is a negative real value.  */
1196   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1197   dconst1_3 = real_value_truncate (mode, dconst_third ());
1198 
1199   if (flag_unsafe_math_optimizations
1200       && cbrtfn
1201       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1202       && REAL_VALUES_EQUAL (c, dconst1_3))
1203     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1204 
1205   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1206      if we don't have a hardware sqrt insn.  */
1207   dconst1_6 = dconst1_3;
1208   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1209 
1210   if (flag_unsafe_math_optimizations
1211       && sqrtfn
1212       && cbrtfn
1213       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1214       && optimize_function_for_speed_p (cfun)
1215       && hw_sqrt_exists
1216       && REAL_VALUES_EQUAL (c, dconst1_6))
1217     {
1218       /* sqrt(x)  */
1219       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1220 
1221       /* cbrt(sqrt(x))  */
1222       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1223     }
1224 
1225   /* Optimize pow(x,c), where n = 2c for some nonzero integer n
1226      and c not an integer, into
1227 
1228        sqrt(x) * powi(x, n/2),                n > 0;
1229        1.0 / (sqrt(x) * powi(x, abs(n/2))),   n < 0.
1230 
1231      Do not calculate the powi factor when n/2 = 0.  */
1232   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1233   n = real_to_integer (&c2);
1234   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1235   c2_is_int = real_identical (&c2, &cint);
1236 
1237   if (flag_unsafe_math_optimizations
1238       && sqrtfn
1239       && c2_is_int
1240       && !c_is_int
1241       && optimize_function_for_speed_p (cfun))
1242     {
1243       tree powi_x_ndiv2 = NULL_TREE;
1244 
1245       /* Attempt to fold powi(arg0, abs(n/2)) into multiplies.  If not
1246          possible or profitable, give up.  Skip the degenerate case when
1247          n is 1 or -1, where the result is always 1.  */
1248       if (absu_hwi (n) != 1)
1249 	{
1250 	  powi_x_ndiv2 = gimple_expand_builtin_powi (gsi, loc, arg0,
1251 						     abs_hwi (n / 2));
1252 	  if (!powi_x_ndiv2)
1253 	    return NULL_TREE;
1254 	}
1255 
1256       /* Calculate sqrt(x).  When n is not 1 or -1, multiply it by the
1257 	 result of the optimal multiply sequence just calculated.  */
1258       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1259 
1260       if (absu_hwi (n) == 1)
1261 	result = sqrt_arg0;
1262       else
1263 	result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1264 					 sqrt_arg0, powi_x_ndiv2);
1265 
1266       /* If n is negative, reciprocate the result.  */
1267       if (n < 0)
1268 	result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1269 					 build_real (type, dconst1), result);
1270       return result;
1271     }
1272 
1273   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1274 
1275      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1276      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1277 
1278      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1279      different from pow(x, 1./3.) due to rounding and behavior with
1280      negative x, we need to constrain this transformation to unsafe
1281      math and positive x or finite math.  */
1282   real_from_integer (&dconst3, VOIDmode, 3, 0, 0);
1283   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
1284   real_round (&c2, mode, &c2);
1285   n = real_to_integer (&c2);
1286   real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
1287   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
1288   real_convert (&c2, mode, &c2);
1289 
1290   if (flag_unsafe_math_optimizations
1291       && cbrtfn
1292       && (gimple_val_nonnegative_real_p (arg0) || !HONOR_NANS (mode))
1293       && real_identical (&c2, &c)
1294       && !c2_is_int
1295       && optimize_function_for_speed_p (cfun)
1296       && powi_cost (n / 3) <= POWI_MAX_MULTS)
1297     {
1298       tree powi_x_ndiv3 = NULL_TREE;
1299 
1300       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
1301          possible or profitable, give up.  Skip the degenerate case when
1302          abs(n) < 3, where the result is always 1.  */
1303       if (absu_hwi (n) >= 3)
1304 	{
1305 	  powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
1306 						     abs_hwi (n / 3));
1307 	  if (!powi_x_ndiv3)
1308 	    return NULL_TREE;
1309 	}
1310 
1311       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
1312          as that creates an unnecessary variable.  Instead, just produce
1313          either cbrt(x) or cbrt(x) * cbrt(x).  */
1314       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
1315 
1316       if (absu_hwi (n) % 3 == 1)
1317 	powi_cbrt_x = cbrt_x;
1318       else
1319 	powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1320 					      cbrt_x, cbrt_x);
1321 
1322       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
1323       if (absu_hwi (n) < 3)
1324 	result = powi_cbrt_x;
1325       else
1326 	result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1327 					 powi_x_ndiv3, powi_cbrt_x);
1328 
1329       /* If n is negative, reciprocate the result.  */
1330       if (n < 0)
1331 	result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1332 					 build_real (type, dconst1), result);
1333 
1334       return result;
1335     }
1336 
1337   /* No optimizations succeeded.  */
1338   return NULL_TREE;
1339 }
1340 
1341 /* ARG is the argument to a cabs builtin call in GSI with location info
1342    LOC.  Create a sequence of statements prior to GSI that calculates
1343    sqrt(R*R + I*I), where R and I are the real and imaginary components
1344    of ARG, respectively.  Return an expression holding the result.  */
1345 
1346 static tree
1347 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
1348 {
1349   tree real_part, imag_part, addend1, addend2, sum, result;
1350   tree type = TREE_TYPE (TREE_TYPE (arg));
1351   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1352   enum machine_mode mode = TYPE_MODE (type);
1353 
1354   if (!flag_unsafe_math_optimizations
1355       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
1356       || !sqrtfn
1357       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
1358     return NULL_TREE;
1359 
1360   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
1361 				    REALPART_EXPR, arg);
1362   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1363 				    real_part, real_part);
1364   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
1365 				    IMAGPART_EXPR, arg);
1366   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
1367 				    imag_part, imag_part);
1368   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
1369   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
1370 
1371   return result;
1372 }
1373 
1374 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
1375    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
1376    an optimal number of multiplies, when n is a constant.  */
1377 
1378 static unsigned int
1379 execute_cse_sincos (void)
1380 {
1381   basic_block bb;
1382   bool cfg_changed = false;
1383 
1384   calculate_dominance_info (CDI_DOMINATORS);
1385   memset (&sincos_stats, 0, sizeof (sincos_stats));
1386 
1387   FOR_EACH_BB (bb)
1388     {
1389       gimple_stmt_iterator gsi;
1390       bool cleanup_eh = false;
1391 
1392       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1393         {
1394 	  gimple stmt = gsi_stmt (gsi);
1395 	  tree fndecl;
1396 
1397 	  /* Only the last stmt in a bb could throw, no need to call
1398 	     gimple_purge_dead_eh_edges if we change something in the middle
1399 	     of a basic block.  */
1400 	  cleanup_eh = false;
1401 
1402 	  if (is_gimple_call (stmt)
1403 	      && gimple_call_lhs (stmt)
1404 	      && (fndecl = gimple_call_fndecl (stmt))
1405 	      && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
1406 	    {
1407 	      tree arg, arg0, arg1, result;
1408 	      HOST_WIDE_INT n;
1409 	      location_t loc;
1410 
1411 	      switch (DECL_FUNCTION_CODE (fndecl))
1412 		{
1413 		CASE_FLT_FN (BUILT_IN_COS):
1414 		CASE_FLT_FN (BUILT_IN_SIN):
1415 		CASE_FLT_FN (BUILT_IN_CEXPI):
1416 		  /* Make sure we have either sincos or cexp.  */
1417 		  if (!TARGET_HAS_SINCOS && !TARGET_C99_FUNCTIONS)
1418 		    break;
1419 
1420 		  arg = gimple_call_arg (stmt, 0);
1421 		  if (TREE_CODE (arg) == SSA_NAME)
1422 		    cfg_changed |= execute_cse_sincos_1 (arg);
1423 		  break;
1424 
1425 		CASE_FLT_FN (BUILT_IN_POW):
1426 		  arg0 = gimple_call_arg (stmt, 0);
1427 		  arg1 = gimple_call_arg (stmt, 1);
1428 
1429 		  loc = gimple_location (stmt);
1430 		  result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
1431 
1432 		  if (result)
1433 		    {
1434 		      tree lhs = gimple_get_lhs (stmt);
1435 		      gimple new_stmt = gimple_build_assign (lhs, result);
1436 		      gimple_set_location (new_stmt, loc);
1437 		      unlink_stmt_vdef (stmt);
1438 		      gsi_replace (&gsi, new_stmt, true);
1439 		      cleanup_eh = true;
1440 		      if (gimple_vdef (stmt))
1441 			release_ssa_name (gimple_vdef (stmt));
1442 		    }
1443 		  break;
1444 
1445 		CASE_FLT_FN (BUILT_IN_POWI):
1446 		  arg0 = gimple_call_arg (stmt, 0);
1447 		  arg1 = gimple_call_arg (stmt, 1);
1448 		  if (!host_integerp (arg1, 0))
1449 		    break;
1450 
1451 		  n = TREE_INT_CST_LOW (arg1);
1452 		  loc = gimple_location (stmt);
1453 		  result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
1454 
1455 		  if (result)
1456 		    {
1457 		      tree lhs = gimple_get_lhs (stmt);
1458 		      gimple new_stmt = gimple_build_assign (lhs, result);
1459 		      gimple_set_location (new_stmt, loc);
1460 		      unlink_stmt_vdef (stmt);
1461 		      gsi_replace (&gsi, new_stmt, true);
1462 		      cleanup_eh = true;
1463 		      if (gimple_vdef (stmt))
1464 			release_ssa_name (gimple_vdef (stmt));
1465 		    }
1466 		  break;
1467 
1468 		CASE_FLT_FN (BUILT_IN_CABS):
1469 		  arg0 = gimple_call_arg (stmt, 0);
1470 		  loc = gimple_location (stmt);
1471 		  result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
1472 
1473 		  if (result)
1474 		    {
1475 		      tree lhs = gimple_get_lhs (stmt);
1476 		      gimple new_stmt = gimple_build_assign (lhs, result);
1477 		      gimple_set_location (new_stmt, loc);
1478 		      unlink_stmt_vdef (stmt);
1479 		      gsi_replace (&gsi, new_stmt, true);
1480 		      cleanup_eh = true;
1481 		      if (gimple_vdef (stmt))
1482 			release_ssa_name (gimple_vdef (stmt));
1483 		    }
1484 		  break;
1485 
1486 		default:;
1487 		}
1488 	    }
1489 	}
1490       if (cleanup_eh)
1491 	cfg_changed |= gimple_purge_dead_eh_edges (bb);
1492     }
1493 
1494   statistics_counter_event (cfun, "sincos statements inserted",
1495 			    sincos_stats.inserted);
1496 
1497   free_dominance_info (CDI_DOMINATORS);
1498   return cfg_changed ? TODO_cleanup_cfg : 0;
1499 }
1500 
1501 static bool
1502 gate_cse_sincos (void)
1503 {
1504   /* We no longer require either sincos or cexp, since powi expansion
1505      piggybacks on this pass.  */
1506   return optimize;
1507 }
1508 
1509 struct gimple_opt_pass pass_cse_sincos =
1510 {
1511  {
1512   GIMPLE_PASS,
1513   "sincos",				/* name */
1514   OPTGROUP_NONE,                        /* optinfo_flags */
1515   gate_cse_sincos,			/* gate */
1516   execute_cse_sincos,			/* execute */
1517   NULL,					/* sub */
1518   NULL,					/* next */
1519   0,					/* static_pass_number */
1520   TV_NONE,				/* tv_id */
1521   PROP_ssa,				/* properties_required */
1522   0,					/* properties_provided */
1523   0,					/* properties_destroyed */
1524   0,					/* todo_flags_start */
1525   TODO_update_ssa | TODO_verify_ssa
1526     | TODO_verify_stmts                 /* todo_flags_finish */
1527  }
1528 };
1529 
1530 /* A symbolic number is used to detect byte permutation and selection
1531    patterns.  Therefore the field N contains an artificial number
1532    consisting of byte size markers:
1533 
1534    0    - byte has the value 0
1535    1..size - byte contains the content of the byte
1536    number indexed with that value minus one  */
1537 
1538 struct symbolic_number {
1539   unsigned HOST_WIDEST_INT n;
1540   tree type;
1541 };
1542 
1543 /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic
1544    number N.  Return false if the requested operation is not permitted
1545    on a symbolic number.  */
1546 
1547 static inline bool
1548 do_shift_rotate (enum tree_code code,
1549 		 struct symbolic_number *n,
1550 		 int count)
1551 {
1552   int bitsize = TYPE_PRECISION (n->type);
1553 
1554   if (count % 8 != 0)
1555     return false;
1556 
1557   /* Zero out the extra bits of N in order to avoid them being shifted
1558      into the significant bits.  */
1559   if (bitsize < 8 * (int)sizeof (HOST_WIDEST_INT))
1560     n->n &= ((unsigned HOST_WIDEST_INT)1 << bitsize) - 1;
1561 
1562   switch (code)
1563     {
1564     case LSHIFT_EXPR:
1565       n->n <<= count;
1566       break;
1567     case RSHIFT_EXPR:
1568       /* Arithmetic shift of signed type: result is dependent on the value.  */
1569       if (!TYPE_UNSIGNED (n->type)
1570 	  && (n->n & ((unsigned HOST_WIDEST_INT) 0xff << (bitsize - 8))))
1571 	return false;
1572       n->n >>= count;
1573       break;
1574     case LROTATE_EXPR:
1575       n->n = (n->n << count) | (n->n >> (bitsize - count));
1576       break;
1577     case RROTATE_EXPR:
1578       n->n = (n->n >> count) | (n->n << (bitsize - count));
1579       break;
1580     default:
1581       return false;
1582     }
1583   /* Zero unused bits for size.  */
1584   if (bitsize < 8 * (int)sizeof (HOST_WIDEST_INT))
1585     n->n &= ((unsigned HOST_WIDEST_INT)1 << bitsize) - 1;
1586   return true;
1587 }
1588 
1589 /* Perform sanity checking for the symbolic number N and the gimple
1590    statement STMT.  */
1591 
1592 static inline bool
1593 verify_symbolic_number_p (struct symbolic_number *n, gimple stmt)
1594 {
1595   tree lhs_type;
1596 
1597   lhs_type = gimple_expr_type (stmt);
1598 
1599   if (TREE_CODE (lhs_type) != INTEGER_TYPE)
1600     return false;
1601 
1602   if (TYPE_PRECISION (lhs_type) != TYPE_PRECISION (n->type))
1603     return false;
1604 
1605   return true;
1606 }
1607 
1608 /* find_bswap_1 invokes itself recursively with N and tries to perform
1609    the operation given by the rhs of STMT on the result.  If the
1610    operation could successfully be executed the function returns the
1611    tree expression of the source operand and NULL otherwise.  */
1612 
1613 static tree
1614 find_bswap_1 (gimple stmt, struct symbolic_number *n, int limit)
1615 {
1616   enum tree_code code;
1617   tree rhs1, rhs2 = NULL;
1618   gimple rhs1_stmt, rhs2_stmt;
1619   tree source_expr1;
1620   enum gimple_rhs_class rhs_class;
1621 
1622   if (!limit || !is_gimple_assign (stmt))
1623     return NULL_TREE;
1624 
1625   rhs1 = gimple_assign_rhs1 (stmt);
1626 
1627   if (TREE_CODE (rhs1) != SSA_NAME)
1628     return NULL_TREE;
1629 
1630   code = gimple_assign_rhs_code (stmt);
1631   rhs_class = gimple_assign_rhs_class (stmt);
1632   rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
1633 
1634   if (rhs_class == GIMPLE_BINARY_RHS)
1635     rhs2 = gimple_assign_rhs2 (stmt);
1636 
1637   /* Handle unary rhs and binary rhs with integer constants as second
1638      operand.  */
1639 
1640   if (rhs_class == GIMPLE_UNARY_RHS
1641       || (rhs_class == GIMPLE_BINARY_RHS
1642 	  && TREE_CODE (rhs2) == INTEGER_CST))
1643     {
1644       if (code != BIT_AND_EXPR
1645 	  && code != LSHIFT_EXPR
1646 	  && code != RSHIFT_EXPR
1647 	  && code != LROTATE_EXPR
1648 	  && code != RROTATE_EXPR
1649 	  && code != NOP_EXPR
1650 	  && code != CONVERT_EXPR)
1651 	return NULL_TREE;
1652 
1653       source_expr1 = find_bswap_1 (rhs1_stmt, n, limit - 1);
1654 
1655       /* If find_bswap_1 returned NULL STMT is a leaf node and we have
1656 	 to initialize the symbolic number.  */
1657       if (!source_expr1)
1658 	{
1659 	  int size;
1660 
1661 	  /* Set up the symbolic number N by setting each byte to a
1662 	     value between 1 and the byte size of rhs1.  The highest
1663 	     order byte is set to n->size and the lowest order
1664 	     byte to 1.  */
1665 	  n->type = TREE_TYPE (rhs1);
1666 	  size = TYPE_PRECISION (n->type);
1667 	  if (size % BITS_PER_UNIT != 0)
1668 	    return NULL_TREE;
1669 	  if (size > HOST_BITS_PER_WIDEST_INT)
1670 	    return NULL_TREE;
1671 	  size /= BITS_PER_UNIT;
1672 	  n->n = (sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1673 		  (unsigned HOST_WIDEST_INT)0x08070605 << 32 | 0x04030201);
1674 
1675 	  if (size < (int)sizeof (HOST_WIDEST_INT))
1676 	    n->n &= ((unsigned HOST_WIDEST_INT)1 <<
1677 		     (size * BITS_PER_UNIT)) - 1;
1678 
1679 	  source_expr1 = rhs1;
1680 	}
1681 
1682       switch (code)
1683 	{
1684 	case BIT_AND_EXPR:
1685 	  {
1686 	    int i, size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1687 	    unsigned HOST_WIDEST_INT val = widest_int_cst_value (rhs2);
1688 	    unsigned HOST_WIDEST_INT tmp = val;
1689 
1690 	    /* Only constants masking full bytes are allowed.  */
1691 	    for (i = 0; i < size; i++, tmp >>= BITS_PER_UNIT)
1692 	      if ((tmp & 0xff) != 0 && (tmp & 0xff) != 0xff)
1693 		return NULL_TREE;
1694 
1695 	    n->n &= val;
1696 	  }
1697 	  break;
1698 	case LSHIFT_EXPR:
1699 	case RSHIFT_EXPR:
1700 	case LROTATE_EXPR:
1701 	case RROTATE_EXPR:
1702 	  if (!do_shift_rotate (code, n, (int)TREE_INT_CST_LOW (rhs2)))
1703 	    return NULL_TREE;
1704 	  break;
1705 	CASE_CONVERT:
1706 	  {
1707 	    int type_size, old_type_size;
1708 	    tree type;
1709 
1710 	    type = gimple_expr_type (stmt);
1711 	    type_size = TYPE_PRECISION (type);
1712 	    if (type_size % BITS_PER_UNIT != 0)
1713 	      return NULL_TREE;
1714 	    if (type_size > (int) HOST_BITS_PER_WIDEST_INT)
1715 	      return NULL_TREE;
1716 
1717 	    /* Sign extension: result is dependent on the value.  */
1718 	    old_type_size = TYPE_PRECISION (n->type);
1719 	    if (!TYPE_UNSIGNED (n->type)
1720 		&& type_size > old_type_size
1721 		&& n->n &
1722 		   ((unsigned HOST_WIDEST_INT) 0xff << (old_type_size - 8)))
1723 	      return NULL_TREE;
1724 
1725 	    if (type_size / BITS_PER_UNIT < (int)(sizeof (HOST_WIDEST_INT)))
1726 	      {
1727 		/* If STMT casts to a smaller type mask out the bits not
1728 		   belonging to the target type.  */
1729 		n->n &= ((unsigned HOST_WIDEST_INT)1 << type_size) - 1;
1730 	      }
1731 	    n->type = type;
1732 	  }
1733 	  break;
1734 	default:
1735 	  return NULL_TREE;
1736 	};
1737       return verify_symbolic_number_p (n, stmt) ? source_expr1 : NULL;
1738     }
1739 
1740   /* Handle binary rhs.  */
1741 
1742   if (rhs_class == GIMPLE_BINARY_RHS)
1743     {
1744       int i, size;
1745       struct symbolic_number n1, n2;
1746       unsigned HOST_WIDEST_INT mask;
1747       tree source_expr2;
1748 
1749       if (code != BIT_IOR_EXPR)
1750 	return NULL_TREE;
1751 
1752       if (TREE_CODE (rhs2) != SSA_NAME)
1753 	return NULL_TREE;
1754 
1755       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
1756 
1757       switch (code)
1758 	{
1759 	case BIT_IOR_EXPR:
1760 	  source_expr1 = find_bswap_1 (rhs1_stmt, &n1, limit - 1);
1761 
1762 	  if (!source_expr1)
1763 	    return NULL_TREE;
1764 
1765 	  source_expr2 = find_bswap_1 (rhs2_stmt, &n2, limit - 1);
1766 
1767 	  if (source_expr1 != source_expr2
1768 	      || TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type))
1769 	    return NULL_TREE;
1770 
1771 	  n->type = n1.type;
1772 	  size = TYPE_PRECISION (n->type) / BITS_PER_UNIT;
1773 	  for (i = 0, mask = 0xff; i < size; i++, mask <<= BITS_PER_UNIT)
1774 	    {
1775 	      unsigned HOST_WIDEST_INT masked1, masked2;
1776 
1777 	      masked1 = n1.n & mask;
1778 	      masked2 = n2.n & mask;
1779 	      if (masked1 && masked2 && masked1 != masked2)
1780 		return NULL_TREE;
1781 	    }
1782 	  n->n = n1.n | n2.n;
1783 
1784 	  if (!verify_symbolic_number_p (n, stmt))
1785 	    return NULL_TREE;
1786 
1787 	  break;
1788 	default:
1789 	  return NULL_TREE;
1790 	}
1791       return source_expr1;
1792     }
1793   return NULL_TREE;
1794 }
1795 
1796 /* Check if STMT completes a bswap implementation consisting of ORs,
1797    SHIFTs and ANDs.  Return the source tree expression on which the
1798    byte swap is performed and NULL if no bswap was found.  */
1799 
1800 static tree
1801 find_bswap (gimple stmt)
1802 {
1803 /* The number which the find_bswap result should match in order to
1804    have a full byte swap.  The number is shifted to the left according
1805    to the size of the symbolic number before using it.  */
1806   unsigned HOST_WIDEST_INT cmp =
1807     sizeof (HOST_WIDEST_INT) < 8 ? 0 :
1808     (unsigned HOST_WIDEST_INT)0x01020304 << 32 | 0x05060708;
1809 
1810   struct symbolic_number n;
1811   tree source_expr;
1812   int limit, bitsize;
1813 
1814   /* The last parameter determines the depth search limit.  It usually
1815      correlates directly to the number of bytes to be touched.  We
1816      increase that number by three  here in order to also
1817      cover signed -> unsigned converions of the src operand as can be seen
1818      in libgcc, and for initial shift/and operation of the src operand.  */
1819   limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt)));
1820   limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit);
1821   source_expr =  find_bswap_1 (stmt, &n, limit);
1822 
1823   if (!source_expr)
1824     return NULL_TREE;
1825 
1826   /* Zero out the extra bits of N and CMP.  */
1827   bitsize = TYPE_PRECISION (n.type);
1828   if (bitsize < 8 * (int)sizeof (HOST_WIDEST_INT))
1829     {
1830       unsigned HOST_WIDEST_INT mask =
1831 	((unsigned HOST_WIDEST_INT)1 << bitsize) - 1;
1832 
1833       n.n &= mask;
1834       cmp >>= sizeof (HOST_WIDEST_INT) * BITS_PER_UNIT - bitsize;
1835     }
1836 
1837   /* A complete byte swap should make the symbolic number to start
1838      with the largest digit in the highest order byte.  */
1839   if (cmp != n.n)
1840     return NULL_TREE;
1841 
1842   return source_expr;
1843 }
1844 
1845 /* Find manual byte swap implementations and turn them into a bswap
1846    builtin invokation.  */
1847 
1848 static unsigned int
1849 execute_optimize_bswap (void)
1850 {
1851   basic_block bb;
1852   bool bswap16_p, bswap32_p, bswap64_p;
1853   bool changed = false;
1854   tree bswap16_type = NULL_TREE, bswap32_type = NULL_TREE, bswap64_type = NULL_TREE;
1855 
1856   if (BITS_PER_UNIT != 8 || CHAR_BIT != 8)
1857     return 0;
1858 
1859   if (sizeof (HOST_WIDEST_INT) < 8)
1860     return 0;
1861 
1862   bswap16_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP16)
1863 	       && optab_handler (bswap_optab, HImode) != CODE_FOR_nothing);
1864   bswap32_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP32)
1865 	       && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing);
1866   bswap64_p = (builtin_decl_explicit_p (BUILT_IN_BSWAP64)
1867 	       && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing
1868 		   || (bswap32_p && word_mode == SImode)));
1869 
1870   if (!bswap16_p && !bswap32_p && !bswap64_p)
1871     return 0;
1872 
1873   /* Determine the argument type of the builtins.  The code later on
1874      assumes that the return and argument type are the same.  */
1875   if (bswap16_p)
1876     {
1877       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
1878       bswap16_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1879     }
1880 
1881   if (bswap32_p)
1882     {
1883       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1884       bswap32_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1885     }
1886 
1887   if (bswap64_p)
1888     {
1889       tree fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1890       bswap64_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl)));
1891     }
1892 
1893   memset (&bswap_stats, 0, sizeof (bswap_stats));
1894 
1895   FOR_EACH_BB (bb)
1896     {
1897       gimple_stmt_iterator gsi;
1898 
1899       /* We do a reverse scan for bswap patterns to make sure we get the
1900 	 widest match. As bswap pattern matching doesn't handle
1901 	 previously inserted smaller bswap replacements as sub-
1902 	 patterns, the wider variant wouldn't be detected.  */
1903       for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
1904         {
1905 	  gimple stmt = gsi_stmt (gsi);
1906 	  tree bswap_src, bswap_type;
1907 	  tree bswap_tmp;
1908 	  tree fndecl = NULL_TREE;
1909 	  int type_size;
1910 	  gimple call;
1911 
1912 	  if (!is_gimple_assign (stmt)
1913 	      || gimple_assign_rhs_code (stmt) != BIT_IOR_EXPR)
1914 	    continue;
1915 
1916 	  type_size = TYPE_PRECISION (gimple_expr_type (stmt));
1917 
1918 	  switch (type_size)
1919 	    {
1920 	    case 16:
1921 	      if (bswap16_p)
1922 		{
1923 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP16);
1924 		  bswap_type = bswap16_type;
1925 		}
1926 	      break;
1927 	    case 32:
1928 	      if (bswap32_p)
1929 		{
1930 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32);
1931 		  bswap_type = bswap32_type;
1932 		}
1933 	      break;
1934 	    case 64:
1935 	      if (bswap64_p)
1936 		{
1937 		  fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64);
1938 		  bswap_type = bswap64_type;
1939 		}
1940 	      break;
1941 	    default:
1942 	      continue;
1943 	    }
1944 
1945 	  if (!fndecl)
1946 	    continue;
1947 
1948 	  bswap_src = find_bswap (stmt);
1949 
1950 	  if (!bswap_src)
1951 	    continue;
1952 
1953 	  changed = true;
1954 	  if (type_size == 16)
1955 	    bswap_stats.found_16bit++;
1956 	  else if (type_size == 32)
1957 	    bswap_stats.found_32bit++;
1958 	  else
1959 	    bswap_stats.found_64bit++;
1960 
1961 	  bswap_tmp = bswap_src;
1962 
1963 	  /* Convert the src expression if necessary.  */
1964 	  if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1965 	    {
1966 	      gimple convert_stmt;
1967 	      bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapsrc");
1968 	      convert_stmt = gimple_build_assign_with_ops
1969 		  		(NOP_EXPR, bswap_tmp, bswap_src, NULL);
1970 	      gsi_insert_before (&gsi, convert_stmt, GSI_SAME_STMT);
1971 	    }
1972 
1973 	  call = gimple_build_call (fndecl, 1, bswap_tmp);
1974 
1975 	  bswap_tmp = gimple_assign_lhs (stmt);
1976 
1977 	  /* Convert the result if necessary.  */
1978 	  if (!useless_type_conversion_p (TREE_TYPE (bswap_tmp), bswap_type))
1979 	    {
1980 	      gimple convert_stmt;
1981 	      bswap_tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
1982 	      convert_stmt = gimple_build_assign_with_ops
1983 			(NOP_EXPR, gimple_assign_lhs (stmt), bswap_tmp, NULL);
1984 	      gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
1985 	    }
1986 
1987 	  gimple_call_set_lhs (call, bswap_tmp);
1988 
1989 	  if (dump_file)
1990 	    {
1991 	      fprintf (dump_file, "%d bit bswap implementation found at: ",
1992 		       (int)type_size);
1993 	      print_gimple_stmt (dump_file, stmt, 0, 0);
1994 	    }
1995 
1996 	  gsi_insert_after (&gsi, call, GSI_SAME_STMT);
1997 	  gsi_remove (&gsi, true);
1998 	}
1999     }
2000 
2001   statistics_counter_event (cfun, "16-bit bswap implementations found",
2002 			    bswap_stats.found_16bit);
2003   statistics_counter_event (cfun, "32-bit bswap implementations found",
2004 			    bswap_stats.found_32bit);
2005   statistics_counter_event (cfun, "64-bit bswap implementations found",
2006 			    bswap_stats.found_64bit);
2007 
2008   return (changed ? TODO_update_ssa | TODO_verify_ssa
2009 	  | TODO_verify_stmts : 0);
2010 }
2011 
2012 static bool
2013 gate_optimize_bswap (void)
2014 {
2015   return flag_expensive_optimizations && optimize;
2016 }
2017 
2018 struct gimple_opt_pass pass_optimize_bswap =
2019 {
2020  {
2021   GIMPLE_PASS,
2022   "bswap",				/* name */
2023   OPTGROUP_NONE,                        /* optinfo_flags */
2024   gate_optimize_bswap,                  /* gate */
2025   execute_optimize_bswap,		/* execute */
2026   NULL,					/* sub */
2027   NULL,					/* next */
2028   0,					/* static_pass_number */
2029   TV_NONE,				/* tv_id */
2030   PROP_ssa,				/* properties_required */
2031   0,					/* properties_provided */
2032   0,					/* properties_destroyed */
2033   0,					/* todo_flags_start */
2034   0                                     /* todo_flags_finish */
2035  }
2036 };
2037 
2038 /* Return true if stmt is a type conversion operation that can be stripped
2039    when used in a widening multiply operation.  */
2040 static bool
2041 widening_mult_conversion_strippable_p (tree result_type, gimple stmt)
2042 {
2043   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2044 
2045   if (TREE_CODE (result_type) == INTEGER_TYPE)
2046     {
2047       tree op_type;
2048       tree inner_op_type;
2049 
2050       if (!CONVERT_EXPR_CODE_P (rhs_code))
2051 	return false;
2052 
2053       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2054 
2055       /* If the type of OP has the same precision as the result, then
2056 	 we can strip this conversion.  The multiply operation will be
2057 	 selected to create the correct extension as a by-product.  */
2058       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2059 	return true;
2060 
2061       /* We can also strip a conversion if it preserves the signed-ness of
2062 	 the operation and doesn't narrow the range.  */
2063       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2064 
2065       /* If the inner-most type is unsigned, then we can strip any
2066 	 intermediate widening operation.  If it's signed, then the
2067 	 intermediate widening operation must also be signed.  */
2068       if ((TYPE_UNSIGNED (inner_op_type)
2069 	   || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2070 	  && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2071 	return true;
2072 
2073       return false;
2074     }
2075 
2076   return rhs_code == FIXED_CONVERT_EXPR;
2077 }
2078 
2079 /* Return true if RHS is a suitable operand for a widening multiplication,
2080    assuming a target type of TYPE.
2081    There are two cases:
2082 
2083      - RHS makes some value at least twice as wide.  Store that value
2084        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2085 
2086      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2087        but leave *TYPE_OUT untouched.  */
2088 
2089 static bool
2090 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2091 			tree *new_rhs_out)
2092 {
2093   gimple stmt;
2094   tree type1, rhs1;
2095 
2096   if (TREE_CODE (rhs) == SSA_NAME)
2097     {
2098       stmt = SSA_NAME_DEF_STMT (rhs);
2099       if (is_gimple_assign (stmt))
2100 	{
2101 	  if (! widening_mult_conversion_strippable_p (type, stmt))
2102 	    rhs1 = rhs;
2103 	  else
2104 	    {
2105 	      rhs1 = gimple_assign_rhs1 (stmt);
2106 
2107 	      if (TREE_CODE (rhs1) == INTEGER_CST)
2108 		{
2109 		  *new_rhs_out = rhs1;
2110 		  *type_out = NULL;
2111 		  return true;
2112 		}
2113 	    }
2114 	}
2115       else
2116 	rhs1 = rhs;
2117 
2118       type1 = TREE_TYPE (rhs1);
2119 
2120       if (TREE_CODE (type1) != TREE_CODE (type)
2121 	  || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2122 	return false;
2123 
2124       *new_rhs_out = rhs1;
2125       *type_out = type1;
2126       return true;
2127     }
2128 
2129   if (TREE_CODE (rhs) == INTEGER_CST)
2130     {
2131       *new_rhs_out = rhs;
2132       *type_out = NULL;
2133       return true;
2134     }
2135 
2136   return false;
2137 }
2138 
2139 /* Return true if STMT performs a widening multiplication, assuming the
2140    output type is TYPE.  If so, store the unwidened types of the operands
2141    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2142    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2143    and *TYPE2_OUT would give the operands of the multiplication.  */
2144 
2145 static bool
2146 is_widening_mult_p (gimple stmt,
2147 		    tree *type1_out, tree *rhs1_out,
2148 		    tree *type2_out, tree *rhs2_out)
2149 {
2150   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2151 
2152   if (TREE_CODE (type) != INTEGER_TYPE
2153       && TREE_CODE (type) != FIXED_POINT_TYPE)
2154     return false;
2155 
2156   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2157 			       rhs1_out))
2158     return false;
2159 
2160   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2161 			       rhs2_out))
2162     return false;
2163 
2164   if (*type1_out == NULL)
2165     {
2166       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2167 	return false;
2168       *type1_out = *type2_out;
2169     }
2170 
2171   if (*type2_out == NULL)
2172     {
2173       if (!int_fits_type_p (*rhs2_out, *type1_out))
2174 	return false;
2175       *type2_out = *type1_out;
2176     }
2177 
2178   /* Ensure that the larger of the two operands comes first. */
2179   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2180     {
2181       tree tmp;
2182       tmp = *type1_out;
2183       *type1_out = *type2_out;
2184       *type2_out = tmp;
2185       tmp = *rhs1_out;
2186       *rhs1_out = *rhs2_out;
2187       *rhs2_out = tmp;
2188     }
2189 
2190   return true;
2191 }
2192 
2193 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2194    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2195    value is true iff we converted the statement.  */
2196 
2197 static bool
2198 convert_mult_to_widen (gimple stmt, gimple_stmt_iterator *gsi)
2199 {
2200   tree lhs, rhs1, rhs2, type, type1, type2;
2201   enum insn_code handler;
2202   enum machine_mode to_mode, from_mode, actual_mode;
2203   optab op;
2204   int actual_precision;
2205   location_t loc = gimple_location (stmt);
2206   bool from_unsigned1, from_unsigned2;
2207 
2208   lhs = gimple_assign_lhs (stmt);
2209   type = TREE_TYPE (lhs);
2210   if (TREE_CODE (type) != INTEGER_TYPE)
2211     return false;
2212 
2213   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2214     return false;
2215 
2216   to_mode = TYPE_MODE (type);
2217   from_mode = TYPE_MODE (type1);
2218   from_unsigned1 = TYPE_UNSIGNED (type1);
2219   from_unsigned2 = TYPE_UNSIGNED (type2);
2220 
2221   if (from_unsigned1 && from_unsigned2)
2222     op = umul_widen_optab;
2223   else if (!from_unsigned1 && !from_unsigned2)
2224     op = smul_widen_optab;
2225   else
2226     op = usmul_widen_optab;
2227 
2228   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2229 						  0, &actual_mode);
2230 
2231   if (handler == CODE_FOR_nothing)
2232     {
2233       if (op != smul_widen_optab)
2234 	{
2235 	  /* We can use a signed multiply with unsigned types as long as
2236 	     there is a wider mode to use, or it is the smaller of the two
2237 	     types that is unsigned.  Note that type1 >= type2, always.  */
2238 	  if ((TYPE_UNSIGNED (type1)
2239 	       && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2240 	      || (TYPE_UNSIGNED (type2)
2241 		  && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2242 	    {
2243 	      from_mode = GET_MODE_WIDER_MODE (from_mode);
2244 	      if (GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2245 		return false;
2246 	    }
2247 
2248 	  op = smul_widen_optab;
2249 	  handler = find_widening_optab_handler_and_mode (op, to_mode,
2250 							  from_mode, 0,
2251 							  &actual_mode);
2252 
2253 	  if (handler == CODE_FOR_nothing)
2254 	    return false;
2255 
2256 	  from_unsigned1 = from_unsigned2 = false;
2257 	}
2258       else
2259 	return false;
2260     }
2261 
2262   /* Ensure that the inputs to the handler are in the correct precison
2263      for the opcode.  This will be the full mode size.  */
2264   actual_precision = GET_MODE_PRECISION (actual_mode);
2265   if (2 * actual_precision > TYPE_PRECISION (type))
2266     return false;
2267   if (actual_precision != TYPE_PRECISION (type1)
2268       || from_unsigned1 != TYPE_UNSIGNED (type1))
2269     rhs1 = build_and_insert_cast (gsi, loc,
2270 				  build_nonstandard_integer_type
2271 				    (actual_precision, from_unsigned1), rhs1);
2272   if (actual_precision != TYPE_PRECISION (type2)
2273       || from_unsigned2 != TYPE_UNSIGNED (type2))
2274     rhs2 = build_and_insert_cast (gsi, loc,
2275 				  build_nonstandard_integer_type
2276 				    (actual_precision, from_unsigned2), rhs2);
2277 
2278   /* Handle constants.  */
2279   if (TREE_CODE (rhs1) == INTEGER_CST)
2280     rhs1 = fold_convert (type1, rhs1);
2281   if (TREE_CODE (rhs2) == INTEGER_CST)
2282     rhs2 = fold_convert (type2, rhs2);
2283 
2284   gimple_assign_set_rhs1 (stmt, rhs1);
2285   gimple_assign_set_rhs2 (stmt, rhs2);
2286   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2287   update_stmt (stmt);
2288   widen_mul_stats.widen_mults_inserted++;
2289   return true;
2290 }
2291 
2292 /* Process a single gimple statement STMT, which is found at the
2293    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2294    rhs (given by CODE), and try to convert it into a
2295    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2296    is true iff we converted the statement.  */
2297 
2298 static bool
2299 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple stmt,
2300 			    enum tree_code code)
2301 {
2302   gimple rhs1_stmt = NULL, rhs2_stmt = NULL;
2303   gimple conv1_stmt = NULL, conv2_stmt = NULL, conv_stmt;
2304   tree type, type1, type2, optype;
2305   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2306   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2307   optab this_optab;
2308   enum tree_code wmult_code;
2309   enum insn_code handler;
2310   enum machine_mode to_mode, from_mode, actual_mode;
2311   location_t loc = gimple_location (stmt);
2312   int actual_precision;
2313   bool from_unsigned1, from_unsigned2;
2314 
2315   lhs = gimple_assign_lhs (stmt);
2316   type = TREE_TYPE (lhs);
2317   if (TREE_CODE (type) != INTEGER_TYPE
2318       && TREE_CODE (type) != FIXED_POINT_TYPE)
2319     return false;
2320 
2321   if (code == MINUS_EXPR)
2322     wmult_code = WIDEN_MULT_MINUS_EXPR;
2323   else
2324     wmult_code = WIDEN_MULT_PLUS_EXPR;
2325 
2326   rhs1 = gimple_assign_rhs1 (stmt);
2327   rhs2 = gimple_assign_rhs2 (stmt);
2328 
2329   if (TREE_CODE (rhs1) == SSA_NAME)
2330     {
2331       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2332       if (is_gimple_assign (rhs1_stmt))
2333 	rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2334     }
2335 
2336   if (TREE_CODE (rhs2) == SSA_NAME)
2337     {
2338       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2339       if (is_gimple_assign (rhs2_stmt))
2340 	rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2341     }
2342 
2343   /* Allow for one conversion statement between the multiply
2344      and addition/subtraction statement.  If there are more than
2345      one conversions then we assume they would invalidate this
2346      transformation.  If that's not the case then they should have
2347      been folded before now.  */
2348   if (CONVERT_EXPR_CODE_P (rhs1_code))
2349     {
2350       conv1_stmt = rhs1_stmt;
2351       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2352       if (TREE_CODE (rhs1) == SSA_NAME)
2353 	{
2354 	  rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2355 	  if (is_gimple_assign (rhs1_stmt))
2356 	    rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2357 	}
2358       else
2359 	return false;
2360     }
2361   if (CONVERT_EXPR_CODE_P (rhs2_code))
2362     {
2363       conv2_stmt = rhs2_stmt;
2364       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2365       if (TREE_CODE (rhs2) == SSA_NAME)
2366 	{
2367 	  rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2368 	  if (is_gimple_assign (rhs2_stmt))
2369 	    rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2370 	}
2371       else
2372 	return false;
2373     }
2374 
2375   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2376      is_widening_mult_p, but we still need the rhs returns.
2377 
2378      It might also appear that it would be sufficient to use the existing
2379      operands of the widening multiply, but that would limit the choice of
2380      multiply-and-accumulate instructions.  */
2381   if (code == PLUS_EXPR
2382       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2383     {
2384       if (!is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2385 			       &type2, &mult_rhs2))
2386 	return false;
2387       add_rhs = rhs2;
2388       conv_stmt = conv1_stmt;
2389     }
2390   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2391     {
2392       if (!is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2393 			       &type2, &mult_rhs2))
2394 	return false;
2395       add_rhs = rhs1;
2396       conv_stmt = conv2_stmt;
2397     }
2398   else
2399     return false;
2400 
2401   to_mode = TYPE_MODE (type);
2402   from_mode = TYPE_MODE (type1);
2403   from_unsigned1 = TYPE_UNSIGNED (type1);
2404   from_unsigned2 = TYPE_UNSIGNED (type2);
2405   optype = type1;
2406 
2407   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2408   if (from_unsigned1 != from_unsigned2)
2409     {
2410       if (!INTEGRAL_TYPE_P (type))
2411 	return false;
2412       /* We can use a signed multiply with unsigned types as long as
2413 	 there is a wider mode to use, or it is the smaller of the two
2414 	 types that is unsigned.  Note that type1 >= type2, always.  */
2415       if ((from_unsigned1
2416 	   && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2417 	  || (from_unsigned2
2418 	      && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2419 	{
2420 	  from_mode = GET_MODE_WIDER_MODE (from_mode);
2421 	  if (GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2422 	    return false;
2423 	}
2424 
2425       from_unsigned1 = from_unsigned2 = false;
2426       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2427 					       false);
2428     }
2429 
2430   /* If there was a conversion between the multiply and addition
2431      then we need to make sure it fits a multiply-and-accumulate.
2432      The should be a single mode change which does not change the
2433      value.  */
2434   if (conv_stmt)
2435     {
2436       /* We use the original, unmodified data types for this.  */
2437       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2438       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2439       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2440       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2441 
2442       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2443 	{
2444 	  /* Conversion is a truncate.  */
2445 	  if (TYPE_PRECISION (to_type) < data_size)
2446 	    return false;
2447 	}
2448       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2449 	{
2450 	  /* Conversion is an extend.  Check it's the right sort.  */
2451 	  if (TYPE_UNSIGNED (from_type) != is_unsigned
2452 	      && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2453 	    return false;
2454 	}
2455       /* else convert is a no-op for our purposes.  */
2456     }
2457 
2458   /* Verify that the machine can perform a widening multiply
2459      accumulate in this mode/signedness combination, otherwise
2460      this transformation is likely to pessimize code.  */
2461   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2462   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2463 						  from_mode, 0, &actual_mode);
2464 
2465   if (handler == CODE_FOR_nothing)
2466     return false;
2467 
2468   /* Ensure that the inputs to the handler are in the correct precison
2469      for the opcode.  This will be the full mode size.  */
2470   actual_precision = GET_MODE_PRECISION (actual_mode);
2471   if (actual_precision != TYPE_PRECISION (type1)
2472       || from_unsigned1 != TYPE_UNSIGNED (type1))
2473     mult_rhs1 = build_and_insert_cast (gsi, loc,
2474 				       build_nonstandard_integer_type
2475 				         (actual_precision, from_unsigned1),
2476 				       mult_rhs1);
2477   if (actual_precision != TYPE_PRECISION (type2)
2478       || from_unsigned2 != TYPE_UNSIGNED (type2))
2479     mult_rhs2 = build_and_insert_cast (gsi, loc,
2480 				       build_nonstandard_integer_type
2481 					 (actual_precision, from_unsigned2),
2482 				       mult_rhs2);
2483 
2484   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2485     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2486 
2487   /* Handle constants.  */
2488   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2489     mult_rhs1 = fold_convert (type1, mult_rhs1);
2490   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2491     mult_rhs2 = fold_convert (type2, mult_rhs2);
2492 
2493   gimple_assign_set_rhs_with_ops_1 (gsi, wmult_code, mult_rhs1, mult_rhs2,
2494 				    add_rhs);
2495   update_stmt (gsi_stmt (*gsi));
2496   widen_mul_stats.maccs_inserted++;
2497   return true;
2498 }
2499 
2500 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
2501    with uses in additions and subtractions to form fused multiply-add
2502    operations.  Returns true if successful and MUL_STMT should be removed.  */
2503 
2504 static bool
2505 convert_mult_to_fma (gimple mul_stmt, tree op1, tree op2)
2506 {
2507   tree mul_result = gimple_get_lhs (mul_stmt);
2508   tree type = TREE_TYPE (mul_result);
2509   gimple use_stmt, neguse_stmt, fma_stmt;
2510   use_operand_p use_p;
2511   imm_use_iterator imm_iter;
2512 
2513   if (FLOAT_TYPE_P (type)
2514       && flag_fp_contract_mode == FP_CONTRACT_OFF)
2515     return false;
2516 
2517   /* We don't want to do bitfield reduction ops.  */
2518   if (INTEGRAL_TYPE_P (type)
2519       && (TYPE_PRECISION (type)
2520 	  != GET_MODE_PRECISION (TYPE_MODE (type))))
2521     return false;
2522 
2523   /* If the target doesn't support it, don't generate it.  We assume that
2524      if fma isn't available then fms, fnma or fnms are not either.  */
2525   if (optab_handler (fma_optab, TYPE_MODE (type)) == CODE_FOR_nothing)
2526     return false;
2527 
2528   /* If the multiplication has zero uses, it is kept around probably because
2529      of -fnon-call-exceptions.  Don't optimize it away in that case,
2530      it is DCE job.  */
2531   if (has_zero_uses (mul_result))
2532     return false;
2533 
2534   /* Make sure that the multiplication statement becomes dead after
2535      the transformation, thus that all uses are transformed to FMAs.
2536      This means we assume that an FMA operation has the same cost
2537      as an addition.  */
2538   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
2539     {
2540       enum tree_code use_code;
2541       tree result = mul_result;
2542       bool negate_p = false;
2543 
2544       use_stmt = USE_STMT (use_p);
2545 
2546       if (is_gimple_debug (use_stmt))
2547 	continue;
2548 
2549       /* For now restrict this operations to single basic blocks.  In theory
2550 	 we would want to support sinking the multiplication in
2551 	 m = a*b;
2552 	 if ()
2553 	   ma = m + c;
2554 	 else
2555 	   d = m;
2556 	 to form a fma in the then block and sink the multiplication to the
2557 	 else block.  */
2558       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2559 	return false;
2560 
2561       if (!is_gimple_assign (use_stmt))
2562 	return false;
2563 
2564       use_code = gimple_assign_rhs_code (use_stmt);
2565 
2566       /* A negate on the multiplication leads to FNMA.  */
2567       if (use_code == NEGATE_EXPR)
2568 	{
2569 	  ssa_op_iter iter;
2570 	  use_operand_p usep;
2571 
2572 	  result = gimple_assign_lhs (use_stmt);
2573 
2574 	  /* Make sure the negate statement becomes dead with this
2575 	     single transformation.  */
2576 	  if (!single_imm_use (gimple_assign_lhs (use_stmt),
2577 			       &use_p, &neguse_stmt))
2578 	    return false;
2579 
2580 	  /* Make sure the multiplication isn't also used on that stmt.  */
2581 	  FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
2582 	    if (USE_FROM_PTR (usep) == mul_result)
2583 	      return false;
2584 
2585 	  /* Re-validate.  */
2586 	  use_stmt = neguse_stmt;
2587 	  if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
2588 	    return false;
2589 	  if (!is_gimple_assign (use_stmt))
2590 	    return false;
2591 
2592 	  use_code = gimple_assign_rhs_code (use_stmt);
2593 	  negate_p = true;
2594 	}
2595 
2596       switch (use_code)
2597 	{
2598 	case MINUS_EXPR:
2599 	  if (gimple_assign_rhs2 (use_stmt) == result)
2600 	    negate_p = !negate_p;
2601 	  break;
2602 	case PLUS_EXPR:
2603 	  break;
2604 	default:
2605 	  /* FMA can only be formed from PLUS and MINUS.  */
2606 	  return false;
2607 	}
2608 
2609       /* We can't handle a * b + a * b.  */
2610       if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt))
2611 	return false;
2612 
2613       /* While it is possible to validate whether or not the exact form
2614 	 that we've recognized is available in the backend, the assumption
2615 	 is that the transformation is never a loss.  For instance, suppose
2616 	 the target only has the plain FMA pattern available.  Consider
2617 	 a*b-c -> fma(a,b,-c): we've exchanged MUL+SUB for FMA+NEG, which
2618 	 is still two operations.  Consider -(a*b)-c -> fma(-a,b,-c): we
2619 	 still have 3 operations, but in the FMA form the two NEGs are
2620 	 independent and could be run in parallel.  */
2621     }
2622 
2623   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2624     {
2625       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2626       enum tree_code use_code;
2627       tree addop, mulop1 = op1, result = mul_result;
2628       bool negate_p = false;
2629 
2630       if (is_gimple_debug (use_stmt))
2631 	continue;
2632 
2633       use_code = gimple_assign_rhs_code (use_stmt);
2634       if (use_code == NEGATE_EXPR)
2635 	{
2636 	  result = gimple_assign_lhs (use_stmt);
2637 	  single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2638 	  gsi_remove (&gsi, true);
2639 	  release_defs (use_stmt);
2640 
2641 	  use_stmt = neguse_stmt;
2642 	  gsi = gsi_for_stmt (use_stmt);
2643 	  use_code = gimple_assign_rhs_code (use_stmt);
2644 	  negate_p = true;
2645 	}
2646 
2647       if (gimple_assign_rhs1 (use_stmt) == result)
2648 	{
2649 	  addop = gimple_assign_rhs2 (use_stmt);
2650 	  /* a * b - c -> a * b + (-c)  */
2651 	  if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2652 	    addop = force_gimple_operand_gsi (&gsi,
2653 					      build1 (NEGATE_EXPR,
2654 						      type, addop),
2655 					      true, NULL_TREE, true,
2656 					      GSI_SAME_STMT);
2657 	}
2658       else
2659 	{
2660 	  addop = gimple_assign_rhs1 (use_stmt);
2661 	  /* a - b * c -> (-b) * c + a */
2662 	  if (gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2663 	    negate_p = !negate_p;
2664 	}
2665 
2666       if (negate_p)
2667 	mulop1 = force_gimple_operand_gsi (&gsi,
2668 					   build1 (NEGATE_EXPR,
2669 						   type, mulop1),
2670 					   true, NULL_TREE, true,
2671 					   GSI_SAME_STMT);
2672 
2673       fma_stmt = gimple_build_assign_with_ops (FMA_EXPR,
2674 					       gimple_assign_lhs (use_stmt),
2675 					       mulop1, op2,
2676 					       addop);
2677       gsi_replace (&gsi, fma_stmt, true);
2678       widen_mul_stats.fmas_inserted++;
2679     }
2680 
2681   return true;
2682 }
2683 
2684 /* Find integer multiplications where the operands are extended from
2685    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
2686    where appropriate.  */
2687 
2688 static unsigned int
2689 execute_optimize_widening_mul (void)
2690 {
2691   basic_block bb;
2692   bool cfg_changed = false;
2693 
2694   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
2695 
2696   FOR_EACH_BB (bb)
2697     {
2698       gimple_stmt_iterator gsi;
2699 
2700       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
2701         {
2702 	  gimple stmt = gsi_stmt (gsi);
2703 	  enum tree_code code;
2704 
2705 	  if (is_gimple_assign (stmt))
2706 	    {
2707 	      code = gimple_assign_rhs_code (stmt);
2708 	      switch (code)
2709 		{
2710 		case MULT_EXPR:
2711 		  if (!convert_mult_to_widen (stmt, &gsi)
2712 		      && convert_mult_to_fma (stmt,
2713 					      gimple_assign_rhs1 (stmt),
2714 					      gimple_assign_rhs2 (stmt)))
2715 		    {
2716 		      gsi_remove (&gsi, true);
2717 		      release_defs (stmt);
2718 		      continue;
2719 		    }
2720 		  break;
2721 
2722 		case PLUS_EXPR:
2723 		case MINUS_EXPR:
2724 		  convert_plusminus_to_widen (&gsi, stmt, code);
2725 		  break;
2726 
2727 		default:;
2728 		}
2729 	    }
2730 	  else if (is_gimple_call (stmt)
2731 		   && gimple_call_lhs (stmt))
2732 	    {
2733 	      tree fndecl = gimple_call_fndecl (stmt);
2734 	      if (fndecl
2735 		  && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
2736 		{
2737 		  switch (DECL_FUNCTION_CODE (fndecl))
2738 		    {
2739 		      case BUILT_IN_POWF:
2740 		      case BUILT_IN_POW:
2741 		      case BUILT_IN_POWL:
2742 			if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
2743 			    && REAL_VALUES_EQUAL
2744 			         (TREE_REAL_CST (gimple_call_arg (stmt, 1)),
2745 				  dconst2)
2746 			    && convert_mult_to_fma (stmt,
2747 						    gimple_call_arg (stmt, 0),
2748 						    gimple_call_arg (stmt, 0)))
2749 			  {
2750 			    unlink_stmt_vdef (stmt);
2751 			    if (gsi_remove (&gsi, true)
2752 				&& gimple_purge_dead_eh_edges (bb))
2753 			      cfg_changed = true;
2754 			    release_defs (stmt);
2755 			    continue;
2756 			  }
2757 			  break;
2758 
2759 		      default:;
2760 		    }
2761 		}
2762 	    }
2763 	  gsi_next (&gsi);
2764 	}
2765     }
2766 
2767   statistics_counter_event (cfun, "widening multiplications inserted",
2768 			    widen_mul_stats.widen_mults_inserted);
2769   statistics_counter_event (cfun, "widening maccs inserted",
2770 			    widen_mul_stats.maccs_inserted);
2771   statistics_counter_event (cfun, "fused multiply-adds inserted",
2772 			    widen_mul_stats.fmas_inserted);
2773 
2774   return cfg_changed ? TODO_cleanup_cfg : 0;
2775 }
2776 
2777 static bool
2778 gate_optimize_widening_mul (void)
2779 {
2780   return flag_expensive_optimizations && optimize;
2781 }
2782 
2783 struct gimple_opt_pass pass_optimize_widening_mul =
2784 {
2785  {
2786   GIMPLE_PASS,
2787   "widening_mul",			/* name */
2788   OPTGROUP_NONE,                        /* optinfo_flags */
2789   gate_optimize_widening_mul,		/* gate */
2790   execute_optimize_widening_mul,	/* execute */
2791   NULL,					/* sub */
2792   NULL,					/* next */
2793   0,					/* static_pass_number */
2794   TV_NONE,				/* tv_id */
2795   PROP_ssa,				/* properties_required */
2796   0,					/* properties_provided */
2797   0,					/* properties_destroyed */
2798   0,					/* todo_flags_start */
2799   TODO_verify_ssa
2800   | TODO_verify_stmts
2801   | TODO_update_ssa                     /* todo_flags_finish */
2802  }
2803 };
2804