xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-ssa-math-opts.c (revision 8feb0f0b7eaff0608f8350bbfa3098827b4bb91b)
1 /* Global, SSA-based optimizations using mathematical identities.
2    Copyright (C) 2005-2020 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 /* Currently, the only mini-pass in this file tries to CSE reciprocal
21    operations.  These are common in sequences such as this one:
22 
23 	modulus = sqrt(x*x + y*y + z*z);
24 	x = x / modulus;
25 	y = y / modulus;
26 	z = z / modulus;
27 
28    that can be optimized to
29 
30 	modulus = sqrt(x*x + y*y + z*z);
31         rmodulus = 1.0 / modulus;
32 	x = x * rmodulus;
33 	y = y * rmodulus;
34 	z = z * rmodulus;
35 
36    We do this for loop invariant divisors, and with this pass whenever
37    we notice that a division has the same divisor multiple times.
38 
39    Of course, like in PRE, we don't insert a division if a dominator
40    already has one.  However, this cannot be done as an extension of
41    PRE for several reasons.
42 
43    First of all, with some experiments it was found out that the
44    transformation is not always useful if there are only two divisions
45    by the same divisor.  This is probably because modern processors
46    can pipeline the divisions; on older, in-order processors it should
47    still be effective to optimize two divisions by the same number.
48    We make this a param, and it shall be called N in the remainder of
49    this comment.
50 
51    Second, if trapping math is active, we have less freedom on where
52    to insert divisions: we can only do so in basic blocks that already
53    contain one.  (If divisions don't trap, instead, we can insert
54    divisions elsewhere, which will be in blocks that are common dominators
55    of those that have the division).
56 
57    We really don't want to compute the reciprocal unless a division will
58    be found.  To do this, we won't insert the division in a basic block
59    that has less than N divisions *post-dominating* it.
60 
61    The algorithm constructs a subset of the dominator tree, holding the
62    blocks containing the divisions and the common dominators to them,
63    and walk it twice.  The first walk is in post-order, and it annotates
64    each block with the number of divisions that post-dominate it: this
65    gives information on where divisions can be inserted profitably.
66    The second walk is in pre-order, and it inserts divisions as explained
67    above, and replaces divisions by multiplications.
68 
69    In the best case, the cost of the pass is O(n_statements).  In the
70    worst-case, the cost is due to creating the dominator tree subset,
71    with a cost of O(n_basic_blocks ^ 2); however this can only happen
72    for n_statements / n_basic_blocks statements.  So, the amortized cost
73    of creating the dominator tree subset is O(n_basic_blocks) and the
74    worst-case cost of the pass is O(n_statements * n_basic_blocks).
75 
76    More practically, the cost will be small because there are few
77    divisions, and they tend to be in the same basic block, so insert_bb
78    is called very few times.
79 
80    If we did this using domwalk.c, an efficient implementation would have
81    to work on all the variables in a single pass, because we could not
82    work on just a subset of the dominator tree, as we do now, and the
83    cost would also be something like O(n_statements * n_basic_blocks).
84    The data structures would be more complex in order to work on all the
85    variables in a single pass.  */
86 
87 #include "config.h"
88 #include "system.h"
89 #include "coretypes.h"
90 #include "backend.h"
91 #include "target.h"
92 #include "rtl.h"
93 #include "tree.h"
94 #include "gimple.h"
95 #include "predict.h"
96 #include "alloc-pool.h"
97 #include "tree-pass.h"
98 #include "ssa.h"
99 #include "optabs-tree.h"
100 #include "gimple-pretty-print.h"
101 #include "alias.h"
102 #include "fold-const.h"
103 #include "gimple-fold.h"
104 #include "gimple-iterator.h"
105 #include "gimplify.h"
106 #include "gimplify-me.h"
107 #include "stor-layout.h"
108 #include "tree-cfg.h"
109 #include "tree-dfa.h"
110 #include "tree-ssa.h"
111 #include "builtins.h"
112 #include "internal-fn.h"
113 #include "case-cfn-macros.h"
114 #include "optabs-libfuncs.h"
115 #include "tree-eh.h"
116 #include "targhooks.h"
117 #include "domwalk.h"
118 
119 /* This structure represents one basic block that either computes a
120    division, or is a common dominator for basic block that compute a
121    division.  */
122 struct occurrence {
123   /* The basic block represented by this structure.  */
124   basic_block bb;
125 
126   /* If non-NULL, the SSA_NAME holding the definition for a reciprocal
127      inserted in BB.  */
128   tree recip_def;
129 
130   /* If non-NULL, the SSA_NAME holding the definition for a squared
131      reciprocal inserted in BB.  */
132   tree square_recip_def;
133 
134   /* If non-NULL, the GIMPLE_ASSIGN for a reciprocal computation that
135      was inserted in BB.  */
136   gimple *recip_def_stmt;
137 
138   /* Pointer to a list of "struct occurrence"s for blocks dominated
139      by BB.  */
140   struct occurrence *children;
141 
142   /* Pointer to the next "struct occurrence"s in the list of blocks
143      sharing a common dominator.  */
144   struct occurrence *next;
145 
146   /* The number of divisions that are in BB before compute_merit.  The
147      number of divisions that are in BB or post-dominate it after
148      compute_merit.  */
149   int num_divisions;
150 
151   /* True if the basic block has a division, false if it is a common
152      dominator for basic blocks that do.  If it is false and trapping
153      math is active, BB is not a candidate for inserting a reciprocal.  */
154   bool bb_has_division;
155 };
156 
157 static struct
158 {
159   /* Number of 1.0/X ops inserted.  */
160   int rdivs_inserted;
161 
162   /* Number of 1.0/FUNC ops inserted.  */
163   int rfuncs_inserted;
164 } reciprocal_stats;
165 
166 static struct
167 {
168   /* Number of cexpi calls inserted.  */
169   int inserted;
170 } sincos_stats;
171 
172 static struct
173 {
174   /* Number of widening multiplication ops inserted.  */
175   int widen_mults_inserted;
176 
177   /* Number of integer multiply-and-accumulate ops inserted.  */
178   int maccs_inserted;
179 
180   /* Number of fp fused multiply-add ops inserted.  */
181   int fmas_inserted;
182 
183   /* Number of divmod calls inserted.  */
184   int divmod_calls_inserted;
185 } widen_mul_stats;
186 
187 /* The instance of "struct occurrence" representing the highest
188    interesting block in the dominator tree.  */
189 static struct occurrence *occ_head;
190 
191 /* Allocation pool for getting instances of "struct occurrence".  */
192 static object_allocator<occurrence> *occ_pool;
193 
194 
195 
196 /* Allocate and return a new struct occurrence for basic block BB, and
197    whose children list is headed by CHILDREN.  */
198 static struct occurrence *
occ_new(basic_block bb,struct occurrence * children)199 occ_new (basic_block bb, struct occurrence *children)
200 {
201   struct occurrence *occ;
202 
203   bb->aux = occ = occ_pool->allocate ();
204   memset (occ, 0, sizeof (struct occurrence));
205 
206   occ->bb = bb;
207   occ->children = children;
208   return occ;
209 }
210 
211 
212 /* Insert NEW_OCC into our subset of the dominator tree.  P_HEAD points to a
213    list of "struct occurrence"s, one per basic block, having IDOM as
214    their common dominator.
215 
216    We try to insert NEW_OCC as deep as possible in the tree, and we also
217    insert any other block that is a common dominator for BB and one
218    block already in the tree.  */
219 
220 static void
insert_bb(struct occurrence * new_occ,basic_block idom,struct occurrence ** p_head)221 insert_bb (struct occurrence *new_occ, basic_block idom,
222 	   struct occurrence **p_head)
223 {
224   struct occurrence *occ, **p_occ;
225 
226   for (p_occ = p_head; (occ = *p_occ) != NULL; )
227     {
228       basic_block bb = new_occ->bb, occ_bb = occ->bb;
229       basic_block dom = nearest_common_dominator (CDI_DOMINATORS, occ_bb, bb);
230       if (dom == bb)
231 	{
232 	  /* BB dominates OCC_BB.  OCC becomes NEW_OCC's child: remove OCC
233 	     from its list.  */
234 	  *p_occ = occ->next;
235 	  occ->next = new_occ->children;
236 	  new_occ->children = occ;
237 
238 	  /* Try the next block (it may as well be dominated by BB).  */
239 	}
240 
241       else if (dom == occ_bb)
242 	{
243 	  /* OCC_BB dominates BB.  Tail recurse to look deeper.  */
244 	  insert_bb (new_occ, dom, &occ->children);
245 	  return;
246 	}
247 
248       else if (dom != idom)
249 	{
250 	  gcc_assert (!dom->aux);
251 
252 	  /* There is a dominator between IDOM and BB, add it and make
253 	     two children out of NEW_OCC and OCC.  First, remove OCC from
254 	     its list.	*/
255 	  *p_occ = occ->next;
256 	  new_occ->next = occ;
257 	  occ->next = NULL;
258 
259 	  /* None of the previous blocks has DOM as a dominator: if we tail
260 	     recursed, we would reexamine them uselessly. Just switch BB with
261 	     DOM, and go on looking for blocks dominated by DOM.  */
262           new_occ = occ_new (dom, new_occ);
263 	}
264 
265       else
266 	{
267 	  /* Nothing special, go on with the next element.  */
268 	  p_occ = &occ->next;
269 	}
270     }
271 
272   /* No place was found as a child of IDOM.  Make BB a sibling of IDOM.  */
273   new_occ->next = *p_head;
274   *p_head = new_occ;
275 }
276 
277 /* Register that we found a division in BB.
278    IMPORTANCE is a measure of how much weighting to give
279    that division.  Use IMPORTANCE = 2 to register a single
280    division.  If the division is going to be found multiple
281    times use 1 (as it is with squares).  */
282 
283 static inline void
register_division_in(basic_block bb,int importance)284 register_division_in (basic_block bb, int importance)
285 {
286   struct occurrence *occ;
287 
288   occ = (struct occurrence *) bb->aux;
289   if (!occ)
290     {
291       occ = occ_new (bb, NULL);
292       insert_bb (occ, ENTRY_BLOCK_PTR_FOR_FN (cfun), &occ_head);
293     }
294 
295   occ->bb_has_division = true;
296   occ->num_divisions += importance;
297 }
298 
299 
300 /* Compute the number of divisions that postdominate each block in OCC and
301    its children.  */
302 
303 static void
compute_merit(struct occurrence * occ)304 compute_merit (struct occurrence *occ)
305 {
306   struct occurrence *occ_child;
307   basic_block dom = occ->bb;
308 
309   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
310     {
311       basic_block bb;
312       if (occ_child->children)
313         compute_merit (occ_child);
314 
315       if (flag_exceptions)
316 	bb = single_noncomplex_succ (dom);
317       else
318 	bb = dom;
319 
320       if (dominated_by_p (CDI_POST_DOMINATORS, bb, occ_child->bb))
321         occ->num_divisions += occ_child->num_divisions;
322     }
323 }
324 
325 
326 /* Return whether USE_STMT is a floating-point division by DEF.  */
327 static inline bool
is_division_by(gimple * use_stmt,tree def)328 is_division_by (gimple *use_stmt, tree def)
329 {
330   return is_gimple_assign (use_stmt)
331 	 && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
332 	 && gimple_assign_rhs2 (use_stmt) == def
333 	 /* Do not recognize x / x as valid division, as we are getting
334 	    confused later by replacing all immediate uses x in such
335 	    a stmt.  */
336 	 && gimple_assign_rhs1 (use_stmt) != def
337 	 && !stmt_can_throw_internal (cfun, use_stmt);
338 }
339 
340 /* Return TRUE if USE_STMT is a multiplication of DEF by A.  */
341 static inline bool
is_mult_by(gimple * use_stmt,tree def,tree a)342 is_mult_by (gimple *use_stmt, tree def, tree a)
343 {
344   if (gimple_code (use_stmt) == GIMPLE_ASSIGN
345       && gimple_assign_rhs_code (use_stmt) == MULT_EXPR)
346     {
347       tree op0 = gimple_assign_rhs1 (use_stmt);
348       tree op1 = gimple_assign_rhs2 (use_stmt);
349 
350       return (op0 == def && op1 == a)
351 	      || (op0 == a && op1 == def);
352     }
353   return 0;
354 }
355 
356 /* Return whether USE_STMT is DEF * DEF.  */
357 static inline bool
is_square_of(gimple * use_stmt,tree def)358 is_square_of (gimple *use_stmt, tree def)
359 {
360   return is_mult_by (use_stmt, def, def);
361 }
362 
363 /* Return whether USE_STMT is a floating-point division by
364    DEF * DEF.  */
365 static inline bool
is_division_by_square(gimple * use_stmt,tree def)366 is_division_by_square (gimple *use_stmt, tree def)
367 {
368   if (gimple_code (use_stmt) == GIMPLE_ASSIGN
369       && gimple_assign_rhs_code (use_stmt) == RDIV_EXPR
370       && gimple_assign_rhs1 (use_stmt) != gimple_assign_rhs2 (use_stmt)
371       && !stmt_can_throw_internal (cfun, use_stmt))
372     {
373       tree denominator = gimple_assign_rhs2 (use_stmt);
374       if (TREE_CODE (denominator) == SSA_NAME)
375 	return is_square_of (SSA_NAME_DEF_STMT (denominator), def);
376     }
377   return 0;
378 }
379 
380 /* Walk the subset of the dominator tree rooted at OCC, setting the
381    RECIP_DEF field to a definition of 1.0 / DEF that can be used in
382    the given basic block.  The field may be left NULL, of course,
383    if it is not possible or profitable to do the optimization.
384 
385    DEF_BSI is an iterator pointing at the statement defining DEF.
386    If RECIP_DEF is set, a dominator already has a computation that can
387    be used.
388 
389    If should_insert_square_recip is set, then this also inserts
390    the square of the reciprocal immediately after the definition
391    of the reciprocal.  */
392 
393 static void
insert_reciprocals(gimple_stmt_iterator * def_gsi,struct occurrence * occ,tree def,tree recip_def,tree square_recip_def,int should_insert_square_recip,int threshold)394 insert_reciprocals (gimple_stmt_iterator *def_gsi, struct occurrence *occ,
395 		    tree def, tree recip_def, tree square_recip_def,
396 		    int should_insert_square_recip, int threshold)
397 {
398   tree type;
399   gassign *new_stmt, *new_square_stmt;
400   gimple_stmt_iterator gsi;
401   struct occurrence *occ_child;
402 
403   if (!recip_def
404       && (occ->bb_has_division || !flag_trapping_math)
405       /* Divide by two as all divisions are counted twice in
406 	 the costing loop.  */
407       && occ->num_divisions / 2 >= threshold)
408     {
409       /* Make a variable with the replacement and substitute it.  */
410       type = TREE_TYPE (def);
411       recip_def = create_tmp_reg (type, "reciptmp");
412       new_stmt = gimple_build_assign (recip_def, RDIV_EXPR,
413 				      build_one_cst (type), def);
414 
415       if (should_insert_square_recip)
416 	{
417 	  square_recip_def = create_tmp_reg (type, "powmult_reciptmp");
418 	  new_square_stmt = gimple_build_assign (square_recip_def, MULT_EXPR,
419 						 recip_def, recip_def);
420 	}
421 
422       if (occ->bb_has_division)
423 	{
424 	  /* Case 1: insert before an existing division.  */
425 	  gsi = gsi_after_labels (occ->bb);
426 	  while (!gsi_end_p (gsi)
427 		 && (!is_division_by (gsi_stmt (gsi), def))
428 		 && (!is_division_by_square (gsi_stmt (gsi), def)))
429 	    gsi_next (&gsi);
430 
431 	  gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
432 	  if (should_insert_square_recip)
433 	    gsi_insert_before (&gsi, new_square_stmt, GSI_SAME_STMT);
434 	}
435       else if (def_gsi && occ->bb == def_gsi->bb)
436 	{
437 	  /* Case 2: insert right after the definition.  Note that this will
438 	     never happen if the definition statement can throw, because in
439 	     that case the sole successor of the statement's basic block will
440 	     dominate all the uses as well.  */
441 	  gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
442 	  if (should_insert_square_recip)
443 	    gsi_insert_after (def_gsi, new_square_stmt, GSI_NEW_STMT);
444 	}
445       else
446 	{
447 	  /* Case 3: insert in a basic block not containing defs/uses.  */
448 	  gsi = gsi_after_labels (occ->bb);
449 	  gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
450 	  if (should_insert_square_recip)
451 	    gsi_insert_before (&gsi, new_square_stmt, GSI_SAME_STMT);
452 	}
453 
454       reciprocal_stats.rdivs_inserted++;
455 
456       occ->recip_def_stmt = new_stmt;
457     }
458 
459   occ->recip_def = recip_def;
460   occ->square_recip_def = square_recip_def;
461   for (occ_child = occ->children; occ_child; occ_child = occ_child->next)
462     insert_reciprocals (def_gsi, occ_child, def, recip_def,
463 			square_recip_def, should_insert_square_recip,
464 			threshold);
465 }
466 
467 /* Replace occurrences of expr / (x * x) with expr * ((1 / x) * (1 / x)).
468    Take as argument the use for (x * x).  */
469 static inline void
replace_reciprocal_squares(use_operand_p use_p)470 replace_reciprocal_squares (use_operand_p use_p)
471 {
472   gimple *use_stmt = USE_STMT (use_p);
473   basic_block bb = gimple_bb (use_stmt);
474   struct occurrence *occ = (struct occurrence *) bb->aux;
475 
476   if (optimize_bb_for_speed_p (bb) && occ->square_recip_def
477       && occ->recip_def)
478     {
479       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
480       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
481       gimple_assign_set_rhs2 (use_stmt, occ->square_recip_def);
482       SET_USE (use_p, occ->square_recip_def);
483       fold_stmt_inplace (&gsi);
484       update_stmt (use_stmt);
485     }
486 }
487 
488 
489 /* Replace the division at USE_P with a multiplication by the reciprocal, if
490    possible.  */
491 
492 static inline void
replace_reciprocal(use_operand_p use_p)493 replace_reciprocal (use_operand_p use_p)
494 {
495   gimple *use_stmt = USE_STMT (use_p);
496   basic_block bb = gimple_bb (use_stmt);
497   struct occurrence *occ = (struct occurrence *) bb->aux;
498 
499   if (optimize_bb_for_speed_p (bb)
500       && occ->recip_def && use_stmt != occ->recip_def_stmt)
501     {
502       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
503       gimple_assign_set_rhs_code (use_stmt, MULT_EXPR);
504       SET_USE (use_p, occ->recip_def);
505       fold_stmt_inplace (&gsi);
506       update_stmt (use_stmt);
507     }
508 }
509 
510 
511 /* Free OCC and return one more "struct occurrence" to be freed.  */
512 
513 static struct occurrence *
free_bb(struct occurrence * occ)514 free_bb (struct occurrence *occ)
515 {
516   struct occurrence *child, *next;
517 
518   /* First get the two pointers hanging off OCC.  */
519   next = occ->next;
520   child = occ->children;
521   occ->bb->aux = NULL;
522   occ_pool->remove (occ);
523 
524   /* Now ensure that we don't recurse unless it is necessary.  */
525   if (!child)
526     return next;
527   else
528     {
529       while (next)
530 	next = free_bb (next);
531 
532       return child;
533     }
534 }
535 
536 /* Transform sequences like
537    t = sqrt (a)
538    x = 1.0 / t;
539    r1 = x * x;
540    r2 = a * x;
541    into:
542    t = sqrt (a)
543    r1 = 1.0 / a;
544    r2 = t;
545    x = r1 * r2;
546    depending on the uses of x, r1, r2.  This removes one multiplication and
547    allows the sqrt and division operations to execute in parallel.
548    DEF_GSI is the gsi of the initial division by sqrt that defines
549    DEF (x in the example above).  */
550 
551 static void
optimize_recip_sqrt(gimple_stmt_iterator * def_gsi,tree def)552 optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def)
553 {
554   gimple *use_stmt;
555   imm_use_iterator use_iter;
556   gimple *stmt = gsi_stmt (*def_gsi);
557   tree x = def;
558   tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt);
559   tree div_rhs1 = gimple_assign_rhs1 (stmt);
560 
561   if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME
562       || TREE_CODE (div_rhs1) != REAL_CST
563       || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1))
564     return;
565 
566   gcall *sqrt_stmt
567     = dyn_cast <gcall *> (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name));
568 
569   if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt))
570     return;
571 
572   switch (gimple_call_combined_fn (sqrt_stmt))
573     {
574     CASE_CFN_SQRT:
575     CASE_CFN_SQRT_FN:
576       break;
577 
578     default:
579       return;
580     }
581   tree a = gimple_call_arg (sqrt_stmt, 0);
582 
583   /* We have 'a' and 'x'.  Now analyze the uses of 'x'.  */
584 
585   /* Statements that use x in x * x.  */
586   auto_vec<gimple *> sqr_stmts;
587   /* Statements that use x in a * x.  */
588   auto_vec<gimple *> mult_stmts;
589   bool has_other_use = false;
590   bool mult_on_main_path = false;
591 
592   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, x)
593     {
594       if (is_gimple_debug (use_stmt))
595 	continue;
596       if (is_square_of (use_stmt, x))
597 	{
598 	  sqr_stmts.safe_push (use_stmt);
599 	  if (gimple_bb (use_stmt) == gimple_bb (stmt))
600 	    mult_on_main_path = true;
601 	}
602       else if (is_mult_by (use_stmt, x, a))
603 	{
604 	  mult_stmts.safe_push (use_stmt);
605 	  if (gimple_bb (use_stmt) == gimple_bb (stmt))
606 	    mult_on_main_path = true;
607 	}
608       else
609 	has_other_use = true;
610     }
611 
612   /* In the x * x and a * x cases we just rewire stmt operands or
613      remove multiplications.  In the has_other_use case we introduce
614      a multiplication so make sure we don't introduce a multiplication
615      on a path where there was none.  */
616   if (has_other_use && !mult_on_main_path)
617     return;
618 
619   if (sqr_stmts.is_empty () && mult_stmts.is_empty ())
620     return;
621 
622   /* If x = 1.0 / sqrt (a) has uses other than those optimized here we want
623      to be able to compose it from the sqr and mult cases.  */
624   if (has_other_use && (sqr_stmts.is_empty () || mult_stmts.is_empty ()))
625     return;
626 
627   if (dump_file)
628     {
629       fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n");
630       print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE);
631       print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
632       fprintf (dump_file, "\n");
633     }
634 
635   bool delete_div = !has_other_use;
636   tree sqr_ssa_name = NULL_TREE;
637   if (!sqr_stmts.is_empty ())
638     {
639       /* r1 = x * x.  Transform the original
640 	 x = 1.0 / t
641 	 into
642 	 tmp1 = 1.0 / a
643 	 r1 = tmp1.  */
644 
645       sqr_ssa_name
646 	= make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr");
647 
648       if (dump_file)
649 	{
650 	  fprintf (dump_file, "Replacing original division\n");
651 	  print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
652 	  fprintf (dump_file, "with new division\n");
653 	}
654       stmt
655 	= gimple_build_assign (sqr_ssa_name, gimple_assign_rhs_code (stmt),
656 			       gimple_assign_rhs1 (stmt), a);
657       gsi_insert_before (def_gsi, stmt, GSI_SAME_STMT);
658       gsi_remove (def_gsi, true);
659       *def_gsi = gsi_for_stmt (stmt);
660       fold_stmt_inplace (def_gsi);
661       update_stmt (stmt);
662 
663       if (dump_file)
664 	print_gimple_stmt (dump_file, stmt, 0, TDF_NONE);
665 
666       delete_div = false;
667       gimple *sqr_stmt;
668       unsigned int i;
669       FOR_EACH_VEC_ELT (sqr_stmts, i, sqr_stmt)
670 	{
671 	  gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt);
672 	  gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name);
673 	  update_stmt (sqr_stmt);
674 	}
675     }
676   if (!mult_stmts.is_empty ())
677     {
678       /* r2 = a * x.  Transform this into:
679 	 r2 = t (The original sqrt (a)).  */
680       unsigned int i;
681       gimple *mult_stmt = NULL;
682       FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt)
683 	{
684 	  gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt);
685 
686 	  if (dump_file)
687 	    {
688 	      fprintf (dump_file, "Replacing squaring multiplication\n");
689 	      print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
690 	      fprintf (dump_file, "with assignment\n");
691 	    }
692 	  gimple_assign_set_rhs_from_tree (&gsi2, orig_sqrt_ssa_name);
693 	  fold_stmt_inplace (&gsi2);
694 	  update_stmt (mult_stmt);
695 	  if (dump_file)
696 	    print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE);
697       }
698     }
699 
700   if (has_other_use)
701     {
702       /* Using the two temporaries tmp1, tmp2 from above
703 	 the original x is now:
704 	 x = tmp1 * tmp2.  */
705       gcc_assert (orig_sqrt_ssa_name);
706       gcc_assert (sqr_ssa_name);
707 
708       gimple *new_stmt
709 	= gimple_build_assign (x, MULT_EXPR,
710 			       orig_sqrt_ssa_name, sqr_ssa_name);
711       gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT);
712       update_stmt (stmt);
713     }
714   else if (delete_div)
715     {
716       /* Remove the original division.  */
717       gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt);
718       gsi_remove (&gsi2, true);
719       release_defs (stmt);
720     }
721   else
722     release_ssa_name (x);
723 }
724 
725 /* Look for floating-point divisions among DEF's uses, and try to
726    replace them by multiplications with the reciprocal.  Add
727    as many statements computing the reciprocal as needed.
728 
729    DEF must be a GIMPLE register of a floating-point type.  */
730 
731 static void
execute_cse_reciprocals_1(gimple_stmt_iterator * def_gsi,tree def)732 execute_cse_reciprocals_1 (gimple_stmt_iterator *def_gsi, tree def)
733 {
734   use_operand_p use_p, square_use_p;
735   imm_use_iterator use_iter, square_use_iter;
736   tree square_def;
737   struct occurrence *occ;
738   int count = 0;
739   int threshold;
740   int square_recip_count = 0;
741   int sqrt_recip_count = 0;
742 
743   gcc_assert (FLOAT_TYPE_P (TREE_TYPE (def)) && TREE_CODE (def) == SSA_NAME);
744   threshold = targetm.min_divisions_for_recip_mul (TYPE_MODE (TREE_TYPE (def)));
745 
746   /* If DEF is a square (x * x), count the number of divisions by x.
747      If there are more divisions by x than by (DEF * DEF), prefer to optimize
748      the reciprocal of x instead of DEF.  This improves cases like:
749        def = x * x
750        t0 = a / def
751        t1 = b / def
752        t2 = c / x
753      Reciprocal optimization of x results in 1 division rather than 2 or 3.  */
754   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
755 
756   if (is_gimple_assign (def_stmt)
757       && gimple_assign_rhs_code (def_stmt) == MULT_EXPR
758       && TREE_CODE (gimple_assign_rhs1 (def_stmt)) == SSA_NAME
759       && gimple_assign_rhs1 (def_stmt) == gimple_assign_rhs2 (def_stmt))
760     {
761       tree op0 = gimple_assign_rhs1 (def_stmt);
762 
763       FOR_EACH_IMM_USE_FAST (use_p, use_iter, op0)
764 	{
765 	  gimple *use_stmt = USE_STMT (use_p);
766 	  if (is_division_by (use_stmt, op0))
767 	    sqrt_recip_count++;
768 	}
769     }
770 
771   FOR_EACH_IMM_USE_FAST (use_p, use_iter, def)
772     {
773       gimple *use_stmt = USE_STMT (use_p);
774       if (is_division_by (use_stmt, def))
775 	{
776 	  register_division_in (gimple_bb (use_stmt), 2);
777 	  count++;
778 	}
779 
780       if (is_square_of (use_stmt, def))
781 	{
782 	  square_def = gimple_assign_lhs (use_stmt);
783 	  FOR_EACH_IMM_USE_FAST (square_use_p, square_use_iter, square_def)
784 	    {
785 	      gimple *square_use_stmt = USE_STMT (square_use_p);
786 	      if (is_division_by (square_use_stmt, square_def))
787 		{
788 		  /* This is executed twice for each division by a square.  */
789 		  register_division_in (gimple_bb (square_use_stmt), 1);
790 		  square_recip_count++;
791 		}
792 	    }
793 	}
794     }
795 
796   /* Square reciprocals were counted twice above.  */
797   square_recip_count /= 2;
798 
799   /* If it is more profitable to optimize 1 / x, don't optimize 1 / (x * x).  */
800   if (sqrt_recip_count > square_recip_count)
801     goto out;
802 
803   /* Do the expensive part only if we can hope to optimize something.  */
804   if (count + square_recip_count >= threshold && count >= 1)
805     {
806       gimple *use_stmt;
807       for (occ = occ_head; occ; occ = occ->next)
808 	{
809 	  compute_merit (occ);
810 	  insert_reciprocals (def_gsi, occ, def, NULL, NULL,
811 			      square_recip_count, threshold);
812 	}
813 
814       FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
815 	{
816 	  if (is_division_by (use_stmt, def))
817 	    {
818 	      FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
819 		replace_reciprocal (use_p);
820 	    }
821 	  else if (square_recip_count > 0 && is_square_of (use_stmt, def))
822 	    {
823 	      FOR_EACH_IMM_USE_ON_STMT (use_p, use_iter)
824 		{
825 		  /* Find all uses of the square that are divisions and
826 		   * replace them by multiplications with the inverse.  */
827 		  imm_use_iterator square_iterator;
828 		  gimple *powmult_use_stmt = USE_STMT (use_p);
829 		  tree powmult_def_name = gimple_assign_lhs (powmult_use_stmt);
830 
831 		  FOR_EACH_IMM_USE_STMT (powmult_use_stmt,
832 					 square_iterator, powmult_def_name)
833 		    FOR_EACH_IMM_USE_ON_STMT (square_use_p, square_iterator)
834 		      {
835 			gimple *powmult_use_stmt = USE_STMT (square_use_p);
836 			if (is_division_by (powmult_use_stmt, powmult_def_name))
837 			  replace_reciprocal_squares (square_use_p);
838 		      }
839 		}
840 	    }
841 	}
842     }
843 
844 out:
845   for (occ = occ_head; occ; )
846     occ = free_bb (occ);
847 
848   occ_head = NULL;
849 }
850 
851 /* Return an internal function that implements the reciprocal of CALL,
852    or IFN_LAST if there is no such function that the target supports.  */
853 
854 internal_fn
internal_fn_reciprocal(gcall * call)855 internal_fn_reciprocal (gcall *call)
856 {
857   internal_fn ifn;
858 
859   switch (gimple_call_combined_fn (call))
860     {
861     CASE_CFN_SQRT:
862     CASE_CFN_SQRT_FN:
863       ifn = IFN_RSQRT;
864       break;
865 
866     default:
867       return IFN_LAST;
868     }
869 
870   tree_pair types = direct_internal_fn_types (ifn, call);
871   if (!direct_internal_fn_supported_p (ifn, types, OPTIMIZE_FOR_SPEED))
872     return IFN_LAST;
873 
874   return ifn;
875 }
876 
877 /* Go through all the floating-point SSA_NAMEs, and call
878    execute_cse_reciprocals_1 on each of them.  */
879 namespace {
880 
881 const pass_data pass_data_cse_reciprocals =
882 {
883   GIMPLE_PASS, /* type */
884   "recip", /* name */
885   OPTGROUP_NONE, /* optinfo_flags */
886   TV_TREE_RECIP, /* tv_id */
887   PROP_ssa, /* properties_required */
888   0, /* properties_provided */
889   0, /* properties_destroyed */
890   0, /* todo_flags_start */
891   TODO_update_ssa, /* todo_flags_finish */
892 };
893 
894 class pass_cse_reciprocals : public gimple_opt_pass
895 {
896 public:
pass_cse_reciprocals(gcc::context * ctxt)897   pass_cse_reciprocals (gcc::context *ctxt)
898     : gimple_opt_pass (pass_data_cse_reciprocals, ctxt)
899   {}
900 
901   /* opt_pass methods: */
gate(function *)902   virtual bool gate (function *) { return optimize && flag_reciprocal_math; }
903   virtual unsigned int execute (function *);
904 
905 }; // class pass_cse_reciprocals
906 
907 unsigned int
execute(function * fun)908 pass_cse_reciprocals::execute (function *fun)
909 {
910   basic_block bb;
911   tree arg;
912 
913   occ_pool = new object_allocator<occurrence> ("dominators for recip");
914 
915   memset (&reciprocal_stats, 0, sizeof (reciprocal_stats));
916   calculate_dominance_info (CDI_DOMINATORS);
917   calculate_dominance_info (CDI_POST_DOMINATORS);
918 
919   if (flag_checking)
920     FOR_EACH_BB_FN (bb, fun)
921       gcc_assert (!bb->aux);
922 
923   for (arg = DECL_ARGUMENTS (fun->decl); arg; arg = DECL_CHAIN (arg))
924     if (FLOAT_TYPE_P (TREE_TYPE (arg))
925 	&& is_gimple_reg (arg))
926       {
927 	tree name = ssa_default_def (fun, arg);
928 	if (name)
929 	  execute_cse_reciprocals_1 (NULL, name);
930       }
931 
932   FOR_EACH_BB_FN (bb, fun)
933     {
934       tree def;
935 
936       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
937 	   gsi_next (&gsi))
938 	{
939 	  gphi *phi = gsi.phi ();
940 	  def = PHI_RESULT (phi);
941 	  if (! virtual_operand_p (def)
942 	      && FLOAT_TYPE_P (TREE_TYPE (def)))
943 	    execute_cse_reciprocals_1 (NULL, def);
944 	}
945 
946       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
947 	   gsi_next (&gsi))
948         {
949 	  gimple *stmt = gsi_stmt (gsi);
950 
951 	  if (gimple_has_lhs (stmt)
952 	      && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL
953 	      && FLOAT_TYPE_P (TREE_TYPE (def))
954 	      && TREE_CODE (def) == SSA_NAME)
955 	    {
956 	      execute_cse_reciprocals_1 (&gsi, def);
957 	      stmt = gsi_stmt (gsi);
958 	      if (flag_unsafe_math_optimizations
959 		  && is_gimple_assign (stmt)
960 		  && gimple_assign_lhs (stmt) == def
961 		  && !stmt_can_throw_internal (cfun, stmt)
962 		  && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
963 		optimize_recip_sqrt (&gsi, def);
964 	    }
965 	}
966 
967       if (optimize_bb_for_size_p (bb))
968         continue;
969 
970       /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
971       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
972 	   gsi_next (&gsi))
973         {
974 	  gimple *stmt = gsi_stmt (gsi);
975 
976 	  if (is_gimple_assign (stmt)
977 	      && gimple_assign_rhs_code (stmt) == RDIV_EXPR)
978 	    {
979 	      tree arg1 = gimple_assign_rhs2 (stmt);
980 	      gimple *stmt1;
981 
982 	      if (TREE_CODE (arg1) != SSA_NAME)
983 		continue;
984 
985 	      stmt1 = SSA_NAME_DEF_STMT (arg1);
986 
987 	      if (is_gimple_call (stmt1)
988 		  && gimple_call_lhs (stmt1))
989 		{
990 		  bool fail;
991 		  imm_use_iterator ui;
992 		  use_operand_p use_p;
993 		  tree fndecl = NULL_TREE;
994 
995 		  gcall *call = as_a <gcall *> (stmt1);
996 		  internal_fn ifn = internal_fn_reciprocal (call);
997 		  if (ifn == IFN_LAST)
998 		    {
999 		      fndecl = gimple_call_fndecl (call);
1000 		      if (!fndecl
1001 			  || !fndecl_built_in_p (fndecl, BUILT_IN_MD))
1002 			continue;
1003 		      fndecl = targetm.builtin_reciprocal (fndecl);
1004 		      if (!fndecl)
1005 			continue;
1006 		    }
1007 
1008 		  /* Check that all uses of the SSA name are divisions,
1009 		     otherwise replacing the defining statement will do
1010 		     the wrong thing.  */
1011 		  fail = false;
1012 		  FOR_EACH_IMM_USE_FAST (use_p, ui, arg1)
1013 		    {
1014 		      gimple *stmt2 = USE_STMT (use_p);
1015 		      if (is_gimple_debug (stmt2))
1016 			continue;
1017 		      if (!is_gimple_assign (stmt2)
1018 			  || gimple_assign_rhs_code (stmt2) != RDIV_EXPR
1019 			  || gimple_assign_rhs1 (stmt2) == arg1
1020 			  || gimple_assign_rhs2 (stmt2) != arg1)
1021 			{
1022 			  fail = true;
1023 			  break;
1024 			}
1025 		    }
1026 		  if (fail)
1027 		    continue;
1028 
1029 		  gimple_replace_ssa_lhs (call, arg1);
1030 		  if (gimple_call_internal_p (call) != (ifn != IFN_LAST))
1031 		    {
1032 		      auto_vec<tree, 4> args;
1033 		      for (unsigned int i = 0;
1034 			   i < gimple_call_num_args (call); i++)
1035 			args.safe_push (gimple_call_arg (call, i));
1036 		      gcall *stmt2;
1037 		      if (ifn == IFN_LAST)
1038 			stmt2 = gimple_build_call_vec (fndecl, args);
1039 		      else
1040 			stmt2 = gimple_build_call_internal_vec (ifn, args);
1041 		      gimple_call_set_lhs (stmt2, arg1);
1042 		      gimple_move_vops (stmt2, call);
1043 		      gimple_call_set_nothrow (stmt2,
1044 					       gimple_call_nothrow_p (call));
1045 		      gimple_stmt_iterator gsi2 = gsi_for_stmt (call);
1046 		      gsi_replace (&gsi2, stmt2, true);
1047 		    }
1048 		  else
1049 		    {
1050 		      if (ifn == IFN_LAST)
1051 			gimple_call_set_fndecl (call, fndecl);
1052 		      else
1053 			gimple_call_set_internal_fn (call, ifn);
1054 		      update_stmt (call);
1055 		    }
1056 		  reciprocal_stats.rfuncs_inserted++;
1057 
1058 		  FOR_EACH_IMM_USE_STMT (stmt, ui, arg1)
1059 		    {
1060 		      gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
1061 		      gimple_assign_set_rhs_code (stmt, MULT_EXPR);
1062 		      fold_stmt_inplace (&gsi);
1063 		      update_stmt (stmt);
1064 		    }
1065 		}
1066 	    }
1067 	}
1068     }
1069 
1070   statistics_counter_event (fun, "reciprocal divs inserted",
1071 			    reciprocal_stats.rdivs_inserted);
1072   statistics_counter_event (fun, "reciprocal functions inserted",
1073 			    reciprocal_stats.rfuncs_inserted);
1074 
1075   free_dominance_info (CDI_DOMINATORS);
1076   free_dominance_info (CDI_POST_DOMINATORS);
1077   delete occ_pool;
1078   return 0;
1079 }
1080 
1081 } // anon namespace
1082 
1083 gimple_opt_pass *
make_pass_cse_reciprocals(gcc::context * ctxt)1084 make_pass_cse_reciprocals (gcc::context *ctxt)
1085 {
1086   return new pass_cse_reciprocals (ctxt);
1087 }
1088 
1089 /* Records an occurrence at statement USE_STMT in the vector of trees
1090    STMTS if it is dominated by *TOP_BB or dominates it or this basic block
1091    is not yet initialized.  Returns true if the occurrence was pushed on
1092    the vector.  Adjusts *TOP_BB to be the basic block dominating all
1093    statements in the vector.  */
1094 
1095 static bool
maybe_record_sincos(vec<gimple * > * stmts,basic_block * top_bb,gimple * use_stmt)1096 maybe_record_sincos (vec<gimple *> *stmts,
1097 		     basic_block *top_bb, gimple *use_stmt)
1098 {
1099   basic_block use_bb = gimple_bb (use_stmt);
1100   if (*top_bb
1101       && (*top_bb == use_bb
1102 	  || dominated_by_p (CDI_DOMINATORS, use_bb, *top_bb)))
1103     stmts->safe_push (use_stmt);
1104   else if (!*top_bb
1105 	   || dominated_by_p (CDI_DOMINATORS, *top_bb, use_bb))
1106     {
1107       stmts->safe_push (use_stmt);
1108       *top_bb = use_bb;
1109     }
1110   else
1111     return false;
1112 
1113   return true;
1114 }
1115 
1116 /* Look for sin, cos and cexpi calls with the same argument NAME and
1117    create a single call to cexpi CSEing the result in this case.
1118    We first walk over all immediate uses of the argument collecting
1119    statements that we can CSE in a vector and in a second pass replace
1120    the statement rhs with a REALPART or IMAGPART expression on the
1121    result of the cexpi call we insert before the use statement that
1122    dominates all other candidates.  */
1123 
1124 static bool
execute_cse_sincos_1(tree name)1125 execute_cse_sincos_1 (tree name)
1126 {
1127   gimple_stmt_iterator gsi;
1128   imm_use_iterator use_iter;
1129   tree fndecl, res, type;
1130   gimple *def_stmt, *use_stmt, *stmt;
1131   int seen_cos = 0, seen_sin = 0, seen_cexpi = 0;
1132   auto_vec<gimple *> stmts;
1133   basic_block top_bb = NULL;
1134   int i;
1135   bool cfg_changed = false;
1136 
1137   type = TREE_TYPE (name);
1138   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, name)
1139     {
1140       if (gimple_code (use_stmt) != GIMPLE_CALL
1141 	  || !gimple_call_lhs (use_stmt))
1142 	continue;
1143 
1144       switch (gimple_call_combined_fn (use_stmt))
1145 	{
1146 	CASE_CFN_COS:
1147 	  seen_cos |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1148 	  break;
1149 
1150 	CASE_CFN_SIN:
1151 	  seen_sin |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1152 	  break;
1153 
1154 	CASE_CFN_CEXPI:
1155 	  seen_cexpi |= maybe_record_sincos (&stmts, &top_bb, use_stmt) ? 1 : 0;
1156 	  break;
1157 
1158 	default:;
1159 	}
1160     }
1161 
1162   if (seen_cos + seen_sin + seen_cexpi <= 1)
1163     return false;
1164 
1165   /* Simply insert cexpi at the beginning of top_bb but not earlier than
1166      the name def statement.  */
1167   fndecl = mathfn_built_in (type, BUILT_IN_CEXPI);
1168   if (!fndecl)
1169     return false;
1170   stmt = gimple_build_call (fndecl, 1, name);
1171   res = make_temp_ssa_name (TREE_TYPE (TREE_TYPE (fndecl)), stmt, "sincostmp");
1172   gimple_call_set_lhs (stmt, res);
1173 
1174   def_stmt = SSA_NAME_DEF_STMT (name);
1175   if (!SSA_NAME_IS_DEFAULT_DEF (name)
1176       && gimple_code (def_stmt) != GIMPLE_PHI
1177       && gimple_bb (def_stmt) == top_bb)
1178     {
1179       gsi = gsi_for_stmt (def_stmt);
1180       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
1181     }
1182   else
1183     {
1184       gsi = gsi_after_labels (top_bb);
1185       gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
1186     }
1187   sincos_stats.inserted++;
1188 
1189   /* And adjust the recorded old call sites.  */
1190   for (i = 0; stmts.iterate (i, &use_stmt); ++i)
1191     {
1192       tree rhs = NULL;
1193 
1194       switch (gimple_call_combined_fn (use_stmt))
1195 	{
1196 	CASE_CFN_COS:
1197 	  rhs = fold_build1 (REALPART_EXPR, type, res);
1198 	  break;
1199 
1200 	CASE_CFN_SIN:
1201 	  rhs = fold_build1 (IMAGPART_EXPR, type, res);
1202 	  break;
1203 
1204 	CASE_CFN_CEXPI:
1205 	  rhs = res;
1206 	  break;
1207 
1208 	default:;
1209 	  gcc_unreachable ();
1210 	}
1211 
1212 	/* Replace call with a copy.  */
1213 	stmt = gimple_build_assign (gimple_call_lhs (use_stmt), rhs);
1214 
1215 	gsi = gsi_for_stmt (use_stmt);
1216 	gsi_replace (&gsi, stmt, true);
1217 	if (gimple_purge_dead_eh_edges (gimple_bb (stmt)))
1218 	  cfg_changed = true;
1219     }
1220 
1221   return cfg_changed;
1222 }
1223 
1224 /* To evaluate powi(x,n), the floating point value x raised to the
1225    constant integer exponent n, we use a hybrid algorithm that
1226    combines the "window method" with look-up tables.  For an
1227    introduction to exponentiation algorithms and "addition chains",
1228    see section 4.6.3, "Evaluation of Powers" of Donald E. Knuth,
1229    "Seminumerical Algorithms", Vol. 2, "The Art of Computer Programming",
1230    3rd Edition, 1998, and Daniel M. Gordon, "A Survey of Fast Exponentiation
1231    Methods", Journal of Algorithms, Vol. 27, pp. 129-146, 1998.  */
1232 
1233 /* Provide a default value for POWI_MAX_MULTS, the maximum number of
1234    multiplications to inline before calling the system library's pow
1235    function.  powi(x,n) requires at worst 2*bits(n)-2 multiplications,
1236    so this default never requires calling pow, powf or powl.  */
1237 
1238 #ifndef POWI_MAX_MULTS
1239 #define POWI_MAX_MULTS  (2*HOST_BITS_PER_WIDE_INT-2)
1240 #endif
1241 
1242 /* The size of the "optimal power tree" lookup table.  All
1243    exponents less than this value are simply looked up in the
1244    powi_table below.  This threshold is also used to size the
1245    cache of pseudo registers that hold intermediate results.  */
1246 #define POWI_TABLE_SIZE 256
1247 
1248 /* The size, in bits of the window, used in the "window method"
1249    exponentiation algorithm.  This is equivalent to a radix of
1250    (1<<POWI_WINDOW_SIZE) in the corresponding "m-ary method".  */
1251 #define POWI_WINDOW_SIZE 3
1252 
1253 /* The following table is an efficient representation of an
1254    "optimal power tree".  For each value, i, the corresponding
1255    value, j, in the table states than an optimal evaluation
1256    sequence for calculating pow(x,i) can be found by evaluating
1257    pow(x,j)*pow(x,i-j).  An optimal power tree for the first
1258    100 integers is given in Knuth's "Seminumerical algorithms".  */
1259 
1260 static const unsigned char powi_table[POWI_TABLE_SIZE] =
1261   {
1262       0,   1,   1,   2,   2,   3,   3,   4,  /*   0 -   7 */
1263       4,   6,   5,   6,   6,  10,   7,   9,  /*   8 -  15 */
1264       8,  16,   9,  16,  10,  12,  11,  13,  /*  16 -  23 */
1265      12,  17,  13,  18,  14,  24,  15,  26,  /*  24 -  31 */
1266      16,  17,  17,  19,  18,  33,  19,  26,  /*  32 -  39 */
1267      20,  25,  21,  40,  22,  27,  23,  44,  /*  40 -  47 */
1268      24,  32,  25,  34,  26,  29,  27,  44,  /*  48 -  55 */
1269      28,  31,  29,  34,  30,  60,  31,  36,  /*  56 -  63 */
1270      32,  64,  33,  34,  34,  46,  35,  37,  /*  64 -  71 */
1271      36,  65,  37,  50,  38,  48,  39,  69,  /*  72 -  79 */
1272      40,  49,  41,  43,  42,  51,  43,  58,  /*  80 -  87 */
1273      44,  64,  45,  47,  46,  59,  47,  76,  /*  88 -  95 */
1274      48,  65,  49,  66,  50,  67,  51,  66,  /*  96 - 103 */
1275      52,  70,  53,  74,  54, 104,  55,  74,  /* 104 - 111 */
1276      56,  64,  57,  69,  58,  78,  59,  68,  /* 112 - 119 */
1277      60,  61,  61,  80,  62,  75,  63,  68,  /* 120 - 127 */
1278      64,  65,  65, 128,  66, 129,  67,  90,  /* 128 - 135 */
1279      68,  73,  69, 131,  70,  94,  71,  88,  /* 136 - 143 */
1280      72, 128,  73,  98,  74, 132,  75, 121,  /* 144 - 151 */
1281      76, 102,  77, 124,  78, 132,  79, 106,  /* 152 - 159 */
1282      80,  97,  81, 160,  82,  99,  83, 134,  /* 160 - 167 */
1283      84,  86,  85,  95,  86, 160,  87, 100,  /* 168 - 175 */
1284      88, 113,  89,  98,  90, 107,  91, 122,  /* 176 - 183 */
1285      92, 111,  93, 102,  94, 126,  95, 150,  /* 184 - 191 */
1286      96, 128,  97, 130,  98, 133,  99, 195,  /* 192 - 199 */
1287     100, 128, 101, 123, 102, 164, 103, 138,  /* 200 - 207 */
1288     104, 145, 105, 146, 106, 109, 107, 149,  /* 208 - 215 */
1289     108, 200, 109, 146, 110, 170, 111, 157,  /* 216 - 223 */
1290     112, 128, 113, 130, 114, 182, 115, 132,  /* 224 - 231 */
1291     116, 200, 117, 132, 118, 158, 119, 206,  /* 232 - 239 */
1292     120, 240, 121, 162, 122, 147, 123, 152,  /* 240 - 247 */
1293     124, 166, 125, 214, 126, 138, 127, 153,  /* 248 - 255 */
1294   };
1295 
1296 
1297 /* Return the number of multiplications required to calculate
1298    powi(x,n) where n is less than POWI_TABLE_SIZE.  This is a
1299    subroutine of powi_cost.  CACHE is an array indicating
1300    which exponents have already been calculated.  */
1301 
1302 static int
powi_lookup_cost(unsigned HOST_WIDE_INT n,bool * cache)1303 powi_lookup_cost (unsigned HOST_WIDE_INT n, bool *cache)
1304 {
1305   /* If we've already calculated this exponent, then this evaluation
1306      doesn't require any additional multiplications.  */
1307   if (cache[n])
1308     return 0;
1309 
1310   cache[n] = true;
1311   return powi_lookup_cost (n - powi_table[n], cache)
1312 	 + powi_lookup_cost (powi_table[n], cache) + 1;
1313 }
1314 
1315 /* Return the number of multiplications required to calculate
1316    powi(x,n) for an arbitrary x, given the exponent N.  This
1317    function needs to be kept in sync with powi_as_mults below.  */
1318 
1319 static int
powi_cost(HOST_WIDE_INT n)1320 powi_cost (HOST_WIDE_INT n)
1321 {
1322   bool cache[POWI_TABLE_SIZE];
1323   unsigned HOST_WIDE_INT digit;
1324   unsigned HOST_WIDE_INT val;
1325   int result;
1326 
1327   if (n == 0)
1328     return 0;
1329 
1330   /* Ignore the reciprocal when calculating the cost.  */
1331   val = absu_hwi (n);
1332 
1333   /* Initialize the exponent cache.  */
1334   memset (cache, 0, POWI_TABLE_SIZE * sizeof (bool));
1335   cache[1] = true;
1336 
1337   result = 0;
1338 
1339   while (val >= POWI_TABLE_SIZE)
1340     {
1341       if (val & 1)
1342 	{
1343 	  digit = val & ((1 << POWI_WINDOW_SIZE) - 1);
1344 	  result += powi_lookup_cost (digit, cache)
1345 		    + POWI_WINDOW_SIZE + 1;
1346 	  val >>= POWI_WINDOW_SIZE;
1347 	}
1348       else
1349 	{
1350 	  val >>= 1;
1351 	  result++;
1352 	}
1353     }
1354 
1355   return result + powi_lookup_cost (val, cache);
1356 }
1357 
1358 /* Recursive subroutine of powi_as_mults.  This function takes the
1359    array, CACHE, of already calculated exponents and an exponent N and
1360    returns a tree that corresponds to CACHE[1]**N, with type TYPE.  */
1361 
1362 static tree
powi_as_mults_1(gimple_stmt_iterator * gsi,location_t loc,tree type,unsigned HOST_WIDE_INT n,tree * cache)1363 powi_as_mults_1 (gimple_stmt_iterator *gsi, location_t loc, tree type,
1364 		 unsigned HOST_WIDE_INT n, tree *cache)
1365 {
1366   tree op0, op1, ssa_target;
1367   unsigned HOST_WIDE_INT digit;
1368   gassign *mult_stmt;
1369 
1370   if (n < POWI_TABLE_SIZE && cache[n])
1371     return cache[n];
1372 
1373   ssa_target = make_temp_ssa_name (type, NULL, "powmult");
1374 
1375   if (n < POWI_TABLE_SIZE)
1376     {
1377       cache[n] = ssa_target;
1378       op0 = powi_as_mults_1 (gsi, loc, type, n - powi_table[n], cache);
1379       op1 = powi_as_mults_1 (gsi, loc, type, powi_table[n], cache);
1380     }
1381   else if (n & 1)
1382     {
1383       digit = n & ((1 << POWI_WINDOW_SIZE) - 1);
1384       op0 = powi_as_mults_1 (gsi, loc, type, n - digit, cache);
1385       op1 = powi_as_mults_1 (gsi, loc, type, digit, cache);
1386     }
1387   else
1388     {
1389       op0 = powi_as_mults_1 (gsi, loc, type, n >> 1, cache);
1390       op1 = op0;
1391     }
1392 
1393   mult_stmt = gimple_build_assign (ssa_target, MULT_EXPR, op0, op1);
1394   gimple_set_location (mult_stmt, loc);
1395   gsi_insert_before (gsi, mult_stmt, GSI_SAME_STMT);
1396 
1397   return ssa_target;
1398 }
1399 
1400 /* Convert ARG0**N to a tree of multiplications of ARG0 with itself.
1401    This function needs to be kept in sync with powi_cost above.  */
1402 
1403 static tree
powi_as_mults(gimple_stmt_iterator * gsi,location_t loc,tree arg0,HOST_WIDE_INT n)1404 powi_as_mults (gimple_stmt_iterator *gsi, location_t loc,
1405 	       tree arg0, HOST_WIDE_INT n)
1406 {
1407   tree cache[POWI_TABLE_SIZE], result, type = TREE_TYPE (arg0);
1408   gassign *div_stmt;
1409   tree target;
1410 
1411   if (n == 0)
1412     return build_real (type, dconst1);
1413 
1414   memset (cache, 0,  sizeof (cache));
1415   cache[1] = arg0;
1416 
1417   result = powi_as_mults_1 (gsi, loc, type, absu_hwi (n), cache);
1418   if (n >= 0)
1419     return result;
1420 
1421   /* If the original exponent was negative, reciprocate the result.  */
1422   target = make_temp_ssa_name (type, NULL, "powmult");
1423   div_stmt = gimple_build_assign (target, RDIV_EXPR,
1424 				  build_real (type, dconst1), result);
1425   gimple_set_location (div_stmt, loc);
1426   gsi_insert_before (gsi, div_stmt, GSI_SAME_STMT);
1427 
1428   return target;
1429 }
1430 
1431 /* ARG0 and N are the two arguments to a powi builtin in GSI with
1432    location info LOC.  If the arguments are appropriate, create an
1433    equivalent sequence of statements prior to GSI using an optimal
1434    number of multiplications, and return an expession holding the
1435    result.  */
1436 
1437 static tree
gimple_expand_builtin_powi(gimple_stmt_iterator * gsi,location_t loc,tree arg0,HOST_WIDE_INT n)1438 gimple_expand_builtin_powi (gimple_stmt_iterator *gsi, location_t loc,
1439 			    tree arg0, HOST_WIDE_INT n)
1440 {
1441   if ((n >= -1 && n <= 2)
1442       || (optimize_function_for_speed_p (cfun)
1443 	  && powi_cost (n) <= POWI_MAX_MULTS))
1444     return powi_as_mults (gsi, loc, arg0, n);
1445 
1446   return NULL_TREE;
1447 }
1448 
1449 /* Build a gimple call statement that calls FN with argument ARG.
1450    Set the lhs of the call statement to a fresh SSA name.  Insert the
1451    statement prior to GSI's current position, and return the fresh
1452    SSA name.  */
1453 
1454 static tree
build_and_insert_call(gimple_stmt_iterator * gsi,location_t loc,tree fn,tree arg)1455 build_and_insert_call (gimple_stmt_iterator *gsi, location_t loc,
1456 		       tree fn, tree arg)
1457 {
1458   gcall *call_stmt;
1459   tree ssa_target;
1460 
1461   call_stmt = gimple_build_call (fn, 1, arg);
1462   ssa_target = make_temp_ssa_name (TREE_TYPE (arg), NULL, "powroot");
1463   gimple_set_lhs (call_stmt, ssa_target);
1464   gimple_set_location (call_stmt, loc);
1465   gsi_insert_before (gsi, call_stmt, GSI_SAME_STMT);
1466 
1467   return ssa_target;
1468 }
1469 
1470 /* Build a gimple binary operation with the given CODE and arguments
1471    ARG0, ARG1, assigning the result to a new SSA name for variable
1472    TARGET.  Insert the statement prior to GSI's current position, and
1473    return the fresh SSA name.*/
1474 
1475 static tree
build_and_insert_binop(gimple_stmt_iterator * gsi,location_t loc,const char * name,enum tree_code code,tree arg0,tree arg1)1476 build_and_insert_binop (gimple_stmt_iterator *gsi, location_t loc,
1477 			const char *name, enum tree_code code,
1478 			tree arg0, tree arg1)
1479 {
1480   tree result = make_temp_ssa_name (TREE_TYPE (arg0), NULL, name);
1481   gassign *stmt = gimple_build_assign (result, code, arg0, arg1);
1482   gimple_set_location (stmt, loc);
1483   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1484   return result;
1485 }
1486 
1487 /* Build a gimple reference operation with the given CODE and argument
1488    ARG, assigning the result to a new SSA name of TYPE with NAME.
1489    Insert the statement prior to GSI's current position, and return
1490    the fresh SSA name.  */
1491 
1492 static inline tree
build_and_insert_ref(gimple_stmt_iterator * gsi,location_t loc,tree type,const char * name,enum tree_code code,tree arg0)1493 build_and_insert_ref (gimple_stmt_iterator *gsi, location_t loc, tree type,
1494 		      const char *name, enum tree_code code, tree arg0)
1495 {
1496   tree result = make_temp_ssa_name (type, NULL, name);
1497   gimple *stmt = gimple_build_assign (result, build1 (code, type, arg0));
1498   gimple_set_location (stmt, loc);
1499   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1500   return result;
1501 }
1502 
1503 /* Build a gimple assignment to cast VAL to TYPE.  Insert the statement
1504    prior to GSI's current position, and return the fresh SSA name.  */
1505 
1506 static tree
build_and_insert_cast(gimple_stmt_iterator * gsi,location_t loc,tree type,tree val)1507 build_and_insert_cast (gimple_stmt_iterator *gsi, location_t loc,
1508 		       tree type, tree val)
1509 {
1510   tree result = make_ssa_name (type);
1511   gassign *stmt = gimple_build_assign (result, NOP_EXPR, val);
1512   gimple_set_location (stmt, loc);
1513   gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
1514   return result;
1515 }
1516 
1517 struct pow_synth_sqrt_info
1518 {
1519   bool *factors;
1520   unsigned int deepest;
1521   unsigned int num_mults;
1522 };
1523 
1524 /* Return true iff the real value C can be represented as a
1525    sum of powers of 0.5 up to N.  That is:
1526    C == SUM<i from 1..N> (a[i]*(0.5**i)) where a[i] is either 0 or 1.
1527    Record in INFO the various parameters of the synthesis algorithm such
1528    as the factors a[i], the maximum 0.5 power and the number of
1529    multiplications that will be required.  */
1530 
1531 bool
representable_as_half_series_p(REAL_VALUE_TYPE c,unsigned n,struct pow_synth_sqrt_info * info)1532 representable_as_half_series_p (REAL_VALUE_TYPE c, unsigned n,
1533 				 struct pow_synth_sqrt_info *info)
1534 {
1535   REAL_VALUE_TYPE factor = dconsthalf;
1536   REAL_VALUE_TYPE remainder = c;
1537 
1538   info->deepest = 0;
1539   info->num_mults = 0;
1540   memset (info->factors, 0, n * sizeof (bool));
1541 
1542   for (unsigned i = 0; i < n; i++)
1543     {
1544       REAL_VALUE_TYPE res;
1545 
1546       /* If something inexact happened bail out now.  */
1547       if (real_arithmetic (&res, MINUS_EXPR, &remainder, &factor))
1548 	return false;
1549 
1550       /* We have hit zero.  The number is representable as a sum
1551          of powers of 0.5.  */
1552       if (real_equal (&res, &dconst0))
1553 	{
1554 	  info->factors[i] = true;
1555 	  info->deepest = i + 1;
1556 	  return true;
1557 	}
1558       else if (!REAL_VALUE_NEGATIVE (res))
1559 	{
1560 	  remainder = res;
1561 	  info->factors[i] = true;
1562 	  info->num_mults++;
1563 	}
1564       else
1565 	info->factors[i] = false;
1566 
1567       real_arithmetic (&factor, MULT_EXPR, &factor, &dconsthalf);
1568     }
1569   return false;
1570 }
1571 
1572 /* Return the tree corresponding to FN being applied
1573    to ARG N times at GSI and LOC.
1574    Look up previous results from CACHE if need be.
1575    cache[0] should contain just plain ARG i.e. FN applied to ARG 0 times.  */
1576 
1577 static tree
get_fn_chain(tree arg,unsigned int n,gimple_stmt_iterator * gsi,tree fn,location_t loc,tree * cache)1578 get_fn_chain (tree arg, unsigned int n, gimple_stmt_iterator *gsi,
1579 	      tree fn, location_t loc, tree *cache)
1580 {
1581   tree res = cache[n];
1582   if (!res)
1583     {
1584       tree prev = get_fn_chain (arg, n - 1, gsi, fn, loc, cache);
1585       res = build_and_insert_call (gsi, loc, fn, prev);
1586       cache[n] = res;
1587     }
1588 
1589   return res;
1590 }
1591 
1592 /* Print to STREAM the repeated application of function FNAME to ARG
1593    N times.  So, for FNAME = "foo", ARG = "x", N = 2 it would print:
1594    "foo (foo (x))".  */
1595 
1596 static void
print_nested_fn(FILE * stream,const char * fname,const char * arg,unsigned int n)1597 print_nested_fn (FILE* stream, const char *fname, const char* arg,
1598 		 unsigned int n)
1599 {
1600   if (n == 0)
1601     fprintf (stream, "%s", arg);
1602   else
1603     {
1604       fprintf (stream, "%s (", fname);
1605       print_nested_fn (stream, fname, arg, n - 1);
1606       fprintf (stream, ")");
1607     }
1608 }
1609 
1610 /* Print to STREAM the fractional sequence of sqrt chains
1611    applied to ARG, described by INFO.  Used for the dump file.  */
1612 
1613 static void
dump_fractional_sqrt_sequence(FILE * stream,const char * arg,struct pow_synth_sqrt_info * info)1614 dump_fractional_sqrt_sequence (FILE *stream, const char *arg,
1615 			        struct pow_synth_sqrt_info *info)
1616 {
1617   for (unsigned int i = 0; i < info->deepest; i++)
1618     {
1619       bool is_set = info->factors[i];
1620       if (is_set)
1621 	{
1622 	  print_nested_fn (stream, "sqrt", arg, i + 1);
1623 	  if (i != info->deepest - 1)
1624 	    fprintf (stream, " * ");
1625 	}
1626     }
1627 }
1628 
1629 /* Print to STREAM a representation of raising ARG to an integer
1630    power N.  Used for the dump file.  */
1631 
1632 static void
dump_integer_part(FILE * stream,const char * arg,HOST_WIDE_INT n)1633 dump_integer_part (FILE *stream, const char* arg, HOST_WIDE_INT n)
1634 {
1635   if (n > 1)
1636     fprintf (stream, "powi (%s, " HOST_WIDE_INT_PRINT_DEC ")", arg, n);
1637   else if (n == 1)
1638     fprintf (stream, "%s", arg);
1639 }
1640 
1641 /* Attempt to synthesize a POW[F] (ARG0, ARG1) call using chains of
1642    square roots.  Place at GSI and LOC.  Limit the maximum depth
1643    of the sqrt chains to MAX_DEPTH.  Return the tree holding the
1644    result of the expanded sequence or NULL_TREE if the expansion failed.
1645 
1646    This routine assumes that ARG1 is a real number with a fractional part
1647    (the integer exponent case will have been handled earlier in
1648    gimple_expand_builtin_pow).
1649 
1650    For ARG1 > 0.0:
1651    * For ARG1 composed of a whole part WHOLE_PART and a fractional part
1652      FRAC_PART i.e. WHOLE_PART == floor (ARG1) and
1653                     FRAC_PART == ARG1 - WHOLE_PART:
1654      Produce POWI (ARG0, WHOLE_PART) * POW (ARG0, FRAC_PART) where
1655      POW (ARG0, FRAC_PART) is expanded as a product of square root chains
1656      if it can be expressed as such, that is if FRAC_PART satisfies:
1657      FRAC_PART == <SUM from i = 1 until MAX_DEPTH> (a[i] * (0.5**i))
1658      where integer a[i] is either 0 or 1.
1659 
1660      Example:
1661      POW (x, 3.625) == POWI (x, 3) * POW (x, 0.625)
1662        --> POWI (x, 3) * SQRT (x) * SQRT (SQRT (SQRT (x)))
1663 
1664    For ARG1 < 0.0 there are two approaches:
1665    * (A) Expand to 1.0 / POW (ARG0, -ARG1) where POW (ARG0, -ARG1)
1666          is calculated as above.
1667 
1668      Example:
1669      POW (x, -5.625) == 1.0 / POW (x, 5.625)
1670        -->  1.0 / (POWI (x, 5) * SQRT (x) * SQRT (SQRT (SQRT (x))))
1671 
1672    * (B) : WHOLE_PART := - ceil (abs (ARG1))
1673            FRAC_PART  := ARG1 - WHOLE_PART
1674      and expand to POW (x, FRAC_PART) / POWI (x, WHOLE_PART).
1675      Example:
1676      POW (x, -5.875) == POW (x, 0.125) / POWI (X, 6)
1677        --> SQRT (SQRT (SQRT (x))) / (POWI (x, 6))
1678 
1679    For ARG1 < 0.0 we choose between (A) and (B) depending on
1680    how many multiplications we'd have to do.
1681    So, for the example in (B): POW (x, -5.875), if we were to
1682    follow algorithm (A) we would produce:
1683    1.0 / POWI (X, 5) * SQRT (X) * SQRT (SQRT (X)) * SQRT (SQRT (SQRT (X)))
1684    which contains more multiplications than approach (B).
1685 
1686    Hopefully, this approach will eliminate potentially expensive POW library
1687    calls when unsafe floating point math is enabled and allow the compiler to
1688    further optimise the multiplies, square roots and divides produced by this
1689    function.  */
1690 
1691 static tree
expand_pow_as_sqrts(gimple_stmt_iterator * gsi,location_t loc,tree arg0,tree arg1,HOST_WIDE_INT max_depth)1692 expand_pow_as_sqrts (gimple_stmt_iterator *gsi, location_t loc,
1693 		     tree arg0, tree arg1, HOST_WIDE_INT max_depth)
1694 {
1695   tree type = TREE_TYPE (arg0);
1696   machine_mode mode = TYPE_MODE (type);
1697   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1698   bool one_over = true;
1699 
1700   if (!sqrtfn)
1701     return NULL_TREE;
1702 
1703   if (TREE_CODE (arg1) != REAL_CST)
1704     return NULL_TREE;
1705 
1706   REAL_VALUE_TYPE exp_init = TREE_REAL_CST (arg1);
1707 
1708   gcc_assert (max_depth > 0);
1709   tree *cache = XALLOCAVEC (tree, max_depth + 1);
1710 
1711   struct pow_synth_sqrt_info synth_info;
1712   synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1713   synth_info.deepest = 0;
1714   synth_info.num_mults = 0;
1715 
1716   bool neg_exp = REAL_VALUE_NEGATIVE (exp_init);
1717   REAL_VALUE_TYPE exp = real_value_abs (&exp_init);
1718 
1719   /* The whole and fractional parts of exp.  */
1720   REAL_VALUE_TYPE whole_part;
1721   REAL_VALUE_TYPE frac_part;
1722 
1723   real_floor (&whole_part, mode, &exp);
1724   real_arithmetic (&frac_part, MINUS_EXPR, &exp, &whole_part);
1725 
1726 
1727   REAL_VALUE_TYPE ceil_whole = dconst0;
1728   REAL_VALUE_TYPE ceil_fract = dconst0;
1729 
1730   if (neg_exp)
1731     {
1732       real_ceil (&ceil_whole, mode, &exp);
1733       real_arithmetic (&ceil_fract, MINUS_EXPR, &ceil_whole, &exp);
1734     }
1735 
1736   if (!representable_as_half_series_p (frac_part, max_depth, &synth_info))
1737     return NULL_TREE;
1738 
1739   /* Check whether it's more profitable to not use 1.0 / ...  */
1740   if (neg_exp)
1741     {
1742       struct pow_synth_sqrt_info alt_synth_info;
1743       alt_synth_info.factors = XALLOCAVEC (bool, max_depth + 1);
1744       alt_synth_info.deepest = 0;
1745       alt_synth_info.num_mults = 0;
1746 
1747       if (representable_as_half_series_p (ceil_fract, max_depth,
1748 					   &alt_synth_info)
1749 	  && alt_synth_info.deepest <= synth_info.deepest
1750 	  && alt_synth_info.num_mults < synth_info.num_mults)
1751 	{
1752 	  whole_part = ceil_whole;
1753 	  frac_part = ceil_fract;
1754 	  synth_info.deepest = alt_synth_info.deepest;
1755 	  synth_info.num_mults = alt_synth_info.num_mults;
1756 	  memcpy (synth_info.factors, alt_synth_info.factors,
1757 		  (max_depth + 1) * sizeof (bool));
1758 	  one_over = false;
1759 	}
1760     }
1761 
1762   HOST_WIDE_INT n = real_to_integer (&whole_part);
1763   REAL_VALUE_TYPE cint;
1764   real_from_integer (&cint, VOIDmode, n, SIGNED);
1765 
1766   if (!real_identical (&whole_part, &cint))
1767     return NULL_TREE;
1768 
1769   if (powi_cost (n) + synth_info.num_mults > POWI_MAX_MULTS)
1770     return NULL_TREE;
1771 
1772   memset (cache, 0, (max_depth + 1) * sizeof (tree));
1773 
1774   tree integer_res = n == 0 ? build_real (type, dconst1) : arg0;
1775 
1776   /* Calculate the integer part of the exponent.  */
1777   if (n > 1)
1778     {
1779       integer_res = gimple_expand_builtin_powi (gsi, loc, arg0, n);
1780       if (!integer_res)
1781 	return NULL_TREE;
1782     }
1783 
1784   if (dump_file)
1785     {
1786       char string[64];
1787 
1788       real_to_decimal (string, &exp_init, sizeof (string), 0, 1);
1789       fprintf (dump_file, "synthesizing pow (x, %s) as:\n", string);
1790 
1791       if (neg_exp)
1792 	{
1793 	  if (one_over)
1794 	    {
1795 	      fprintf (dump_file, "1.0 / (");
1796 	      dump_integer_part (dump_file, "x", n);
1797 	      if (n > 0)
1798 	        fprintf (dump_file, " * ");
1799 	      dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1800 	      fprintf (dump_file, ")");
1801 	    }
1802 	  else
1803 	    {
1804 	      dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1805 	      fprintf (dump_file, " / (");
1806 	      dump_integer_part (dump_file, "x", n);
1807 	      fprintf (dump_file, ")");
1808 	    }
1809 	}
1810       else
1811 	{
1812 	  dump_fractional_sqrt_sequence (dump_file, "x", &synth_info);
1813 	  if (n > 0)
1814 	    fprintf (dump_file, " * ");
1815 	  dump_integer_part (dump_file, "x", n);
1816 	}
1817 
1818       fprintf (dump_file, "\ndeepest sqrt chain: %d\n", synth_info.deepest);
1819     }
1820 
1821 
1822   tree fract_res = NULL_TREE;
1823   cache[0] = arg0;
1824 
1825   /* Calculate the fractional part of the exponent.  */
1826   for (unsigned i = 0; i < synth_info.deepest; i++)
1827     {
1828       if (synth_info.factors[i])
1829 	{
1830 	  tree sqrt_chain = get_fn_chain (arg0, i + 1, gsi, sqrtfn, loc, cache);
1831 
1832 	  if (!fract_res)
1833 	      fract_res = sqrt_chain;
1834 
1835 	  else
1836 	    fract_res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1837 					   fract_res, sqrt_chain);
1838 	}
1839     }
1840 
1841   tree res = NULL_TREE;
1842 
1843   if (neg_exp)
1844     {
1845       if (one_over)
1846 	{
1847 	  if (n > 0)
1848 	    res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1849 					   fract_res, integer_res);
1850 	  else
1851 	    res = fract_res;
1852 
1853 	  res = build_and_insert_binop (gsi, loc, "powrootrecip", RDIV_EXPR,
1854 					  build_real (type, dconst1), res);
1855 	}
1856       else
1857 	{
1858 	  res = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
1859 					 fract_res, integer_res);
1860 	}
1861     }
1862   else
1863     res = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
1864 				   fract_res, integer_res);
1865   return res;
1866 }
1867 
1868 /* ARG0 and ARG1 are the two arguments to a pow builtin call in GSI
1869    with location info LOC.  If possible, create an equivalent and
1870    less expensive sequence of statements prior to GSI, and return an
1871    expession holding the result.  */
1872 
1873 static tree
gimple_expand_builtin_pow(gimple_stmt_iterator * gsi,location_t loc,tree arg0,tree arg1)1874 gimple_expand_builtin_pow (gimple_stmt_iterator *gsi, location_t loc,
1875 			   tree arg0, tree arg1)
1876 {
1877   REAL_VALUE_TYPE c, cint, dconst1_3, dconst1_4, dconst1_6;
1878   REAL_VALUE_TYPE c2, dconst3;
1879   HOST_WIDE_INT n;
1880   tree type, sqrtfn, cbrtfn, sqrt_arg0, result, cbrt_x, powi_cbrt_x;
1881   machine_mode mode;
1882   bool speed_p = optimize_bb_for_speed_p (gsi_bb (*gsi));
1883   bool hw_sqrt_exists, c_is_int, c2_is_int;
1884 
1885   dconst1_4 = dconst1;
1886   SET_REAL_EXP (&dconst1_4, REAL_EXP (&dconst1_4) - 2);
1887 
1888   /* If the exponent isn't a constant, there's nothing of interest
1889      to be done.  */
1890   if (TREE_CODE (arg1) != REAL_CST)
1891     return NULL_TREE;
1892 
1893   /* Don't perform the operation if flag_signaling_nans is on
1894      and the operand is a signaling NaN.  */
1895   if (HONOR_SNANS (TYPE_MODE (TREE_TYPE (arg1)))
1896       && ((TREE_CODE (arg0) == REAL_CST
1897 	   && REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg0)))
1898 	  || REAL_VALUE_ISSIGNALING_NAN (TREE_REAL_CST (arg1))))
1899     return NULL_TREE;
1900 
1901   /* If the exponent is equivalent to an integer, expand to an optimal
1902      multiplication sequence when profitable.  */
1903   c = TREE_REAL_CST (arg1);
1904   n = real_to_integer (&c);
1905   real_from_integer (&cint, VOIDmode, n, SIGNED);
1906   c_is_int = real_identical (&c, &cint);
1907 
1908   if (c_is_int
1909       && ((n >= -1 && n <= 2)
1910 	  || (flag_unsafe_math_optimizations
1911 	      && speed_p
1912 	      && powi_cost (n) <= POWI_MAX_MULTS)))
1913     return gimple_expand_builtin_powi (gsi, loc, arg0, n);
1914 
1915   /* Attempt various optimizations using sqrt and cbrt.  */
1916   type = TREE_TYPE (arg0);
1917   mode = TYPE_MODE (type);
1918   sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
1919 
1920   /* Optimize pow(x,0.5) = sqrt(x).  This replacement is always safe
1921      unless signed zeros must be maintained.  pow(-0,0.5) = +0, while
1922      sqrt(-0) = -0.  */
1923   if (sqrtfn
1924       && real_equal (&c, &dconsthalf)
1925       && !HONOR_SIGNED_ZEROS (mode))
1926     return build_and_insert_call (gsi, loc, sqrtfn, arg0);
1927 
1928   hw_sqrt_exists = optab_handler (sqrt_optab, mode) != CODE_FOR_nothing;
1929 
1930   /* Optimize pow(x,1./3.) = cbrt(x).  This requires unsafe math
1931      optimizations since 1./3. is not exactly representable.  If x
1932      is negative and finite, the correct value of pow(x,1./3.) is
1933      a NaN with the "invalid" exception raised, because the value
1934      of 1./3. actually has an even denominator.  The correct value
1935      of cbrt(x) is a negative real value.  */
1936   cbrtfn = mathfn_built_in (type, BUILT_IN_CBRT);
1937   dconst1_3 = real_value_truncate (mode, dconst_third ());
1938 
1939   if (flag_unsafe_math_optimizations
1940       && cbrtfn
1941       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1942       && real_equal (&c, &dconst1_3))
1943     return build_and_insert_call (gsi, loc, cbrtfn, arg0);
1944 
1945   /* Optimize pow(x,1./6.) = cbrt(sqrt(x)).  Don't do this optimization
1946      if we don't have a hardware sqrt insn.  */
1947   dconst1_6 = dconst1_3;
1948   SET_REAL_EXP (&dconst1_6, REAL_EXP (&dconst1_6) - 1);
1949 
1950   if (flag_unsafe_math_optimizations
1951       && sqrtfn
1952       && cbrtfn
1953       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
1954       && speed_p
1955       && hw_sqrt_exists
1956       && real_equal (&c, &dconst1_6))
1957     {
1958       /* sqrt(x)  */
1959       sqrt_arg0 = build_and_insert_call (gsi, loc, sqrtfn, arg0);
1960 
1961       /* cbrt(sqrt(x))  */
1962       return build_and_insert_call (gsi, loc, cbrtfn, sqrt_arg0);
1963     }
1964 
1965 
1966   /* Attempt to expand the POW as a product of square root chains.
1967      Expand the 0.25 case even when otpimising for size.  */
1968   if (flag_unsafe_math_optimizations
1969       && sqrtfn
1970       && hw_sqrt_exists
1971       && (speed_p || real_equal (&c, &dconst1_4))
1972       && !HONOR_SIGNED_ZEROS (mode))
1973     {
1974       unsigned int max_depth = speed_p
1975 				? param_max_pow_sqrt_depth
1976 				: 2;
1977 
1978       tree expand_with_sqrts
1979 	= expand_pow_as_sqrts (gsi, loc, arg0, arg1, max_depth);
1980 
1981       if (expand_with_sqrts)
1982 	return expand_with_sqrts;
1983     }
1984 
1985   real_arithmetic (&c2, MULT_EXPR, &c, &dconst2);
1986   n = real_to_integer (&c2);
1987   real_from_integer (&cint, VOIDmode, n, SIGNED);
1988   c2_is_int = real_identical (&c2, &cint);
1989 
1990   /* Optimize pow(x,c), where 3c = n for some nonzero integer n, into
1991 
1992      powi(x, n/3) * powi(cbrt(x), n%3),                    n > 0;
1993      1.0 / (powi(x, abs(n)/3) * powi(cbrt(x), abs(n)%3)),  n < 0.
1994 
1995      Do not calculate the first factor when n/3 = 0.  As cbrt(x) is
1996      different from pow(x, 1./3.) due to rounding and behavior with
1997      negative x, we need to constrain this transformation to unsafe
1998      math and positive x or finite math.  */
1999   real_from_integer (&dconst3, VOIDmode, 3, SIGNED);
2000   real_arithmetic (&c2, MULT_EXPR, &c, &dconst3);
2001   real_round (&c2, mode, &c2);
2002   n = real_to_integer (&c2);
2003   real_from_integer (&cint, VOIDmode, n, SIGNED);
2004   real_arithmetic (&c2, RDIV_EXPR, &cint, &dconst3);
2005   real_convert (&c2, mode, &c2);
2006 
2007   if (flag_unsafe_math_optimizations
2008       && cbrtfn
2009       && (!HONOR_NANS (mode) || tree_expr_nonnegative_p (arg0))
2010       && real_identical (&c2, &c)
2011       && !c2_is_int
2012       && optimize_function_for_speed_p (cfun)
2013       && powi_cost (n / 3) <= POWI_MAX_MULTS)
2014     {
2015       tree powi_x_ndiv3 = NULL_TREE;
2016 
2017       /* Attempt to fold powi(arg0, abs(n/3)) into multiplies.  If not
2018          possible or profitable, give up.  Skip the degenerate case when
2019          abs(n) < 3, where the result is always 1.  */
2020       if (absu_hwi (n) >= 3)
2021 	{
2022 	  powi_x_ndiv3 = gimple_expand_builtin_powi (gsi, loc, arg0,
2023 						     abs_hwi (n / 3));
2024 	  if (!powi_x_ndiv3)
2025 	    return NULL_TREE;
2026 	}
2027 
2028       /* Calculate powi(cbrt(x), n%3).  Don't use gimple_expand_builtin_powi
2029          as that creates an unnecessary variable.  Instead, just produce
2030          either cbrt(x) or cbrt(x) * cbrt(x).  */
2031       cbrt_x = build_and_insert_call (gsi, loc, cbrtfn, arg0);
2032 
2033       if (absu_hwi (n) % 3 == 1)
2034 	powi_cbrt_x = cbrt_x;
2035       else
2036 	powi_cbrt_x = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
2037 					      cbrt_x, cbrt_x);
2038 
2039       /* Multiply the two subexpressions, unless powi(x,abs(n)/3) = 1.  */
2040       if (absu_hwi (n) < 3)
2041 	result = powi_cbrt_x;
2042       else
2043 	result = build_and_insert_binop (gsi, loc, "powroot", MULT_EXPR,
2044 					 powi_x_ndiv3, powi_cbrt_x);
2045 
2046       /* If n is negative, reciprocate the result.  */
2047       if (n < 0)
2048 	result = build_and_insert_binop (gsi, loc, "powroot", RDIV_EXPR,
2049 					 build_real (type, dconst1), result);
2050 
2051       return result;
2052     }
2053 
2054   /* No optimizations succeeded.  */
2055   return NULL_TREE;
2056 }
2057 
2058 /* ARG is the argument to a cabs builtin call in GSI with location info
2059    LOC.  Create a sequence of statements prior to GSI that calculates
2060    sqrt(R*R + I*I), where R and I are the real and imaginary components
2061    of ARG, respectively.  Return an expression holding the result.  */
2062 
2063 static tree
gimple_expand_builtin_cabs(gimple_stmt_iterator * gsi,location_t loc,tree arg)2064 gimple_expand_builtin_cabs (gimple_stmt_iterator *gsi, location_t loc, tree arg)
2065 {
2066   tree real_part, imag_part, addend1, addend2, sum, result;
2067   tree type = TREE_TYPE (TREE_TYPE (arg));
2068   tree sqrtfn = mathfn_built_in (type, BUILT_IN_SQRT);
2069   machine_mode mode = TYPE_MODE (type);
2070 
2071   if (!flag_unsafe_math_optimizations
2072       || !optimize_bb_for_speed_p (gimple_bb (gsi_stmt (*gsi)))
2073       || !sqrtfn
2074       || optab_handler (sqrt_optab, mode) == CODE_FOR_nothing)
2075     return NULL_TREE;
2076 
2077   real_part = build_and_insert_ref (gsi, loc, type, "cabs",
2078 				    REALPART_EXPR, arg);
2079   addend1 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
2080 				    real_part, real_part);
2081   imag_part = build_and_insert_ref (gsi, loc, type, "cabs",
2082 				    IMAGPART_EXPR, arg);
2083   addend2 = build_and_insert_binop (gsi, loc, "cabs", MULT_EXPR,
2084 				    imag_part, imag_part);
2085   sum = build_and_insert_binop (gsi, loc, "cabs", PLUS_EXPR, addend1, addend2);
2086   result = build_and_insert_call (gsi, loc, sqrtfn, sum);
2087 
2088   return result;
2089 }
2090 
2091 /* Go through all calls to sin, cos and cexpi and call execute_cse_sincos_1
2092    on the SSA_NAME argument of each of them.  Also expand powi(x,n) into
2093    an optimal number of multiplies, when n is a constant.  */
2094 
2095 namespace {
2096 
2097 const pass_data pass_data_cse_sincos =
2098 {
2099   GIMPLE_PASS, /* type */
2100   "sincos", /* name */
2101   OPTGROUP_NONE, /* optinfo_flags */
2102   TV_TREE_SINCOS, /* tv_id */
2103   PROP_ssa, /* properties_required */
2104   PROP_gimple_opt_math, /* properties_provided */
2105   0, /* properties_destroyed */
2106   0, /* todo_flags_start */
2107   TODO_update_ssa, /* todo_flags_finish */
2108 };
2109 
2110 class pass_cse_sincos : public gimple_opt_pass
2111 {
2112 public:
pass_cse_sincos(gcc::context * ctxt)2113   pass_cse_sincos (gcc::context *ctxt)
2114     : gimple_opt_pass (pass_data_cse_sincos, ctxt)
2115   {}
2116 
2117   /* opt_pass methods: */
gate(function *)2118   virtual bool gate (function *)
2119     {
2120       /* We no longer require either sincos or cexp, since powi expansion
2121 	 piggybacks on this pass.  */
2122       return optimize;
2123     }
2124 
2125   virtual unsigned int execute (function *);
2126 
2127 }; // class pass_cse_sincos
2128 
2129 unsigned int
execute(function * fun)2130 pass_cse_sincos::execute (function *fun)
2131 {
2132   basic_block bb;
2133   bool cfg_changed = false;
2134 
2135   calculate_dominance_info (CDI_DOMINATORS);
2136   memset (&sincos_stats, 0, sizeof (sincos_stats));
2137 
2138   FOR_EACH_BB_FN (bb, fun)
2139     {
2140       gimple_stmt_iterator gsi;
2141       bool cleanup_eh = false;
2142 
2143       for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2144         {
2145 	  gimple *stmt = gsi_stmt (gsi);
2146 
2147 	  /* Only the last stmt in a bb could throw, no need to call
2148 	     gimple_purge_dead_eh_edges if we change something in the middle
2149 	     of a basic block.  */
2150 	  cleanup_eh = false;
2151 
2152 	  if (is_gimple_call (stmt)
2153 	      && gimple_call_lhs (stmt))
2154 	    {
2155 	      tree arg, arg0, arg1, result;
2156 	      HOST_WIDE_INT n;
2157 	      location_t loc;
2158 
2159 	      switch (gimple_call_combined_fn (stmt))
2160 		{
2161 		CASE_CFN_COS:
2162 		CASE_CFN_SIN:
2163 		CASE_CFN_CEXPI:
2164 		  /* Make sure we have either sincos or cexp.  */
2165 		  if (!targetm.libc_has_function (function_c99_math_complex)
2166 		      && !targetm.libc_has_function (function_sincos))
2167 		    break;
2168 
2169 		  arg = gimple_call_arg (stmt, 0);
2170 		  if (TREE_CODE (arg) == SSA_NAME)
2171 		    cfg_changed |= execute_cse_sincos_1 (arg);
2172 		  break;
2173 
2174 		CASE_CFN_POW:
2175 		  arg0 = gimple_call_arg (stmt, 0);
2176 		  arg1 = gimple_call_arg (stmt, 1);
2177 
2178 		  loc = gimple_location (stmt);
2179 		  result = gimple_expand_builtin_pow (&gsi, loc, arg0, arg1);
2180 
2181 		  if (result)
2182 		    {
2183 		      tree lhs = gimple_get_lhs (stmt);
2184 		      gassign *new_stmt = gimple_build_assign (lhs, result);
2185 		      gimple_set_location (new_stmt, loc);
2186 		      unlink_stmt_vdef (stmt);
2187 		      gsi_replace (&gsi, new_stmt, true);
2188 		      cleanup_eh = true;
2189 		      if (gimple_vdef (stmt))
2190 			release_ssa_name (gimple_vdef (stmt));
2191 		    }
2192 		  break;
2193 
2194 		CASE_CFN_POWI:
2195 		  arg0 = gimple_call_arg (stmt, 0);
2196 		  arg1 = gimple_call_arg (stmt, 1);
2197 		  loc = gimple_location (stmt);
2198 
2199 		  if (real_minus_onep (arg0))
2200 		    {
2201                       tree t0, t1, cond, one, minus_one;
2202 		      gassign *stmt;
2203 
2204 		      t0 = TREE_TYPE (arg0);
2205 		      t1 = TREE_TYPE (arg1);
2206 		      one = build_real (t0, dconst1);
2207 		      minus_one = build_real (t0, dconstm1);
2208 
2209 		      cond = make_temp_ssa_name (t1, NULL, "powi_cond");
2210 		      stmt = gimple_build_assign (cond, BIT_AND_EXPR,
2211 						  arg1, build_int_cst (t1, 1));
2212 		      gimple_set_location (stmt, loc);
2213 		      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
2214 
2215 		      result = make_temp_ssa_name (t0, NULL, "powi");
2216 		      stmt = gimple_build_assign (result, COND_EXPR, cond,
2217 						  minus_one, one);
2218 		      gimple_set_location (stmt, loc);
2219 		      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
2220 		    }
2221 		  else
2222 		    {
2223 		      if (!tree_fits_shwi_p (arg1))
2224 			break;
2225 
2226 		      n = tree_to_shwi (arg1);
2227 		      result = gimple_expand_builtin_powi (&gsi, loc, arg0, n);
2228 		    }
2229 
2230 		  if (result)
2231 		    {
2232 		      tree lhs = gimple_get_lhs (stmt);
2233 		      gassign *new_stmt = gimple_build_assign (lhs, result);
2234 		      gimple_set_location (new_stmt, loc);
2235 		      unlink_stmt_vdef (stmt);
2236 		      gsi_replace (&gsi, new_stmt, true);
2237 		      cleanup_eh = true;
2238 		      if (gimple_vdef (stmt))
2239 			release_ssa_name (gimple_vdef (stmt));
2240 		    }
2241 		  break;
2242 
2243 		CASE_CFN_CABS:
2244 		  arg0 = gimple_call_arg (stmt, 0);
2245 		  loc = gimple_location (stmt);
2246 		  result = gimple_expand_builtin_cabs (&gsi, loc, arg0);
2247 
2248 		  if (result)
2249 		    {
2250 		      tree lhs = gimple_get_lhs (stmt);
2251 		      gassign *new_stmt = gimple_build_assign (lhs, result);
2252 		      gimple_set_location (new_stmt, loc);
2253 		      unlink_stmt_vdef (stmt);
2254 		      gsi_replace (&gsi, new_stmt, true);
2255 		      cleanup_eh = true;
2256 		      if (gimple_vdef (stmt))
2257 			release_ssa_name (gimple_vdef (stmt));
2258 		    }
2259 		  break;
2260 
2261 		default:;
2262 		}
2263 	    }
2264 	}
2265       if (cleanup_eh)
2266 	cfg_changed |= gimple_purge_dead_eh_edges (bb);
2267     }
2268 
2269   statistics_counter_event (fun, "sincos statements inserted",
2270 			    sincos_stats.inserted);
2271 
2272   return cfg_changed ? TODO_cleanup_cfg : 0;
2273 }
2274 
2275 } // anon namespace
2276 
2277 gimple_opt_pass *
make_pass_cse_sincos(gcc::context * ctxt)2278 make_pass_cse_sincos (gcc::context *ctxt)
2279 {
2280   return new pass_cse_sincos (ctxt);
2281 }
2282 
2283 /* Return true if stmt is a type conversion operation that can be stripped
2284    when used in a widening multiply operation.  */
2285 static bool
widening_mult_conversion_strippable_p(tree result_type,gimple * stmt)2286 widening_mult_conversion_strippable_p (tree result_type, gimple *stmt)
2287 {
2288   enum tree_code rhs_code = gimple_assign_rhs_code (stmt);
2289 
2290   if (TREE_CODE (result_type) == INTEGER_TYPE)
2291     {
2292       tree op_type;
2293       tree inner_op_type;
2294 
2295       if (!CONVERT_EXPR_CODE_P (rhs_code))
2296 	return false;
2297 
2298       op_type = TREE_TYPE (gimple_assign_lhs (stmt));
2299 
2300       /* If the type of OP has the same precision as the result, then
2301 	 we can strip this conversion.  The multiply operation will be
2302 	 selected to create the correct extension as a by-product.  */
2303       if (TYPE_PRECISION (result_type) == TYPE_PRECISION (op_type))
2304 	return true;
2305 
2306       /* We can also strip a conversion if it preserves the signed-ness of
2307 	 the operation and doesn't narrow the range.  */
2308       inner_op_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
2309 
2310       /* If the inner-most type is unsigned, then we can strip any
2311 	 intermediate widening operation.  If it's signed, then the
2312 	 intermediate widening operation must also be signed.  */
2313       if ((TYPE_UNSIGNED (inner_op_type)
2314 	   || TYPE_UNSIGNED (op_type) == TYPE_UNSIGNED (inner_op_type))
2315 	  && TYPE_PRECISION (op_type) > TYPE_PRECISION (inner_op_type))
2316 	return true;
2317 
2318       return false;
2319     }
2320 
2321   return rhs_code == FIXED_CONVERT_EXPR;
2322 }
2323 
2324 /* Return true if RHS is a suitable operand for a widening multiplication,
2325    assuming a target type of TYPE.
2326    There are two cases:
2327 
2328      - RHS makes some value at least twice as wide.  Store that value
2329        in *NEW_RHS_OUT if so, and store its type in *TYPE_OUT.
2330 
2331      - RHS is an integer constant.  Store that value in *NEW_RHS_OUT if so,
2332        but leave *TYPE_OUT untouched.  */
2333 
2334 static bool
is_widening_mult_rhs_p(tree type,tree rhs,tree * type_out,tree * new_rhs_out)2335 is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
2336 			tree *new_rhs_out)
2337 {
2338   gimple *stmt;
2339   tree type1, rhs1;
2340 
2341   if (TREE_CODE (rhs) == SSA_NAME)
2342     {
2343       stmt = SSA_NAME_DEF_STMT (rhs);
2344       if (is_gimple_assign (stmt))
2345 	{
2346 	  if (! widening_mult_conversion_strippable_p (type, stmt))
2347 	    rhs1 = rhs;
2348 	  else
2349 	    {
2350 	      rhs1 = gimple_assign_rhs1 (stmt);
2351 
2352 	      if (TREE_CODE (rhs1) == INTEGER_CST)
2353 		{
2354 		  *new_rhs_out = rhs1;
2355 		  *type_out = NULL;
2356 		  return true;
2357 		}
2358 	    }
2359 	}
2360       else
2361 	rhs1 = rhs;
2362 
2363       type1 = TREE_TYPE (rhs1);
2364 
2365       if (TREE_CODE (type1) != TREE_CODE (type)
2366 	  || TYPE_PRECISION (type1) * 2 > TYPE_PRECISION (type))
2367 	return false;
2368 
2369       *new_rhs_out = rhs1;
2370       *type_out = type1;
2371       return true;
2372     }
2373 
2374   if (TREE_CODE (rhs) == INTEGER_CST)
2375     {
2376       *new_rhs_out = rhs;
2377       *type_out = NULL;
2378       return true;
2379     }
2380 
2381   return false;
2382 }
2383 
2384 /* Return true if STMT performs a widening multiplication, assuming the
2385    output type is TYPE.  If so, store the unwidened types of the operands
2386    in *TYPE1_OUT and *TYPE2_OUT respectively.  Also fill *RHS1_OUT and
2387    *RHS2_OUT such that converting those operands to types *TYPE1_OUT
2388    and *TYPE2_OUT would give the operands of the multiplication.  */
2389 
2390 static bool
is_widening_mult_p(gimple * stmt,tree * type1_out,tree * rhs1_out,tree * type2_out,tree * rhs2_out)2391 is_widening_mult_p (gimple *stmt,
2392 		    tree *type1_out, tree *rhs1_out,
2393 		    tree *type2_out, tree *rhs2_out)
2394 {
2395   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
2396 
2397   if (TREE_CODE (type) == INTEGER_TYPE)
2398     {
2399       if (TYPE_OVERFLOW_TRAPS (type))
2400 	return false;
2401     }
2402   else if (TREE_CODE (type) != FIXED_POINT_TYPE)
2403     return false;
2404 
2405   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs1 (stmt), type1_out,
2406 			       rhs1_out))
2407     return false;
2408 
2409   if (!is_widening_mult_rhs_p (type, gimple_assign_rhs2 (stmt), type2_out,
2410 			       rhs2_out))
2411     return false;
2412 
2413   if (*type1_out == NULL)
2414     {
2415       if (*type2_out == NULL || !int_fits_type_p (*rhs1_out, *type2_out))
2416 	return false;
2417       *type1_out = *type2_out;
2418     }
2419 
2420   if (*type2_out == NULL)
2421     {
2422       if (!int_fits_type_p (*rhs2_out, *type1_out))
2423 	return false;
2424       *type2_out = *type1_out;
2425     }
2426 
2427   /* Ensure that the larger of the two operands comes first. */
2428   if (TYPE_PRECISION (*type1_out) < TYPE_PRECISION (*type2_out))
2429     {
2430       std::swap (*type1_out, *type2_out);
2431       std::swap (*rhs1_out, *rhs2_out);
2432     }
2433 
2434   return true;
2435 }
2436 
2437 /* Check to see if the CALL statement is an invocation of copysign
2438    with 1. being the first argument.  */
2439 static bool
is_copysign_call_with_1(gimple * call)2440 is_copysign_call_with_1 (gimple *call)
2441 {
2442   gcall *c = dyn_cast <gcall *> (call);
2443   if (! c)
2444     return false;
2445 
2446   enum combined_fn code = gimple_call_combined_fn (c);
2447 
2448   if (code == CFN_LAST)
2449     return false;
2450 
2451   if (builtin_fn_p (code))
2452     {
2453       switch (as_builtin_fn (code))
2454 	{
2455 	CASE_FLT_FN (BUILT_IN_COPYSIGN):
2456 	CASE_FLT_FN_FLOATN_NX (BUILT_IN_COPYSIGN):
2457 	  return real_onep (gimple_call_arg (c, 0));
2458 	default:
2459 	  return false;
2460 	}
2461     }
2462 
2463   if (internal_fn_p (code))
2464     {
2465       switch (as_internal_fn (code))
2466 	{
2467 	case IFN_COPYSIGN:
2468 	  return real_onep (gimple_call_arg (c, 0));
2469 	default:
2470 	  return false;
2471 	}
2472     }
2473 
2474    return false;
2475 }
2476 
2477 /* Try to expand the pattern x * copysign (1, y) into xorsign (x, y).
2478    This only happens when the xorsign optab is defined, if the
2479    pattern is not a xorsign pattern or if expansion fails FALSE is
2480    returned, otherwise TRUE is returned.  */
2481 static bool
convert_expand_mult_copysign(gimple * stmt,gimple_stmt_iterator * gsi)2482 convert_expand_mult_copysign (gimple *stmt, gimple_stmt_iterator *gsi)
2483 {
2484   tree treeop0, treeop1, lhs, type;
2485   location_t loc = gimple_location (stmt);
2486   lhs = gimple_assign_lhs (stmt);
2487   treeop0 = gimple_assign_rhs1 (stmt);
2488   treeop1 = gimple_assign_rhs2 (stmt);
2489   type = TREE_TYPE (lhs);
2490   machine_mode mode = TYPE_MODE (type);
2491 
2492   if (HONOR_SNANS (type))
2493     return false;
2494 
2495   if (TREE_CODE (treeop0) == SSA_NAME && TREE_CODE (treeop1) == SSA_NAME)
2496     {
2497       gimple *call0 = SSA_NAME_DEF_STMT (treeop0);
2498       if (!has_single_use (treeop0) || !is_copysign_call_with_1 (call0))
2499 	{
2500 	  call0 = SSA_NAME_DEF_STMT (treeop1);
2501 	  if (!has_single_use (treeop1) || !is_copysign_call_with_1 (call0))
2502 	    return false;
2503 
2504 	  treeop1 = treeop0;
2505 	}
2506 	if (optab_handler (xorsign_optab, mode) == CODE_FOR_nothing)
2507 	  return false;
2508 
2509 	gcall *c = as_a<gcall*> (call0);
2510 	treeop0 = gimple_call_arg (c, 1);
2511 
2512 	gcall *call_stmt
2513 	  = gimple_build_call_internal (IFN_XORSIGN, 2, treeop1, treeop0);
2514 	gimple_set_lhs (call_stmt, lhs);
2515 	gimple_set_location (call_stmt, loc);
2516 	gsi_replace (gsi, call_stmt, true);
2517 	return true;
2518     }
2519 
2520   return false;
2521 }
2522 
2523 /* Process a single gimple statement STMT, which has a MULT_EXPR as
2524    its rhs, and try to convert it into a WIDEN_MULT_EXPR.  The return
2525    value is true iff we converted the statement.  */
2526 
2527 static bool
convert_mult_to_widen(gimple * stmt,gimple_stmt_iterator * gsi)2528 convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
2529 {
2530   tree lhs, rhs1, rhs2, type, type1, type2;
2531   enum insn_code handler;
2532   scalar_int_mode to_mode, from_mode, actual_mode;
2533   optab op;
2534   int actual_precision;
2535   location_t loc = gimple_location (stmt);
2536   bool from_unsigned1, from_unsigned2;
2537 
2538   lhs = gimple_assign_lhs (stmt);
2539   type = TREE_TYPE (lhs);
2540   if (TREE_CODE (type) != INTEGER_TYPE)
2541     return false;
2542 
2543   if (!is_widening_mult_p (stmt, &type1, &rhs1, &type2, &rhs2))
2544     return false;
2545 
2546   to_mode = SCALAR_INT_TYPE_MODE (type);
2547   from_mode = SCALAR_INT_TYPE_MODE (type1);
2548   if (to_mode == from_mode)
2549     return false;
2550 
2551   from_unsigned1 = TYPE_UNSIGNED (type1);
2552   from_unsigned2 = TYPE_UNSIGNED (type2);
2553 
2554   if (from_unsigned1 && from_unsigned2)
2555     op = umul_widen_optab;
2556   else if (!from_unsigned1 && !from_unsigned2)
2557     op = smul_widen_optab;
2558   else
2559     op = usmul_widen_optab;
2560 
2561   handler = find_widening_optab_handler_and_mode (op, to_mode, from_mode,
2562 						  &actual_mode);
2563 
2564   if (handler == CODE_FOR_nothing)
2565     {
2566       if (op != smul_widen_optab)
2567 	{
2568 	  /* We can use a signed multiply with unsigned types as long as
2569 	     there is a wider mode to use, or it is the smaller of the two
2570 	     types that is unsigned.  Note that type1 >= type2, always.  */
2571 	  if ((TYPE_UNSIGNED (type1)
2572 	       && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2573 	      || (TYPE_UNSIGNED (type2)
2574 		  && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2575 	    {
2576 	      if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
2577 		  || GET_MODE_SIZE (to_mode) <= GET_MODE_SIZE (from_mode))
2578 		return false;
2579 	    }
2580 
2581 	  op = smul_widen_optab;
2582 	  handler = find_widening_optab_handler_and_mode (op, to_mode,
2583 							  from_mode,
2584 							  &actual_mode);
2585 
2586 	  if (handler == CODE_FOR_nothing)
2587 	    return false;
2588 
2589 	  from_unsigned1 = from_unsigned2 = false;
2590 	}
2591       else
2592 	return false;
2593     }
2594 
2595   /* Ensure that the inputs to the handler are in the correct precison
2596      for the opcode.  This will be the full mode size.  */
2597   actual_precision = GET_MODE_PRECISION (actual_mode);
2598   if (2 * actual_precision > TYPE_PRECISION (type))
2599     return false;
2600   if (actual_precision != TYPE_PRECISION (type1)
2601       || from_unsigned1 != TYPE_UNSIGNED (type1))
2602     rhs1 = build_and_insert_cast (gsi, loc,
2603 				  build_nonstandard_integer_type
2604 				    (actual_precision, from_unsigned1), rhs1);
2605   if (actual_precision != TYPE_PRECISION (type2)
2606       || from_unsigned2 != TYPE_UNSIGNED (type2))
2607     rhs2 = build_and_insert_cast (gsi, loc,
2608 				  build_nonstandard_integer_type
2609 				    (actual_precision, from_unsigned2), rhs2);
2610 
2611   /* Handle constants.  */
2612   if (TREE_CODE (rhs1) == INTEGER_CST)
2613     rhs1 = fold_convert (type1, rhs1);
2614   if (TREE_CODE (rhs2) == INTEGER_CST)
2615     rhs2 = fold_convert (type2, rhs2);
2616 
2617   gimple_assign_set_rhs1 (stmt, rhs1);
2618   gimple_assign_set_rhs2 (stmt, rhs2);
2619   gimple_assign_set_rhs_code (stmt, WIDEN_MULT_EXPR);
2620   update_stmt (stmt);
2621   widen_mul_stats.widen_mults_inserted++;
2622   return true;
2623 }
2624 
2625 /* Process a single gimple statement STMT, which is found at the
2626    iterator GSI and has a either a PLUS_EXPR or a MINUS_EXPR as its
2627    rhs (given by CODE), and try to convert it into a
2628    WIDEN_MULT_PLUS_EXPR or a WIDEN_MULT_MINUS_EXPR.  The return value
2629    is true iff we converted the statement.  */
2630 
2631 static bool
convert_plusminus_to_widen(gimple_stmt_iterator * gsi,gimple * stmt,enum tree_code code)2632 convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
2633 			    enum tree_code code)
2634 {
2635   gimple *rhs1_stmt = NULL, *rhs2_stmt = NULL;
2636   gimple *conv1_stmt = NULL, *conv2_stmt = NULL, *conv_stmt;
2637   tree type, type1, type2, optype;
2638   tree lhs, rhs1, rhs2, mult_rhs1, mult_rhs2, add_rhs;
2639   enum tree_code rhs1_code = ERROR_MARK, rhs2_code = ERROR_MARK;
2640   optab this_optab;
2641   enum tree_code wmult_code;
2642   enum insn_code handler;
2643   scalar_mode to_mode, from_mode, actual_mode;
2644   location_t loc = gimple_location (stmt);
2645   int actual_precision;
2646   bool from_unsigned1, from_unsigned2;
2647 
2648   lhs = gimple_assign_lhs (stmt);
2649   type = TREE_TYPE (lhs);
2650   if (TREE_CODE (type) != INTEGER_TYPE
2651       && TREE_CODE (type) != FIXED_POINT_TYPE)
2652     return false;
2653 
2654   if (code == MINUS_EXPR)
2655     wmult_code = WIDEN_MULT_MINUS_EXPR;
2656   else
2657     wmult_code = WIDEN_MULT_PLUS_EXPR;
2658 
2659   rhs1 = gimple_assign_rhs1 (stmt);
2660   rhs2 = gimple_assign_rhs2 (stmt);
2661 
2662   if (TREE_CODE (rhs1) == SSA_NAME)
2663     {
2664       rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2665       if (is_gimple_assign (rhs1_stmt))
2666 	rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2667     }
2668 
2669   if (TREE_CODE (rhs2) == SSA_NAME)
2670     {
2671       rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2672       if (is_gimple_assign (rhs2_stmt))
2673 	rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2674     }
2675 
2676   /* Allow for one conversion statement between the multiply
2677      and addition/subtraction statement.  If there are more than
2678      one conversions then we assume they would invalidate this
2679      transformation.  If that's not the case then they should have
2680      been folded before now.  */
2681   if (CONVERT_EXPR_CODE_P (rhs1_code))
2682     {
2683       conv1_stmt = rhs1_stmt;
2684       rhs1 = gimple_assign_rhs1 (rhs1_stmt);
2685       if (TREE_CODE (rhs1) == SSA_NAME)
2686 	{
2687 	  rhs1_stmt = SSA_NAME_DEF_STMT (rhs1);
2688 	  if (is_gimple_assign (rhs1_stmt))
2689 	    rhs1_code = gimple_assign_rhs_code (rhs1_stmt);
2690 	}
2691       else
2692 	return false;
2693     }
2694   if (CONVERT_EXPR_CODE_P (rhs2_code))
2695     {
2696       conv2_stmt = rhs2_stmt;
2697       rhs2 = gimple_assign_rhs1 (rhs2_stmt);
2698       if (TREE_CODE (rhs2) == SSA_NAME)
2699 	{
2700 	  rhs2_stmt = SSA_NAME_DEF_STMT (rhs2);
2701 	  if (is_gimple_assign (rhs2_stmt))
2702 	    rhs2_code = gimple_assign_rhs_code (rhs2_stmt);
2703 	}
2704       else
2705 	return false;
2706     }
2707 
2708   /* If code is WIDEN_MULT_EXPR then it would seem unnecessary to call
2709      is_widening_mult_p, but we still need the rhs returns.
2710 
2711      It might also appear that it would be sufficient to use the existing
2712      operands of the widening multiply, but that would limit the choice of
2713      multiply-and-accumulate instructions.
2714 
2715      If the widened-multiplication result has more than one uses, it is
2716      probably wiser not to do the conversion.  Also restrict this operation
2717      to single basic block to avoid moving the multiply to a different block
2718      with a higher execution frequency.  */
2719   if (code == PLUS_EXPR
2720       && (rhs1_code == MULT_EXPR || rhs1_code == WIDEN_MULT_EXPR))
2721     {
2722       if (!has_single_use (rhs1)
2723 	  || gimple_bb (rhs1_stmt) != gimple_bb (stmt)
2724 	  || !is_widening_mult_p (rhs1_stmt, &type1, &mult_rhs1,
2725 				  &type2, &mult_rhs2))
2726 	return false;
2727       add_rhs = rhs2;
2728       conv_stmt = conv1_stmt;
2729     }
2730   else if (rhs2_code == MULT_EXPR || rhs2_code == WIDEN_MULT_EXPR)
2731     {
2732       if (!has_single_use (rhs2)
2733 	  || gimple_bb (rhs2_stmt) != gimple_bb (stmt)
2734 	  || !is_widening_mult_p (rhs2_stmt, &type1, &mult_rhs1,
2735 				  &type2, &mult_rhs2))
2736 	return false;
2737       add_rhs = rhs1;
2738       conv_stmt = conv2_stmt;
2739     }
2740   else
2741     return false;
2742 
2743   to_mode = SCALAR_TYPE_MODE (type);
2744   from_mode = SCALAR_TYPE_MODE (type1);
2745   if (to_mode == from_mode)
2746     return false;
2747 
2748   from_unsigned1 = TYPE_UNSIGNED (type1);
2749   from_unsigned2 = TYPE_UNSIGNED (type2);
2750   optype = type1;
2751 
2752   /* There's no such thing as a mixed sign madd yet, so use a wider mode.  */
2753   if (from_unsigned1 != from_unsigned2)
2754     {
2755       if (!INTEGRAL_TYPE_P (type))
2756 	return false;
2757       /* We can use a signed multiply with unsigned types as long as
2758 	 there is a wider mode to use, or it is the smaller of the two
2759 	 types that is unsigned.  Note that type1 >= type2, always.  */
2760       if ((from_unsigned1
2761 	   && TYPE_PRECISION (type1) == GET_MODE_PRECISION (from_mode))
2762 	  || (from_unsigned2
2763 	      && TYPE_PRECISION (type2) == GET_MODE_PRECISION (from_mode)))
2764 	{
2765 	  if (!GET_MODE_WIDER_MODE (from_mode).exists (&from_mode)
2766 	      || GET_MODE_SIZE (from_mode) >= GET_MODE_SIZE (to_mode))
2767 	    return false;
2768 	}
2769 
2770       from_unsigned1 = from_unsigned2 = false;
2771       optype = build_nonstandard_integer_type (GET_MODE_PRECISION (from_mode),
2772 					       false);
2773     }
2774 
2775   /* If there was a conversion between the multiply and addition
2776      then we need to make sure it fits a multiply-and-accumulate.
2777      The should be a single mode change which does not change the
2778      value.  */
2779   if (conv_stmt)
2780     {
2781       /* We use the original, unmodified data types for this.  */
2782       tree from_type = TREE_TYPE (gimple_assign_rhs1 (conv_stmt));
2783       tree to_type = TREE_TYPE (gimple_assign_lhs (conv_stmt));
2784       int data_size = TYPE_PRECISION (type1) + TYPE_PRECISION (type2);
2785       bool is_unsigned = TYPE_UNSIGNED (type1) && TYPE_UNSIGNED (type2);
2786 
2787       if (TYPE_PRECISION (from_type) > TYPE_PRECISION (to_type))
2788 	{
2789 	  /* Conversion is a truncate.  */
2790 	  if (TYPE_PRECISION (to_type) < data_size)
2791 	    return false;
2792 	}
2793       else if (TYPE_PRECISION (from_type) < TYPE_PRECISION (to_type))
2794 	{
2795 	  /* Conversion is an extend.  Check it's the right sort.  */
2796 	  if (TYPE_UNSIGNED (from_type) != is_unsigned
2797 	      && !(is_unsigned && TYPE_PRECISION (from_type) > data_size))
2798 	    return false;
2799 	}
2800       /* else convert is a no-op for our purposes.  */
2801     }
2802 
2803   /* Verify that the machine can perform a widening multiply
2804      accumulate in this mode/signedness combination, otherwise
2805      this transformation is likely to pessimize code.  */
2806   this_optab = optab_for_tree_code (wmult_code, optype, optab_default);
2807   handler = find_widening_optab_handler_and_mode (this_optab, to_mode,
2808 						  from_mode, &actual_mode);
2809 
2810   if (handler == CODE_FOR_nothing)
2811     return false;
2812 
2813   /* Ensure that the inputs to the handler are in the correct precison
2814      for the opcode.  This will be the full mode size.  */
2815   actual_precision = GET_MODE_PRECISION (actual_mode);
2816   if (actual_precision != TYPE_PRECISION (type1)
2817       || from_unsigned1 != TYPE_UNSIGNED (type1))
2818     mult_rhs1 = build_and_insert_cast (gsi, loc,
2819 				       build_nonstandard_integer_type
2820 				         (actual_precision, from_unsigned1),
2821 				       mult_rhs1);
2822   if (actual_precision != TYPE_PRECISION (type2)
2823       || from_unsigned2 != TYPE_UNSIGNED (type2))
2824     mult_rhs2 = build_and_insert_cast (gsi, loc,
2825 				       build_nonstandard_integer_type
2826 					 (actual_precision, from_unsigned2),
2827 				       mult_rhs2);
2828 
2829   if (!useless_type_conversion_p (type, TREE_TYPE (add_rhs)))
2830     add_rhs = build_and_insert_cast (gsi, loc, type, add_rhs);
2831 
2832   /* Handle constants.  */
2833   if (TREE_CODE (mult_rhs1) == INTEGER_CST)
2834     mult_rhs1 = fold_convert (type1, mult_rhs1);
2835   if (TREE_CODE (mult_rhs2) == INTEGER_CST)
2836     mult_rhs2 = fold_convert (type2, mult_rhs2);
2837 
2838   gimple_assign_set_rhs_with_ops (gsi, wmult_code, mult_rhs1, mult_rhs2,
2839 				  add_rhs);
2840   update_stmt (gsi_stmt (*gsi));
2841   widen_mul_stats.maccs_inserted++;
2842   return true;
2843 }
2844 
2845 /* Given a result MUL_RESULT which is a result of a multiplication of OP1 and
2846    OP2 and which we know is used in statements that can be, together with the
2847    multiplication, converted to FMAs, perform the transformation.  */
2848 
2849 static void
convert_mult_to_fma_1(tree mul_result,tree op1,tree op2)2850 convert_mult_to_fma_1 (tree mul_result, tree op1, tree op2)
2851 {
2852   tree type = TREE_TYPE (mul_result);
2853   gimple *use_stmt;
2854   imm_use_iterator imm_iter;
2855   gcall *fma_stmt;
2856 
2857   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, mul_result)
2858     {
2859       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
2860       tree addop, mulop1 = op1, result = mul_result;
2861       bool negate_p = false;
2862       gimple_seq seq = NULL;
2863 
2864       if (is_gimple_debug (use_stmt))
2865 	continue;
2866 
2867       if (is_gimple_assign (use_stmt)
2868 	  && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
2869 	{
2870 	  result = gimple_assign_lhs (use_stmt);
2871 	  use_operand_p use_p;
2872 	  gimple *neguse_stmt;
2873 	  single_imm_use (gimple_assign_lhs (use_stmt), &use_p, &neguse_stmt);
2874 	  gsi_remove (&gsi, true);
2875 	  release_defs (use_stmt);
2876 
2877 	  use_stmt = neguse_stmt;
2878 	  gsi = gsi_for_stmt (use_stmt);
2879 	  negate_p = true;
2880 	}
2881 
2882       tree cond, else_value, ops[3];
2883       tree_code code;
2884       if (!can_interpret_as_conditional_op_p (use_stmt, &cond, &code,
2885 					      ops, &else_value))
2886 	gcc_unreachable ();
2887       addop = ops[0] == result ? ops[1] : ops[0];
2888 
2889       if (code == MINUS_EXPR)
2890 	{
2891 	  if (ops[0] == result)
2892 	    /* a * b - c -> a * b + (-c)  */
2893 	    addop = gimple_build (&seq, NEGATE_EXPR, type, addop);
2894 	  else
2895 	    /* a - b * c -> (-b) * c + a */
2896 	    negate_p = !negate_p;
2897 	}
2898 
2899       if (negate_p)
2900 	mulop1 = gimple_build (&seq, NEGATE_EXPR, type, mulop1);
2901 
2902       if (seq)
2903 	gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
2904 
2905       if (cond)
2906 	fma_stmt = gimple_build_call_internal (IFN_COND_FMA, 5, cond, mulop1,
2907 					       op2, addop, else_value);
2908       else
2909 	fma_stmt = gimple_build_call_internal (IFN_FMA, 3, mulop1, op2, addop);
2910       gimple_set_lhs (fma_stmt, gimple_get_lhs (use_stmt));
2911       gimple_call_set_nothrow (fma_stmt, !stmt_can_throw_internal (cfun,
2912 								   use_stmt));
2913       gsi_replace (&gsi, fma_stmt, true);
2914       /* Follow all SSA edges so that we generate FMS, FNMA and FNMS
2915 	 regardless of where the negation occurs.  */
2916       gimple *orig_stmt = gsi_stmt (gsi);
2917       if (fold_stmt (&gsi, follow_all_ssa_edges))
2918 	{
2919 	  if (maybe_clean_or_replace_eh_stmt (orig_stmt, gsi_stmt (gsi)))
2920 	    gcc_unreachable ();
2921 	  update_stmt (gsi_stmt (gsi));
2922 	}
2923 
2924       if (dump_file && (dump_flags & TDF_DETAILS))
2925 	{
2926 	  fprintf (dump_file, "Generated FMA ");
2927 	  print_gimple_stmt (dump_file, gsi_stmt (gsi), 0, TDF_NONE);
2928 	  fprintf (dump_file, "\n");
2929 	}
2930 
2931       widen_mul_stats.fmas_inserted++;
2932     }
2933 }
2934 
2935 /* Data necessary to perform the actual transformation from a multiplication
2936    and an addition to an FMA after decision is taken it should be done and to
2937    then delete the multiplication statement from the function IL.  */
2938 
2939 struct fma_transformation_info
2940 {
2941   gimple *mul_stmt;
2942   tree mul_result;
2943   tree op1;
2944   tree op2;
2945 };
2946 
2947 /* Structure containing the current state of FMA deferring, i.e. whether we are
2948    deferring, whether to continue deferring, and all data necessary to come
2949    back and perform all deferred transformations.  */
2950 
2951 class fma_deferring_state
2952 {
2953 public:
2954   /* Class constructor.  Pass true as PERFORM_DEFERRING in order to actually
2955      do any deferring.  */
2956 
fma_deferring_state(bool perform_deferring)2957   fma_deferring_state (bool perform_deferring)
2958     : m_candidates (), m_mul_result_set (), m_initial_phi (NULL),
2959       m_last_result (NULL_TREE), m_deferring_p (perform_deferring) {}
2960 
2961   /* List of FMA candidates for which we the transformation has been determined
2962      possible but we at this point in BB analysis we do not consider them
2963      beneficial.  */
2964   auto_vec<fma_transformation_info, 8> m_candidates;
2965 
2966   /* Set of results of multiplication that are part of an already deferred FMA
2967      candidates.  */
2968   hash_set<tree> m_mul_result_set;
2969 
2970   /* The PHI that supposedly feeds back result of a FMA to another over loop
2971      boundary.  */
2972   gphi *m_initial_phi;
2973 
2974   /* Result of the last produced FMA candidate or NULL if there has not been
2975      one.  */
2976   tree m_last_result;
2977 
2978   /* If true, deferring might still be profitable.  If false, transform all
2979      candidates and no longer defer.  */
2980   bool m_deferring_p;
2981 };
2982 
2983 /* Transform all deferred FMA candidates and mark STATE as no longer
2984    deferring.  */
2985 
2986 static void
cancel_fma_deferring(fma_deferring_state * state)2987 cancel_fma_deferring (fma_deferring_state *state)
2988 {
2989   if (!state->m_deferring_p)
2990     return;
2991 
2992   for (unsigned i = 0; i < state->m_candidates.length (); i++)
2993     {
2994       if (dump_file && (dump_flags & TDF_DETAILS))
2995 	fprintf (dump_file, "Generating deferred FMA\n");
2996 
2997       const fma_transformation_info &fti = state->m_candidates[i];
2998       convert_mult_to_fma_1 (fti.mul_result, fti.op1, fti.op2);
2999 
3000       gimple_stmt_iterator gsi = gsi_for_stmt (fti.mul_stmt);
3001       gsi_remove (&gsi, true);
3002       release_defs (fti.mul_stmt);
3003     }
3004   state->m_deferring_p = false;
3005 }
3006 
3007 /* If OP is an SSA name defined by a PHI node, return the PHI statement.
3008    Otherwise return NULL.  */
3009 
3010 static gphi *
result_of_phi(tree op)3011 result_of_phi (tree op)
3012 {
3013   if (TREE_CODE (op) != SSA_NAME)
3014     return NULL;
3015 
3016   return dyn_cast <gphi *> (SSA_NAME_DEF_STMT (op));
3017 }
3018 
3019 /* After processing statements of a BB and recording STATE, return true if the
3020    initial phi is fed by the last FMA candidate result ore one such result from
3021    previously processed BBs marked in LAST_RESULT_SET.  */
3022 
3023 static bool
last_fma_candidate_feeds_initial_phi(fma_deferring_state * state,hash_set<tree> * last_result_set)3024 last_fma_candidate_feeds_initial_phi (fma_deferring_state *state,
3025 				      hash_set<tree> *last_result_set)
3026 {
3027   ssa_op_iter iter;
3028   use_operand_p use;
3029   FOR_EACH_PHI_ARG (use, state->m_initial_phi, iter, SSA_OP_USE)
3030     {
3031       tree t = USE_FROM_PTR (use);
3032       if (t == state->m_last_result
3033 	  || last_result_set->contains (t))
3034 	return true;
3035     }
3036 
3037   return false;
3038 }
3039 
3040 /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2
3041    with uses in additions and subtractions to form fused multiply-add
3042    operations.  Returns true if successful and MUL_STMT should be removed.
3043    If MUL_COND is nonnull, the multiplication in MUL_STMT is conditional
3044    on MUL_COND, otherwise it is unconditional.
3045 
3046    If STATE indicates that we are deferring FMA transformation, that means
3047    that we do not produce FMAs for basic blocks which look like:
3048 
3049     <bb 6>
3050     # accumulator_111 = PHI <0.0(5), accumulator_66(6)>
3051     _65 = _14 * _16;
3052     accumulator_66 = _65 + accumulator_111;
3053 
3054   or its unrolled version, i.e. with several FMA candidates that feed result
3055   of one into the addend of another.  Instead, we add them to a list in STATE
3056   and if we later discover an FMA candidate that is not part of such a chain,
3057   we go back and perform all deferred past candidates.  */
3058 
3059 static bool
3060 convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
3061 		     fma_deferring_state *state, tree mul_cond = NULL_TREE)
3062 {
3063   tree mul_result = gimple_get_lhs (mul_stmt);
3064   tree type = TREE_TYPE (mul_result);
3065   gimple *use_stmt, *neguse_stmt;
3066   use_operand_p use_p;
3067   imm_use_iterator imm_iter;
3068 
3069   if (FLOAT_TYPE_P (type)
3070       && flag_fp_contract_mode == FP_CONTRACT_OFF)
3071     return false;
3072 
3073   /* We don't want to do bitfield reduction ops.  */
3074   if (INTEGRAL_TYPE_P (type)
3075       && (!type_has_mode_precision_p (type) || TYPE_OVERFLOW_TRAPS (type)))
3076     return false;
3077 
3078   /* If the target doesn't support it, don't generate it.  We assume that
3079      if fma isn't available then fms, fnma or fnms are not either.  */
3080   optimization_type opt_type = bb_optimization_type (gimple_bb (mul_stmt));
3081   if (!direct_internal_fn_supported_p (IFN_FMA, type, opt_type))
3082     return false;
3083 
3084   /* If the multiplication has zero uses, it is kept around probably because
3085      of -fnon-call-exceptions.  Don't optimize it away in that case,
3086      it is DCE job.  */
3087   if (has_zero_uses (mul_result))
3088     return false;
3089 
3090   bool check_defer
3091     = (state->m_deferring_p
3092        && maybe_le (tree_to_poly_int64 (TYPE_SIZE (type)),
3093 		    param_avoid_fma_max_bits));
3094   bool defer = check_defer;
3095   bool seen_negate_p = false;
3096   /* Make sure that the multiplication statement becomes dead after
3097      the transformation, thus that all uses are transformed to FMAs.
3098      This means we assume that an FMA operation has the same cost
3099      as an addition.  */
FOR_EACH_IMM_USE_FAST(use_p,imm_iter,mul_result)3100   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, mul_result)
3101     {
3102       tree result = mul_result;
3103       bool negate_p = false;
3104 
3105       use_stmt = USE_STMT (use_p);
3106 
3107       if (is_gimple_debug (use_stmt))
3108 	continue;
3109 
3110       /* For now restrict this operations to single basic blocks.  In theory
3111 	 we would want to support sinking the multiplication in
3112 	 m = a*b;
3113 	 if ()
3114 	   ma = m + c;
3115 	 else
3116 	   d = m;
3117 	 to form a fma in the then block and sink the multiplication to the
3118 	 else block.  */
3119       if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3120 	return false;
3121 
3122       /* A negate on the multiplication leads to FNMA.  */
3123       if (is_gimple_assign (use_stmt)
3124 	  && gimple_assign_rhs_code (use_stmt) == NEGATE_EXPR)
3125 	{
3126 	  ssa_op_iter iter;
3127 	  use_operand_p usep;
3128 
3129 	  /* If (due to earlier missed optimizations) we have two
3130 	     negates of the same value, treat them as equivalent
3131 	     to a single negate with multiple uses.  */
3132 	  if (seen_negate_p)
3133 	    return false;
3134 
3135 	  result = gimple_assign_lhs (use_stmt);
3136 
3137 	  /* Make sure the negate statement becomes dead with this
3138 	     single transformation.  */
3139 	  if (!single_imm_use (gimple_assign_lhs (use_stmt),
3140 			       &use_p, &neguse_stmt))
3141 	    return false;
3142 
3143 	  /* Make sure the multiplication isn't also used on that stmt.  */
3144 	  FOR_EACH_PHI_OR_STMT_USE (usep, neguse_stmt, iter, SSA_OP_USE)
3145 	    if (USE_FROM_PTR (usep) == mul_result)
3146 	      return false;
3147 
3148 	  /* Re-validate.  */
3149 	  use_stmt = neguse_stmt;
3150 	  if (gimple_bb (use_stmt) != gimple_bb (mul_stmt))
3151 	    return false;
3152 
3153 	  negate_p = seen_negate_p = true;
3154 	}
3155 
3156       tree cond, else_value, ops[3];
3157       tree_code code;
3158       if (!can_interpret_as_conditional_op_p (use_stmt, &cond, &code, ops,
3159 					      &else_value))
3160 	return false;
3161 
3162       switch (code)
3163 	{
3164 	case MINUS_EXPR:
3165 	  if (ops[1] == result)
3166 	    negate_p = !negate_p;
3167 	  break;
3168 	case PLUS_EXPR:
3169 	  break;
3170 	default:
3171 	  /* FMA can only be formed from PLUS and MINUS.  */
3172 	  return false;
3173 	}
3174 
3175       if (mul_cond && cond != mul_cond)
3176 	return false;
3177 
3178       if (cond)
3179 	{
3180 	  if (cond == result || else_value == result)
3181 	    return false;
3182 	  if (!direct_internal_fn_supported_p (IFN_COND_FMA, type, opt_type))
3183 	    return false;
3184 	}
3185 
3186       /* If the subtrahend (OPS[1]) is computed by a MULT_EXPR that
3187 	 we'll visit later, we might be able to get a more profitable
3188 	 match with fnma.
3189 	 OTOH, if we don't, a negate / fma pair has likely lower latency
3190 	 that a mult / subtract pair.  */
3191       if (code == MINUS_EXPR
3192 	  && !negate_p
3193 	  && ops[0] == result
3194 	  && !direct_internal_fn_supported_p (IFN_FMS, type, opt_type)
3195 	  && direct_internal_fn_supported_p (IFN_FNMA, type, opt_type)
3196 	  && TREE_CODE (ops[1]) == SSA_NAME
3197 	  && has_single_use (ops[1]))
3198 	{
3199 	  gimple *stmt2 = SSA_NAME_DEF_STMT (ops[1]);
3200 	  if (is_gimple_assign (stmt2)
3201 	      && gimple_assign_rhs_code (stmt2) == MULT_EXPR)
3202 	    return false;
3203 	}
3204 
3205       /* We can't handle a * b + a * b.  */
3206       if (ops[0] == ops[1])
3207 	return false;
3208       /* If deferring, make sure we are not looking at an instruction that
3209 	 wouldn't have existed if we were not.  */
3210       if (state->m_deferring_p
3211 	  && (state->m_mul_result_set.contains (ops[0])
3212 	      || state->m_mul_result_set.contains (ops[1])))
3213 	return false;
3214 
3215       if (check_defer)
3216 	{
3217 	  tree use_lhs = gimple_get_lhs (use_stmt);
3218 	  if (state->m_last_result)
3219 	    {
3220 	      if (ops[1] == state->m_last_result
3221 		  || ops[0] == state->m_last_result)
3222 		defer = true;
3223 	      else
3224 		defer = false;
3225 	    }
3226 	  else
3227 	    {
3228 	      gcc_checking_assert (!state->m_initial_phi);
3229 	      gphi *phi;
3230 	      if (ops[0] == result)
3231 		phi = result_of_phi (ops[1]);
3232 	      else
3233 		{
3234 		  gcc_assert (ops[1] == result);
3235 		  phi = result_of_phi (ops[0]);
3236 		}
3237 
3238 	      if (phi)
3239 		{
3240 		  state->m_initial_phi = phi;
3241 		  defer = true;
3242 		}
3243 	      else
3244 		defer = false;
3245 	    }
3246 
3247 	  state->m_last_result = use_lhs;
3248 	  check_defer = false;
3249 	}
3250       else
3251 	defer = false;
3252 
3253       /* While it is possible to validate whether or not the exact form that
3254 	 we've recognized is available in the backend, the assumption is that
3255 	 if the deferring logic above did not trigger, the transformation is
3256 	 never a loss.  For instance, suppose the target only has the plain FMA
3257 	 pattern available.  Consider a*b-c -> fma(a,b,-c): we've exchanged
3258 	 MUL+SUB for FMA+NEG, which is still two operations.  Consider
3259          -(a*b)-c -> fma(-a,b,-c): we still have 3 operations, but in the FMA
3260 	 form the two NEGs are independent and could be run in parallel.  */
3261     }
3262 
3263   if (defer)
3264     {
3265       fma_transformation_info fti;
3266       fti.mul_stmt = mul_stmt;
3267       fti.mul_result = mul_result;
3268       fti.op1 = op1;
3269       fti.op2 = op2;
3270       state->m_candidates.safe_push (fti);
3271       state->m_mul_result_set.add (mul_result);
3272 
3273       if (dump_file && (dump_flags & TDF_DETAILS))
3274 	{
3275 	  fprintf (dump_file, "Deferred generating FMA for multiplication ");
3276 	  print_gimple_stmt (dump_file, mul_stmt, 0, TDF_NONE);
3277 	  fprintf (dump_file, "\n");
3278 	}
3279 
3280       return false;
3281     }
3282   else
3283     {
3284       if (state->m_deferring_p)
3285 	cancel_fma_deferring (state);
3286       convert_mult_to_fma_1 (mul_result, op1, op2);
3287       return true;
3288     }
3289 }
3290 
3291 
3292 /* Helper function of match_uaddsub_overflow.  Return 1
3293    if USE_STMT is unsigned overflow check ovf != 0 for
3294    STMT, -1 if USE_STMT is unsigned overflow check ovf == 0
3295    and 0 otherwise.  */
3296 
3297 static int
uaddsub_overflow_check_p(gimple * stmt,gimple * use_stmt)3298 uaddsub_overflow_check_p (gimple *stmt, gimple *use_stmt)
3299 {
3300   enum tree_code ccode = ERROR_MARK;
3301   tree crhs1 = NULL_TREE, crhs2 = NULL_TREE;
3302   if (gimple_code (use_stmt) == GIMPLE_COND)
3303     {
3304       ccode = gimple_cond_code (use_stmt);
3305       crhs1 = gimple_cond_lhs (use_stmt);
3306       crhs2 = gimple_cond_rhs (use_stmt);
3307     }
3308   else if (is_gimple_assign (use_stmt))
3309     {
3310       if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3311 	{
3312 	  ccode = gimple_assign_rhs_code (use_stmt);
3313 	  crhs1 = gimple_assign_rhs1 (use_stmt);
3314 	  crhs2 = gimple_assign_rhs2 (use_stmt);
3315 	}
3316       else if (gimple_assign_rhs_code (use_stmt) == COND_EXPR)
3317 	{
3318 	  tree cond = gimple_assign_rhs1 (use_stmt);
3319 	  if (COMPARISON_CLASS_P (cond))
3320 	    {
3321 	      ccode = TREE_CODE (cond);
3322 	      crhs1 = TREE_OPERAND (cond, 0);
3323 	      crhs2 = TREE_OPERAND (cond, 1);
3324 	    }
3325 	  else
3326 	    return 0;
3327 	}
3328       else
3329 	return 0;
3330     }
3331   else
3332     return 0;
3333 
3334   if (TREE_CODE_CLASS (ccode) != tcc_comparison)
3335     return 0;
3336 
3337   enum tree_code code = gimple_assign_rhs_code (stmt);
3338   tree lhs = gimple_assign_lhs (stmt);
3339   tree rhs1 = gimple_assign_rhs1 (stmt);
3340   tree rhs2 = gimple_assign_rhs2 (stmt);
3341 
3342   switch (ccode)
3343     {
3344     case GT_EXPR:
3345     case LE_EXPR:
3346       /* r = a - b; r > a or r <= a
3347 	 r = a + b; a > r or a <= r or b > r or b <= r.  */
3348       if ((code == MINUS_EXPR && crhs1 == lhs && crhs2 == rhs1)
3349 	  || (code == PLUS_EXPR && (crhs1 == rhs1 || crhs1 == rhs2)
3350 	      && crhs2 == lhs))
3351 	return ccode == GT_EXPR ? 1 : -1;
3352       break;
3353     case LT_EXPR:
3354     case GE_EXPR:
3355       /* r = a - b; a < r or a >= r
3356 	 r = a + b; r < a or r >= a or r < b or r >= b.  */
3357       if ((code == MINUS_EXPR && crhs1 == rhs1 && crhs2 == lhs)
3358 	  || (code == PLUS_EXPR && crhs1 == lhs
3359 	      && (crhs2 == rhs1 || crhs2 == rhs2)))
3360 	return ccode == LT_EXPR ? 1 : -1;
3361       break;
3362     default:
3363       break;
3364     }
3365   return 0;
3366 }
3367 
3368 /* Recognize for unsigned x
3369    x = y - z;
3370    if (x > y)
3371    where there are other uses of x and replace it with
3372    _7 = SUB_OVERFLOW (y, z);
3373    x = REALPART_EXPR <_7>;
3374    _8 = IMAGPART_EXPR <_7>;
3375    if (_8)
3376    and similarly for addition.  */
3377 
3378 static bool
match_uaddsub_overflow(gimple_stmt_iterator * gsi,gimple * stmt,enum tree_code code)3379 match_uaddsub_overflow (gimple_stmt_iterator *gsi, gimple *stmt,
3380 			enum tree_code code)
3381 {
3382   tree lhs = gimple_assign_lhs (stmt);
3383   tree type = TREE_TYPE (lhs);
3384   use_operand_p use_p;
3385   imm_use_iterator iter;
3386   bool use_seen = false;
3387   bool ovf_use_seen = false;
3388   gimple *use_stmt;
3389 
3390   gcc_checking_assert (code == PLUS_EXPR || code == MINUS_EXPR);
3391   if (!INTEGRAL_TYPE_P (type)
3392       || !TYPE_UNSIGNED (type)
3393       || has_zero_uses (lhs)
3394       || has_single_use (lhs)
3395       || optab_handler (code == PLUS_EXPR ? uaddv4_optab : usubv4_optab,
3396 			TYPE_MODE (type)) == CODE_FOR_nothing)
3397     return false;
3398 
3399   FOR_EACH_IMM_USE_FAST (use_p, iter, lhs)
3400     {
3401       use_stmt = USE_STMT (use_p);
3402       if (is_gimple_debug (use_stmt))
3403 	continue;
3404 
3405       if (uaddsub_overflow_check_p (stmt, use_stmt))
3406 	ovf_use_seen = true;
3407       else
3408 	use_seen = true;
3409       if (ovf_use_seen && use_seen)
3410 	break;
3411     }
3412 
3413   if (!ovf_use_seen || !use_seen)
3414     return false;
3415 
3416   tree ctype = build_complex_type (type);
3417   tree rhs1 = gimple_assign_rhs1 (stmt);
3418   tree rhs2 = gimple_assign_rhs2 (stmt);
3419   gcall *g = gimple_build_call_internal (code == PLUS_EXPR
3420 					 ? IFN_ADD_OVERFLOW : IFN_SUB_OVERFLOW,
3421 					 2, rhs1, rhs2);
3422   tree ctmp = make_ssa_name (ctype);
3423   gimple_call_set_lhs (g, ctmp);
3424   gsi_insert_before (gsi, g, GSI_SAME_STMT);
3425   gassign *g2 = gimple_build_assign (lhs, REALPART_EXPR,
3426 				     build1 (REALPART_EXPR, type, ctmp));
3427   gsi_replace (gsi, g2, true);
3428   tree ovf = make_ssa_name (type);
3429   g2 = gimple_build_assign (ovf, IMAGPART_EXPR,
3430 			    build1 (IMAGPART_EXPR, type, ctmp));
3431   gsi_insert_after (gsi, g2, GSI_NEW_STMT);
3432 
3433   FOR_EACH_IMM_USE_STMT (use_stmt, iter, lhs)
3434     {
3435       if (is_gimple_debug (use_stmt))
3436 	continue;
3437 
3438       int ovf_use = uaddsub_overflow_check_p (stmt, use_stmt);
3439       if (ovf_use == 0)
3440 	continue;
3441       if (gimple_code (use_stmt) == GIMPLE_COND)
3442 	{
3443 	  gcond *cond_stmt = as_a <gcond *> (use_stmt);
3444 	  gimple_cond_set_lhs (cond_stmt, ovf);
3445 	  gimple_cond_set_rhs (cond_stmt, build_int_cst (type, 0));
3446 	  gimple_cond_set_code (cond_stmt, ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3447 	}
3448       else
3449 	{
3450 	  gcc_checking_assert (is_gimple_assign (use_stmt));
3451 	  if (gimple_assign_rhs_class (use_stmt) == GIMPLE_BINARY_RHS)
3452 	    {
3453 	      gimple_assign_set_rhs1 (use_stmt, ovf);
3454 	      gimple_assign_set_rhs2 (use_stmt, build_int_cst (type, 0));
3455 	      gimple_assign_set_rhs_code (use_stmt,
3456 					  ovf_use == 1 ? NE_EXPR : EQ_EXPR);
3457 	    }
3458 	  else
3459 	    {
3460 	      gcc_checking_assert (gimple_assign_rhs_code (use_stmt)
3461 				   == COND_EXPR);
3462 	      tree cond = build2 (ovf_use == 1 ? NE_EXPR : EQ_EXPR,
3463 				  boolean_type_node, ovf,
3464 				  build_int_cst (type, 0));
3465 	      gimple_assign_set_rhs1 (use_stmt, cond);
3466 	    }
3467 	}
3468       update_stmt (use_stmt);
3469     }
3470   return true;
3471 }
3472 
3473 /* Return true if target has support for divmod.  */
3474 
3475 static bool
target_supports_divmod_p(optab divmod_optab,optab div_optab,machine_mode mode)3476 target_supports_divmod_p (optab divmod_optab, optab div_optab, machine_mode mode)
3477 {
3478   /* If target supports hardware divmod insn, use it for divmod.  */
3479   if (optab_handler (divmod_optab, mode) != CODE_FOR_nothing)
3480     return true;
3481 
3482   /* Check if libfunc for divmod is available.  */
3483   rtx libfunc = optab_libfunc (divmod_optab, mode);
3484   if (libfunc != NULL_RTX)
3485     {
3486       /* If optab_handler exists for div_optab, perhaps in a wider mode,
3487 	 we don't want to use the libfunc even if it exists for given mode.  */
3488       machine_mode div_mode;
3489       FOR_EACH_MODE_FROM (div_mode, mode)
3490 	if (optab_handler (div_optab, div_mode) != CODE_FOR_nothing)
3491 	  return false;
3492 
3493       return targetm.expand_divmod_libfunc != NULL;
3494     }
3495 
3496   return false;
3497 }
3498 
3499 /* Check if stmt is candidate for divmod transform.  */
3500 
3501 static bool
divmod_candidate_p(gassign * stmt)3502 divmod_candidate_p (gassign *stmt)
3503 {
3504   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
3505   machine_mode mode = TYPE_MODE (type);
3506   optab divmod_optab, div_optab;
3507 
3508   if (TYPE_UNSIGNED (type))
3509     {
3510       divmod_optab = udivmod_optab;
3511       div_optab = udiv_optab;
3512     }
3513   else
3514     {
3515       divmod_optab = sdivmod_optab;
3516       div_optab = sdiv_optab;
3517     }
3518 
3519   tree op1 = gimple_assign_rhs1 (stmt);
3520   tree op2 = gimple_assign_rhs2 (stmt);
3521 
3522   /* Disable the transform if either is a constant, since division-by-constant
3523      may have specialized expansion.  */
3524   if (CONSTANT_CLASS_P (op1) || CONSTANT_CLASS_P (op2))
3525     return false;
3526 
3527   /* Exclude the case where TYPE_OVERFLOW_TRAPS (type) as that should
3528      expand using the [su]divv optabs.  */
3529   if (TYPE_OVERFLOW_TRAPS (type))
3530     return false;
3531 
3532   if (!target_supports_divmod_p (divmod_optab, div_optab, mode))
3533     return false;
3534 
3535   return true;
3536 }
3537 
3538 /* This function looks for:
3539    t1 = a TRUNC_DIV_EXPR b;
3540    t2 = a TRUNC_MOD_EXPR b;
3541    and transforms it to the following sequence:
3542    complex_tmp = DIVMOD (a, b);
3543    t1 = REALPART_EXPR(a);
3544    t2 = IMAGPART_EXPR(b);
3545    For conditions enabling the transform see divmod_candidate_p().
3546 
3547    The pass has three parts:
3548    1) Find top_stmt which is trunc_div or trunc_mod stmt and dominates all
3549       other trunc_div_expr and trunc_mod_expr stmts.
3550    2) Add top_stmt and all trunc_div and trunc_mod stmts dominated by top_stmt
3551       to stmts vector.
3552    3) Insert DIVMOD call just before top_stmt and update entries in
3553       stmts vector to use return value of DIMOVD (REALEXPR_PART for div,
3554       IMAGPART_EXPR for mod).  */
3555 
3556 static bool
convert_to_divmod(gassign * stmt)3557 convert_to_divmod (gassign *stmt)
3558 {
3559   if (stmt_can_throw_internal (cfun, stmt)
3560       || !divmod_candidate_p (stmt))
3561     return false;
3562 
3563   tree op1 = gimple_assign_rhs1 (stmt);
3564   tree op2 = gimple_assign_rhs2 (stmt);
3565 
3566   imm_use_iterator use_iter;
3567   gimple *use_stmt;
3568   auto_vec<gimple *> stmts;
3569 
3570   gimple *top_stmt = stmt;
3571   basic_block top_bb = gimple_bb (stmt);
3572 
3573   /* Part 1: Try to set top_stmt to "topmost" stmt that dominates
3574      at-least stmt and possibly other trunc_div/trunc_mod stmts
3575      having same operands as stmt.  */
3576 
3577   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, op1)
3578     {
3579       if (is_gimple_assign (use_stmt)
3580 	  && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
3581 	      || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
3582 	  && operand_equal_p (op1, gimple_assign_rhs1 (use_stmt), 0)
3583 	  && operand_equal_p (op2, gimple_assign_rhs2 (use_stmt), 0))
3584 	{
3585 	  if (stmt_can_throw_internal (cfun, use_stmt))
3586 	    continue;
3587 
3588 	  basic_block bb = gimple_bb (use_stmt);
3589 
3590 	  if (bb == top_bb)
3591 	    {
3592 	      if (gimple_uid (use_stmt) < gimple_uid (top_stmt))
3593 		top_stmt = use_stmt;
3594 	    }
3595 	  else if (dominated_by_p (CDI_DOMINATORS, top_bb, bb))
3596 	    {
3597 	      top_bb = bb;
3598 	      top_stmt = use_stmt;
3599 	    }
3600 	}
3601     }
3602 
3603   tree top_op1 = gimple_assign_rhs1 (top_stmt);
3604   tree top_op2 = gimple_assign_rhs2 (top_stmt);
3605 
3606   stmts.safe_push (top_stmt);
3607   bool div_seen = (gimple_assign_rhs_code (top_stmt) == TRUNC_DIV_EXPR);
3608 
3609   /* Part 2: Add all trunc_div/trunc_mod statements domianted by top_bb
3610      to stmts vector. The 2nd loop will always add stmt to stmts vector, since
3611      gimple_bb (top_stmt) dominates gimple_bb (stmt), so the
3612      2nd loop ends up adding at-least single trunc_mod_expr stmt.  */
3613 
3614   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, top_op1)
3615     {
3616       if (is_gimple_assign (use_stmt)
3617 	  && (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR
3618 	      || gimple_assign_rhs_code (use_stmt) == TRUNC_MOD_EXPR)
3619 	  && operand_equal_p (top_op1, gimple_assign_rhs1 (use_stmt), 0)
3620 	  && operand_equal_p (top_op2, gimple_assign_rhs2 (use_stmt), 0))
3621 	{
3622 	  if (use_stmt == top_stmt
3623 	      || stmt_can_throw_internal (cfun, use_stmt)
3624 	      || !dominated_by_p (CDI_DOMINATORS, gimple_bb (use_stmt), top_bb))
3625 	    continue;
3626 
3627 	  stmts.safe_push (use_stmt);
3628 	  if (gimple_assign_rhs_code (use_stmt) == TRUNC_DIV_EXPR)
3629 	    div_seen = true;
3630 	}
3631     }
3632 
3633   if (!div_seen)
3634     return false;
3635 
3636   /* Part 3: Create libcall to internal fn DIVMOD:
3637      divmod_tmp = DIVMOD (op1, op2).  */
3638 
3639   gcall *call_stmt = gimple_build_call_internal (IFN_DIVMOD, 2, op1, op2);
3640   tree res = make_temp_ssa_name (build_complex_type (TREE_TYPE (op1)),
3641 				 call_stmt, "divmod_tmp");
3642   gimple_call_set_lhs (call_stmt, res);
3643   /* We rejected throwing statements above.  */
3644   gimple_call_set_nothrow (call_stmt, true);
3645 
3646   /* Insert the call before top_stmt.  */
3647   gimple_stmt_iterator top_stmt_gsi = gsi_for_stmt (top_stmt);
3648   gsi_insert_before (&top_stmt_gsi, call_stmt, GSI_SAME_STMT);
3649 
3650   widen_mul_stats.divmod_calls_inserted++;
3651 
3652   /* Update all statements in stmts vector:
3653      lhs = op1 TRUNC_DIV_EXPR op2 -> lhs = REALPART_EXPR<divmod_tmp>
3654      lhs = op1 TRUNC_MOD_EXPR op2 -> lhs = IMAGPART_EXPR<divmod_tmp>.  */
3655 
3656   for (unsigned i = 0; stmts.iterate (i, &use_stmt); ++i)
3657     {
3658       tree new_rhs;
3659 
3660       switch (gimple_assign_rhs_code (use_stmt))
3661 	{
3662 	  case TRUNC_DIV_EXPR:
3663 	    new_rhs = fold_build1 (REALPART_EXPR, TREE_TYPE (op1), res);
3664 	    break;
3665 
3666 	  case TRUNC_MOD_EXPR:
3667 	    new_rhs = fold_build1 (IMAGPART_EXPR, TREE_TYPE (op1), res);
3668 	    break;
3669 
3670 	  default:
3671 	    gcc_unreachable ();
3672 	}
3673 
3674       gimple_stmt_iterator gsi = gsi_for_stmt (use_stmt);
3675       gimple_assign_set_rhs_from_tree (&gsi, new_rhs);
3676       update_stmt (use_stmt);
3677     }
3678 
3679   return true;
3680 }
3681 
3682 /* Find integer multiplications where the operands are extended from
3683    smaller types, and replace the MULT_EXPR with a WIDEN_MULT_EXPR
3684    where appropriate.  */
3685 
3686 namespace {
3687 
3688 const pass_data pass_data_optimize_widening_mul =
3689 {
3690   GIMPLE_PASS, /* type */
3691   "widening_mul", /* name */
3692   OPTGROUP_NONE, /* optinfo_flags */
3693   TV_TREE_WIDEN_MUL, /* tv_id */
3694   PROP_ssa, /* properties_required */
3695   0, /* properties_provided */
3696   0, /* properties_destroyed */
3697   0, /* todo_flags_start */
3698   TODO_update_ssa, /* todo_flags_finish */
3699 };
3700 
3701 class pass_optimize_widening_mul : public gimple_opt_pass
3702 {
3703 public:
pass_optimize_widening_mul(gcc::context * ctxt)3704   pass_optimize_widening_mul (gcc::context *ctxt)
3705     : gimple_opt_pass (pass_data_optimize_widening_mul, ctxt)
3706   {}
3707 
3708   /* opt_pass methods: */
gate(function *)3709   virtual bool gate (function *)
3710     {
3711       return flag_expensive_optimizations && optimize;
3712     }
3713 
3714   virtual unsigned int execute (function *);
3715 
3716 }; // class pass_optimize_widening_mul
3717 
3718 /* Walker class to perform the transformation in reverse dominance order. */
3719 
3720 class math_opts_dom_walker : public dom_walker
3721 {
3722 public:
3723   /* Constructor, CFG_CHANGED is a pointer to a boolean flag that will be set
3724      if walking modidifes the CFG.  */
3725 
math_opts_dom_walker(bool * cfg_changed_p)3726   math_opts_dom_walker (bool *cfg_changed_p)
3727     : dom_walker (CDI_DOMINATORS), m_last_result_set (),
3728       m_cfg_changed_p (cfg_changed_p) {}
3729 
3730   /* The actual actions performed in the walk.  */
3731 
3732   virtual void after_dom_children (basic_block);
3733 
3734   /* Set of results of chains of multiply and add statement combinations that
3735      were not transformed into FMAs because of active deferring.  */
3736   hash_set<tree> m_last_result_set;
3737 
3738   /* Pointer to a flag of the user that needs to be set if CFG has been
3739      modified.  */
3740   bool *m_cfg_changed_p;
3741 };
3742 
3743 void
after_dom_children(basic_block bb)3744 math_opts_dom_walker::after_dom_children (basic_block bb)
3745 {
3746   gimple_stmt_iterator gsi;
3747 
3748   fma_deferring_state fma_state (param_avoid_fma_max_bits > 0);
3749 
3750   for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);)
3751     {
3752       gimple *stmt = gsi_stmt (gsi);
3753       enum tree_code code;
3754 
3755       if (is_gimple_assign (stmt))
3756 	{
3757 	  code = gimple_assign_rhs_code (stmt);
3758 	  switch (code)
3759 	    {
3760 	    case MULT_EXPR:
3761 	      if (!convert_mult_to_widen (stmt, &gsi)
3762 		  && !convert_expand_mult_copysign (stmt, &gsi)
3763 		  && convert_mult_to_fma (stmt,
3764 					  gimple_assign_rhs1 (stmt),
3765 					  gimple_assign_rhs2 (stmt),
3766 					  &fma_state))
3767 		{
3768 		  gsi_remove (&gsi, true);
3769 		  release_defs (stmt);
3770 		  continue;
3771 		}
3772 	      break;
3773 
3774 	    case PLUS_EXPR:
3775 	    case MINUS_EXPR:
3776 	      if (!convert_plusminus_to_widen (&gsi, stmt, code))
3777 		match_uaddsub_overflow (&gsi, stmt, code);
3778 	      break;
3779 
3780 	    case TRUNC_MOD_EXPR:
3781 	      convert_to_divmod (as_a<gassign *> (stmt));
3782 	      break;
3783 
3784 	    default:;
3785 	    }
3786 	}
3787       else if (is_gimple_call (stmt))
3788 	{
3789 	  switch (gimple_call_combined_fn (stmt))
3790 	    {
3791 	    CASE_CFN_POW:
3792 	      if (gimple_call_lhs (stmt)
3793 		  && TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST
3794 		  && real_equal (&TREE_REAL_CST (gimple_call_arg (stmt, 1)),
3795 				 &dconst2)
3796 		  && convert_mult_to_fma (stmt,
3797 					  gimple_call_arg (stmt, 0),
3798 					  gimple_call_arg (stmt, 0),
3799 					  &fma_state))
3800 		{
3801 		  unlink_stmt_vdef (stmt);
3802 		  if (gsi_remove (&gsi, true)
3803 		      && gimple_purge_dead_eh_edges (bb))
3804 		    *m_cfg_changed_p = true;
3805 		  release_defs (stmt);
3806 		  continue;
3807 		}
3808 	      break;
3809 
3810 	    case CFN_COND_MUL:
3811 	      if (convert_mult_to_fma (stmt,
3812 				       gimple_call_arg (stmt, 1),
3813 				       gimple_call_arg (stmt, 2),
3814 				       &fma_state,
3815 				       gimple_call_arg (stmt, 0)))
3816 
3817 		{
3818 		  gsi_remove (&gsi, true);
3819 		  release_defs (stmt);
3820 		  continue;
3821 		}
3822 	      break;
3823 
3824 	    case CFN_LAST:
3825 	      cancel_fma_deferring (&fma_state);
3826 	      break;
3827 
3828 	    default:
3829 	      break;
3830 	    }
3831 	}
3832       gsi_next (&gsi);
3833     }
3834   if (fma_state.m_deferring_p
3835       && fma_state.m_initial_phi)
3836     {
3837       gcc_checking_assert (fma_state.m_last_result);
3838       if (!last_fma_candidate_feeds_initial_phi (&fma_state,
3839 						 &m_last_result_set))
3840 	cancel_fma_deferring (&fma_state);
3841       else
3842 	m_last_result_set.add (fma_state.m_last_result);
3843     }
3844 }
3845 
3846 
3847 unsigned int
execute(function * fun)3848 pass_optimize_widening_mul::execute (function *fun)
3849 {
3850   bool cfg_changed = false;
3851 
3852   memset (&widen_mul_stats, 0, sizeof (widen_mul_stats));
3853   calculate_dominance_info (CDI_DOMINATORS);
3854   renumber_gimple_stmt_uids (cfun);
3855 
3856   math_opts_dom_walker (&cfg_changed).walk (ENTRY_BLOCK_PTR_FOR_FN (cfun));
3857 
3858   statistics_counter_event (fun, "widening multiplications inserted",
3859 			    widen_mul_stats.widen_mults_inserted);
3860   statistics_counter_event (fun, "widening maccs inserted",
3861 			    widen_mul_stats.maccs_inserted);
3862   statistics_counter_event (fun, "fused multiply-adds inserted",
3863 			    widen_mul_stats.fmas_inserted);
3864   statistics_counter_event (fun, "divmod calls inserted",
3865 			    widen_mul_stats.divmod_calls_inserted);
3866 
3867   return cfg_changed ? TODO_cleanup_cfg : 0;
3868 }
3869 
3870 } // anon namespace
3871 
3872 gimple_opt_pass *
make_pass_optimize_widening_mul(gcc::context * ctxt)3873 make_pass_optimize_widening_mul (gcc::context *ctxt)
3874 {
3875   return new pass_optimize_widening_mul (ctxt);
3876 }
3877