1*38fd1498Szrj /* Lowering and expansion of OpenMP directives for HSA GPU agents.
2*38fd1498Szrj
3*38fd1498Szrj Copyright (C) 2013-2018 Free Software Foundation, Inc.
4*38fd1498Szrj
5*38fd1498Szrj This file is part of GCC.
6*38fd1498Szrj
7*38fd1498Szrj GCC is free software; you can redistribute it and/or modify it under
8*38fd1498Szrj the terms of the GNU General Public License as published by the Free
9*38fd1498Szrj Software Foundation; either version 3, or (at your option) any later
10*38fd1498Szrj version.
11*38fd1498Szrj
12*38fd1498Szrj GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13*38fd1498Szrj WARRANTY; without even the implied warranty of MERCHANTABILITY or
14*38fd1498Szrj FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15*38fd1498Szrj for more details.
16*38fd1498Szrj
17*38fd1498Szrj You should have received a copy of the GNU General Public License
18*38fd1498Szrj along with GCC; see the file COPYING3. If not see
19*38fd1498Szrj <http://www.gnu.org/licenses/>. */
20*38fd1498Szrj
21*38fd1498Szrj #include "config.h"
22*38fd1498Szrj #include "system.h"
23*38fd1498Szrj #include "coretypes.h"
24*38fd1498Szrj #include "backend.h"
25*38fd1498Szrj #include "tree.h"
26*38fd1498Szrj #include "gimple.h"
27*38fd1498Szrj #include "tree-pass.h"
28*38fd1498Szrj #include "ssa.h"
29*38fd1498Szrj #include "cgraph.h"
30*38fd1498Szrj #include "pretty-print.h"
31*38fd1498Szrj #include "fold-const.h"
32*38fd1498Szrj #include "gimplify.h"
33*38fd1498Szrj #include "gimple-iterator.h"
34*38fd1498Szrj #include "gimple-walk.h"
35*38fd1498Szrj #include "tree-inline.h"
36*38fd1498Szrj #include "langhooks.h"
37*38fd1498Szrj #include "omp-general.h"
38*38fd1498Szrj #include "omp-low.h"
39*38fd1498Szrj #include "omp-grid.h"
40*38fd1498Szrj #include "gimple-pretty-print.h"
41*38fd1498Szrj
42*38fd1498Szrj /* Return the lastprivate predicate for a given gridified loop described by
43*38fd1498Szrj FD). */
44*38fd1498Szrj
45*38fd1498Szrj tree
omp_grid_lastprivate_predicate(struct omp_for_data * fd)46*38fd1498Szrj omp_grid_lastprivate_predicate (struct omp_for_data *fd)
47*38fd1498Szrj {
48*38fd1498Szrj /* When dealing with a gridified loop, we need to check up to three collapsed
49*38fd1498Szrj iteration variables but they are not actually captured in this fd.
50*38fd1498Szrj Fortunately, we can easily rely on HSA builtins to get this
51*38fd1498Szrj information. */
52*38fd1498Szrj
53*38fd1498Szrj tree id, size;
54*38fd1498Szrj if (gimple_omp_for_kind (fd->for_stmt) == GF_OMP_FOR_KIND_GRID_LOOP
55*38fd1498Szrj && gimple_omp_for_grid_intra_group (fd->for_stmt))
56*38fd1498Szrj {
57*38fd1498Szrj id = builtin_decl_explicit (BUILT_IN_HSA_WORKITEMID);
58*38fd1498Szrj size = builtin_decl_explicit (BUILT_IN_HSA_CURRENTWORKGROUPSIZE);
59*38fd1498Szrj }
60*38fd1498Szrj else
61*38fd1498Szrj {
62*38fd1498Szrj id = builtin_decl_explicit (BUILT_IN_HSA_WORKITEMABSID);
63*38fd1498Szrj size = builtin_decl_explicit (BUILT_IN_HSA_GRIDSIZE);
64*38fd1498Szrj }
65*38fd1498Szrj tree cond = NULL;
66*38fd1498Szrj for (int dim = 0; dim < fd->collapse; dim++)
67*38fd1498Szrj {
68*38fd1498Szrj tree dim_tree = build_int_cstu (unsigned_type_node, dim);
69*38fd1498Szrj tree u1 = build_int_cstu (unsigned_type_node, 1);
70*38fd1498Szrj tree c2
71*38fd1498Szrj = build2 (EQ_EXPR, boolean_type_node,
72*38fd1498Szrj build2 (PLUS_EXPR, unsigned_type_node,
73*38fd1498Szrj build_call_expr (id, 1, dim_tree), u1),
74*38fd1498Szrj build_call_expr (size, 1, dim_tree));
75*38fd1498Szrj if (cond)
76*38fd1498Szrj cond = build2 (TRUTH_AND_EXPR, boolean_type_node, cond, c2);
77*38fd1498Szrj else
78*38fd1498Szrj cond = c2;
79*38fd1498Szrj }
80*38fd1498Szrj return cond;
81*38fd1498Szrj }
82*38fd1498Szrj
83*38fd1498Szrj /* Structure describing the basic properties of the loop we ara analyzing
84*38fd1498Szrj whether it can be gridified and when it is gridified. */
85*38fd1498Szrj
86*38fd1498Szrj struct grid_prop
87*38fd1498Szrj {
88*38fd1498Szrj /* True when we are doing tiling gridification, i.e. when there is a distinct
89*38fd1498Szrj distribute loop over groups and a loop construct over work-items. False
90*38fd1498Szrj when distribute and parallel for loops form a combined construct. */
91*38fd1498Szrj bool tiling;
92*38fd1498Szrj /* Location of the target construct for optimization information
93*38fd1498Szrj messages. */
94*38fd1498Szrj location_t target_loc;
95*38fd1498Szrj /* The collapse clause of the involved loops. Collapse value of all of them
96*38fd1498Szrj must be the same for gridification to take place. */
97*38fd1498Szrj size_t collapse;
98*38fd1498Szrj /* Group sizes, if requested by the user or NULL if not requested. */
99*38fd1498Szrj tree group_sizes[3];
100*38fd1498Szrj };
101*38fd1498Szrj
102*38fd1498Szrj #define GRID_MISSED_MSG_PREFIX "Will not turn target construct into a " \
103*38fd1498Szrj "gridified HSA kernel because "
104*38fd1498Szrj
105*38fd1498Szrj /* Return true if STMT is an assignment of a register-type into a local
106*38fd1498Szrj VAR_DECL. If GRID is non-NULL, the assignment additionally must not be to
107*38fd1498Szrj any of the trees specifying group sizes there. */
108*38fd1498Szrj
109*38fd1498Szrj static bool
grid_safe_assignment_p(gimple * stmt,grid_prop * grid)110*38fd1498Szrj grid_safe_assignment_p (gimple *stmt, grid_prop *grid)
111*38fd1498Szrj {
112*38fd1498Szrj gassign *assign = dyn_cast <gassign *> (stmt);
113*38fd1498Szrj if (!assign)
114*38fd1498Szrj return false;
115*38fd1498Szrj if (gimple_clobber_p (assign))
116*38fd1498Szrj return true;
117*38fd1498Szrj tree lhs = gimple_assign_lhs (assign);
118*38fd1498Szrj if (!VAR_P (lhs)
119*38fd1498Szrj || !is_gimple_reg_type (TREE_TYPE (lhs))
120*38fd1498Szrj || is_global_var (lhs))
121*38fd1498Szrj return false;
122*38fd1498Szrj if (grid)
123*38fd1498Szrj for (unsigned i = 0; i < grid->collapse; i++)
124*38fd1498Szrj if (lhs == grid->group_sizes[i])
125*38fd1498Szrj return false;
126*38fd1498Szrj return true;
127*38fd1498Szrj }
128*38fd1498Szrj
129*38fd1498Szrj /* Return true if all statements in SEQ are assignments to local register-type
130*38fd1498Szrj variables that do not hold group size information. */
131*38fd1498Szrj
132*38fd1498Szrj static bool
grid_seq_only_contains_local_assignments(gimple_seq seq,grid_prop * grid)133*38fd1498Szrj grid_seq_only_contains_local_assignments (gimple_seq seq, grid_prop *grid)
134*38fd1498Szrj {
135*38fd1498Szrj if (!seq)
136*38fd1498Szrj return true;
137*38fd1498Szrj
138*38fd1498Szrj gimple_stmt_iterator gsi;
139*38fd1498Szrj for (gsi = gsi_start (seq); !gsi_end_p (gsi); gsi_next (&gsi))
140*38fd1498Szrj if (!grid_safe_assignment_p (gsi_stmt (gsi), grid))
141*38fd1498Szrj return false;
142*38fd1498Szrj return true;
143*38fd1498Szrj }
144*38fd1498Szrj
145*38fd1498Szrj /* Scan statements in SEQ and call itself recursively on any bind. GRID
146*38fd1498Szrj describes hitherto discovered properties of the loop that is evaluated for
147*38fd1498Szrj possible gridification. If during whole search only assignments to
148*38fd1498Szrj register-type local variables (that do not overwrite group size information)
149*38fd1498Szrj and one single OMP statement is encountered, return true, otherwise return
150*38fd1498Szrj false. RET is where we store any OMP statement encountered. */
151*38fd1498Szrj
152*38fd1498Szrj static bool
grid_find_single_omp_among_assignments_1(gimple_seq seq,grid_prop * grid,const char * name,gimple ** ret)153*38fd1498Szrj grid_find_single_omp_among_assignments_1 (gimple_seq seq, grid_prop *grid,
154*38fd1498Szrj const char *name, gimple **ret)
155*38fd1498Szrj {
156*38fd1498Szrj gimple_stmt_iterator gsi;
157*38fd1498Szrj for (gsi = gsi_start (seq); !gsi_end_p (gsi); gsi_next (&gsi))
158*38fd1498Szrj {
159*38fd1498Szrj gimple *stmt = gsi_stmt (gsi);
160*38fd1498Szrj
161*38fd1498Szrj if (grid_safe_assignment_p (stmt, grid))
162*38fd1498Szrj continue;
163*38fd1498Szrj if (gbind *bind = dyn_cast <gbind *> (stmt))
164*38fd1498Szrj {
165*38fd1498Szrj gimple_seq bind_body = gimple_bind_body (bind);
166*38fd1498Szrj if (!grid_find_single_omp_among_assignments_1 (bind_body, grid, name,
167*38fd1498Szrj ret))
168*38fd1498Szrj return false;
169*38fd1498Szrj }
170*38fd1498Szrj else if (is_gimple_omp (stmt))
171*38fd1498Szrj {
172*38fd1498Szrj if (*ret)
173*38fd1498Szrj {
174*38fd1498Szrj if (dump_enabled_p ())
175*38fd1498Szrj {
176*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
177*38fd1498Szrj GRID_MISSED_MSG_PREFIX "%s construct "
178*38fd1498Szrj "contains multiple OpenMP constructs\n",
179*38fd1498Szrj name);
180*38fd1498Szrj dump_printf_loc (MSG_NOTE, gimple_location (*ret),
181*38fd1498Szrj "The first OpenMP construct within "
182*38fd1498Szrj "a parallel\n");
183*38fd1498Szrj dump_printf_loc (MSG_NOTE, gimple_location (stmt),
184*38fd1498Szrj "The second OpenMP construct within "
185*38fd1498Szrj "a parallel\n");
186*38fd1498Szrj }
187*38fd1498Szrj return false;
188*38fd1498Szrj }
189*38fd1498Szrj *ret = stmt;
190*38fd1498Szrj }
191*38fd1498Szrj else
192*38fd1498Szrj {
193*38fd1498Szrj if (dump_enabled_p ())
194*38fd1498Szrj {
195*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
196*38fd1498Szrj GRID_MISSED_MSG_PREFIX "%s construct contains "
197*38fd1498Szrj "a complex statement\n", name);
198*38fd1498Szrj dump_printf_loc (MSG_NOTE, gimple_location (stmt),
199*38fd1498Szrj "This statement cannot be analyzed for "
200*38fd1498Szrj "gridification\n");
201*38fd1498Szrj }
202*38fd1498Szrj return false;
203*38fd1498Szrj }
204*38fd1498Szrj }
205*38fd1498Szrj return true;
206*38fd1498Szrj }
207*38fd1498Szrj
208*38fd1498Szrj /* Scan statements in SEQ and make sure that it and any binds in it contain
209*38fd1498Szrj only assignments to local register-type variables (that do not overwrite
210*38fd1498Szrj group size information) and one OMP construct. If so, return that
211*38fd1498Szrj construct, otherwise return NULL. GRID describes hitherto discovered
212*38fd1498Szrj properties of the loop that is evaluated for possible gridification. If
213*38fd1498Szrj dumping is enabled and function fails, use NAME to dump a note with the
214*38fd1498Szrj reason for failure. */
215*38fd1498Szrj
216*38fd1498Szrj static gimple *
grid_find_single_omp_among_assignments(gimple_seq seq,grid_prop * grid,const char * name)217*38fd1498Szrj grid_find_single_omp_among_assignments (gimple_seq seq, grid_prop *grid,
218*38fd1498Szrj const char *name)
219*38fd1498Szrj {
220*38fd1498Szrj if (!seq)
221*38fd1498Szrj {
222*38fd1498Szrj if (dump_enabled_p ())
223*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
224*38fd1498Szrj GRID_MISSED_MSG_PREFIX "%s construct has empty body\n",
225*38fd1498Szrj name);
226*38fd1498Szrj return NULL;
227*38fd1498Szrj }
228*38fd1498Szrj
229*38fd1498Szrj gimple *ret = NULL;
230*38fd1498Szrj if (grid_find_single_omp_among_assignments_1 (seq, grid, name, &ret))
231*38fd1498Szrj {
232*38fd1498Szrj if (!ret && dump_enabled_p ())
233*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
234*38fd1498Szrj GRID_MISSED_MSG_PREFIX "%s construct does not contain"
235*38fd1498Szrj " any other OpenMP construct\n", name);
236*38fd1498Szrj return ret;
237*38fd1498Szrj }
238*38fd1498Szrj else
239*38fd1498Szrj return NULL;
240*38fd1498Szrj }
241*38fd1498Szrj
242*38fd1498Szrj /* Walker function looking for statements there is no point gridifying (and for
243*38fd1498Szrj noreturn function calls which we cannot do). Return non-NULL if such a
244*38fd1498Szrj function is found. */
245*38fd1498Szrj
246*38fd1498Szrj static tree
grid_find_ungridifiable_statement(gimple_stmt_iterator * gsi,bool * handled_ops_p,struct walk_stmt_info * wi)247*38fd1498Szrj grid_find_ungridifiable_statement (gimple_stmt_iterator *gsi,
248*38fd1498Szrj bool *handled_ops_p,
249*38fd1498Szrj struct walk_stmt_info *wi)
250*38fd1498Szrj {
251*38fd1498Szrj *handled_ops_p = false;
252*38fd1498Szrj gimple *stmt = gsi_stmt (*gsi);
253*38fd1498Szrj switch (gimple_code (stmt))
254*38fd1498Szrj {
255*38fd1498Szrj case GIMPLE_CALL:
256*38fd1498Szrj if (gimple_call_noreturn_p (as_a <gcall *> (stmt)))
257*38fd1498Szrj {
258*38fd1498Szrj *handled_ops_p = true;
259*38fd1498Szrj wi->info = stmt;
260*38fd1498Szrj return error_mark_node;
261*38fd1498Szrj }
262*38fd1498Szrj break;
263*38fd1498Szrj
264*38fd1498Szrj /* We may reduce the following list if we find a way to implement the
265*38fd1498Szrj clauses, but now there is no point trying further. */
266*38fd1498Szrj case GIMPLE_OMP_CRITICAL:
267*38fd1498Szrj case GIMPLE_OMP_TASKGROUP:
268*38fd1498Szrj case GIMPLE_OMP_TASK:
269*38fd1498Szrj case GIMPLE_OMP_SECTION:
270*38fd1498Szrj case GIMPLE_OMP_SECTIONS:
271*38fd1498Szrj case GIMPLE_OMP_SECTIONS_SWITCH:
272*38fd1498Szrj case GIMPLE_OMP_TARGET:
273*38fd1498Szrj case GIMPLE_OMP_ORDERED:
274*38fd1498Szrj *handled_ops_p = true;
275*38fd1498Szrj wi->info = stmt;
276*38fd1498Szrj return error_mark_node;
277*38fd1498Szrj default:
278*38fd1498Szrj break;
279*38fd1498Szrj }
280*38fd1498Szrj return NULL;
281*38fd1498Szrj }
282*38fd1498Szrj
283*38fd1498Szrj /* Examine clauses of omp parallel statement PAR and if any prevents
284*38fd1498Szrj gridification, issue a missed-optimization diagnostics and return false,
285*38fd1498Szrj otherwise return true. GRID describes hitherto discovered properties of the
286*38fd1498Szrj loop that is evaluated for possible gridification. */
287*38fd1498Szrj
288*38fd1498Szrj static bool
grid_parallel_clauses_gridifiable(gomp_parallel * par,location_t tloc)289*38fd1498Szrj grid_parallel_clauses_gridifiable (gomp_parallel *par, location_t tloc)
290*38fd1498Szrj {
291*38fd1498Szrj tree clauses = gimple_omp_parallel_clauses (par);
292*38fd1498Szrj while (clauses)
293*38fd1498Szrj {
294*38fd1498Szrj switch (OMP_CLAUSE_CODE (clauses))
295*38fd1498Szrj {
296*38fd1498Szrj case OMP_CLAUSE_NUM_THREADS:
297*38fd1498Szrj if (dump_enabled_p ())
298*38fd1498Szrj {
299*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
300*38fd1498Szrj GRID_MISSED_MSG_PREFIX "because there is "
301*38fd1498Szrj "a num_threads clause of the parallel "
302*38fd1498Szrj "construct\n");
303*38fd1498Szrj dump_printf_loc (MSG_NOTE, gimple_location (par),
304*38fd1498Szrj "Parallel construct has a num_threads clause\n");
305*38fd1498Szrj }
306*38fd1498Szrj return false;
307*38fd1498Szrj
308*38fd1498Szrj case OMP_CLAUSE_REDUCTION:
309*38fd1498Szrj if (dump_enabled_p ())
310*38fd1498Szrj {
311*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
312*38fd1498Szrj GRID_MISSED_MSG_PREFIX "a reduction clause "
313*38fd1498Szrj "is present\n ");
314*38fd1498Szrj dump_printf_loc (MSG_NOTE, gimple_location (par),
315*38fd1498Szrj "Parallel construct has a reduction clause\n");
316*38fd1498Szrj }
317*38fd1498Szrj return false;
318*38fd1498Szrj
319*38fd1498Szrj default:
320*38fd1498Szrj break;
321*38fd1498Szrj }
322*38fd1498Szrj clauses = OMP_CLAUSE_CHAIN (clauses);
323*38fd1498Szrj }
324*38fd1498Szrj return true;
325*38fd1498Szrj }
326*38fd1498Szrj
327*38fd1498Szrj /* Examine clauses and the body of omp loop statement GFOR and if something
328*38fd1498Szrj prevents gridification, issue a missed-optimization diagnostics and return
329*38fd1498Szrj false, otherwise return true. GRID describes hitherto discovered properties
330*38fd1498Szrj of the loop that is evaluated for possible gridification. */
331*38fd1498Szrj
332*38fd1498Szrj static bool
grid_inner_loop_gridifiable_p(gomp_for * gfor,grid_prop * grid)333*38fd1498Szrj grid_inner_loop_gridifiable_p (gomp_for *gfor, grid_prop *grid)
334*38fd1498Szrj {
335*38fd1498Szrj if (!grid_seq_only_contains_local_assignments (gimple_omp_for_pre_body (gfor),
336*38fd1498Szrj grid))
337*38fd1498Szrj {
338*38fd1498Szrj if (dump_enabled_p ())
339*38fd1498Szrj {
340*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
341*38fd1498Szrj GRID_MISSED_MSG_PREFIX "the inner loop "
342*38fd1498Szrj "loop bounds computation contains a complex "
343*38fd1498Szrj "statement\n");
344*38fd1498Szrj dump_printf_loc (MSG_NOTE, gimple_location (gfor),
345*38fd1498Szrj "Loop construct cannot be analyzed for "
346*38fd1498Szrj "gridification\n");
347*38fd1498Szrj }
348*38fd1498Szrj return false;
349*38fd1498Szrj }
350*38fd1498Szrj
351*38fd1498Szrj tree clauses = gimple_omp_for_clauses (gfor);
352*38fd1498Szrj while (clauses)
353*38fd1498Szrj {
354*38fd1498Szrj switch (OMP_CLAUSE_CODE (clauses))
355*38fd1498Szrj {
356*38fd1498Szrj case OMP_CLAUSE_SCHEDULE:
357*38fd1498Szrj if (OMP_CLAUSE_SCHEDULE_KIND (clauses) != OMP_CLAUSE_SCHEDULE_AUTO)
358*38fd1498Szrj {
359*38fd1498Szrj if (dump_enabled_p ())
360*38fd1498Szrj {
361*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
362*38fd1498Szrj GRID_MISSED_MSG_PREFIX "the inner loop "
363*38fd1498Szrj "has a non-automatic schedule clause\n");
364*38fd1498Szrj dump_printf_loc (MSG_NOTE, gimple_location (gfor),
365*38fd1498Szrj "Loop construct has a non automatic "
366*38fd1498Szrj "schedule clause\n");
367*38fd1498Szrj }
368*38fd1498Szrj return false;
369*38fd1498Szrj }
370*38fd1498Szrj break;
371*38fd1498Szrj
372*38fd1498Szrj case OMP_CLAUSE_REDUCTION:
373*38fd1498Szrj if (dump_enabled_p ())
374*38fd1498Szrj {
375*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
376*38fd1498Szrj GRID_MISSED_MSG_PREFIX "a reduction "
377*38fd1498Szrj "clause is present\n ");
378*38fd1498Szrj dump_printf_loc (MSG_NOTE, gimple_location (gfor),
379*38fd1498Szrj "Loop construct has a reduction schedule "
380*38fd1498Szrj "clause\n");
381*38fd1498Szrj }
382*38fd1498Szrj return false;
383*38fd1498Szrj
384*38fd1498Szrj default:
385*38fd1498Szrj break;
386*38fd1498Szrj }
387*38fd1498Szrj clauses = OMP_CLAUSE_CHAIN (clauses);
388*38fd1498Szrj }
389*38fd1498Szrj struct walk_stmt_info wi;
390*38fd1498Szrj memset (&wi, 0, sizeof (wi));
391*38fd1498Szrj if (walk_gimple_seq (gimple_omp_body (gfor),
392*38fd1498Szrj grid_find_ungridifiable_statement,
393*38fd1498Szrj NULL, &wi))
394*38fd1498Szrj {
395*38fd1498Szrj gimple *bad = (gimple *) wi.info;
396*38fd1498Szrj if (dump_enabled_p ())
397*38fd1498Szrj {
398*38fd1498Szrj if (is_gimple_call (bad))
399*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
400*38fd1498Szrj GRID_MISSED_MSG_PREFIX "the inner loop contains "
401*38fd1498Szrj "call to a noreturn function\n");
402*38fd1498Szrj else
403*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
404*38fd1498Szrj GRID_MISSED_MSG_PREFIX "the inner loop contains "
405*38fd1498Szrj "statement %s which cannot be transformed\n",
406*38fd1498Szrj gimple_code_name[(int) gimple_code (bad)]);
407*38fd1498Szrj dump_printf_loc (MSG_NOTE, gimple_location (bad),
408*38fd1498Szrj "This statement cannot be analyzed for "
409*38fd1498Szrj "gridification\n");
410*38fd1498Szrj }
411*38fd1498Szrj return false;
412*38fd1498Szrj }
413*38fd1498Szrj return true;
414*38fd1498Szrj }
415*38fd1498Szrj
416*38fd1498Szrj /* Given distribute omp construct represented by DIST, which in the original
417*38fd1498Szrj source forms a compound construct with a looping construct, return true if it
418*38fd1498Szrj can be turned into a gridified HSA kernel. Otherwise return false. GRID
419*38fd1498Szrj describes hitherto discovered properties of the loop that is evaluated for
420*38fd1498Szrj possible gridification. */
421*38fd1498Szrj
422*38fd1498Szrj static bool
grid_dist_follows_simple_pattern(gomp_for * dist,grid_prop * grid)423*38fd1498Szrj grid_dist_follows_simple_pattern (gomp_for *dist, grid_prop *grid)
424*38fd1498Szrj {
425*38fd1498Szrj location_t tloc = grid->target_loc;
426*38fd1498Szrj gimple *stmt = grid_find_single_omp_among_assignments (gimple_omp_body (dist),
427*38fd1498Szrj grid, "distribute");
428*38fd1498Szrj gomp_parallel *par;
429*38fd1498Szrj if (!stmt
430*38fd1498Szrj || !(par = dyn_cast <gomp_parallel *> (stmt))
431*38fd1498Szrj || !grid_parallel_clauses_gridifiable (par, tloc))
432*38fd1498Szrj return false;
433*38fd1498Szrj
434*38fd1498Szrj stmt = grid_find_single_omp_among_assignments (gimple_omp_body (par), grid,
435*38fd1498Szrj "parallel");
436*38fd1498Szrj gomp_for *gfor;
437*38fd1498Szrj if (!stmt || !(gfor = dyn_cast <gomp_for *> (stmt)))
438*38fd1498Szrj return false;
439*38fd1498Szrj
440*38fd1498Szrj if (gimple_omp_for_kind (gfor) != GF_OMP_FOR_KIND_FOR)
441*38fd1498Szrj {
442*38fd1498Szrj if (dump_enabled_p ())
443*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
444*38fd1498Szrj GRID_MISSED_MSG_PREFIX "the inner loop is not "
445*38fd1498Szrj "a simple for loop\n");
446*38fd1498Szrj return false;
447*38fd1498Szrj }
448*38fd1498Szrj gcc_assert (gimple_omp_for_collapse (gfor) == grid->collapse);
449*38fd1498Szrj
450*38fd1498Szrj if (!grid_inner_loop_gridifiable_p (gfor, grid))
451*38fd1498Szrj return false;
452*38fd1498Szrj
453*38fd1498Szrj return true;
454*38fd1498Szrj }
455*38fd1498Szrj
456*38fd1498Szrj /* Given an omp loop statement GFOR, return true if it can participate in
457*38fd1498Szrj tiling gridification, i.e. in one where the distribute and parallel for
458*38fd1498Szrj loops do not form a compound statement. GRID describes hitherto discovered
459*38fd1498Szrj properties of the loop that is evaluated for possible gridification. */
460*38fd1498Szrj
461*38fd1498Szrj static bool
grid_gfor_follows_tiling_pattern(gomp_for * gfor,grid_prop * grid)462*38fd1498Szrj grid_gfor_follows_tiling_pattern (gomp_for *gfor, grid_prop *grid)
463*38fd1498Szrj {
464*38fd1498Szrj if (gimple_omp_for_kind (gfor) != GF_OMP_FOR_KIND_FOR)
465*38fd1498Szrj {
466*38fd1498Szrj if (dump_enabled_p ())
467*38fd1498Szrj {
468*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
469*38fd1498Szrj GRID_MISSED_MSG_PREFIX "an inner loop is not "
470*38fd1498Szrj "a simple for loop\n");
471*38fd1498Szrj dump_printf_loc (MSG_NOTE, gimple_location (gfor),
472*38fd1498Szrj "This statement is not a simple for loop\n");
473*38fd1498Szrj }
474*38fd1498Szrj return false;
475*38fd1498Szrj }
476*38fd1498Szrj
477*38fd1498Szrj if (!grid_inner_loop_gridifiable_p (gfor, grid))
478*38fd1498Szrj return false;
479*38fd1498Szrj
480*38fd1498Szrj if (gimple_omp_for_collapse (gfor) != grid->collapse)
481*38fd1498Szrj {
482*38fd1498Szrj if (dump_enabled_p ())
483*38fd1498Szrj {
484*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
485*38fd1498Szrj GRID_MISSED_MSG_PREFIX "an inner loop does not "
486*38fd1498Szrj "have use the same collapse clause\n");
487*38fd1498Szrj dump_printf_loc (MSG_NOTE, gimple_location (gfor),
488*38fd1498Szrj "Loop construct uses a different collapse clause\n");
489*38fd1498Szrj }
490*38fd1498Szrj return false;
491*38fd1498Szrj }
492*38fd1498Szrj
493*38fd1498Szrj struct omp_for_data fd;
494*38fd1498Szrj struct omp_for_data_loop *loops
495*38fd1498Szrj = (struct omp_for_data_loop *)alloca (grid->collapse
496*38fd1498Szrj * sizeof (struct omp_for_data_loop));
497*38fd1498Szrj omp_extract_for_data (gfor, &fd, loops);
498*38fd1498Szrj for (unsigned i = 0; i < grid->collapse; i++)
499*38fd1498Szrj {
500*38fd1498Szrj tree itype, type = TREE_TYPE (fd.loops[i].v);
501*38fd1498Szrj if (POINTER_TYPE_P (type))
502*38fd1498Szrj itype = signed_type_for (type);
503*38fd1498Szrj else
504*38fd1498Szrj itype = type;
505*38fd1498Szrj
506*38fd1498Szrj tree n1 = fold_convert (itype, fd.loops[i].n1);
507*38fd1498Szrj tree n2 = fold_convert (itype, fd.loops[i].n2);
508*38fd1498Szrj tree t = build_int_cst (itype,
509*38fd1498Szrj (fd.loops[i].cond_code == LT_EXPR ? -1 : 1));
510*38fd1498Szrj t = fold_build2 (PLUS_EXPR, itype, fd.loops[i].step, t);
511*38fd1498Szrj t = fold_build2 (PLUS_EXPR, itype, t, n2);
512*38fd1498Szrj t = fold_build2 (MINUS_EXPR, itype, t, n1);
513*38fd1498Szrj if (TYPE_UNSIGNED (itype) && fd.loops[i].cond_code == GT_EXPR)
514*38fd1498Szrj t = fold_build2 (TRUNC_DIV_EXPR, itype,
515*38fd1498Szrj fold_build1 (NEGATE_EXPR, itype, t),
516*38fd1498Szrj fold_build1 (NEGATE_EXPR, itype, fd.loops[i].step));
517*38fd1498Szrj else
518*38fd1498Szrj t = fold_build2 (TRUNC_DIV_EXPR, itype, t, fd.loops[i].step);
519*38fd1498Szrj
520*38fd1498Szrj if (!operand_equal_p (grid->group_sizes[i], t, 0))
521*38fd1498Szrj {
522*38fd1498Szrj if (dump_enabled_p ())
523*38fd1498Szrj {
524*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
525*38fd1498Szrj GRID_MISSED_MSG_PREFIX "the distribute and "
526*38fd1498Szrj "an internal loop do not agree on tile size\n");
527*38fd1498Szrj dump_printf_loc (MSG_NOTE, gimple_location (gfor),
528*38fd1498Szrj "Loop construct does not seem to loop over "
529*38fd1498Szrj "a tile size\n");
530*38fd1498Szrj }
531*38fd1498Szrj return false;
532*38fd1498Szrj }
533*38fd1498Szrj }
534*38fd1498Szrj return true;
535*38fd1498Szrj }
536*38fd1498Szrj
537*38fd1498Szrj /* Facing a call to FNDECL in the body of a distribute construct, return true
538*38fd1498Szrj if we can handle it or false if it precludes gridification. */
539*38fd1498Szrj
540*38fd1498Szrj static bool
grid_call_permissible_in_distribute_p(tree fndecl)541*38fd1498Szrj grid_call_permissible_in_distribute_p (tree fndecl)
542*38fd1498Szrj {
543*38fd1498Szrj if (DECL_PURE_P (fndecl) || TREE_READONLY (fndecl))
544*38fd1498Szrj return true;
545*38fd1498Szrj
546*38fd1498Szrj const char *name = IDENTIFIER_POINTER (DECL_NAME (fndecl));
547*38fd1498Szrj if (strstr (name, "omp_") != name)
548*38fd1498Szrj return false;
549*38fd1498Szrj
550*38fd1498Szrj if ((strcmp (name, "omp_get_thread_num") == 0)
551*38fd1498Szrj || (strcmp (name, "omp_get_num_threads") == 0)
552*38fd1498Szrj || (strcmp (name, "omp_get_num_teams") == 0)
553*38fd1498Szrj || (strcmp (name, "omp_get_team_num") == 0)
554*38fd1498Szrj || (strcmp (name, "omp_get_level") == 0)
555*38fd1498Szrj || (strcmp (name, "omp_get_active_level") == 0)
556*38fd1498Szrj || (strcmp (name, "omp_in_parallel") == 0))
557*38fd1498Szrj return true;
558*38fd1498Szrj
559*38fd1498Szrj return false;
560*38fd1498Szrj }
561*38fd1498Szrj
562*38fd1498Szrj /* Facing a call satisfying grid_call_permissible_in_distribute_p in the body
563*38fd1498Szrj of a distribute construct that is pointed at by GSI, modify it as necessary
564*38fd1498Szrj for gridification. If the statement itself got removed, return true. */
565*38fd1498Szrj
566*38fd1498Szrj static bool
grid_handle_call_in_distribute(gimple_stmt_iterator * gsi)567*38fd1498Szrj grid_handle_call_in_distribute (gimple_stmt_iterator *gsi)
568*38fd1498Szrj {
569*38fd1498Szrj gimple *stmt = gsi_stmt (*gsi);
570*38fd1498Szrj tree fndecl = gimple_call_fndecl (stmt);
571*38fd1498Szrj gcc_checking_assert (stmt);
572*38fd1498Szrj if (DECL_PURE_P (fndecl) || TREE_READONLY (fndecl))
573*38fd1498Szrj return false;
574*38fd1498Szrj
575*38fd1498Szrj const char *name = IDENTIFIER_POINTER (DECL_NAME (fndecl));
576*38fd1498Szrj if ((strcmp (name, "omp_get_thread_num") == 0)
577*38fd1498Szrj || (strcmp (name, "omp_get_level") == 0)
578*38fd1498Szrj || (strcmp (name, "omp_get_active_level") == 0)
579*38fd1498Szrj || (strcmp (name, "omp_in_parallel") == 0))
580*38fd1498Szrj {
581*38fd1498Szrj tree lhs = gimple_call_lhs (stmt);
582*38fd1498Szrj if (lhs)
583*38fd1498Szrj {
584*38fd1498Szrj gassign *assign
585*38fd1498Szrj = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
586*38fd1498Szrj gsi_insert_before (gsi, assign, GSI_SAME_STMT);
587*38fd1498Szrj }
588*38fd1498Szrj gsi_remove (gsi, true);
589*38fd1498Szrj return true;
590*38fd1498Szrj }
591*38fd1498Szrj
592*38fd1498Szrj /* The rest of the omp functions can stay as they are, HSA back-end will
593*38fd1498Szrj handle them correctly. */
594*38fd1498Szrj gcc_checking_assert ((strcmp (name, "omp_get_num_threads") == 0)
595*38fd1498Szrj || (strcmp (name, "omp_get_num_teams") == 0)
596*38fd1498Szrj || (strcmp (name, "omp_get_team_num") == 0));
597*38fd1498Szrj return false;
598*38fd1498Szrj }
599*38fd1498Szrj
600*38fd1498Szrj /* Given a sequence of statements within a distribute omp construct or a
601*38fd1498Szrj parallel construct, which in the original source does not form a compound
602*38fd1498Szrj construct with a looping construct, return true if it does not prevent us
603*38fd1498Szrj from turning it into a gridified HSA kernel. Otherwise return false. GRID
604*38fd1498Szrj describes hitherto discovered properties of the loop that is evaluated for
605*38fd1498Szrj possible gridification. IN_PARALLEL must be true if seq is within a
606*38fd1498Szrj parallel construct and flase if it is only within a distribute
607*38fd1498Szrj construct. */
608*38fd1498Szrj
609*38fd1498Szrj static bool
grid_dist_follows_tiling_pattern(gimple_seq seq,grid_prop * grid,bool in_parallel)610*38fd1498Szrj grid_dist_follows_tiling_pattern (gimple_seq seq, grid_prop *grid,
611*38fd1498Szrj bool in_parallel)
612*38fd1498Szrj {
613*38fd1498Szrj gimple_stmt_iterator gsi;
614*38fd1498Szrj for (gsi = gsi_start (seq); !gsi_end_p (gsi); gsi_next (&gsi))
615*38fd1498Szrj {
616*38fd1498Szrj gimple *stmt = gsi_stmt (gsi);
617*38fd1498Szrj
618*38fd1498Szrj if (grid_safe_assignment_p (stmt, grid)
619*38fd1498Szrj || gimple_code (stmt) == GIMPLE_GOTO
620*38fd1498Szrj || gimple_code (stmt) == GIMPLE_LABEL
621*38fd1498Szrj || gimple_code (stmt) == GIMPLE_COND)
622*38fd1498Szrj continue;
623*38fd1498Szrj else if (gbind *bind = dyn_cast <gbind *> (stmt))
624*38fd1498Szrj {
625*38fd1498Szrj if (!grid_dist_follows_tiling_pattern (gimple_bind_body (bind),
626*38fd1498Szrj grid, in_parallel))
627*38fd1498Szrj return false;
628*38fd1498Szrj continue;
629*38fd1498Szrj }
630*38fd1498Szrj else if (gtry *try_stmt = dyn_cast <gtry *> (stmt))
631*38fd1498Szrj {
632*38fd1498Szrj if (gimple_try_kind (try_stmt) == GIMPLE_TRY_CATCH)
633*38fd1498Szrj {
634*38fd1498Szrj if (dump_enabled_p ())
635*38fd1498Szrj {
636*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
637*38fd1498Szrj GRID_MISSED_MSG_PREFIX "the distribute "
638*38fd1498Szrj "construct contains a try..catch region\n");
639*38fd1498Szrj dump_printf_loc (MSG_NOTE, gimple_location (try_stmt),
640*38fd1498Szrj "This statement cannot be analyzed for "
641*38fd1498Szrj "tiled gridification\n");
642*38fd1498Szrj }
643*38fd1498Szrj return false;
644*38fd1498Szrj }
645*38fd1498Szrj if (!grid_dist_follows_tiling_pattern (gimple_try_eval (try_stmt),
646*38fd1498Szrj grid, in_parallel))
647*38fd1498Szrj return false;
648*38fd1498Szrj if (!grid_dist_follows_tiling_pattern (gimple_try_cleanup (try_stmt),
649*38fd1498Szrj grid, in_parallel))
650*38fd1498Szrj return false;
651*38fd1498Szrj continue;
652*38fd1498Szrj }
653*38fd1498Szrj else if (is_gimple_call (stmt))
654*38fd1498Szrj {
655*38fd1498Szrj tree fndecl = gimple_call_fndecl (stmt);
656*38fd1498Szrj if (fndecl && grid_call_permissible_in_distribute_p (fndecl))
657*38fd1498Szrj continue;
658*38fd1498Szrj
659*38fd1498Szrj if (dump_enabled_p ())
660*38fd1498Szrj {
661*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
662*38fd1498Szrj GRID_MISSED_MSG_PREFIX "the distribute "
663*38fd1498Szrj "construct contains a call\n");
664*38fd1498Szrj dump_printf_loc (MSG_NOTE, gimple_location (stmt),
665*38fd1498Szrj "This statement cannot be analyzed for "
666*38fd1498Szrj "tiled gridification\n");
667*38fd1498Szrj }
668*38fd1498Szrj return false;
669*38fd1498Szrj }
670*38fd1498Szrj else if (gomp_parallel *par = dyn_cast <gomp_parallel *> (stmt))
671*38fd1498Szrj {
672*38fd1498Szrj if (in_parallel)
673*38fd1498Szrj {
674*38fd1498Szrj if (dump_enabled_p ())
675*38fd1498Szrj {
676*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
677*38fd1498Szrj GRID_MISSED_MSG_PREFIX "a parallel "
678*38fd1498Szrj "construct contains another parallel "
679*38fd1498Szrj "construct\n");
680*38fd1498Szrj dump_printf_loc (MSG_NOTE, gimple_location (stmt),
681*38fd1498Szrj "This parallel construct is nested in "
682*38fd1498Szrj "another one\n");
683*38fd1498Szrj }
684*38fd1498Szrj return false;
685*38fd1498Szrj }
686*38fd1498Szrj if (!grid_parallel_clauses_gridifiable (par, grid->target_loc)
687*38fd1498Szrj || !grid_dist_follows_tiling_pattern (gimple_omp_body (par),
688*38fd1498Szrj grid, true))
689*38fd1498Szrj return false;
690*38fd1498Szrj }
691*38fd1498Szrj else if (gomp_for *gfor = dyn_cast <gomp_for *> (stmt))
692*38fd1498Szrj {
693*38fd1498Szrj if (!in_parallel)
694*38fd1498Szrj {
695*38fd1498Szrj if (dump_enabled_p ())
696*38fd1498Szrj {
697*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
698*38fd1498Szrj GRID_MISSED_MSG_PREFIX "a loop "
699*38fd1498Szrj "construct is not nested within a parallel "
700*38fd1498Szrj "construct\n");
701*38fd1498Szrj dump_printf_loc (MSG_NOTE, gimple_location (stmt),
702*38fd1498Szrj "This loop construct is not nested in "
703*38fd1498Szrj "a parallel construct\n");
704*38fd1498Szrj }
705*38fd1498Szrj return false;
706*38fd1498Szrj }
707*38fd1498Szrj if (!grid_gfor_follows_tiling_pattern (gfor, grid))
708*38fd1498Szrj return false;
709*38fd1498Szrj }
710*38fd1498Szrj else
711*38fd1498Szrj {
712*38fd1498Szrj if (dump_enabled_p ())
713*38fd1498Szrj {
714*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, grid->target_loc,
715*38fd1498Szrj GRID_MISSED_MSG_PREFIX "the distribute "
716*38fd1498Szrj "construct contains a complex statement\n");
717*38fd1498Szrj dump_printf_loc (MSG_NOTE, gimple_location (stmt),
718*38fd1498Szrj "This statement cannot be analyzed for "
719*38fd1498Szrj "tiled gridification\n");
720*38fd1498Szrj }
721*38fd1498Szrj return false;
722*38fd1498Szrj }
723*38fd1498Szrj }
724*38fd1498Szrj return true;
725*38fd1498Szrj }
726*38fd1498Szrj
727*38fd1498Szrj /* If TARGET follows a pattern that can be turned into a gridified HSA kernel,
728*38fd1498Szrj return true, otherwise return false. In the case of success, also fill in
729*38fd1498Szrj GRID with information describing the kernel grid. */
730*38fd1498Szrj
731*38fd1498Szrj static bool
grid_target_follows_gridifiable_pattern(gomp_target * target,grid_prop * grid)732*38fd1498Szrj grid_target_follows_gridifiable_pattern (gomp_target *target, grid_prop *grid)
733*38fd1498Szrj {
734*38fd1498Szrj if (gimple_omp_target_kind (target) != GF_OMP_TARGET_KIND_REGION)
735*38fd1498Szrj return false;
736*38fd1498Szrj
737*38fd1498Szrj location_t tloc = gimple_location (target);
738*38fd1498Szrj grid->target_loc = tloc;
739*38fd1498Szrj gimple *stmt
740*38fd1498Szrj = grid_find_single_omp_among_assignments (gimple_omp_body (target),
741*38fd1498Szrj grid, "target");
742*38fd1498Szrj if (!stmt)
743*38fd1498Szrj return false;
744*38fd1498Szrj gomp_teams *teams = dyn_cast <gomp_teams *> (stmt);
745*38fd1498Szrj tree group_size = NULL;
746*38fd1498Szrj if (!teams)
747*38fd1498Szrj {
748*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
749*38fd1498Szrj GRID_MISSED_MSG_PREFIX "it does not have a sole teams "
750*38fd1498Szrj "construct in it.\n");
751*38fd1498Szrj return false;
752*38fd1498Szrj }
753*38fd1498Szrj
754*38fd1498Szrj tree clauses = gimple_omp_teams_clauses (teams);
755*38fd1498Szrj while (clauses)
756*38fd1498Szrj {
757*38fd1498Szrj switch (OMP_CLAUSE_CODE (clauses))
758*38fd1498Szrj {
759*38fd1498Szrj case OMP_CLAUSE_NUM_TEAMS:
760*38fd1498Szrj if (dump_enabled_p ())
761*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
762*38fd1498Szrj GRID_MISSED_MSG_PREFIX "the teams construct "
763*38fd1498Szrj "contains a num_teams clause\n ");
764*38fd1498Szrj return false;
765*38fd1498Szrj
766*38fd1498Szrj case OMP_CLAUSE_REDUCTION:
767*38fd1498Szrj if (dump_enabled_p ())
768*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
769*38fd1498Szrj GRID_MISSED_MSG_PREFIX "a reduction "
770*38fd1498Szrj "clause is present\n ");
771*38fd1498Szrj return false;
772*38fd1498Szrj
773*38fd1498Szrj case OMP_CLAUSE_THREAD_LIMIT:
774*38fd1498Szrj if (!integer_zerop (OMP_CLAUSE_OPERAND (clauses, 0)))
775*38fd1498Szrj group_size = OMP_CLAUSE_OPERAND (clauses, 0);
776*38fd1498Szrj break;
777*38fd1498Szrj
778*38fd1498Szrj default:
779*38fd1498Szrj break;
780*38fd1498Szrj }
781*38fd1498Szrj clauses = OMP_CLAUSE_CHAIN (clauses);
782*38fd1498Szrj }
783*38fd1498Szrj
784*38fd1498Szrj stmt = grid_find_single_omp_among_assignments (gimple_omp_body (teams), grid,
785*38fd1498Szrj "teams");
786*38fd1498Szrj if (!stmt)
787*38fd1498Szrj return false;
788*38fd1498Szrj gomp_for *dist = dyn_cast <gomp_for *> (stmt);
789*38fd1498Szrj if (!dist)
790*38fd1498Szrj {
791*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
792*38fd1498Szrj GRID_MISSED_MSG_PREFIX "the teams construct does not "
793*38fd1498Szrj "have a single distribute construct in it.\n");
794*38fd1498Szrj return false;
795*38fd1498Szrj }
796*38fd1498Szrj
797*38fd1498Szrj gcc_assert (gimple_omp_for_kind (dist) == GF_OMP_FOR_KIND_DISTRIBUTE);
798*38fd1498Szrj
799*38fd1498Szrj grid->collapse = gimple_omp_for_collapse (dist);
800*38fd1498Szrj if (grid->collapse > 3)
801*38fd1498Szrj {
802*38fd1498Szrj if (dump_enabled_p ())
803*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
804*38fd1498Szrj GRID_MISSED_MSG_PREFIX "the distribute construct "
805*38fd1498Szrj "contains collapse clause with parameter greater "
806*38fd1498Szrj "than 3\n");
807*38fd1498Szrj return false;
808*38fd1498Szrj }
809*38fd1498Szrj
810*38fd1498Szrj struct omp_for_data fd;
811*38fd1498Szrj struct omp_for_data_loop *dist_loops
812*38fd1498Szrj = (struct omp_for_data_loop *)alloca (grid->collapse
813*38fd1498Szrj * sizeof (struct omp_for_data_loop));
814*38fd1498Szrj omp_extract_for_data (dist, &fd, dist_loops);
815*38fd1498Szrj if (fd.chunk_size)
816*38fd1498Szrj {
817*38fd1498Szrj if (group_size && !operand_equal_p (group_size, fd.chunk_size, 0))
818*38fd1498Szrj {
819*38fd1498Szrj if (dump_enabled_p ())
820*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
821*38fd1498Szrj GRID_MISSED_MSG_PREFIX "the teams "
822*38fd1498Szrj "thread limit is different from distribute "
823*38fd1498Szrj "schedule chunk\n");
824*38fd1498Szrj return false;
825*38fd1498Szrj }
826*38fd1498Szrj group_size = fd.chunk_size;
827*38fd1498Szrj }
828*38fd1498Szrj if (group_size && grid->collapse > 1)
829*38fd1498Szrj {
830*38fd1498Szrj if (dump_enabled_p ())
831*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
832*38fd1498Szrj GRID_MISSED_MSG_PREFIX "group size cannot be "
833*38fd1498Szrj "set using thread_limit or schedule clauses "
834*38fd1498Szrj "when also using a collapse clause greater than 1\n");
835*38fd1498Szrj return false;
836*38fd1498Szrj }
837*38fd1498Szrj
838*38fd1498Szrj if (gimple_omp_for_combined_p (dist))
839*38fd1498Szrj {
840*38fd1498Szrj grid->tiling = false;
841*38fd1498Szrj grid->group_sizes[0] = group_size;
842*38fd1498Szrj for (unsigned i = 1; i < grid->collapse; i++)
843*38fd1498Szrj grid->group_sizes[i] = NULL;
844*38fd1498Szrj return grid_dist_follows_simple_pattern (dist, grid);
845*38fd1498Szrj }
846*38fd1498Szrj else
847*38fd1498Szrj {
848*38fd1498Szrj grid->tiling = true;
849*38fd1498Szrj if (group_size)
850*38fd1498Szrj {
851*38fd1498Szrj if (dump_enabled_p ())
852*38fd1498Szrj dump_printf_loc (MSG_MISSED_OPTIMIZATION, tloc,
853*38fd1498Szrj GRID_MISSED_MSG_PREFIX "group size cannot be set "
854*38fd1498Szrj "using thread_limit or schedule clauses when "
855*38fd1498Szrj "distribute and loop constructs do not form "
856*38fd1498Szrj "one combined construct\n");
857*38fd1498Szrj return false;
858*38fd1498Szrj }
859*38fd1498Szrj for (unsigned i = 0; i < grid->collapse; i++)
860*38fd1498Szrj {
861*38fd1498Szrj if (fd.loops[i].cond_code == GT_EXPR)
862*38fd1498Szrj grid->group_sizes[i] = fold_build1 (NEGATE_EXPR,
863*38fd1498Szrj TREE_TYPE (fd.loops[i].step),
864*38fd1498Szrj fd.loops[i].step);
865*38fd1498Szrj else
866*38fd1498Szrj grid->group_sizes[i] = fd.loops[i].step;
867*38fd1498Szrj }
868*38fd1498Szrj return grid_dist_follows_tiling_pattern (gimple_omp_body (dist), grid,
869*38fd1498Szrj false);
870*38fd1498Szrj }
871*38fd1498Szrj }
872*38fd1498Szrj
873*38fd1498Szrj /* Operand walker, used to remap pre-body declarations according to a hash map
874*38fd1498Szrj provided in DATA. */
875*38fd1498Szrj
876*38fd1498Szrj static tree
grid_remap_prebody_decls(tree * tp,int * walk_subtrees,void * data)877*38fd1498Szrj grid_remap_prebody_decls (tree *tp, int *walk_subtrees, void *data)
878*38fd1498Szrj {
879*38fd1498Szrj tree t = *tp;
880*38fd1498Szrj
881*38fd1498Szrj if (DECL_P (t) || TYPE_P (t))
882*38fd1498Szrj *walk_subtrees = 0;
883*38fd1498Szrj else
884*38fd1498Szrj *walk_subtrees = 1;
885*38fd1498Szrj
886*38fd1498Szrj if (VAR_P (t))
887*38fd1498Szrj {
888*38fd1498Szrj struct walk_stmt_info *wi = (struct walk_stmt_info *) data;
889*38fd1498Szrj hash_map<tree, tree> *declmap = (hash_map<tree, tree> *) wi->info;
890*38fd1498Szrj tree *repl = declmap->get (t);
891*38fd1498Szrj if (repl)
892*38fd1498Szrj *tp = *repl;
893*38fd1498Szrj }
894*38fd1498Szrj return NULL_TREE;
895*38fd1498Szrj }
896*38fd1498Szrj
897*38fd1498Szrj /* Identifiers of segments into which a particular variable should be places
898*38fd1498Szrj when gridifying. */
899*38fd1498Szrj
900*38fd1498Szrj enum grid_var_segment {GRID_SEGMENT_PRIVATE, GRID_SEGMENT_GROUP,
901*38fd1498Szrj GRID_SEGMENT_GLOBAL};
902*38fd1498Szrj
903*38fd1498Szrj /* Mark VAR so that it is eventually placed into SEGMENT. Place an artificial
904*38fd1498Szrj builtin call into SEQ that will make sure the variable is always considered
905*38fd1498Szrj address taken. */
906*38fd1498Szrj
907*38fd1498Szrj static void
grid_mark_variable_segment(tree var,enum grid_var_segment segment)908*38fd1498Szrj grid_mark_variable_segment (tree var, enum grid_var_segment segment)
909*38fd1498Szrj {
910*38fd1498Szrj /* Making a non-addressable variables would require that we re-gimplify all
911*38fd1498Szrj their uses. Fortunately, we do not have to do this because if they are
912*38fd1498Szrj not addressable, it means they are not used in atomic or parallel
913*38fd1498Szrj statements and so relaxed GPU consistency rules mean we can just keep them
914*38fd1498Szrj private. */
915*38fd1498Szrj if (!TREE_ADDRESSABLE (var))
916*38fd1498Szrj return;
917*38fd1498Szrj
918*38fd1498Szrj switch (segment)
919*38fd1498Szrj {
920*38fd1498Szrj case GRID_SEGMENT_GROUP:
921*38fd1498Szrj DECL_ATTRIBUTES (var) = tree_cons (get_identifier ("hsa_group_segment"),
922*38fd1498Szrj NULL, DECL_ATTRIBUTES (var));
923*38fd1498Szrj break;
924*38fd1498Szrj case GRID_SEGMENT_GLOBAL:
925*38fd1498Szrj DECL_ATTRIBUTES (var) = tree_cons (get_identifier ("hsa_global_segment"),
926*38fd1498Szrj NULL, DECL_ATTRIBUTES (var));
927*38fd1498Szrj break;
928*38fd1498Szrj default:
929*38fd1498Szrj gcc_unreachable ();
930*38fd1498Szrj }
931*38fd1498Szrj
932*38fd1498Szrj if (!TREE_STATIC (var))
933*38fd1498Szrj {
934*38fd1498Szrj TREE_STATIC (var) = 1;
935*38fd1498Szrj varpool_node::finalize_decl (var);
936*38fd1498Szrj }
937*38fd1498Szrj
938*38fd1498Szrj }
939*38fd1498Szrj
940*38fd1498Szrj /* Copy leading register-type assignments to local variables in SRC to just
941*38fd1498Szrj before DST, Creating temporaries, adjusting mapping of operands in WI and
942*38fd1498Szrj remapping operands as necessary. Add any new temporaries to TGT_BIND.
943*38fd1498Szrj Return the first statement that does not conform to grid_safe_assignment_p
944*38fd1498Szrj or NULL. If VAR_SEGMENT is not GRID_SEGMENT_PRIVATE, also mark all
945*38fd1498Szrj variables in traversed bind statements so that they are put into the
946*38fd1498Szrj appropriate segment. */
947*38fd1498Szrj
948*38fd1498Szrj static gimple *
grid_copy_leading_local_assignments(gimple_seq src,gimple_stmt_iterator * dst,gbind * tgt_bind,enum grid_var_segment var_segment,struct walk_stmt_info * wi)949*38fd1498Szrj grid_copy_leading_local_assignments (gimple_seq src, gimple_stmt_iterator *dst,
950*38fd1498Szrj gbind *tgt_bind,
951*38fd1498Szrj enum grid_var_segment var_segment,
952*38fd1498Szrj struct walk_stmt_info *wi)
953*38fd1498Szrj {
954*38fd1498Szrj hash_map<tree, tree> *declmap = (hash_map<tree, tree> *) wi->info;
955*38fd1498Szrj gimple_stmt_iterator gsi;
956*38fd1498Szrj for (gsi = gsi_start (src); !gsi_end_p (gsi); gsi_next (&gsi))
957*38fd1498Szrj {
958*38fd1498Szrj gimple *stmt = gsi_stmt (gsi);
959*38fd1498Szrj if (gbind *bind = dyn_cast <gbind *> (stmt))
960*38fd1498Szrj {
961*38fd1498Szrj gimple *r = grid_copy_leading_local_assignments
962*38fd1498Szrj (gimple_bind_body (bind), dst, tgt_bind, var_segment, wi);
963*38fd1498Szrj
964*38fd1498Szrj if (var_segment != GRID_SEGMENT_PRIVATE)
965*38fd1498Szrj for (tree var = gimple_bind_vars (bind);
966*38fd1498Szrj var;
967*38fd1498Szrj var = DECL_CHAIN (var))
968*38fd1498Szrj grid_mark_variable_segment (var, var_segment);
969*38fd1498Szrj if (r)
970*38fd1498Szrj return r;
971*38fd1498Szrj else
972*38fd1498Szrj continue;
973*38fd1498Szrj }
974*38fd1498Szrj if (!grid_safe_assignment_p (stmt, NULL))
975*38fd1498Szrj return stmt;
976*38fd1498Szrj tree lhs = gimple_assign_lhs (as_a <gassign *> (stmt));
977*38fd1498Szrj tree repl = copy_var_decl (lhs, create_tmp_var_name (NULL),
978*38fd1498Szrj TREE_TYPE (lhs));
979*38fd1498Szrj DECL_CONTEXT (repl) = current_function_decl;
980*38fd1498Szrj gimple_bind_append_vars (tgt_bind, repl);
981*38fd1498Szrj
982*38fd1498Szrj declmap->put (lhs, repl);
983*38fd1498Szrj gassign *copy = as_a <gassign *> (gimple_copy (stmt));
984*38fd1498Szrj walk_gimple_op (copy, grid_remap_prebody_decls, wi);
985*38fd1498Szrj gsi_insert_before (dst, copy, GSI_SAME_STMT);
986*38fd1498Szrj }
987*38fd1498Szrj return NULL;
988*38fd1498Szrj }
989*38fd1498Szrj
990*38fd1498Szrj /* Statement walker function to make adjustments to statements within the
991*38fd1498Szrj gridifed kernel copy. */
992*38fd1498Szrj
993*38fd1498Szrj static tree
grid_process_grid_body(gimple_stmt_iterator * gsi,bool * handled_ops_p,struct walk_stmt_info *)994*38fd1498Szrj grid_process_grid_body (gimple_stmt_iterator *gsi, bool *handled_ops_p,
995*38fd1498Szrj struct walk_stmt_info *)
996*38fd1498Szrj {
997*38fd1498Szrj *handled_ops_p = false;
998*38fd1498Szrj gimple *stmt = gsi_stmt (*gsi);
999*38fd1498Szrj if (gimple_code (stmt) == GIMPLE_OMP_FOR
1000*38fd1498Szrj && (gimple_omp_for_kind (stmt) & GF_OMP_FOR_SIMD))
1001*38fd1498Szrj {
1002*38fd1498Szrj gomp_for *loop = as_a <gomp_for *> (stmt);
1003*38fd1498Szrj tree clauses = gimple_omp_for_clauses (loop);
1004*38fd1498Szrj tree cl = omp_find_clause (clauses, OMP_CLAUSE_SAFELEN);
1005*38fd1498Szrj if (cl)
1006*38fd1498Szrj OMP_CLAUSE_SAFELEN_EXPR (cl) = integer_one_node;
1007*38fd1498Szrj else
1008*38fd1498Szrj {
1009*38fd1498Szrj tree c = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE_SAFELEN);
1010*38fd1498Szrj OMP_CLAUSE_SAFELEN_EXPR (c) = integer_one_node;
1011*38fd1498Szrj OMP_CLAUSE_CHAIN (c) = clauses;
1012*38fd1498Szrj gimple_omp_for_set_clauses (loop, c);
1013*38fd1498Szrj }
1014*38fd1498Szrj }
1015*38fd1498Szrj return NULL_TREE;
1016*38fd1498Szrj }
1017*38fd1498Szrj
1018*38fd1498Szrj /* Given a PARLOOP that is a normal for looping construct but also a part of a
1019*38fd1498Szrj combined construct with a simd loop, eliminate the simd loop. */
1020*38fd1498Szrj
1021*38fd1498Szrj static void
grid_eliminate_combined_simd_part(gomp_for * parloop)1022*38fd1498Szrj grid_eliminate_combined_simd_part (gomp_for *parloop)
1023*38fd1498Szrj {
1024*38fd1498Szrj struct walk_stmt_info wi;
1025*38fd1498Szrj
1026*38fd1498Szrj memset (&wi, 0, sizeof (wi));
1027*38fd1498Szrj wi.val_only = true;
1028*38fd1498Szrj enum gf_mask msk = GF_OMP_FOR_SIMD;
1029*38fd1498Szrj wi.info = (void *) &msk;
1030*38fd1498Szrj walk_gimple_seq (gimple_omp_body (parloop), omp_find_combined_for, NULL, &wi);
1031*38fd1498Szrj gimple *stmt = (gimple *) wi.info;
1032*38fd1498Szrj /* We expect that the SIMD id the only statement in the parallel loop. */
1033*38fd1498Szrj gcc_assert (stmt
1034*38fd1498Szrj && gimple_code (stmt) == GIMPLE_OMP_FOR
1035*38fd1498Szrj && (gimple_omp_for_kind (stmt) == GF_OMP_FOR_SIMD)
1036*38fd1498Szrj && gimple_omp_for_combined_into_p (stmt)
1037*38fd1498Szrj && !gimple_omp_for_combined_p (stmt));
1038*38fd1498Szrj gomp_for *simd = as_a <gomp_for *> (stmt);
1039*38fd1498Szrj
1040*38fd1498Szrj /* Copy over the iteration properties because the body refers to the index in
1041*38fd1498Szrj the bottmom-most loop. */
1042*38fd1498Szrj unsigned i, collapse = gimple_omp_for_collapse (parloop);
1043*38fd1498Szrj gcc_checking_assert (collapse == gimple_omp_for_collapse (simd));
1044*38fd1498Szrj for (i = 0; i < collapse; i++)
1045*38fd1498Szrj {
1046*38fd1498Szrj gimple_omp_for_set_index (parloop, i, gimple_omp_for_index (simd, i));
1047*38fd1498Szrj gimple_omp_for_set_initial (parloop, i, gimple_omp_for_initial (simd, i));
1048*38fd1498Szrj gimple_omp_for_set_final (parloop, i, gimple_omp_for_final (simd, i));
1049*38fd1498Szrj gimple_omp_for_set_incr (parloop, i, gimple_omp_for_incr (simd, i));
1050*38fd1498Szrj }
1051*38fd1498Szrj
1052*38fd1498Szrj tree *tgt= gimple_omp_for_clauses_ptr (parloop);
1053*38fd1498Szrj while (*tgt)
1054*38fd1498Szrj tgt = &OMP_CLAUSE_CHAIN (*tgt);
1055*38fd1498Szrj
1056*38fd1498Szrj /* Copy over all clauses, except for linaer clauses, which are turned into
1057*38fd1498Szrj private clauses, and all other simd-specificl clauses, which are
1058*38fd1498Szrj ignored. */
1059*38fd1498Szrj tree *pc = gimple_omp_for_clauses_ptr (simd);
1060*38fd1498Szrj while (*pc)
1061*38fd1498Szrj {
1062*38fd1498Szrj tree c = *pc;
1063*38fd1498Szrj switch (TREE_CODE (c))
1064*38fd1498Szrj {
1065*38fd1498Szrj case OMP_CLAUSE_LINEAR:
1066*38fd1498Szrj {
1067*38fd1498Szrj tree priv = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE_PRIVATE);
1068*38fd1498Szrj OMP_CLAUSE_DECL (priv) = OMP_CLAUSE_DECL (c);
1069*38fd1498Szrj OMP_CLAUSE_CHAIN (priv) = NULL;
1070*38fd1498Szrj *tgt = priv;
1071*38fd1498Szrj tgt = &OMP_CLAUSE_CHAIN (priv);
1072*38fd1498Szrj pc = &OMP_CLAUSE_CHAIN (c);
1073*38fd1498Szrj break;
1074*38fd1498Szrj }
1075*38fd1498Szrj
1076*38fd1498Szrj case OMP_CLAUSE_SAFELEN:
1077*38fd1498Szrj case OMP_CLAUSE_SIMDLEN:
1078*38fd1498Szrj case OMP_CLAUSE_ALIGNED:
1079*38fd1498Szrj pc = &OMP_CLAUSE_CHAIN (c);
1080*38fd1498Szrj break;
1081*38fd1498Szrj
1082*38fd1498Szrj default:
1083*38fd1498Szrj *pc = OMP_CLAUSE_CHAIN (c);
1084*38fd1498Szrj OMP_CLAUSE_CHAIN (c) = NULL;
1085*38fd1498Szrj *tgt = c;
1086*38fd1498Szrj tgt = &OMP_CLAUSE_CHAIN(c);
1087*38fd1498Szrj break;
1088*38fd1498Szrj }
1089*38fd1498Szrj }
1090*38fd1498Szrj
1091*38fd1498Szrj /* Finally, throw away the simd and mark the parallel loop as not
1092*38fd1498Szrj combined. */
1093*38fd1498Szrj gimple_omp_set_body (parloop, gimple_omp_body (simd));
1094*38fd1498Szrj gimple_omp_for_set_combined_p (parloop, false);
1095*38fd1498Szrj }
1096*38fd1498Szrj
1097*38fd1498Szrj /* Statement walker function marking all parallels as grid_phony and loops as
1098*38fd1498Szrj grid ones representing threads of a particular thread group. */
1099*38fd1498Szrj
1100*38fd1498Szrj static tree
grid_mark_tiling_loops(gimple_stmt_iterator * gsi,bool * handled_ops_p,struct walk_stmt_info * wi_in)1101*38fd1498Szrj grid_mark_tiling_loops (gimple_stmt_iterator *gsi, bool *handled_ops_p,
1102*38fd1498Szrj struct walk_stmt_info *wi_in)
1103*38fd1498Szrj {
1104*38fd1498Szrj *handled_ops_p = false;
1105*38fd1498Szrj if (gomp_for *loop = dyn_cast <gomp_for *> (gsi_stmt (*gsi)))
1106*38fd1498Szrj {
1107*38fd1498Szrj *handled_ops_p = true;
1108*38fd1498Szrj gimple_omp_for_set_kind (loop, GF_OMP_FOR_KIND_GRID_LOOP);
1109*38fd1498Szrj gimple_omp_for_set_grid_intra_group (loop, true);
1110*38fd1498Szrj if (gimple_omp_for_combined_p (loop))
1111*38fd1498Szrj grid_eliminate_combined_simd_part (loop);
1112*38fd1498Szrj
1113*38fd1498Szrj struct walk_stmt_info body_wi;
1114*38fd1498Szrj memset (&body_wi, 0, sizeof (body_wi));
1115*38fd1498Szrj walk_gimple_seq_mod (gimple_omp_body_ptr (loop),
1116*38fd1498Szrj grid_process_grid_body, NULL, &body_wi);
1117*38fd1498Szrj
1118*38fd1498Szrj gbind *bind = (gbind *) wi_in->info;
1119*38fd1498Szrj tree c;
1120*38fd1498Szrj for (c = gimple_omp_for_clauses (loop); c; c = OMP_CLAUSE_CHAIN (c))
1121*38fd1498Szrj if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_LASTPRIVATE)
1122*38fd1498Szrj {
1123*38fd1498Szrj push_gimplify_context ();
1124*38fd1498Szrj tree ov = OMP_CLAUSE_DECL (c);
1125*38fd1498Szrj tree gv = copy_var_decl (ov, create_tmp_var_name (NULL),
1126*38fd1498Szrj TREE_TYPE (ov));
1127*38fd1498Szrj
1128*38fd1498Szrj grid_mark_variable_segment (gv, GRID_SEGMENT_GROUP);
1129*38fd1498Szrj DECL_CONTEXT (gv) = current_function_decl;
1130*38fd1498Szrj gimple_bind_append_vars (bind, gv);
1131*38fd1498Szrj tree x = lang_hooks.decls.omp_clause_assign_op (c, gv, ov);
1132*38fd1498Szrj gimplify_and_add (x, &OMP_CLAUSE_LASTPRIVATE_GIMPLE_SEQ (c));
1133*38fd1498Szrj x = lang_hooks.decls.omp_clause_copy_ctor (c, ov, gv);
1134*38fd1498Szrj gimple_seq l = NULL;
1135*38fd1498Szrj gimplify_and_add (x, &l);
1136*38fd1498Szrj gsi_insert_seq_after (gsi, l, GSI_SAME_STMT);
1137*38fd1498Szrj pop_gimplify_context (bind);
1138*38fd1498Szrj }
1139*38fd1498Szrj }
1140*38fd1498Szrj return NULL_TREE;
1141*38fd1498Szrj }
1142*38fd1498Szrj
1143*38fd1498Szrj /* Statement walker function marking all parallels as grid_phony and loops as
1144*38fd1498Szrj grid ones representing threads of a particular thread group. */
1145*38fd1498Szrj
1146*38fd1498Szrj static tree
grid_mark_tiling_parallels_and_loops(gimple_stmt_iterator * gsi,bool * handled_ops_p,struct walk_stmt_info * wi_in)1147*38fd1498Szrj grid_mark_tiling_parallels_and_loops (gimple_stmt_iterator *gsi,
1148*38fd1498Szrj bool *handled_ops_p,
1149*38fd1498Szrj struct walk_stmt_info *wi_in)
1150*38fd1498Szrj {
1151*38fd1498Szrj *handled_ops_p = false;
1152*38fd1498Szrj wi_in->removed_stmt = false;
1153*38fd1498Szrj gimple *stmt = gsi_stmt (*gsi);
1154*38fd1498Szrj if (gbind *bind = dyn_cast <gbind *> (stmt))
1155*38fd1498Szrj {
1156*38fd1498Szrj for (tree var = gimple_bind_vars (bind); var; var = DECL_CHAIN (var))
1157*38fd1498Szrj grid_mark_variable_segment (var, GRID_SEGMENT_GROUP);
1158*38fd1498Szrj }
1159*38fd1498Szrj else if (gomp_parallel *parallel = dyn_cast <gomp_parallel *> (stmt))
1160*38fd1498Szrj {
1161*38fd1498Szrj *handled_ops_p = true;
1162*38fd1498Szrj gimple_omp_parallel_set_grid_phony (parallel, true);
1163*38fd1498Szrj
1164*38fd1498Szrj gbind *new_bind = gimple_build_bind (NULL, NULL, make_node (BLOCK));
1165*38fd1498Szrj gimple_bind_set_body (new_bind, gimple_omp_body (parallel));
1166*38fd1498Szrj gimple_seq s = NULL;
1167*38fd1498Szrj gimple_seq_add_stmt (&s, new_bind);
1168*38fd1498Szrj gimple_omp_set_body (parallel, s);
1169*38fd1498Szrj
1170*38fd1498Szrj struct walk_stmt_info wi_par;
1171*38fd1498Szrj memset (&wi_par, 0, sizeof (wi_par));
1172*38fd1498Szrj wi_par.info = new_bind;
1173*38fd1498Szrj walk_gimple_seq_mod (gimple_bind_body_ptr (new_bind),
1174*38fd1498Szrj grid_mark_tiling_loops, NULL, &wi_par);
1175*38fd1498Szrj }
1176*38fd1498Szrj else if (is_a <gcall *> (stmt))
1177*38fd1498Szrj wi_in->removed_stmt = grid_handle_call_in_distribute (gsi);
1178*38fd1498Szrj return NULL_TREE;
1179*38fd1498Szrj }
1180*38fd1498Szrj
1181*38fd1498Szrj /* Given freshly copied top level kernel SEQ, identify the individual OMP
1182*38fd1498Szrj components, mark them as part of kernel, copy assignment leading to them
1183*38fd1498Szrj just before DST, remapping them using WI and adding new temporaries to
1184*38fd1498Szrj TGT_BIND, and and return the loop that will be used for kernel dispatch. */
1185*38fd1498Szrj
1186*38fd1498Szrj static gomp_for *
grid_process_kernel_body_copy(grid_prop * grid,gimple_seq seq,gimple_stmt_iterator * dst,gbind * tgt_bind,struct walk_stmt_info * wi)1187*38fd1498Szrj grid_process_kernel_body_copy (grid_prop *grid, gimple_seq seq,
1188*38fd1498Szrj gimple_stmt_iterator *dst,
1189*38fd1498Szrj gbind *tgt_bind, struct walk_stmt_info *wi)
1190*38fd1498Szrj {
1191*38fd1498Szrj gimple *stmt = grid_copy_leading_local_assignments (seq, dst, tgt_bind,
1192*38fd1498Szrj GRID_SEGMENT_GLOBAL, wi);
1193*38fd1498Szrj gomp_teams *teams = dyn_cast <gomp_teams *> (stmt);
1194*38fd1498Szrj gcc_assert (teams);
1195*38fd1498Szrj gimple_omp_teams_set_grid_phony (teams, true);
1196*38fd1498Szrj stmt = grid_copy_leading_local_assignments (gimple_omp_body (teams), dst,
1197*38fd1498Szrj tgt_bind, GRID_SEGMENT_GLOBAL,
1198*38fd1498Szrj wi);
1199*38fd1498Szrj gcc_checking_assert (stmt);
1200*38fd1498Szrj gomp_for *dist = dyn_cast <gomp_for *> (stmt);
1201*38fd1498Szrj gcc_assert (dist);
1202*38fd1498Szrj gimple_seq prebody = gimple_omp_for_pre_body (dist);
1203*38fd1498Szrj if (prebody)
1204*38fd1498Szrj grid_copy_leading_local_assignments (prebody, dst, tgt_bind,
1205*38fd1498Szrj GRID_SEGMENT_GROUP, wi);
1206*38fd1498Szrj
1207*38fd1498Szrj if (grid->tiling)
1208*38fd1498Szrj {
1209*38fd1498Szrj gimple_omp_for_set_kind (dist, GF_OMP_FOR_KIND_GRID_LOOP);
1210*38fd1498Szrj gimple_omp_for_set_grid_group_iter (dist, true);
1211*38fd1498Szrj
1212*38fd1498Szrj struct walk_stmt_info wi_tiled;
1213*38fd1498Szrj memset (&wi_tiled, 0, sizeof (wi_tiled));
1214*38fd1498Szrj walk_gimple_seq_mod (gimple_omp_body_ptr (dist),
1215*38fd1498Szrj grid_mark_tiling_parallels_and_loops, NULL,
1216*38fd1498Szrj &wi_tiled);
1217*38fd1498Szrj return dist;
1218*38fd1498Szrj }
1219*38fd1498Szrj else
1220*38fd1498Szrj {
1221*38fd1498Szrj gimple_omp_for_set_grid_phony (dist, true);
1222*38fd1498Szrj stmt = grid_copy_leading_local_assignments (gimple_omp_body (dist), dst,
1223*38fd1498Szrj tgt_bind,
1224*38fd1498Szrj GRID_SEGMENT_PRIVATE, wi);
1225*38fd1498Szrj gcc_checking_assert (stmt);
1226*38fd1498Szrj gomp_parallel *parallel = as_a <gomp_parallel *> (stmt);
1227*38fd1498Szrj gimple_omp_parallel_set_grid_phony (parallel, true);
1228*38fd1498Szrj stmt = grid_copy_leading_local_assignments (gimple_omp_body (parallel),
1229*38fd1498Szrj dst, tgt_bind,
1230*38fd1498Szrj GRID_SEGMENT_PRIVATE, wi);
1231*38fd1498Szrj gomp_for *inner_loop = as_a <gomp_for *> (stmt);
1232*38fd1498Szrj gimple_omp_for_set_kind (inner_loop, GF_OMP_FOR_KIND_GRID_LOOP);
1233*38fd1498Szrj prebody = gimple_omp_for_pre_body (inner_loop);
1234*38fd1498Szrj if (prebody)
1235*38fd1498Szrj grid_copy_leading_local_assignments (prebody, dst, tgt_bind,
1236*38fd1498Szrj GRID_SEGMENT_PRIVATE, wi);
1237*38fd1498Szrj
1238*38fd1498Szrj if (gimple_omp_for_combined_p (inner_loop))
1239*38fd1498Szrj grid_eliminate_combined_simd_part (inner_loop);
1240*38fd1498Szrj struct walk_stmt_info body_wi;
1241*38fd1498Szrj memset (&body_wi, 0, sizeof (body_wi));
1242*38fd1498Szrj walk_gimple_seq_mod (gimple_omp_body_ptr (inner_loop),
1243*38fd1498Szrj grid_process_grid_body, NULL, &body_wi);
1244*38fd1498Szrj
1245*38fd1498Szrj return inner_loop;
1246*38fd1498Szrj }
1247*38fd1498Szrj }
1248*38fd1498Szrj
1249*38fd1498Szrj /* If TARGET points to a GOMP_TARGET which follows a gridifiable pattern,
1250*38fd1498Szrj create a GPU kernel for it. GSI must point to the same statement, TGT_BIND
1251*38fd1498Szrj is the bind into which temporaries inserted before TARGET should be
1252*38fd1498Szrj added. */
1253*38fd1498Szrj
1254*38fd1498Szrj static void
grid_attempt_target_gridification(gomp_target * target,gimple_stmt_iterator * gsi,gbind * tgt_bind)1255*38fd1498Szrj grid_attempt_target_gridification (gomp_target *target,
1256*38fd1498Szrj gimple_stmt_iterator *gsi,
1257*38fd1498Szrj gbind *tgt_bind)
1258*38fd1498Szrj {
1259*38fd1498Szrj /* removed group_size */
1260*38fd1498Szrj grid_prop grid;
1261*38fd1498Szrj memset (&grid, 0, sizeof (grid));
1262*38fd1498Szrj if (!target || !grid_target_follows_gridifiable_pattern (target, &grid))
1263*38fd1498Szrj return;
1264*38fd1498Szrj
1265*38fd1498Szrj location_t loc = gimple_location (target);
1266*38fd1498Szrj if (dump_enabled_p ())
1267*38fd1498Szrj dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc,
1268*38fd1498Szrj "Target construct will be turned into a gridified HSA "
1269*38fd1498Szrj "kernel\n");
1270*38fd1498Szrj
1271*38fd1498Szrj /* Copy target body to a GPUKERNEL construct: */
1272*38fd1498Szrj gimple_seq kernel_seq = copy_gimple_seq_and_replace_locals
1273*38fd1498Szrj (gimple_omp_body (target));
1274*38fd1498Szrj
1275*38fd1498Szrj hash_map<tree, tree> *declmap = new hash_map<tree, tree>;
1276*38fd1498Szrj struct walk_stmt_info wi;
1277*38fd1498Szrj memset (&wi, 0, sizeof (struct walk_stmt_info));
1278*38fd1498Szrj wi.info = declmap;
1279*38fd1498Szrj
1280*38fd1498Szrj /* Copy assignments in between OMP statements before target, mark OMP
1281*38fd1498Szrj statements within copy appropriately. */
1282*38fd1498Szrj gomp_for *inner_loop = grid_process_kernel_body_copy (&grid, kernel_seq, gsi,
1283*38fd1498Szrj tgt_bind, &wi);
1284*38fd1498Szrj
1285*38fd1498Szrj gbind *old_bind
1286*38fd1498Szrj = as_a <gbind *> (gimple_seq_first (gimple_omp_body (target)));
1287*38fd1498Szrj gbind *new_bind = as_a <gbind *> (gimple_seq_first (kernel_seq));
1288*38fd1498Szrj tree new_block = gimple_bind_block (new_bind);
1289*38fd1498Szrj tree enc_block = BLOCK_SUPERCONTEXT (gimple_bind_block (old_bind));
1290*38fd1498Szrj BLOCK_CHAIN (new_block) = BLOCK_SUBBLOCKS (enc_block);
1291*38fd1498Szrj BLOCK_SUBBLOCKS (enc_block) = new_block;
1292*38fd1498Szrj BLOCK_SUPERCONTEXT (new_block) = enc_block;
1293*38fd1498Szrj gimple *gpukernel = gimple_build_omp_grid_body (kernel_seq);
1294*38fd1498Szrj gimple_seq_add_stmt
1295*38fd1498Szrj (gimple_bind_body_ptr (as_a <gbind *> (gimple_omp_body (target))),
1296*38fd1498Szrj gpukernel);
1297*38fd1498Szrj
1298*38fd1498Szrj for (size_t i = 0; i < grid.collapse; i++)
1299*38fd1498Szrj walk_tree (&grid.group_sizes[i], grid_remap_prebody_decls, &wi, NULL);
1300*38fd1498Szrj push_gimplify_context ();
1301*38fd1498Szrj for (size_t i = 0; i < grid.collapse; i++)
1302*38fd1498Szrj {
1303*38fd1498Szrj tree itype, type = TREE_TYPE (gimple_omp_for_index (inner_loop, i));
1304*38fd1498Szrj if (POINTER_TYPE_P (type))
1305*38fd1498Szrj itype = signed_type_for (type);
1306*38fd1498Szrj else
1307*38fd1498Szrj itype = type;
1308*38fd1498Szrj
1309*38fd1498Szrj enum tree_code cond_code = gimple_omp_for_cond (inner_loop, i);
1310*38fd1498Szrj tree n1 = unshare_expr (gimple_omp_for_initial (inner_loop, i));
1311*38fd1498Szrj walk_tree (&n1, grid_remap_prebody_decls, &wi, NULL);
1312*38fd1498Szrj tree n2 = unshare_expr (gimple_omp_for_final (inner_loop, i));
1313*38fd1498Szrj walk_tree (&n2, grid_remap_prebody_decls, &wi, NULL);
1314*38fd1498Szrj omp_adjust_for_condition (loc, &cond_code, &n2);
1315*38fd1498Szrj n1 = fold_convert (itype, n1);
1316*38fd1498Szrj n2 = fold_convert (itype, n2);
1317*38fd1498Szrj
1318*38fd1498Szrj tree cond = fold_build2 (cond_code, boolean_type_node, n1, n2);
1319*38fd1498Szrj tree step
1320*38fd1498Szrj = omp_get_for_step_from_incr (loc, gimple_omp_for_incr (inner_loop, i));
1321*38fd1498Szrj
1322*38fd1498Szrj tree t = build_int_cst (itype, (cond_code == LT_EXPR ? -1 : 1));
1323*38fd1498Szrj t = fold_build2 (PLUS_EXPR, itype, step, t);
1324*38fd1498Szrj t = fold_build2 (PLUS_EXPR, itype, t, n2);
1325*38fd1498Szrj t = fold_build2 (MINUS_EXPR, itype, t, n1);
1326*38fd1498Szrj if (TYPE_UNSIGNED (itype) && cond_code == GT_EXPR)
1327*38fd1498Szrj t = fold_build2 (TRUNC_DIV_EXPR, itype,
1328*38fd1498Szrj fold_build1 (NEGATE_EXPR, itype, t),
1329*38fd1498Szrj fold_build1 (NEGATE_EXPR, itype, step));
1330*38fd1498Szrj else
1331*38fd1498Szrj t = fold_build2 (TRUNC_DIV_EXPR, itype, t, step);
1332*38fd1498Szrj t = fold_build3 (COND_EXPR, itype, cond, t, build_zero_cst (itype));
1333*38fd1498Szrj if (grid.tiling)
1334*38fd1498Szrj {
1335*38fd1498Szrj if (cond_code == GT_EXPR)
1336*38fd1498Szrj step = fold_build1 (NEGATE_EXPR, itype, step);
1337*38fd1498Szrj t = fold_build2 (MULT_EXPR, itype, t, step);
1338*38fd1498Szrj }
1339*38fd1498Szrj
1340*38fd1498Szrj tree gs = fold_convert (uint32_type_node, t);
1341*38fd1498Szrj gimple_seq tmpseq = NULL;
1342*38fd1498Szrj gimplify_expr (&gs, &tmpseq, NULL, is_gimple_val, fb_rvalue);
1343*38fd1498Szrj if (!gimple_seq_empty_p (tmpseq))
1344*38fd1498Szrj gsi_insert_seq_before (gsi, tmpseq, GSI_SAME_STMT);
1345*38fd1498Szrj
1346*38fd1498Szrj tree ws;
1347*38fd1498Szrj if (grid.group_sizes[i])
1348*38fd1498Szrj {
1349*38fd1498Szrj ws = fold_convert (uint32_type_node, grid.group_sizes[i]);
1350*38fd1498Szrj tmpseq = NULL;
1351*38fd1498Szrj gimplify_expr (&ws, &tmpseq, NULL, is_gimple_val, fb_rvalue);
1352*38fd1498Szrj if (!gimple_seq_empty_p (tmpseq))
1353*38fd1498Szrj gsi_insert_seq_before (gsi, tmpseq, GSI_SAME_STMT);
1354*38fd1498Szrj }
1355*38fd1498Szrj else
1356*38fd1498Szrj ws = build_zero_cst (uint32_type_node);
1357*38fd1498Szrj
1358*38fd1498Szrj tree c = build_omp_clause (UNKNOWN_LOCATION, OMP_CLAUSE__GRIDDIM_);
1359*38fd1498Szrj OMP_CLAUSE__GRIDDIM__DIMENSION (c) = i;
1360*38fd1498Szrj OMP_CLAUSE__GRIDDIM__SIZE (c) = gs;
1361*38fd1498Szrj OMP_CLAUSE__GRIDDIM__GROUP (c) = ws;
1362*38fd1498Szrj OMP_CLAUSE_CHAIN (c) = gimple_omp_target_clauses (target);
1363*38fd1498Szrj gimple_omp_target_set_clauses (target, c);
1364*38fd1498Szrj }
1365*38fd1498Szrj pop_gimplify_context (tgt_bind);
1366*38fd1498Szrj delete declmap;
1367*38fd1498Szrj return;
1368*38fd1498Szrj }
1369*38fd1498Szrj
1370*38fd1498Szrj /* Walker function doing all the work for create_target_kernels. */
1371*38fd1498Szrj
1372*38fd1498Szrj static tree
grid_gridify_all_targets_stmt(gimple_stmt_iterator * gsi,bool * handled_ops_p,struct walk_stmt_info * incoming)1373*38fd1498Szrj grid_gridify_all_targets_stmt (gimple_stmt_iterator *gsi,
1374*38fd1498Szrj bool *handled_ops_p,
1375*38fd1498Szrj struct walk_stmt_info *incoming)
1376*38fd1498Szrj {
1377*38fd1498Szrj *handled_ops_p = false;
1378*38fd1498Szrj
1379*38fd1498Szrj gimple *stmt = gsi_stmt (*gsi);
1380*38fd1498Szrj gomp_target *target = dyn_cast <gomp_target *> (stmt);
1381*38fd1498Szrj if (target)
1382*38fd1498Szrj {
1383*38fd1498Szrj gbind *tgt_bind = (gbind *) incoming->info;
1384*38fd1498Szrj gcc_checking_assert (tgt_bind);
1385*38fd1498Szrj grid_attempt_target_gridification (target, gsi, tgt_bind);
1386*38fd1498Szrj return NULL_TREE;
1387*38fd1498Szrj }
1388*38fd1498Szrj gbind *bind = dyn_cast <gbind *> (stmt);
1389*38fd1498Szrj if (bind)
1390*38fd1498Szrj {
1391*38fd1498Szrj *handled_ops_p = true;
1392*38fd1498Szrj struct walk_stmt_info wi;
1393*38fd1498Szrj memset (&wi, 0, sizeof (wi));
1394*38fd1498Szrj wi.info = bind;
1395*38fd1498Szrj walk_gimple_seq_mod (gimple_bind_body_ptr (bind),
1396*38fd1498Szrj grid_gridify_all_targets_stmt, NULL, &wi);
1397*38fd1498Szrj }
1398*38fd1498Szrj return NULL_TREE;
1399*38fd1498Szrj }
1400*38fd1498Szrj
1401*38fd1498Szrj /* Attempt to gridify all target constructs in BODY_P. All such targets will
1402*38fd1498Szrj have their bodies duplicated, with the new copy being put into a
1403*38fd1498Szrj gimple_omp_grid_body statement. All kernel-related construct within the
1404*38fd1498Szrj grid_body will be marked with phony flags or kernel kinds. Moreover, some
1405*38fd1498Szrj re-structuring is often needed, such as copying pre-bodies before the target
1406*38fd1498Szrj construct so that kernel grid sizes can be computed. */
1407*38fd1498Szrj
1408*38fd1498Szrj void
omp_grid_gridify_all_targets(gimple_seq * body_p)1409*38fd1498Szrj omp_grid_gridify_all_targets (gimple_seq *body_p)
1410*38fd1498Szrj {
1411*38fd1498Szrj struct walk_stmt_info wi;
1412*38fd1498Szrj memset (&wi, 0, sizeof (wi));
1413*38fd1498Szrj walk_gimple_seq_mod (body_p, grid_gridify_all_targets_stmt, NULL, &wi);
1414*38fd1498Szrj }
1415