xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/brig/brigfrontend/brig-basic-inst-handler.cc (revision cef8759bd76c1b621f8eab8faa6f208faabc2e15)
1 /* brig-basic-inst-handler.cc -- brig basic instruction handling
2    Copyright (C) 2016-2018 Free Software Foundation, Inc.
3    Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
4    for General Processor Tech.
5 
6    This file is part of GCC.
7 
8    GCC is free software; you can redistribute it and/or modify it under
9    the terms of the GNU General Public License as published by the Free
10    Software Foundation; either version 3, or (at your option) any later
11    version.
12 
13    GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14    WARRANTY; without even the implied warranty of MERCHANTABILITY or
15    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16    for more details.
17 
18    You should have received a copy of the GNU General Public License
19    along with GCC; see the file COPYING3.  If not see
20    <http://www.gnu.org/licenses/>.  */
21 
22 #include <sstream>
23 
24 #include "brig-code-entry-handler.h"
25 #include "brig-util.h"
26 
27 #include "errors.h"
28 #include "gimple-expr.h"
29 #include "convert.h"
30 #include "print-tree.h"
31 #include "tree-pretty-print.h"
32 #include "langhooks.h"
33 #include "stor-layout.h"
34 #include "diagnostic-core.h"
35 #include "brig-builtins.h"
36 #include "fold-const.h"
37 
38 brig_basic_inst_handler::brig_basic_inst_handler (brig_to_generic &parent)
39   : brig_code_entry_handler (parent)
40 {
41 }
42 
43 class scalarized_sat_arithmetics : public tree_element_binary_visitor
44 {
45 public:
46   scalarized_sat_arithmetics (const BrigInstBase &brig_inst)
47     : m_brig_inst (brig_inst)
48   {
49     BrigType16_t element_type = brig_inst.type & BRIG_TYPE_BASE_MASK;
50 
51 #undef DEF_HSAIL_SAT_BUILTIN
52 #undef DEF_HSAIL_BUILTIN
53 #undef DEF_HSAIL_ATOMIC_BUILTIN
54 #undef DEF_HSAIL_INTR_BUILTIN
55 #undef DEF_HSAIL_CVT_ZEROI_SAT_BUILTIN
56 
57 #define DEF_HSAIL_SAT_BUILTIN(ENUM, BRIG_OPCODE, HSAIL_TYPE,		\
58 			      NAME, TYPE, ATTRS)			\
59     if (brig_inst.opcode == BRIG_OPCODE && element_type == HSAIL_TYPE)	\
60       m_builtin = builtin_decl_explicit (ENUM);				\
61     else
62 #include "brig-builtins.def"
63       gcc_unreachable ();
64   }
65 
66   virtual tree
67   visit_element (brig_code_entry_handler &, tree operand0, tree operand1)
68   {
69     /* Implement saturating arithmetics with scalar built-ins for now.
70        TODO: emit GENERIC nodes for the simplest cases or at least
71        emit vector built-ins.  */
72     return call_builtin (m_builtin, 2, TREE_TYPE (operand0),
73 			 TREE_TYPE (operand0), operand0,
74 			 TREE_TYPE (operand1), operand1);
75   }
76   const BrigInstBase &m_brig_inst;
77   tree m_builtin;
78 };
79 
80 /* Implements a vector shuffle.  ARITH_TYPE is the type of the vector,
81    OPERANDS[0] is the first vector, OPERAND[1] the second vector and
82    OPERANDS[2] the shuffle mask in HSAIL format.  The output is a VEC_PERM_EXPR
83    that implements the shuffle as a GENERIC expression.  */
84 
85 tree
86 brig_basic_inst_handler::build_shuffle (tree arith_type,
87 					tree_stl_vec &operands)
88 {
89   tree element_type
90     = get_unsigned_int_type (TREE_TYPE (TREE_TYPE (operands[0])));
91 
92   /* Offsets to add to the mask values to convert from the
93      HSAIL mask to VEC_PERM_EXPR masks.  VEC_PERM_EXPR mask
94      assumes an index spanning from 0 to 2 times the vec
95      width while HSAIL refers separately to two different
96      input vectors, thus is not a "full shuffle" where all
97      output elements can originate from any input element.  */
98   vec<constructor_elt, va_gc> *mask_offset_vals = NULL;
99 
100   unsigned int element_count = gccbrig_type_vector_subparts (arith_type);
101 
102   vec<constructor_elt, va_gc> *input_mask_vals = NULL;
103   size_t input_mask_element_size = exact_log2 (element_count);
104 
105   /* Unpack the tightly packed mask elements to BIT_FIELD_REFs
106      from which to construct the mask vector as understood by
107      VEC_PERM_EXPR.  */
108   tree mask_operand = add_temp_var ("shuffle_mask", operands[2]);
109 
110   tree mask_element_type
111     = build_nonstandard_integer_type (input_mask_element_size, true);
112 
113   for (size_t i = 0; i < element_count; ++i)
114     {
115       tree mask_element
116 	= build3 (BIT_FIELD_REF, mask_element_type, mask_operand,
117 		  bitsize_int (input_mask_element_size),
118 		  bitsize_int (i * input_mask_element_size));
119 
120       mask_element = convert (element_type, mask_element);
121 
122       tree offset;
123       if (i < element_count / 2)
124 	offset = build_int_cst (element_type, 0);
125       else
126 	offset = build_int_cst (element_type, element_count);
127 
128       CONSTRUCTOR_APPEND_ELT (mask_offset_vals, NULL_TREE, offset);
129       CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, mask_element);
130     }
131   tree mask_vec_type = build_vector_type (element_type, element_count);
132 
133   tree mask_vec = build_constructor (mask_vec_type, input_mask_vals);
134   tree offset_vec = build_constructor (mask_vec_type, mask_offset_vals);
135 
136   tree mask = build2 (PLUS_EXPR, mask_vec_type, mask_vec, offset_vec);
137 
138   tree perm = build3 (VEC_PERM_EXPR, TREE_TYPE (operands[0]), operands[0],
139 		      operands[1], mask);
140   return perm;
141 }
142 
143 /* Unpacks (extracts) a scalar element with an index in OPERANDS[1]
144    from the vector expression in OPERANDS[0].  */
145 
146 tree
147 brig_basic_inst_handler::build_unpack (tree_stl_vec &operands)
148 {
149   /* Implement the unpack with a shuffle that stores the unpacked
150      element to the lowest bit positions in the dest.  After that
151      a bitwise AND is used to clear the uppermost bits.  */
152   tree src_element_type = TREE_TYPE (TREE_TYPE (operands[0]));
153 
154   /* Perform the operations with a raw (unsigned int type) type.  */
155   tree element_type = get_unsigned_int_type (src_element_type);
156 
157   vec<constructor_elt, va_gc> *input_mask_vals = NULL;
158   vec<constructor_elt, va_gc> *and_mask_vals = NULL;
159 
160   size_t element_count
161     = gccbrig_type_vector_subparts (TREE_TYPE (operands[0]));
162   tree vec_type = build_vector_type (element_type, element_count);
163 
164   for (size_t i = 0; i < element_count; ++i)
165     {
166       tree mask_element;
167       if (i == 0)
168 	mask_element = convert (element_type, operands[1]);
169       else
170 	mask_element = build_int_cst (element_type, 0);
171 
172       CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, mask_element);
173 
174       tree and_mask_element;
175       if (i == 0)
176 	and_mask_element = build_int_cst (element_type, -1);
177       else
178 	and_mask_element = build_int_cst (element_type, 0);
179       CONSTRUCTOR_APPEND_ELT (and_mask_vals, NULL_TREE, and_mask_element);
180     }
181 
182   tree mask_vec = build_constructor (vec_type, input_mask_vals);
183 
184   tree and_mask_vec = build_constructor (vec_type, and_mask_vals);
185 
186   tree perm = build3 (VEC_PERM_EXPR, vec_type,
187 		      build_resize_convert_view (vec_type, operands[0]),
188 		      build_resize_convert_view (vec_type, operands[0]),
189 		      mask_vec);
190 
191   tree cleared = build2 (BIT_AND_EXPR, vec_type, perm, and_mask_vec);
192 
193   size_t s = int_size_in_bytes (TREE_TYPE (cleared)) * BITS_PER_UNIT;
194   tree raw_type = build_nonstandard_integer_type (s, true);
195 
196   tree as_int = build_resize_convert_view (raw_type, cleared);
197 
198   if (int_size_in_bytes (src_element_type) < 4)
199     {
200       if (INTEGRAL_TYPE_P (src_element_type))
201 	return extend_int (as_int, uint32_type_node, src_element_type);
202     }
203   return as_int;
204 }
205 
206 /* Packs (inserts) a scalar element in OPERANDS[1]
207    to the vector in OPERANDS[0] at element position defined by
208    OPERANDS[2].  */
209 
210 tree
211 brig_basic_inst_handler::build_pack (tree_stl_vec &operands)
212 {
213   /* Implement using a bit level insertion.
214      TODO: Reuse this for implementing 'bitinsert'
215      without a builtin call.  */
216 
217   size_t ecount = gccbrig_type_vector_subparts (TREE_TYPE (operands[0]));
218   size_t vecsize = int_size_in_bytes (TREE_TYPE (operands[0])) * BITS_PER_UNIT;
219   tree wide_type = build_nonstandard_integer_type (vecsize, 1);
220 
221   tree src_vect = build_resize_convert_view (wide_type, operands[0]);
222   src_vect = add_temp_var ("src_vect", src_vect);
223 
224   tree scalar = operands[1];
225   scalar = add_temp_var ("scalar", convert_to_integer (wide_type, scalar));
226 
227   tree pos = operands[2];
228 
229   /* The upper bits of the position can contain garbage.
230      Zero them for well-defined semantics.  */
231   tree t = build2 (BIT_AND_EXPR, TREE_TYPE (pos), operands[2],
232 		   build_int_cstu (TREE_TYPE (pos), ecount - 1));
233   pos = add_temp_var ("pos", convert (wide_type, t));
234 
235   tree element_type = TREE_TYPE (TREE_TYPE (operands[0]));
236   size_t element_width = int_size_in_bytes (element_type) * BITS_PER_UNIT;
237   tree ewidth = build_int_cstu (wide_type, element_width);
238 
239   tree bitoffset = build2 (MULT_EXPR, wide_type, ewidth, pos);
240   bitoffset = add_temp_var ("offset", bitoffset);
241 
242   uint64_t mask_int
243     = element_width == 64 ? (uint64_t) -1 : ((uint64_t) 1 << element_width) - 1;
244 
245   tree mask = build_int_cstu (wide_type, mask_int);
246 
247   mask = add_temp_var ("mask", convert_to_integer (wide_type, mask));
248 
249   tree clearing_mask
250     = build1 (BIT_NOT_EXPR, wide_type,
251 	      build2 (LSHIFT_EXPR, wide_type, mask, bitoffset));
252 
253   tree zeroed_element
254     = build2 (BIT_AND_EXPR, wide_type, src_vect, clearing_mask);
255 
256   /* TODO: Is the AND necessary: does HSA define what
257      happens if the upper bits in the inserted element are not
258      zero? */
259   tree element_in_position
260     = build2 (LSHIFT_EXPR, wide_type,
261 	      build2 (BIT_AND_EXPR, wide_type, scalar, mask), bitoffset);
262 
263   tree inserted
264     = build2 (BIT_IOR_EXPR, wide_type, zeroed_element, element_in_position);
265   return inserted;
266 }
267 
268 /* Implement the unpack{lo,hi}.  BRIG_OPCODE should tell which one and
269    ARITH_TYPE describe the type of the vector arithmetics.
270    OPERANDS[0] and OPERANDS[1] are the input vectors.  */
271 
272 tree
273 brig_basic_inst_handler::build_unpack_lo_or_hi (BrigOpcode16_t brig_opcode,
274 						tree arith_type,
275 						tree_stl_vec &operands)
276 {
277   tree element_type = get_unsigned_int_type (TREE_TYPE (arith_type));
278   tree mask_vec_type
279     = build_vector_type (element_type,
280 			 gccbrig_type_vector_subparts (arith_type));
281 
282   size_t element_count = gccbrig_type_vector_subparts (arith_type);
283   vec<constructor_elt, va_gc> *input_mask_vals = NULL;
284 
285   size_t offset = (brig_opcode == BRIG_OPCODE_UNPACKLO) ? 0 : element_count / 2;
286 
287   for (size_t i = 0; i < element_count / 2; ++i)
288     {
289       CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE,
290 			      build_int_cst (element_type, offset + i));
291       CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE,
292 			      build_int_cst (element_type,
293 					     offset + i + element_count));
294     }
295 
296   tree mask_vec = build_constructor (mask_vec_type, input_mask_vals);
297 
298   tree perm = build3 (VEC_PERM_EXPR, TREE_TYPE (operands[0]), operands[0],
299 		      operands[1], mask_vec);
300   return perm;
301 }
302 
303 /* Builds a basic instruction expression from a BRIG instruction.  BRIG_OPCODE
304    is the opcode, BRIG_TYPE the brig type of the instruction, ARITH_TYPE the
305    desired tree type for the instruction, and OPERANDS the instruction's
306    input operands already converted to tree nodes.  */
307 
308 tree
309 brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
310 					  BrigType16_t brig_type,
311 					  tree arith_type,
312 					  tree_stl_vec &operands)
313 {
314   tree_code opcode = get_tree_code_for_hsa_opcode (brig_opcode, brig_type);
315 
316   BrigType16_t inner_type = brig_type & BRIG_TYPE_BASE_MASK;
317 
318   tree instr_inner_type
319     = VECTOR_TYPE_P (arith_type) ? TREE_TYPE (arith_type) : arith_type;
320 
321   if (opcode == RSHIFT_EXPR || opcode == LSHIFT_EXPR)
322     {
323       /* HSA defines modulo/clipping behavior for shift amounts larger
324 	 than the bit width, while tree.def leaves it undefined.
325 	 We need to mask the upper bits to ensure the defined behavior.  */
326       tree scalar_mask
327 	= build_int_cst (instr_inner_type,
328 			 gccbrig_hsa_type_bit_size (inner_type) - 1);
329 
330       tree mask = VECTOR_TYPE_P (arith_type)
331 		    ? build_vector_from_val (arith_type, scalar_mask)
332 		    : scalar_mask;
333 
334       /* The shift amount is a scalar, broadcast it to produce
335 	 a vector shift.  */
336       if (VECTOR_TYPE_P (arith_type))
337 	operands[1] = build_vector_from_val (arith_type, operands[1]);
338       operands[1] = build2 (BIT_AND_EXPR, arith_type, operands[1], mask);
339     }
340 
341   size_t input_count = operands.size ();
342   size_t output_count = gccbrig_hsa_opcode_op_output_p (brig_opcode, 0) ?
343     1 : 0;
344 
345   if (opcode == TREE_LIST)
346     {
347       /* There was no direct GENERIC opcode for the instruction;
348 	 try to emulate it with a chain of GENERIC nodes.  */
349       if (brig_opcode == BRIG_OPCODE_MAD || brig_opcode == BRIG_OPCODE_MAD24)
350 	{
351 	  /* There doesn't seem to be a "standard" MAD built-in in gcc so let's
352 	     use a chain of multiply + add for now (double rounding method).
353 	     It should be easier for optimizers than a custom built-in call
354 	     WIDEN_MULT_EXPR is close, but requires a double size result
355 	     type.  */
356 	  tree mult_res
357 	    = build2 (MULT_EXPR, arith_type, operands[0], operands[1]);
358 	  return build2 (PLUS_EXPR, arith_type, mult_res, operands[2]);
359 	}
360       else if (brig_opcode == BRIG_OPCODE_MAD24HI)
361 	{
362 	  tree mult_res
363 	    = build2 (MULT_HIGHPART_EXPR, arith_type, operands[0], operands[1]);
364 	  return build2 (PLUS_EXPR, arith_type, mult_res, operands[2]);
365 	}
366       else if (brig_opcode == BRIG_OPCODE_SHUFFLE)
367 	{
368 	  return build_shuffle (arith_type, operands);
369 	}
370       else if (brig_opcode == BRIG_OPCODE_UNPACKLO
371 	       || brig_opcode == BRIG_OPCODE_UNPACKHI)
372 	{
373 	  return build_unpack_lo_or_hi (brig_opcode, arith_type, operands);
374 	}
375       else if (brig_opcode == BRIG_OPCODE_UNPACK)
376 	{
377 	  return build_unpack (operands);
378 	}
379       else if (brig_opcode == BRIG_OPCODE_PACK)
380 	{
381 	  return build_pack (operands);
382 	}
383       else if (brig_opcode == BRIG_OPCODE_NRSQRT)
384 	{
385 	  /* Implement as 1.0/sqrt (x) and assume gcc instruction selects to
386 	     native ISA other than a division, if available.
387 	     TODO: this will happen only with unsafe math optimizations
388 	     on which cannot be used in general to remain HSAIL compliant.
389 	     Perhaps a builtin call would be better option here.  */
390 	  return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
391 			 expand_or_call_builtin (BRIG_OPCODE_SQRT, brig_type,
392 						 arith_type, operands));
393 	}
394       else if (brig_opcode == BRIG_OPCODE_NRCP)
395 	{
396 	  /* Implement as 1.0/x and assume gcc instruction selects to
397 	     native ISA other than a division, if available.  */
398 	  return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
399 			 operands[0]);
400 	}
401       else if (brig_opcode == BRIG_OPCODE_LANEID
402 	       || brig_opcode == BRIG_OPCODE_MAXWAVEID
403 	       || brig_opcode == BRIG_OPCODE_WAVEID)
404 	{
405 	  /* Assuming WAVESIZE 1 (for now), therefore LANEID, WAVEID and
406 	     MAXWAVEID always return 0.  */
407 	  return build_zero_cst (arith_type);
408 	}
409       else
410 	gcc_unreachable ();
411     }
412   else if (opcode == CALL_EXPR)
413     return expand_or_call_builtin (brig_opcode, brig_type, arith_type,
414 				   operands);
415   else if (output_count == 1)
416     {
417       if (input_count == 1)
418 	{
419 	  if (opcode == MODIFY_EXPR)
420 	    return operands[0];
421 	  else
422 	    return build1 (opcode, arith_type, operands[0]);
423 	}
424       else if (input_count == 2)
425 	return build2 (opcode, arith_type, operands[0], operands[1]);
426       else if (input_count == 3)
427 	return build3 (opcode, arith_type, operands[0], operands[1],
428 		       operands[2]);
429       else
430 	gcc_unreachable ();
431     }
432   else
433     gcc_unreachable ();
434 
435   return NULL_TREE;
436 }
437 
438 /* Handles the basic instructions, including packed instructions. Deals
439    with the different packing modes by unpacking/packing the wanted
440    elements.  Delegates most of the instruction cases to build_inst_expr(). */
441 
442 size_t
443 brig_basic_inst_handler::operator () (const BrigBase *base)
444 {
445   const BrigInstBase *brig_inst = (const BrigInstBase *) base;
446 
447   tree_stl_vec operands = build_operands (*brig_inst);
448 
449   size_t output_count
450     = gccbrig_hsa_opcode_op_output_p (brig_inst->opcode, 0) ? 1 : 0;
451   size_t input_count
452     = operands.size () == 0 ? 0 : (operands.size () - output_count);
453 
454   gcc_assert (output_count == 0 || output_count == 1);
455 
456   tree_stl_vec::iterator first_input_i = operands.begin ();
457   if (output_count > 0 && operands.size () > 0)
458     ++first_input_i;
459 
460   tree_stl_vec in_operands;
461   in_operands.assign (first_input_i, operands.end ());
462 
463   BrigType16_t brig_inst_type = brig_inst->type;
464 
465   if (brig_inst->opcode == BRIG_OPCODE_NOP)
466     return base->byteCount;
467   else if (brig_inst->opcode == BRIG_OPCODE_FIRSTBIT
468 	   || brig_inst->opcode == BRIG_OPCODE_LASTBIT
469 	   || brig_inst->opcode == BRIG_OPCODE_SAD)
470     /* These instructions are reported to be always 32b in HSAIL, but we want
471        to treat them according to their input argument's type to select the
472        correct instruction/builtin.  */
473     brig_inst_type
474       = gccbrig_tree_type_to_hsa_type (TREE_TYPE (in_operands[0]));
475 
476   tree instr_type = gccbrig_tree_type_for_hsa_type (brig_inst_type);
477 
478   if (!instr_type)
479     {
480       gcc_unreachable ();
481       return base->byteCount;
482     }
483 
484   bool is_vec_instr = hsa_type_packed_p (brig_inst_type);
485 
486   size_t element_size_bits;
487   size_t element_count;
488 
489   if (is_vec_instr)
490     {
491       BrigType16_t brig_element_type = brig_inst_type & BRIG_TYPE_BASE_MASK;
492       element_size_bits = gccbrig_hsa_type_bit_size (brig_element_type);
493       element_count = gccbrig_hsa_type_bit_size (brig_inst_type)
494 	/ gccbrig_hsa_type_bit_size (brig_element_type);
495     }
496   else
497     {
498       element_size_bits = gccbrig_hsa_type_bit_size (brig_inst_type);
499       element_count = 1;
500     }
501 
502   /* The actual arithmetics type that should be performed with the
503      operation.  This is not always the same as the original BRIG
504      opcode's type due to implicit conversions of storage-only f16.  */
505   tree arith_type = gccbrig_is_bit_operation (brig_inst->opcode)
506 		      ? gccbrig_tree_type_for_hsa_type (brig_inst_type)
507 		      : get_tree_expr_type_for_hsa_type (brig_inst_type);
508 
509   tree instr_expr = NULL_TREE;
510 
511   BrigPack8_t p = BRIG_PACK_NONE;
512   if (brig_inst->base.kind == BRIG_KIND_INST_MOD)
513     p = ((const BrigInstMod *) brig_inst)->pack;
514   else if (brig_inst->base.kind == BRIG_KIND_INST_CMP)
515     p = ((const BrigInstCmp *) brig_inst)->pack;
516 
517   if (p == BRIG_PACK_PS || p == BRIG_PACK_PSSAT)
518     in_operands[1] = build_lower_element_broadcast (in_operands[1]);
519   else if (p == BRIG_PACK_SP || p == BRIG_PACK_SPSAT)
520     in_operands[0] = build_lower_element_broadcast (in_operands[0]);
521 
522   tree_code opcode
523     = get_tree_code_for_hsa_opcode (brig_inst->opcode, brig_inst_type);
524 
525   if (p >= BRIG_PACK_PPSAT && p <= BRIG_PACK_PSAT)
526     {
527       scalarized_sat_arithmetics sat_arith (*brig_inst);
528       gcc_assert (input_count == 2);
529       instr_expr = sat_arith (*this, in_operands[0], in_operands[1]);
530     }
531   else if (opcode == RETURN_EXPR)
532     {
533       if (m_parent.m_cf->m_is_kernel)
534 	{
535 	  tree goto_stmt
536 	    = build1 (GOTO_EXPR, void_type_node, m_parent.m_cf->m_exit_label);
537 	  m_parent.m_cf->append_statement (goto_stmt);
538 	  return base->byteCount;
539 	}
540       else
541 	{
542 	  m_parent.m_cf->append_return_stmt ();
543 	  return base->byteCount;
544 	}
545     }
546   else if (opcode == MULT_HIGHPART_EXPR &&
547 	   is_vec_instr && element_size_bits < 64)
548     {
549       /* MULT_HIGHPART_EXPR works only on target dependent vector sizes and
550 	 even the scalars do not seem to work at least for char elements.
551 
552 	 Let's fall back to scalarization and promotion of the vector elements
553 	 to larger types with the MULHI computed as a regular MUL.
554 	 MULHI for 2x64b seems to work with the Intel CPUs I've tested so
555 	 that is passed on for vector processing so there is no need for
556 	 128b scalar arithmetics.
557 
558 	 This is not modular as these type of things do not belong to the
559 	 frontend, there should be a legalization phase before the backend
560 	 that figures out the best way to compute the MULHI for any
561 	 integer vector datatype.
562 
563 	 TODO: promote to larger vector types instead.  For example
564 	 MULT_HIGHPART_EXPR with s8x8 doesn't work, but s16x8 seems to at least
565 	 with my x86-64.
566       */
567       tree_stl_vec operand0_elements;
568       if (input_count > 0)
569 	unpack (in_operands[0], operand0_elements);
570 
571       tree_stl_vec operand1_elements;
572       if (input_count > 1)
573 	unpack (in_operands[1], operand1_elements);
574 
575       tree_stl_vec result_elements;
576 
577       tree scalar_type = TREE_TYPE (arith_type);
578       BrigType16_t element_type = brig_inst_type & BRIG_TYPE_BASE_MASK;
579       tree promoted_type = short_integer_type_node;
580       switch (element_type)
581 	{
582 	case BRIG_TYPE_S8:
583 	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S16);
584 	  break;
585 	case BRIG_TYPE_U8:
586 	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U16);
587 	  break;
588 	case BRIG_TYPE_S16:
589 	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S32);
590 	  break;
591 	case BRIG_TYPE_U16:
592 	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U32);
593 	  break;
594 	case BRIG_TYPE_S32:
595 	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S64);
596 	  break;
597 	case BRIG_TYPE_U32:
598 	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U64);
599 	  break;
600 	default:
601 	  gcc_unreachable ();
602 	}
603 
604       size_t promoted_type_size = int_size_in_bytes (promoted_type) * 8;
605       size_t element_count = gccbrig_type_vector_subparts (arith_type);
606       for (size_t i = 0; i < element_count; ++i)
607 	{
608 	  tree operand0 = convert (promoted_type, operand0_elements.at (i));
609 	  tree operand1 = convert (promoted_type, operand1_elements.at (i));
610 
611 	  tree scalar_expr
612 	    = build2 (MULT_EXPR, promoted_type, operand0, operand1);
613 
614 	  scalar_expr
615 	    = build2 (RSHIFT_EXPR, promoted_type, scalar_expr,
616 		      build_int_cstu (promoted_type, promoted_type_size / 2));
617 
618 	  result_elements.push_back (convert (scalar_type, scalar_expr));
619 	}
620       instr_expr = pack (result_elements);
621     }
622   else
623     {
624       /* 'class' is always of b1 type, let's consider it by its
625 	 float type when building the instruction to find the
626 	 correct builtin.  */
627       if (brig_inst->opcode == BRIG_OPCODE_CLASS)
628 	brig_inst_type = ((const BrigInstSourceType *) base)->sourceType;
629       instr_expr = build_inst_expr (brig_inst->opcode, brig_inst_type,
630 				     arith_type, in_operands);
631     }
632 
633   if (instr_expr == NULL_TREE)
634     {
635       gcc_unreachable ();
636       return base->byteCount;
637     }
638 
639   if (p == BRIG_PACK_SS || p == BRIG_PACK_S || p == BRIG_PACK_SSSAT
640       || p == BRIG_PACK_SSAT)
641     {
642       /* In case of _s_ or _ss_, select only the lowest element
643 	 from the new input to the output.  We could extract
644 	 the element and use a scalar operation, but try
645 	 to keep data in vector registers as much as possible
646 	 to avoid copies between scalar and vector datapaths.  */
647       tree old_value;
648       tree half_storage_type = gccbrig_tree_type_for_hsa_type (brig_inst_type);
649       bool is_fp16_operation
650 	= (brig_inst_type & BRIG_TYPE_BASE_MASK) == BRIG_TYPE_F16
651 	&& !gccbrig_is_bit_operation (brig_inst->opcode);
652 
653       if (is_fp16_operation)
654 	old_value = build_h2f_conversion
655 	  (build_resize_convert_view (half_storage_type, operands[0]));
656       else
657 	old_value
658 	  = build_resize_convert_view (TREE_TYPE (instr_expr), operands[0]);
659 
660       size_t esize = is_fp16_operation ? 32 : element_size_bits;
661 
662       /* Construct a permutation mask where other elements than the lowest one
663 	 is picked from the old_value.  */
664       tree mask_inner_type = build_nonstandard_integer_type (esize, 1);
665       vec<constructor_elt, va_gc> *constructor_vals = NULL;
666       for (size_t i = 0; i < element_count; ++i)
667 	{
668 	  tree cst;
669 
670 	  if (i == 0)
671 	    cst = build_int_cstu (mask_inner_type, element_count);
672 	  else
673 	    cst = build_int_cstu (mask_inner_type, i);
674 	  CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, cst);
675 	}
676       tree mask_vec_type = build_vector_type (mask_inner_type, element_count);
677       tree mask = build_vector_from_ctor (mask_vec_type, constructor_vals);
678 
679       tree new_value = create_tmp_var (TREE_TYPE (instr_expr), "new_output");
680       tree assign
681 	= build2 (MODIFY_EXPR, TREE_TYPE (instr_expr), new_value, instr_expr);
682       m_parent.m_cf->append_statement (assign);
683 
684       instr_expr
685 	= build3 (VEC_PERM_EXPR, arith_type, old_value, new_value, mask);
686 
687       tree lower_output = create_tmp_var (TREE_TYPE (instr_expr), "s_output");
688       tree assign_lower = build2 (MODIFY_EXPR, TREE_TYPE (instr_expr),
689 				  lower_output, instr_expr);
690       m_parent.m_cf->append_statement (assign_lower);
691       instr_expr = lower_output;
692     }
693 
694   if (output_count == 1)
695     build_output_assignment (*brig_inst, operands[0], instr_expr);
696   else
697     m_parent.m_cf->append_statement (instr_expr);
698   return base->byteCount;
699 }
700 
701 /* Create an expression that broadcasts the lowest element of the
702    vector in VEC_OPERAND to all elements of the returned vector.  */
703 
704 tree
705 brig_basic_inst_handler::build_lower_element_broadcast (tree vec_operand)
706 {
707   /* Build the broadcast using shuffle because there's no
708      direct broadcast in GENERIC and this way there's no need for
709      a separate extract of the lowest element.  */
710   tree element_type = TREE_TYPE (TREE_TYPE (vec_operand));
711   size_t esize = 8 * int_size_in_bytes (element_type);
712 
713   size_t element_count
714     = gccbrig_type_vector_subparts (TREE_TYPE (vec_operand));
715   tree mask_inner_type = build_nonstandard_integer_type (esize, 1);
716   vec<constructor_elt, va_gc> *constructor_vals = NULL;
717 
718   /* Construct the mask.  */
719   for (size_t i = 0; i < element_count; ++i)
720     {
721       tree cst = build_int_cstu (mask_inner_type, element_count);
722       CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, cst);
723     }
724   tree mask_vec_type = build_vector_type (mask_inner_type, element_count);
725   tree mask = build_vector_from_ctor (mask_vec_type, constructor_vals);
726 
727   return build3 (VEC_PERM_EXPR, TREE_TYPE (vec_operand), vec_operand,
728 		 vec_operand, mask);
729 }
730 
731 /* Returns the tree code that should be used to implement the given
732    HSA instruction opcode (BRIG_OPCODE) for the given type of instruction
733    (BRIG_TYPE).  In case the opcode cannot be mapped to a TREE node directly,
734    returns TREE_LIST (if it can be emulated with a simple chain of tree
735    nodes) or CALL_EXPR if the opcode should be implemented using a builtin
736    call.  */
737 
738 tree_code
739 brig_basic_inst_handler::get_tree_code_for_hsa_opcode
740   (BrigOpcode16_t brig_opcode, BrigType16_t brig_type) const
741 {
742   BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK;
743   switch (brig_opcode)
744     {
745     case BRIG_OPCODE_NOP:
746       return NOP_EXPR;
747     case BRIG_OPCODE_ADD:
748       return PLUS_EXPR;
749     case BRIG_OPCODE_CMOV:
750       if (brig_inner_type == brig_type)
751 	return COND_EXPR;
752       else
753 	return VEC_COND_EXPR;
754     case BRIG_OPCODE_SUB:
755       return MINUS_EXPR;
756     case BRIG_OPCODE_MUL:
757     case BRIG_OPCODE_MUL24:
758       return MULT_EXPR;
759     case BRIG_OPCODE_MULHI:
760     case BRIG_OPCODE_MUL24HI:
761       return MULT_HIGHPART_EXPR;
762     case BRIG_OPCODE_DIV:
763       if (gccbrig_is_float_type (brig_inner_type))
764 	return RDIV_EXPR;
765       else
766 	return TRUNC_DIV_EXPR;
767     case BRIG_OPCODE_NEG:
768       return NEGATE_EXPR;
769     case BRIG_OPCODE_MIN:
770       if (gccbrig_is_float_type (brig_inner_type))
771 	return CALL_EXPR;
772       else
773 	return MIN_EXPR;
774     case BRIG_OPCODE_MAX:
775       if (gccbrig_is_float_type (brig_inner_type))
776 	return CALL_EXPR;
777       else
778 	return MAX_EXPR;
779     case BRIG_OPCODE_FMA:
780       return FMA_EXPR;
781     case BRIG_OPCODE_ABS:
782       return ABS_EXPR;
783     case BRIG_OPCODE_SHL:
784       return LSHIFT_EXPR;
785     case BRIG_OPCODE_SHR:
786       return RSHIFT_EXPR;
787     case BRIG_OPCODE_OR:
788       return BIT_IOR_EXPR;
789     case BRIG_OPCODE_XOR:
790       return BIT_XOR_EXPR;
791     case BRIG_OPCODE_AND:
792       return BIT_AND_EXPR;
793     case BRIG_OPCODE_NOT:
794       return BIT_NOT_EXPR;
795     case BRIG_OPCODE_RET:
796       return RETURN_EXPR;
797     case BRIG_OPCODE_MOV:
798     case BRIG_OPCODE_LDF:
799       return MODIFY_EXPR;
800     case BRIG_OPCODE_LD:
801     case BRIG_OPCODE_ST:
802       return MEM_REF;
803     case BRIG_OPCODE_BR:
804       return GOTO_EXPR;
805     case BRIG_OPCODE_REM:
806       if (brig_type == BRIG_TYPE_U64 || brig_type == BRIG_TYPE_U32)
807 	return TRUNC_MOD_EXPR;
808       else
809 	return CALL_EXPR;
810     case BRIG_OPCODE_NRCP:
811     case BRIG_OPCODE_NRSQRT:
812       /* Implement as 1/f (x).  gcc should pattern detect that and
813 	 use a native instruction, if available, for it.  */
814       return TREE_LIST;
815     case BRIG_OPCODE_FLOOR:
816     case BRIG_OPCODE_CEIL:
817     case BRIG_OPCODE_SQRT:
818     case BRIG_OPCODE_NSQRT:
819     case BRIG_OPCODE_RINT:
820     case BRIG_OPCODE_TRUNC:
821     case BRIG_OPCODE_POPCOUNT:
822     case BRIG_OPCODE_COPYSIGN:
823     case BRIG_OPCODE_NCOS:
824     case BRIG_OPCODE_NSIN:
825     case BRIG_OPCODE_NLOG2:
826     case BRIG_OPCODE_NEXP2:
827     case BRIG_OPCODE_NFMA:
828       /* Class has type B1 regardless of the float type, thus
829 	 the below builtin map search cannot find it.  */
830     case BRIG_OPCODE_CLASS:
831     case BRIG_OPCODE_WORKITEMABSID:
832       return CALL_EXPR;
833     default:
834 
835       /* Some BRIG opcodes can use the same builtins for unsigned and
836 	 signed types.  Force these cases to unsigned types.
837       */
838 
839       if (brig_opcode == BRIG_OPCODE_BORROW
840 	  || brig_opcode == BRIG_OPCODE_CARRY
841 	  || brig_opcode == BRIG_OPCODE_LASTBIT
842 	  || brig_opcode == BRIG_OPCODE_BITINSERT)
843 	{
844 	  if (brig_type == BRIG_TYPE_S32)
845 	    brig_type = BRIG_TYPE_U32;
846 	  else if (brig_type == BRIG_TYPE_S64)
847 	    brig_type = BRIG_TYPE_U64;
848 	}
849 
850 
851       builtin_map::const_iterator i
852 	= s_custom_builtins.find (std::make_pair (brig_opcode, brig_type));
853       if (i != s_custom_builtins.end ())
854 	return CALL_EXPR;
855       else if (s_custom_builtins.find
856 	       (std::make_pair (brig_opcode, brig_inner_type))
857 	       != s_custom_builtins.end ())
858 	return CALL_EXPR;
859       if (brig_inner_type == BRIG_TYPE_F16
860 	  && s_custom_builtins.find
861 	  (std::make_pair (brig_opcode, BRIG_TYPE_F32))
862 	  != s_custom_builtins.end ())
863 	return CALL_EXPR;
864       break;
865     }
866   return TREE_LIST; /* Emulate using a chain of nodes.  */
867 }
868