xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/brig/brigfrontend/brig-basic-inst-handler.cc (revision 4c3eb207d36f67d31994830c0a694161fc1ca39b)
1 /* brig-basic-inst-handler.cc -- brig basic instruction handling
2    Copyright (C) 2016-2020 Free Software Foundation, Inc.
3    Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
4    for General Processor Tech.
5 
6    This file is part of GCC.
7 
8    GCC is free software; you can redistribute it and/or modify it under
9    the terms of the GNU General Public License as published by the Free
10    Software Foundation; either version 3, or (at your option) any later
11    version.
12 
13    GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14    WARRANTY; without even the implied warranty of MERCHANTABILITY or
15    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16    for more details.
17 
18    You should have received a copy of the GNU General Public License
19    along with GCC; see the file COPYING3.  If not see
20    <http://www.gnu.org/licenses/>.  */
21 
22 #include <sstream>
23 
24 #include "brig-code-entry-handler.h"
25 #include "brig-util.h"
26 
27 #include "errors.h"
28 #include "gimple-expr.h"
29 #include "convert.h"
30 #include "print-tree.h"
31 #include "tree-pretty-print.h"
32 #include "langhooks.h"
33 #include "stor-layout.h"
34 #include "diagnostic-core.h"
35 #include "brig-builtins.h"
36 #include "fold-const.h"
37 
brig_basic_inst_handler(brig_to_generic & parent)38 brig_basic_inst_handler::brig_basic_inst_handler (brig_to_generic &parent)
39   : brig_code_entry_handler (parent)
40 {
41 }
42 
43 class scalarized_sat_arithmetics : public tree_element_binary_visitor
44 {
45 public:
scalarized_sat_arithmetics(const BrigInstBase & brig_inst)46   scalarized_sat_arithmetics (const BrigInstBase &brig_inst)
47     : m_brig_inst (brig_inst)
48   {
49     BrigType16_t element_type = brig_inst.type & BRIG_TYPE_BASE_MASK;
50 
51 #undef DEF_HSAIL_SAT_BUILTIN
52 #undef DEF_HSAIL_BUILTIN
53 #undef DEF_HSAIL_ATOMIC_BUILTIN
54 #undef DEF_HSAIL_INTR_BUILTIN
55 #undef DEF_HSAIL_CVT_ZEROI_SAT_BUILTIN
56 
57 #define DEF_HSAIL_SAT_BUILTIN(ENUM, BRIG_OPCODE, HSAIL_TYPE,		\
58 			      NAME, TYPE, ATTRS)			\
59     if (brig_inst.opcode == BRIG_OPCODE && element_type == HSAIL_TYPE)	\
60       m_builtin = builtin_decl_explicit (ENUM);				\
61     else
62 #include "brig-builtins.def"
63       gcc_unreachable ();
64   }
65 
66   virtual tree
visit_element(brig_code_entry_handler &,tree operand0,tree operand1)67   visit_element (brig_code_entry_handler &, tree operand0, tree operand1)
68   {
69     /* Implement saturating arithmetics with scalar built-ins for now.
70        TODO: emit GENERIC nodes for the simplest cases or at least
71        emit vector built-ins.  */
72     return call_builtin (m_builtin, 2, TREE_TYPE (operand0),
73 			 TREE_TYPE (operand0), operand0,
74 			 TREE_TYPE (operand1), operand1);
75   }
76   const BrigInstBase &m_brig_inst;
77   tree m_builtin;
78 };
79 
80 /* Implements a vector shuffle.  ARITH_TYPE is the type of the vector,
81    OPERANDS[0] is the first vector, OPERAND[1] the second vector and
82    OPERANDS[2] the shuffle mask in HSAIL format.  The output is a VEC_PERM_EXPR
83    that implements the shuffle as a GENERIC expression.  */
84 
85 tree
build_shuffle(tree arith_type,tree_stl_vec & operands)86 brig_basic_inst_handler::build_shuffle (tree arith_type,
87 					tree_stl_vec &operands)
88 {
89   tree element_type
90     = get_unsigned_int_type (TREE_TYPE (TREE_TYPE (operands[0])));
91 
92   /* Offsets to add to the mask values to convert from the
93      HSAIL mask to VEC_PERM_EXPR masks.  VEC_PERM_EXPR mask
94      assumes an index spanning from 0 to 2 times the vec
95      width while HSAIL refers separately to two different
96      input vectors, thus is not a "full shuffle" where all
97      output elements can originate from any input element.  */
98   vec<constructor_elt, va_gc> *mask_offset_vals = NULL;
99 
100   unsigned int element_count = gccbrig_type_vector_subparts (arith_type);
101 
102   vec<constructor_elt, va_gc> *input_mask_vals = NULL;
103   size_t input_mask_element_size = exact_log2 (element_count);
104 
105   /* Unpack the tightly packed mask elements to BIT_FIELD_REFs
106      from which to construct the mask vector as understood by
107      VEC_PERM_EXPR.  */
108   tree mask_operand
109     = m_parent.m_cf->add_temp_var ("shuffle_mask", operands[2]);
110 
111   tree mask_element_type
112     = build_nonstandard_integer_type (input_mask_element_size, true);
113 
114   for (size_t i = 0; i < element_count; ++i)
115     {
116       tree mask_element
117 	= build3 (BIT_FIELD_REF, mask_element_type, mask_operand,
118 		  bitsize_int (input_mask_element_size),
119 		  bitsize_int (i * input_mask_element_size));
120 
121       mask_element = convert (element_type, mask_element);
122 
123       tree offset;
124       if (i < element_count / 2)
125 	offset = build_int_cst (element_type, 0);
126       else
127 	offset = build_int_cst (element_type, element_count);
128 
129       CONSTRUCTOR_APPEND_ELT (mask_offset_vals, NULL_TREE, offset);
130       CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, mask_element);
131     }
132   tree mask_vec_type = build_vector_type (element_type, element_count);
133 
134   tree mask_vec = build_constructor (mask_vec_type, input_mask_vals);
135   tree offset_vec = build_constructor (mask_vec_type, mask_offset_vals);
136 
137   tree mask = build2 (PLUS_EXPR, mask_vec_type, mask_vec, offset_vec);
138 
139   tree perm = build3 (VEC_PERM_EXPR, TREE_TYPE (operands[0]), operands[0],
140 		      operands[1], mask);
141   return perm;
142 }
143 
144 /* Unpacks (extracts) a scalar element with an index in OPERANDS[1]
145    from the vector expression in OPERANDS[0].  */
146 
147 tree
build_unpack(tree_stl_vec & operands)148 brig_basic_inst_handler::build_unpack (tree_stl_vec &operands)
149 {
150   /* Implement the unpack with a shuffle that stores the unpacked
151      element to the lowest bit positions in the dest.  After that
152      a bitwise AND is used to clear the uppermost bits.  */
153   tree src_element_type = TREE_TYPE (TREE_TYPE (operands[0]));
154 
155   /* Perform the operations with a raw (unsigned int type) type.  */
156   tree element_type = get_unsigned_int_type (src_element_type);
157 
158   vec<constructor_elt, va_gc> *input_mask_vals = NULL;
159   vec<constructor_elt, va_gc> *and_mask_vals = NULL;
160 
161   size_t element_count
162     = gccbrig_type_vector_subparts (TREE_TYPE (operands[0]));
163   tree vec_type = build_vector_type (element_type, element_count);
164 
165   for (size_t i = 0; i < element_count; ++i)
166     {
167       tree mask_element;
168       if (i == 0)
169 	mask_element = convert (element_type, operands[1]);
170       else
171 	mask_element = build_int_cst (element_type, 0);
172 
173       CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, mask_element);
174 
175       tree and_mask_element;
176       if (i == 0)
177 	and_mask_element = build_int_cst (element_type, -1);
178       else
179 	and_mask_element = build_int_cst (element_type, 0);
180       CONSTRUCTOR_APPEND_ELT (and_mask_vals, NULL_TREE, and_mask_element);
181     }
182 
183   tree mask_vec = build_constructor (vec_type, input_mask_vals);
184 
185   tree and_mask_vec = build_constructor (vec_type, and_mask_vals);
186 
187   tree perm = build3 (VEC_PERM_EXPR, vec_type,
188 		      build_resize_convert_view (vec_type, operands[0]),
189 		      build_resize_convert_view (vec_type, operands[0]),
190 		      mask_vec);
191 
192   tree cleared = build2 (BIT_AND_EXPR, vec_type, perm, and_mask_vec);
193 
194   size_t s = int_size_in_bytes (TREE_TYPE (cleared)) * BITS_PER_UNIT;
195   tree raw_type = build_nonstandard_integer_type (s, true);
196 
197   tree as_int = build_resize_convert_view (raw_type, cleared);
198 
199   if (int_size_in_bytes (src_element_type) < 4)
200     {
201       if (INTEGRAL_TYPE_P (src_element_type))
202 	return extend_int (as_int, uint32_type_node, src_element_type);
203     }
204   return as_int;
205 }
206 
207 /* Packs (inserts) a scalar element in OPERANDS[1]
208    to the vector in OPERANDS[0] at element position defined by
209    OPERANDS[2].  */
210 
211 tree
build_pack(tree_stl_vec & operands)212 brig_basic_inst_handler::build_pack (tree_stl_vec &operands)
213 {
214   /* Implement using a bit level insertion.
215      TODO: Reuse this for implementing 'bitinsert'
216      without a builtin call.  */
217 
218   size_t ecount = gccbrig_type_vector_subparts (TREE_TYPE (operands[0]));
219   size_t vecsize = int_size_in_bytes (TREE_TYPE (operands[0])) * BITS_PER_UNIT;
220   tree wide_type = build_nonstandard_integer_type (vecsize, 1);
221 
222   tree src_vect = build_resize_convert_view (wide_type, operands[0]);
223   src_vect = m_parent.m_cf->add_temp_var ("src_vect", src_vect);
224 
225   tree scalar = operands[1];
226   scalar = m_parent.m_cf->add_temp_var ("scalar",
227 					convert_to_integer (wide_type, scalar));
228 
229   tree pos = operands[2];
230 
231   /* The upper bits of the position can contain garbage.
232      Zero them for well-defined semantics.  */
233   tree t = build2 (BIT_AND_EXPR, TREE_TYPE (pos), operands[2],
234 		   build_int_cstu (TREE_TYPE (pos), ecount - 1));
235   pos = m_parent.m_cf->add_temp_var ("pos", convert (wide_type, t));
236 
237   tree element_type = TREE_TYPE (TREE_TYPE (operands[0]));
238   size_t element_width = int_size_in_bytes (element_type) * BITS_PER_UNIT;
239   tree ewidth = build_int_cstu (wide_type, element_width);
240 
241   tree bitoffset = build2 (MULT_EXPR, wide_type, ewidth, pos);
242   bitoffset = m_parent.m_cf->add_temp_var ("offset", bitoffset);
243 
244   uint64_t mask_int
245     = element_width == 64 ? (uint64_t) -1 : ((uint64_t) 1 << element_width) - 1;
246 
247   tree mask = build_int_cstu (wide_type, mask_int);
248 
249   mask = m_parent.m_cf->add_temp_var ("mask",
250 				      convert_to_integer (wide_type, mask));
251 
252   tree clearing_mask
253     = build1 (BIT_NOT_EXPR, wide_type,
254 	      build2 (LSHIFT_EXPR, wide_type, mask, bitoffset));
255 
256   tree zeroed_element
257     = build2 (BIT_AND_EXPR, wide_type, src_vect, clearing_mask);
258 
259   /* TODO: Is the AND necessary: does HSA define what
260      happens if the upper bits in the inserted element are not
261      zero? */
262   tree element_in_position
263     = build2 (LSHIFT_EXPR, wide_type,
264 	      build2 (BIT_AND_EXPR, wide_type, scalar, mask), bitoffset);
265 
266   tree inserted
267     = build2 (BIT_IOR_EXPR, wide_type, zeroed_element, element_in_position);
268   return inserted;
269 }
270 
271 /* Implement the unpack{lo,hi}.  BRIG_OPCODE should tell which one and
272    ARITH_TYPE describe the type of the vector arithmetics.
273    OPERANDS[0] and OPERANDS[1] are the input vectors.  */
274 
275 tree
build_unpack_lo_or_hi(BrigOpcode16_t brig_opcode,tree arith_type,tree_stl_vec & operands)276 brig_basic_inst_handler::build_unpack_lo_or_hi (BrigOpcode16_t brig_opcode,
277 						tree arith_type,
278 						tree_stl_vec &operands)
279 {
280   tree element_type = get_unsigned_int_type (TREE_TYPE (arith_type));
281   tree mask_vec_type
282     = build_vector_type (element_type,
283 			 gccbrig_type_vector_subparts (arith_type));
284 
285   size_t element_count = gccbrig_type_vector_subparts (arith_type);
286   vec<constructor_elt, va_gc> *input_mask_vals = NULL;
287 
288   size_t offset = (brig_opcode == BRIG_OPCODE_UNPACKLO) ? 0 : element_count / 2;
289 
290   for (size_t i = 0; i < element_count / 2; ++i)
291     {
292       CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE,
293 			      build_int_cst (element_type, offset + i));
294       CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE,
295 			      build_int_cst (element_type,
296 					     offset + i + element_count));
297     }
298 
299   tree mask_vec = build_constructor (mask_vec_type, input_mask_vals);
300 
301   tree perm = build3 (VEC_PERM_EXPR, TREE_TYPE (operands[0]), operands[0],
302 		      operands[1], mask_vec);
303   return perm;
304 }
305 
306 /* Builds a basic instruction expression from a BRIG instruction.  BRIG_OPCODE
307    is the opcode, BRIG_TYPE the brig type of the instruction, ARITH_TYPE the
308    desired tree type for the instruction, and OPERANDS the instruction's
309    input operands already converted to tree nodes.  */
310 
311 tree
build_inst_expr(BrigOpcode16_t brig_opcode,BrigType16_t brig_type,tree arith_type,tree_stl_vec & operands)312 brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
313 					  BrigType16_t brig_type,
314 					  tree arith_type,
315 					  tree_stl_vec &operands)
316 {
317   tree_code opcode
318     = brig_function::get_tree_code_for_hsa_opcode (brig_opcode, brig_type);
319 
320   BrigType16_t inner_type = brig_type & BRIG_TYPE_BASE_MASK;
321 
322   tree instr_inner_type
323     = VECTOR_TYPE_P (arith_type) ? TREE_TYPE (arith_type) : arith_type;
324 
325   if (opcode == RSHIFT_EXPR || opcode == LSHIFT_EXPR)
326     {
327       /* HSA defines modulo/clipping behavior for shift amounts larger
328 	 than the bit width, while tree.def leaves it undefined.
329 	 We need to mask the upper bits to ensure the defined behavior.  */
330       tree scalar_mask
331 	= build_int_cst (instr_inner_type,
332 			 gccbrig_hsa_type_bit_size (inner_type) - 1);
333 
334       tree mask = VECTOR_TYPE_P (arith_type)
335 		    ? build_vector_from_val (arith_type, scalar_mask)
336 		    : scalar_mask;
337 
338       /* The shift amount is a scalar, broadcast it to produce
339 	 a vector shift.  */
340       if (VECTOR_TYPE_P (arith_type))
341 	operands[1] = build_vector_from_val (arith_type, operands[1]);
342       operands[1] = build2 (BIT_AND_EXPR, arith_type, operands[1], mask);
343     }
344 
345   size_t input_count = operands.size ();
346   size_t output_count = gccbrig_hsa_opcode_op_output_p (brig_opcode, 0) ?
347     1 : 0;
348 
349   if (opcode == TREE_LIST)
350     {
351       /* There was no direct GENERIC opcode for the instruction;
352 	 try to emulate it with a chain of GENERIC nodes.  */
353       if (brig_opcode == BRIG_OPCODE_MAD || brig_opcode == BRIG_OPCODE_MAD24)
354 	{
355 	  /* There doesn't seem to be a "standard" MAD built-in in gcc so let's
356 	     use a chain of multiply + add for now (double rounding method).
357 	     It should be easier for optimizers than a custom built-in call
358 	     WIDEN_MULT_EXPR is close, but requires a double size result
359 	     type.  */
360 	  tree mult_res
361 	    = build2 (MULT_EXPR, arith_type, operands[0], operands[1]);
362 	  return build2 (PLUS_EXPR, arith_type, mult_res, operands[2]);
363 	}
364       else if (brig_opcode == BRIG_OPCODE_MAD24HI)
365 	{
366 	  tree mult_res
367 	    = build2 (MULT_HIGHPART_EXPR, arith_type, operands[0], operands[1]);
368 	  return build2 (PLUS_EXPR, arith_type, mult_res, operands[2]);
369 	}
370       else if (brig_opcode == BRIG_OPCODE_SHUFFLE)
371 	{
372 	  return build_shuffle (arith_type, operands);
373 	}
374       else if (brig_opcode == BRIG_OPCODE_UNPACKLO
375 	       || brig_opcode == BRIG_OPCODE_UNPACKHI)
376 	{
377 	  return build_unpack_lo_or_hi (brig_opcode, arith_type, operands);
378 	}
379       else if (brig_opcode == BRIG_OPCODE_UNPACK)
380 	{
381 	  return build_unpack (operands);
382 	}
383       else if (brig_opcode == BRIG_OPCODE_PACK)
384 	{
385 	  return build_pack (operands);
386 	}
387       else if (brig_opcode == BRIG_OPCODE_NRSQRT)
388 	{
389 	  /* Implement as 1.0/sqrt (x) and assume gcc instruction selects to
390 	     native ISA other than a division, if available.
391 	     TODO: this will happen only with unsafe math optimizations
392 	     on which cannot be used in general to remain HSAIL compliant.
393 	     Perhaps a builtin call would be better option here.  */
394 	  return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
395 			 m_parent.m_cf->expand_or_call_builtin
396 			 (BRIG_OPCODE_SQRT, brig_type, arith_type, operands));
397 	}
398       else if (brig_opcode == BRIG_OPCODE_NRCP)
399 	{
400 	  /* Implement as 1.0/x and assume gcc instruction selects to
401 	     native ISA other than a division, if available.  */
402 	  return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
403 			 operands[0]);
404 	}
405       else if (brig_opcode == BRIG_OPCODE_LANEID
406 	       || brig_opcode == BRIG_OPCODE_MAXWAVEID
407 	       || brig_opcode == BRIG_OPCODE_WAVEID)
408 	{
409 	  /* Assuming WAVESIZE 1 (for now), therefore LANEID, WAVEID and
410 	     MAXWAVEID always return 0.  */
411 	  return build_zero_cst (arith_type);
412 	}
413       else
414 	gcc_unreachable ();
415     }
416   else if (opcode == CALL_EXPR)
417     return m_parent.m_cf->expand_or_call_builtin (brig_opcode, brig_type,
418 						  arith_type, operands);
419   else if (output_count == 1)
420     {
421       if (input_count == 1)
422 	{
423 	  if (opcode == MODIFY_EXPR)
424 	    return operands[0];
425 	  else
426 	    return build1 (opcode, arith_type, operands[0]);
427 	}
428       else if (input_count == 2)
429 	return build2 (opcode, arith_type, operands[0], operands[1]);
430       else if (input_count == 3)
431 	return build3 (opcode, arith_type, operands[0], operands[1],
432 		       operands[2]);
433       else
434 	gcc_unreachable ();
435     }
436   else
437     gcc_unreachable ();
438 
439   return NULL_TREE;
440 }
441 
442 /* Handles the basic instructions, including packed instructions. Deals
443    with the different packing modes by unpacking/packing the wanted
444    elements.  Delegates most of the instruction cases to build_inst_expr(). */
445 
446 size_t
operator ()(const BrigBase * base)447 brig_basic_inst_handler::operator () (const BrigBase *base)
448 {
449   const BrigInstBase *brig_inst = (const BrigInstBase *) base;
450   if (brig_inst->opcode == BRIG_OPCODE_NOP)
451     return base->byteCount;
452 
453   tree_stl_vec operands = build_operands (*brig_inst);
454 
455   size_t output_count
456     = gccbrig_hsa_opcode_op_output_p (brig_inst->opcode, 0) ? 1 : 0;
457   size_t input_count
458     = operands.size () == 0 ? 0 : (operands.size () - output_count);
459 
460   gcc_assert (output_count == 0 || output_count == 1);
461 
462   tree_stl_vec::iterator first_input_i = operands.begin ();
463   if (output_count > 0 && operands.size () > 0)
464     ++first_input_i;
465 
466   tree_stl_vec in_operands;
467   in_operands.assign (first_input_i, operands.end ());
468 
469   BrigType16_t brig_inst_type = brig_inst->type;
470 
471   if (brig_inst->opcode == BRIG_OPCODE_FIRSTBIT
472       || brig_inst->opcode == BRIG_OPCODE_LASTBIT
473       || brig_inst->opcode == BRIG_OPCODE_SAD)
474     /* These instructions are reported to be always 32b in HSAIL, but we want
475        to treat them according to their input argument's type to select the
476        correct instruction/builtin.  */
477     brig_inst_type
478       = gccbrig_tree_type_to_hsa_type (TREE_TYPE (in_operands[0]));
479 
480   tree instr_type = gccbrig_tree_type_for_hsa_type (brig_inst_type);
481 
482   if (!instr_type)
483     {
484       gcc_unreachable ();
485       return base->byteCount;
486     }
487 
488   bool is_vec_instr = hsa_type_packed_p (brig_inst_type);
489 
490   size_t element_size_bits;
491   size_t element_count;
492 
493   if (is_vec_instr)
494     {
495       BrigType16_t brig_element_type = brig_inst_type & BRIG_TYPE_BASE_MASK;
496       element_size_bits = gccbrig_hsa_type_bit_size (brig_element_type);
497       element_count = gccbrig_hsa_type_bit_size (brig_inst_type)
498 	/ gccbrig_hsa_type_bit_size (brig_element_type);
499     }
500   else
501     {
502       element_size_bits = gccbrig_hsa_type_bit_size (brig_inst_type);
503       element_count = 1;
504     }
505 
506   /* The actual arithmetics type that should be performed with the
507      operation.  This is not always the same as the original BRIG
508      opcode's type due to implicit conversions of storage-only f16.  */
509   tree arith_type = gccbrig_is_bit_operation (brig_inst->opcode)
510 		      ? gccbrig_tree_type_for_hsa_type (brig_inst_type)
511 		      : get_tree_expr_type_for_hsa_type (brig_inst_type);
512 
513   tree instr_expr = NULL_TREE;
514 
515   BrigPack8_t p = BRIG_PACK_NONE;
516   if (brig_inst->base.kind == BRIG_KIND_INST_MOD)
517     p = ((const BrigInstMod *) brig_inst)->pack;
518   else if (brig_inst->base.kind == BRIG_KIND_INST_CMP)
519     p = ((const BrigInstCmp *) brig_inst)->pack;
520 
521   if (p == BRIG_PACK_PS || p == BRIG_PACK_PSSAT)
522     in_operands[1] = build_lower_element_broadcast (in_operands[1]);
523   else if (p == BRIG_PACK_SP || p == BRIG_PACK_SPSAT)
524     in_operands[0] = build_lower_element_broadcast (in_operands[0]);
525 
526   tree_code opcode
527     = brig_function::get_tree_code_for_hsa_opcode (brig_inst->opcode,
528 						   brig_inst_type);
529 
530   if (p >= BRIG_PACK_PPSAT && p <= BRIG_PACK_PSAT)
531     {
532       scalarized_sat_arithmetics sat_arith (*brig_inst);
533       gcc_assert (input_count == 2);
534       instr_expr = sat_arith (*this, in_operands[0], in_operands[1]);
535     }
536   else if (opcode == RETURN_EXPR)
537     {
538       if (m_parent.m_cf->m_is_kernel)
539 	{
540 	  tree goto_stmt
541 	    = build1 (GOTO_EXPR, void_type_node, m_parent.m_cf->m_exit_label);
542 	  m_parent.m_cf->append_statement (goto_stmt);
543 	  return base->byteCount;
544 	}
545       else
546 	{
547 	  m_parent.m_cf->append_return_stmt ();
548 	  return base->byteCount;
549 	}
550     }
551   else if (opcode == MULT_HIGHPART_EXPR &&
552 	   is_vec_instr && element_size_bits < 64)
553     {
554       /* MULT_HIGHPART_EXPR works only on target dependent vector sizes and
555 	 even the scalars do not seem to work at least for char elements.
556 
557 	 Let's fall back to scalarization and promotion of the vector elements
558 	 to larger types with the MULHI computed as a regular MUL.
559 	 MULHI for 2x64b seems to work with the Intel CPUs I've tested so
560 	 that is passed on for vector processing so there is no need for
561 	 128b scalar arithmetics.
562 
563 	 This is not modular as these type of things do not belong to the
564 	 frontend, there should be a legalization phase before the backend
565 	 that figures out the best way to compute the MULHI for any
566 	 integer vector datatype.
567 
568 	 TODO: promote to larger vector types instead.  For example
569 	 MULT_HIGHPART_EXPR with s8x8 doesn't work, but s16x8 seems to at least
570 	 with my x86-64.
571       */
572       tree_stl_vec operand0_elements;
573       if (input_count > 0)
574 	m_parent.m_cf->unpack (in_operands[0], operand0_elements);
575 
576       tree_stl_vec operand1_elements;
577       if (input_count > 1)
578 	m_parent.m_cf->unpack (in_operands[1], operand1_elements);
579 
580       tree_stl_vec result_elements;
581 
582       tree scalar_type = TREE_TYPE (arith_type);
583       BrigType16_t element_type = brig_inst_type & BRIG_TYPE_BASE_MASK;
584       tree promoted_type = short_integer_type_node;
585       switch (element_type)
586 	{
587 	case BRIG_TYPE_S8:
588 	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S16);
589 	  break;
590 	case BRIG_TYPE_U8:
591 	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U16);
592 	  break;
593 	case BRIG_TYPE_S16:
594 	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S32);
595 	  break;
596 	case BRIG_TYPE_U16:
597 	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U32);
598 	  break;
599 	case BRIG_TYPE_S32:
600 	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S64);
601 	  break;
602 	case BRIG_TYPE_U32:
603 	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U64);
604 	  break;
605 	default:
606 	  gcc_unreachable ();
607 	}
608 
609       size_t promoted_type_size = int_size_in_bytes (promoted_type) * 8;
610       size_t element_count = gccbrig_type_vector_subparts (arith_type);
611       for (size_t i = 0; i < element_count; ++i)
612 	{
613 	  tree operand0 = convert (promoted_type, operand0_elements.at (i));
614 	  tree operand1 = convert (promoted_type, operand1_elements.at (i));
615 
616 	  tree scalar_expr
617 	    = build2 (MULT_EXPR, promoted_type, operand0, operand1);
618 
619 	  scalar_expr
620 	    = build2 (RSHIFT_EXPR, promoted_type, scalar_expr,
621 		      build_int_cstu (promoted_type, promoted_type_size / 2));
622 
623 	  result_elements.push_back (convert (scalar_type, scalar_expr));
624 	}
625       instr_expr = m_parent.m_cf->pack (result_elements);
626     }
627   else
628     {
629       /* 'class' is always of b1 type, let's consider it by its
630 	 float type when building the instruction to find the
631 	 correct builtin.  */
632       if (brig_inst->opcode == BRIG_OPCODE_CLASS)
633 	brig_inst_type = ((const BrigInstSourceType *) base)->sourceType;
634       instr_expr = build_inst_expr (brig_inst->opcode, brig_inst_type,
635 				     arith_type, in_operands);
636     }
637 
638   if (instr_expr == NULL_TREE)
639     {
640       gcc_unreachable ();
641       return base->byteCount;
642     }
643 
644   if (p == BRIG_PACK_SS || p == BRIG_PACK_S || p == BRIG_PACK_SSSAT
645       || p == BRIG_PACK_SSAT)
646     {
647       /* In case of _s_ or _ss_, select only the lowest element
648 	 from the new input to the output.  We could extract
649 	 the element and use a scalar operation, but try
650 	 to keep data in vector registers as much as possible
651 	 to avoid copies between scalar and vector datapaths.  */
652       tree old_value;
653       tree half_storage_type = gccbrig_tree_type_for_hsa_type (brig_inst_type);
654       bool is_fp16_operation
655 	= (brig_inst_type & BRIG_TYPE_BASE_MASK) == BRIG_TYPE_F16
656 	&& !gccbrig_is_bit_operation (brig_inst->opcode);
657 
658       if (is_fp16_operation)
659 	old_value = build_h2f_conversion
660 	  (build_resize_convert_view (half_storage_type, operands[0]));
661       else
662 	old_value
663 	  = build_resize_convert_view (TREE_TYPE (instr_expr), operands[0]);
664 
665       size_t esize = is_fp16_operation ? 32 : element_size_bits;
666 
667       /* Construct a permutation mask where other elements than the lowest one
668 	 is picked from the old_value.  */
669       tree mask_inner_type = build_nonstandard_integer_type (esize, 1);
670       vec<constructor_elt, va_gc> *constructor_vals = NULL;
671       for (size_t i = 0; i < element_count; ++i)
672 	{
673 	  tree cst;
674 
675 	  if (i == 0)
676 	    cst = build_int_cstu (mask_inner_type, element_count);
677 	  else
678 	    cst = build_int_cstu (mask_inner_type, i);
679 	  CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, cst);
680 	}
681       tree mask_vec_type = build_vector_type (mask_inner_type, element_count);
682       tree mask = build_vector_from_ctor (mask_vec_type, constructor_vals);
683 
684       tree new_value = create_tmp_var (TREE_TYPE (instr_expr), "new_output");
685       tree assign
686 	= build2 (MODIFY_EXPR, TREE_TYPE (instr_expr), new_value, instr_expr);
687       m_parent.m_cf->append_statement (assign);
688 
689       instr_expr
690 	= build3 (VEC_PERM_EXPR, arith_type, old_value, new_value, mask);
691 
692       tree lower_output = create_tmp_var (TREE_TYPE (instr_expr), "s_output");
693       tree assign_lower = build2 (MODIFY_EXPR, TREE_TYPE (instr_expr),
694 				  lower_output, instr_expr);
695       m_parent.m_cf->append_statement (assign_lower);
696       instr_expr = lower_output;
697     }
698 
699   if (output_count == 1)
700     build_output_assignment (*brig_inst, operands[0], instr_expr);
701   else
702     m_parent.m_cf->append_statement (instr_expr);
703   return base->byteCount;
704 }
705 
706 /* Create an expression that broadcasts the lowest element of the
707    vector in VEC_OPERAND to all elements of the returned vector.  */
708 
709 tree
build_lower_element_broadcast(tree vec_operand)710 brig_basic_inst_handler::build_lower_element_broadcast (tree vec_operand)
711 {
712   /* Build the broadcast using shuffle because there's no
713      direct broadcast in GENERIC and this way there's no need for
714      a separate extract of the lowest element.  */
715   tree element_type = TREE_TYPE (TREE_TYPE (vec_operand));
716   size_t esize = 8 * int_size_in_bytes (element_type);
717 
718   size_t element_count
719     = gccbrig_type_vector_subparts (TREE_TYPE (vec_operand));
720   tree mask_inner_type = build_nonstandard_integer_type (esize, 1);
721   vec<constructor_elt, va_gc> *constructor_vals = NULL;
722 
723   /* Construct the mask.  */
724   for (size_t i = 0; i < element_count; ++i)
725     {
726       tree cst = build_int_cstu (mask_inner_type, element_count);
727       CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, cst);
728     }
729   tree mask_vec_type = build_vector_type (mask_inner_type, element_count);
730   tree mask = build_vector_from_ctor (mask_vec_type, constructor_vals);
731 
732   return build3 (VEC_PERM_EXPR, TREE_TYPE (vec_operand), vec_operand,
733 		 vec_operand, mask);
734 }
735 
736