1 /* brig-basic-inst-handler.cc -- brig basic instruction handling 2 Copyright (C) 2016-2018 Free Software Foundation, Inc. 3 Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com> 4 for General Processor Tech. 5 6 This file is part of GCC. 7 8 GCC is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free 10 Software Foundation; either version 3, or (at your option) any later 11 version. 12 13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or 15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16 for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with GCC; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include <sstream> 23 24 #include "brig-code-entry-handler.h" 25 #include "brig-util.h" 26 27 #include "errors.h" 28 #include "gimple-expr.h" 29 #include "convert.h" 30 #include "print-tree.h" 31 #include "tree-pretty-print.h" 32 #include "langhooks.h" 33 #include "stor-layout.h" 34 #include "diagnostic-core.h" 35 #include "brig-builtins.h" 36 #include "fold-const.h" 37 38 brig_basic_inst_handler::brig_basic_inst_handler (brig_to_generic &parent) 39 : brig_code_entry_handler (parent) 40 { 41 } 42 43 class scalarized_sat_arithmetics : public tree_element_binary_visitor 44 { 45 public: 46 scalarized_sat_arithmetics (const BrigInstBase &brig_inst) 47 : m_brig_inst (brig_inst) 48 { 49 BrigType16_t element_type = brig_inst.type & BRIG_TYPE_BASE_MASK; 50 51 #undef DEF_HSAIL_SAT_BUILTIN 52 #undef DEF_HSAIL_BUILTIN 53 #undef DEF_HSAIL_ATOMIC_BUILTIN 54 #undef DEF_HSAIL_INTR_BUILTIN 55 #undef DEF_HSAIL_CVT_ZEROI_SAT_BUILTIN 56 57 #define DEF_HSAIL_SAT_BUILTIN(ENUM, BRIG_OPCODE, HSAIL_TYPE, \ 58 NAME, TYPE, ATTRS) \ 59 if (brig_inst.opcode == BRIG_OPCODE && element_type == HSAIL_TYPE) \ 60 m_builtin = builtin_decl_explicit (ENUM); \ 61 else 62 #include "brig-builtins.def" 63 gcc_unreachable (); 64 } 65 66 virtual tree 67 visit_element (brig_code_entry_handler &, tree operand0, tree operand1) 68 { 69 /* Implement saturating arithmetics with scalar built-ins for now. 70 TODO: emit GENERIC nodes for the simplest cases or at least 71 emit vector built-ins. */ 72 return call_builtin (m_builtin, 2, TREE_TYPE (operand0), 73 TREE_TYPE (operand0), operand0, 74 TREE_TYPE (operand1), operand1); 75 } 76 const BrigInstBase &m_brig_inst; 77 tree m_builtin; 78 }; 79 80 /* Implements a vector shuffle. ARITH_TYPE is the type of the vector, 81 OPERANDS[0] is the first vector, OPERAND[1] the second vector and 82 OPERANDS[2] the shuffle mask in HSAIL format. The output is a VEC_PERM_EXPR 83 that implements the shuffle as a GENERIC expression. */ 84 85 tree 86 brig_basic_inst_handler::build_shuffle (tree arith_type, 87 tree_stl_vec &operands) 88 { 89 tree element_type 90 = get_unsigned_int_type (TREE_TYPE (TREE_TYPE (operands[0]))); 91 92 /* Offsets to add to the mask values to convert from the 93 HSAIL mask to VEC_PERM_EXPR masks. VEC_PERM_EXPR mask 94 assumes an index spanning from 0 to 2 times the vec 95 width while HSAIL refers separately to two different 96 input vectors, thus is not a "full shuffle" where all 97 output elements can originate from any input element. */ 98 vec<constructor_elt, va_gc> *mask_offset_vals = NULL; 99 100 unsigned int element_count = gccbrig_type_vector_subparts (arith_type); 101 102 vec<constructor_elt, va_gc> *input_mask_vals = NULL; 103 size_t input_mask_element_size = exact_log2 (element_count); 104 105 /* Unpack the tightly packed mask elements to BIT_FIELD_REFs 106 from which to construct the mask vector as understood by 107 VEC_PERM_EXPR. */ 108 tree mask_operand = add_temp_var ("shuffle_mask", operands[2]); 109 110 tree mask_element_type 111 = build_nonstandard_integer_type (input_mask_element_size, true); 112 113 for (size_t i = 0; i < element_count; ++i) 114 { 115 tree mask_element 116 = build3 (BIT_FIELD_REF, mask_element_type, mask_operand, 117 bitsize_int (input_mask_element_size), 118 bitsize_int (i * input_mask_element_size)); 119 120 mask_element = convert (element_type, mask_element); 121 122 tree offset; 123 if (i < element_count / 2) 124 offset = build_int_cst (element_type, 0); 125 else 126 offset = build_int_cst (element_type, element_count); 127 128 CONSTRUCTOR_APPEND_ELT (mask_offset_vals, NULL_TREE, offset); 129 CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, mask_element); 130 } 131 tree mask_vec_type = build_vector_type (element_type, element_count); 132 133 tree mask_vec = build_constructor (mask_vec_type, input_mask_vals); 134 tree offset_vec = build_constructor (mask_vec_type, mask_offset_vals); 135 136 tree mask = build2 (PLUS_EXPR, mask_vec_type, mask_vec, offset_vec); 137 138 tree perm = build3 (VEC_PERM_EXPR, TREE_TYPE (operands[0]), operands[0], 139 operands[1], mask); 140 return perm; 141 } 142 143 /* Unpacks (extracts) a scalar element with an index in OPERANDS[1] 144 from the vector expression in OPERANDS[0]. */ 145 146 tree 147 brig_basic_inst_handler::build_unpack (tree_stl_vec &operands) 148 { 149 /* Implement the unpack with a shuffle that stores the unpacked 150 element to the lowest bit positions in the dest. After that 151 a bitwise AND is used to clear the uppermost bits. */ 152 tree src_element_type = TREE_TYPE (TREE_TYPE (operands[0])); 153 154 /* Perform the operations with a raw (unsigned int type) type. */ 155 tree element_type = get_unsigned_int_type (src_element_type); 156 157 vec<constructor_elt, va_gc> *input_mask_vals = NULL; 158 vec<constructor_elt, va_gc> *and_mask_vals = NULL; 159 160 size_t element_count 161 = gccbrig_type_vector_subparts (TREE_TYPE (operands[0])); 162 tree vec_type = build_vector_type (element_type, element_count); 163 164 for (size_t i = 0; i < element_count; ++i) 165 { 166 tree mask_element; 167 if (i == 0) 168 mask_element = convert (element_type, operands[1]); 169 else 170 mask_element = build_int_cst (element_type, 0); 171 172 CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, mask_element); 173 174 tree and_mask_element; 175 if (i == 0) 176 and_mask_element = build_int_cst (element_type, -1); 177 else 178 and_mask_element = build_int_cst (element_type, 0); 179 CONSTRUCTOR_APPEND_ELT (and_mask_vals, NULL_TREE, and_mask_element); 180 } 181 182 tree mask_vec = build_constructor (vec_type, input_mask_vals); 183 184 tree and_mask_vec = build_constructor (vec_type, and_mask_vals); 185 186 tree perm = build3 (VEC_PERM_EXPR, vec_type, 187 build_resize_convert_view (vec_type, operands[0]), 188 build_resize_convert_view (vec_type, operands[0]), 189 mask_vec); 190 191 tree cleared = build2 (BIT_AND_EXPR, vec_type, perm, and_mask_vec); 192 193 size_t s = int_size_in_bytes (TREE_TYPE (cleared)) * BITS_PER_UNIT; 194 tree raw_type = build_nonstandard_integer_type (s, true); 195 196 tree as_int = build_resize_convert_view (raw_type, cleared); 197 198 if (int_size_in_bytes (src_element_type) < 4) 199 { 200 if (INTEGRAL_TYPE_P (src_element_type)) 201 return extend_int (as_int, uint32_type_node, src_element_type); 202 } 203 return as_int; 204 } 205 206 /* Packs (inserts) a scalar element in OPERANDS[1] 207 to the vector in OPERANDS[0] at element position defined by 208 OPERANDS[2]. */ 209 210 tree 211 brig_basic_inst_handler::build_pack (tree_stl_vec &operands) 212 { 213 /* Implement using a bit level insertion. 214 TODO: Reuse this for implementing 'bitinsert' 215 without a builtin call. */ 216 217 size_t ecount = gccbrig_type_vector_subparts (TREE_TYPE (operands[0])); 218 size_t vecsize = int_size_in_bytes (TREE_TYPE (operands[0])) * BITS_PER_UNIT; 219 tree wide_type = build_nonstandard_integer_type (vecsize, 1); 220 221 tree src_vect = build_resize_convert_view (wide_type, operands[0]); 222 src_vect = add_temp_var ("src_vect", src_vect); 223 224 tree scalar = operands[1]; 225 scalar = add_temp_var ("scalar", convert_to_integer (wide_type, scalar)); 226 227 tree pos = operands[2]; 228 229 /* The upper bits of the position can contain garbage. 230 Zero them for well-defined semantics. */ 231 tree t = build2 (BIT_AND_EXPR, TREE_TYPE (pos), operands[2], 232 build_int_cstu (TREE_TYPE (pos), ecount - 1)); 233 pos = add_temp_var ("pos", convert (wide_type, t)); 234 235 tree element_type = TREE_TYPE (TREE_TYPE (operands[0])); 236 size_t element_width = int_size_in_bytes (element_type) * BITS_PER_UNIT; 237 tree ewidth = build_int_cstu (wide_type, element_width); 238 239 tree bitoffset = build2 (MULT_EXPR, wide_type, ewidth, pos); 240 bitoffset = add_temp_var ("offset", bitoffset); 241 242 uint64_t mask_int 243 = element_width == 64 ? (uint64_t) -1 : ((uint64_t) 1 << element_width) - 1; 244 245 tree mask = build_int_cstu (wide_type, mask_int); 246 247 mask = add_temp_var ("mask", convert_to_integer (wide_type, mask)); 248 249 tree clearing_mask 250 = build1 (BIT_NOT_EXPR, wide_type, 251 build2 (LSHIFT_EXPR, wide_type, mask, bitoffset)); 252 253 tree zeroed_element 254 = build2 (BIT_AND_EXPR, wide_type, src_vect, clearing_mask); 255 256 /* TODO: Is the AND necessary: does HSA define what 257 happens if the upper bits in the inserted element are not 258 zero? */ 259 tree element_in_position 260 = build2 (LSHIFT_EXPR, wide_type, 261 build2 (BIT_AND_EXPR, wide_type, scalar, mask), bitoffset); 262 263 tree inserted 264 = build2 (BIT_IOR_EXPR, wide_type, zeroed_element, element_in_position); 265 return inserted; 266 } 267 268 /* Implement the unpack{lo,hi}. BRIG_OPCODE should tell which one and 269 ARITH_TYPE describe the type of the vector arithmetics. 270 OPERANDS[0] and OPERANDS[1] are the input vectors. */ 271 272 tree 273 brig_basic_inst_handler::build_unpack_lo_or_hi (BrigOpcode16_t brig_opcode, 274 tree arith_type, 275 tree_stl_vec &operands) 276 { 277 tree element_type = get_unsigned_int_type (TREE_TYPE (arith_type)); 278 tree mask_vec_type 279 = build_vector_type (element_type, 280 gccbrig_type_vector_subparts (arith_type)); 281 282 size_t element_count = gccbrig_type_vector_subparts (arith_type); 283 vec<constructor_elt, va_gc> *input_mask_vals = NULL; 284 285 size_t offset = (brig_opcode == BRIG_OPCODE_UNPACKLO) ? 0 : element_count / 2; 286 287 for (size_t i = 0; i < element_count / 2; ++i) 288 { 289 CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, 290 build_int_cst (element_type, offset + i)); 291 CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, 292 build_int_cst (element_type, 293 offset + i + element_count)); 294 } 295 296 tree mask_vec = build_constructor (mask_vec_type, input_mask_vals); 297 298 tree perm = build3 (VEC_PERM_EXPR, TREE_TYPE (operands[0]), operands[0], 299 operands[1], mask_vec); 300 return perm; 301 } 302 303 /* Builds a basic instruction expression from a BRIG instruction. BRIG_OPCODE 304 is the opcode, BRIG_TYPE the brig type of the instruction, ARITH_TYPE the 305 desired tree type for the instruction, and OPERANDS the instruction's 306 input operands already converted to tree nodes. */ 307 308 tree 309 brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode, 310 BrigType16_t brig_type, 311 tree arith_type, 312 tree_stl_vec &operands) 313 { 314 tree_code opcode = get_tree_code_for_hsa_opcode (brig_opcode, brig_type); 315 316 BrigType16_t inner_type = brig_type & BRIG_TYPE_BASE_MASK; 317 318 tree instr_inner_type 319 = VECTOR_TYPE_P (arith_type) ? TREE_TYPE (arith_type) : arith_type; 320 321 if (opcode == RSHIFT_EXPR || opcode == LSHIFT_EXPR) 322 { 323 /* HSA defines modulo/clipping behavior for shift amounts larger 324 than the bit width, while tree.def leaves it undefined. 325 We need to mask the upper bits to ensure the defined behavior. */ 326 tree scalar_mask 327 = build_int_cst (instr_inner_type, 328 gccbrig_hsa_type_bit_size (inner_type) - 1); 329 330 tree mask = VECTOR_TYPE_P (arith_type) 331 ? build_vector_from_val (arith_type, scalar_mask) 332 : scalar_mask; 333 334 /* The shift amount is a scalar, broadcast it to produce 335 a vector shift. */ 336 if (VECTOR_TYPE_P (arith_type)) 337 operands[1] = build_vector_from_val (arith_type, operands[1]); 338 operands[1] = build2 (BIT_AND_EXPR, arith_type, operands[1], mask); 339 } 340 341 size_t input_count = operands.size (); 342 size_t output_count = gccbrig_hsa_opcode_op_output_p (brig_opcode, 0) ? 343 1 : 0; 344 345 if (opcode == TREE_LIST) 346 { 347 /* There was no direct GENERIC opcode for the instruction; 348 try to emulate it with a chain of GENERIC nodes. */ 349 if (brig_opcode == BRIG_OPCODE_MAD || brig_opcode == BRIG_OPCODE_MAD24) 350 { 351 /* There doesn't seem to be a "standard" MAD built-in in gcc so let's 352 use a chain of multiply + add for now (double rounding method). 353 It should be easier for optimizers than a custom built-in call 354 WIDEN_MULT_EXPR is close, but requires a double size result 355 type. */ 356 tree mult_res 357 = build2 (MULT_EXPR, arith_type, operands[0], operands[1]); 358 return build2 (PLUS_EXPR, arith_type, mult_res, operands[2]); 359 } 360 else if (brig_opcode == BRIG_OPCODE_MAD24HI) 361 { 362 tree mult_res 363 = build2 (MULT_HIGHPART_EXPR, arith_type, operands[0], operands[1]); 364 return build2 (PLUS_EXPR, arith_type, mult_res, operands[2]); 365 } 366 else if (brig_opcode == BRIG_OPCODE_SHUFFLE) 367 { 368 return build_shuffle (arith_type, operands); 369 } 370 else if (brig_opcode == BRIG_OPCODE_UNPACKLO 371 || brig_opcode == BRIG_OPCODE_UNPACKHI) 372 { 373 return build_unpack_lo_or_hi (brig_opcode, arith_type, operands); 374 } 375 else if (brig_opcode == BRIG_OPCODE_UNPACK) 376 { 377 return build_unpack (operands); 378 } 379 else if (brig_opcode == BRIG_OPCODE_PACK) 380 { 381 return build_pack (operands); 382 } 383 else if (brig_opcode == BRIG_OPCODE_NRSQRT) 384 { 385 /* Implement as 1.0/sqrt (x) and assume gcc instruction selects to 386 native ISA other than a division, if available. 387 TODO: this will happen only with unsafe math optimizations 388 on which cannot be used in general to remain HSAIL compliant. 389 Perhaps a builtin call would be better option here. */ 390 return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type), 391 expand_or_call_builtin (BRIG_OPCODE_SQRT, brig_type, 392 arith_type, operands)); 393 } 394 else if (brig_opcode == BRIG_OPCODE_NRCP) 395 { 396 /* Implement as 1.0/x and assume gcc instruction selects to 397 native ISA other than a division, if available. */ 398 return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type), 399 operands[0]); 400 } 401 else if (brig_opcode == BRIG_OPCODE_LANEID 402 || brig_opcode == BRIG_OPCODE_MAXWAVEID 403 || brig_opcode == BRIG_OPCODE_WAVEID) 404 { 405 /* Assuming WAVESIZE 1 (for now), therefore LANEID, WAVEID and 406 MAXWAVEID always return 0. */ 407 return build_zero_cst (arith_type); 408 } 409 else 410 gcc_unreachable (); 411 } 412 else if (opcode == CALL_EXPR) 413 return expand_or_call_builtin (brig_opcode, brig_type, arith_type, 414 operands); 415 else if (output_count == 1) 416 { 417 if (input_count == 1) 418 { 419 if (opcode == MODIFY_EXPR) 420 return operands[0]; 421 else 422 return build1 (opcode, arith_type, operands[0]); 423 } 424 else if (input_count == 2) 425 return build2 (opcode, arith_type, operands[0], operands[1]); 426 else if (input_count == 3) 427 return build3 (opcode, arith_type, operands[0], operands[1], 428 operands[2]); 429 else 430 gcc_unreachable (); 431 } 432 else 433 gcc_unreachable (); 434 435 return NULL_TREE; 436 } 437 438 /* Handles the basic instructions, including packed instructions. Deals 439 with the different packing modes by unpacking/packing the wanted 440 elements. Delegates most of the instruction cases to build_inst_expr(). */ 441 442 size_t 443 brig_basic_inst_handler::operator () (const BrigBase *base) 444 { 445 const BrigInstBase *brig_inst = (const BrigInstBase *) base; 446 447 tree_stl_vec operands = build_operands (*brig_inst); 448 449 size_t output_count 450 = gccbrig_hsa_opcode_op_output_p (brig_inst->opcode, 0) ? 1 : 0; 451 size_t input_count 452 = operands.size () == 0 ? 0 : (operands.size () - output_count); 453 454 gcc_assert (output_count == 0 || output_count == 1); 455 456 tree_stl_vec::iterator first_input_i = operands.begin (); 457 if (output_count > 0 && operands.size () > 0) 458 ++first_input_i; 459 460 tree_stl_vec in_operands; 461 in_operands.assign (first_input_i, operands.end ()); 462 463 BrigType16_t brig_inst_type = brig_inst->type; 464 465 if (brig_inst->opcode == BRIG_OPCODE_NOP) 466 return base->byteCount; 467 else if (brig_inst->opcode == BRIG_OPCODE_FIRSTBIT 468 || brig_inst->opcode == BRIG_OPCODE_LASTBIT 469 || brig_inst->opcode == BRIG_OPCODE_SAD) 470 /* These instructions are reported to be always 32b in HSAIL, but we want 471 to treat them according to their input argument's type to select the 472 correct instruction/builtin. */ 473 brig_inst_type 474 = gccbrig_tree_type_to_hsa_type (TREE_TYPE (in_operands[0])); 475 476 tree instr_type = gccbrig_tree_type_for_hsa_type (brig_inst_type); 477 478 if (!instr_type) 479 { 480 gcc_unreachable (); 481 return base->byteCount; 482 } 483 484 bool is_vec_instr = hsa_type_packed_p (brig_inst_type); 485 486 size_t element_size_bits; 487 size_t element_count; 488 489 if (is_vec_instr) 490 { 491 BrigType16_t brig_element_type = brig_inst_type & BRIG_TYPE_BASE_MASK; 492 element_size_bits = gccbrig_hsa_type_bit_size (brig_element_type); 493 element_count = gccbrig_hsa_type_bit_size (brig_inst_type) 494 / gccbrig_hsa_type_bit_size (brig_element_type); 495 } 496 else 497 { 498 element_size_bits = gccbrig_hsa_type_bit_size (brig_inst_type); 499 element_count = 1; 500 } 501 502 /* The actual arithmetics type that should be performed with the 503 operation. This is not always the same as the original BRIG 504 opcode's type due to implicit conversions of storage-only f16. */ 505 tree arith_type = gccbrig_is_bit_operation (brig_inst->opcode) 506 ? gccbrig_tree_type_for_hsa_type (brig_inst_type) 507 : get_tree_expr_type_for_hsa_type (brig_inst_type); 508 509 tree instr_expr = NULL_TREE; 510 511 BrigPack8_t p = BRIG_PACK_NONE; 512 if (brig_inst->base.kind == BRIG_KIND_INST_MOD) 513 p = ((const BrigInstMod *) brig_inst)->pack; 514 else if (brig_inst->base.kind == BRIG_KIND_INST_CMP) 515 p = ((const BrigInstCmp *) brig_inst)->pack; 516 517 if (p == BRIG_PACK_PS || p == BRIG_PACK_PSSAT) 518 in_operands[1] = build_lower_element_broadcast (in_operands[1]); 519 else if (p == BRIG_PACK_SP || p == BRIG_PACK_SPSAT) 520 in_operands[0] = build_lower_element_broadcast (in_operands[0]); 521 522 tree_code opcode 523 = get_tree_code_for_hsa_opcode (brig_inst->opcode, brig_inst_type); 524 525 if (p >= BRIG_PACK_PPSAT && p <= BRIG_PACK_PSAT) 526 { 527 scalarized_sat_arithmetics sat_arith (*brig_inst); 528 gcc_assert (input_count == 2); 529 instr_expr = sat_arith (*this, in_operands[0], in_operands[1]); 530 } 531 else if (opcode == RETURN_EXPR) 532 { 533 if (m_parent.m_cf->m_is_kernel) 534 { 535 tree goto_stmt 536 = build1 (GOTO_EXPR, void_type_node, m_parent.m_cf->m_exit_label); 537 m_parent.m_cf->append_statement (goto_stmt); 538 return base->byteCount; 539 } 540 else 541 { 542 m_parent.m_cf->append_return_stmt (); 543 return base->byteCount; 544 } 545 } 546 else if (opcode == MULT_HIGHPART_EXPR && 547 is_vec_instr && element_size_bits < 64) 548 { 549 /* MULT_HIGHPART_EXPR works only on target dependent vector sizes and 550 even the scalars do not seem to work at least for char elements. 551 552 Let's fall back to scalarization and promotion of the vector elements 553 to larger types with the MULHI computed as a regular MUL. 554 MULHI for 2x64b seems to work with the Intel CPUs I've tested so 555 that is passed on for vector processing so there is no need for 556 128b scalar arithmetics. 557 558 This is not modular as these type of things do not belong to the 559 frontend, there should be a legalization phase before the backend 560 that figures out the best way to compute the MULHI for any 561 integer vector datatype. 562 563 TODO: promote to larger vector types instead. For example 564 MULT_HIGHPART_EXPR with s8x8 doesn't work, but s16x8 seems to at least 565 with my x86-64. 566 */ 567 tree_stl_vec operand0_elements; 568 if (input_count > 0) 569 unpack (in_operands[0], operand0_elements); 570 571 tree_stl_vec operand1_elements; 572 if (input_count > 1) 573 unpack (in_operands[1], operand1_elements); 574 575 tree_stl_vec result_elements; 576 577 tree scalar_type = TREE_TYPE (arith_type); 578 BrigType16_t element_type = brig_inst_type & BRIG_TYPE_BASE_MASK; 579 tree promoted_type = short_integer_type_node; 580 switch (element_type) 581 { 582 case BRIG_TYPE_S8: 583 promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S16); 584 break; 585 case BRIG_TYPE_U8: 586 promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U16); 587 break; 588 case BRIG_TYPE_S16: 589 promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S32); 590 break; 591 case BRIG_TYPE_U16: 592 promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U32); 593 break; 594 case BRIG_TYPE_S32: 595 promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S64); 596 break; 597 case BRIG_TYPE_U32: 598 promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U64); 599 break; 600 default: 601 gcc_unreachable (); 602 } 603 604 size_t promoted_type_size = int_size_in_bytes (promoted_type) * 8; 605 size_t element_count = gccbrig_type_vector_subparts (arith_type); 606 for (size_t i = 0; i < element_count; ++i) 607 { 608 tree operand0 = convert (promoted_type, operand0_elements.at (i)); 609 tree operand1 = convert (promoted_type, operand1_elements.at (i)); 610 611 tree scalar_expr 612 = build2 (MULT_EXPR, promoted_type, operand0, operand1); 613 614 scalar_expr 615 = build2 (RSHIFT_EXPR, promoted_type, scalar_expr, 616 build_int_cstu (promoted_type, promoted_type_size / 2)); 617 618 result_elements.push_back (convert (scalar_type, scalar_expr)); 619 } 620 instr_expr = pack (result_elements); 621 } 622 else 623 { 624 /* 'class' is always of b1 type, let's consider it by its 625 float type when building the instruction to find the 626 correct builtin. */ 627 if (brig_inst->opcode == BRIG_OPCODE_CLASS) 628 brig_inst_type = ((const BrigInstSourceType *) base)->sourceType; 629 instr_expr = build_inst_expr (brig_inst->opcode, brig_inst_type, 630 arith_type, in_operands); 631 } 632 633 if (instr_expr == NULL_TREE) 634 { 635 gcc_unreachable (); 636 return base->byteCount; 637 } 638 639 if (p == BRIG_PACK_SS || p == BRIG_PACK_S || p == BRIG_PACK_SSSAT 640 || p == BRIG_PACK_SSAT) 641 { 642 /* In case of _s_ or _ss_, select only the lowest element 643 from the new input to the output. We could extract 644 the element and use a scalar operation, but try 645 to keep data in vector registers as much as possible 646 to avoid copies between scalar and vector datapaths. */ 647 tree old_value; 648 tree half_storage_type = gccbrig_tree_type_for_hsa_type (brig_inst_type); 649 bool is_fp16_operation 650 = (brig_inst_type & BRIG_TYPE_BASE_MASK) == BRIG_TYPE_F16 651 && !gccbrig_is_bit_operation (brig_inst->opcode); 652 653 if (is_fp16_operation) 654 old_value = build_h2f_conversion 655 (build_resize_convert_view (half_storage_type, operands[0])); 656 else 657 old_value 658 = build_resize_convert_view (TREE_TYPE (instr_expr), operands[0]); 659 660 size_t esize = is_fp16_operation ? 32 : element_size_bits; 661 662 /* Construct a permutation mask where other elements than the lowest one 663 is picked from the old_value. */ 664 tree mask_inner_type = build_nonstandard_integer_type (esize, 1); 665 vec<constructor_elt, va_gc> *constructor_vals = NULL; 666 for (size_t i = 0; i < element_count; ++i) 667 { 668 tree cst; 669 670 if (i == 0) 671 cst = build_int_cstu (mask_inner_type, element_count); 672 else 673 cst = build_int_cstu (mask_inner_type, i); 674 CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, cst); 675 } 676 tree mask_vec_type = build_vector_type (mask_inner_type, element_count); 677 tree mask = build_vector_from_ctor (mask_vec_type, constructor_vals); 678 679 tree new_value = create_tmp_var (TREE_TYPE (instr_expr), "new_output"); 680 tree assign 681 = build2 (MODIFY_EXPR, TREE_TYPE (instr_expr), new_value, instr_expr); 682 m_parent.m_cf->append_statement (assign); 683 684 instr_expr 685 = build3 (VEC_PERM_EXPR, arith_type, old_value, new_value, mask); 686 687 tree lower_output = create_tmp_var (TREE_TYPE (instr_expr), "s_output"); 688 tree assign_lower = build2 (MODIFY_EXPR, TREE_TYPE (instr_expr), 689 lower_output, instr_expr); 690 m_parent.m_cf->append_statement (assign_lower); 691 instr_expr = lower_output; 692 } 693 694 if (output_count == 1) 695 build_output_assignment (*brig_inst, operands[0], instr_expr); 696 else 697 m_parent.m_cf->append_statement (instr_expr); 698 return base->byteCount; 699 } 700 701 /* Create an expression that broadcasts the lowest element of the 702 vector in VEC_OPERAND to all elements of the returned vector. */ 703 704 tree 705 brig_basic_inst_handler::build_lower_element_broadcast (tree vec_operand) 706 { 707 /* Build the broadcast using shuffle because there's no 708 direct broadcast in GENERIC and this way there's no need for 709 a separate extract of the lowest element. */ 710 tree element_type = TREE_TYPE (TREE_TYPE (vec_operand)); 711 size_t esize = 8 * int_size_in_bytes (element_type); 712 713 size_t element_count 714 = gccbrig_type_vector_subparts (TREE_TYPE (vec_operand)); 715 tree mask_inner_type = build_nonstandard_integer_type (esize, 1); 716 vec<constructor_elt, va_gc> *constructor_vals = NULL; 717 718 /* Construct the mask. */ 719 for (size_t i = 0; i < element_count; ++i) 720 { 721 tree cst = build_int_cstu (mask_inner_type, element_count); 722 CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, cst); 723 } 724 tree mask_vec_type = build_vector_type (mask_inner_type, element_count); 725 tree mask = build_vector_from_ctor (mask_vec_type, constructor_vals); 726 727 return build3 (VEC_PERM_EXPR, TREE_TYPE (vec_operand), vec_operand, 728 vec_operand, mask); 729 } 730 731 /* Returns the tree code that should be used to implement the given 732 HSA instruction opcode (BRIG_OPCODE) for the given type of instruction 733 (BRIG_TYPE). In case the opcode cannot be mapped to a TREE node directly, 734 returns TREE_LIST (if it can be emulated with a simple chain of tree 735 nodes) or CALL_EXPR if the opcode should be implemented using a builtin 736 call. */ 737 738 tree_code 739 brig_basic_inst_handler::get_tree_code_for_hsa_opcode 740 (BrigOpcode16_t brig_opcode, BrigType16_t brig_type) const 741 { 742 BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK; 743 switch (brig_opcode) 744 { 745 case BRIG_OPCODE_NOP: 746 return NOP_EXPR; 747 case BRIG_OPCODE_ADD: 748 return PLUS_EXPR; 749 case BRIG_OPCODE_CMOV: 750 if (brig_inner_type == brig_type) 751 return COND_EXPR; 752 else 753 return VEC_COND_EXPR; 754 case BRIG_OPCODE_SUB: 755 return MINUS_EXPR; 756 case BRIG_OPCODE_MUL: 757 case BRIG_OPCODE_MUL24: 758 return MULT_EXPR; 759 case BRIG_OPCODE_MULHI: 760 case BRIG_OPCODE_MUL24HI: 761 return MULT_HIGHPART_EXPR; 762 case BRIG_OPCODE_DIV: 763 if (gccbrig_is_float_type (brig_inner_type)) 764 return RDIV_EXPR; 765 else 766 return TRUNC_DIV_EXPR; 767 case BRIG_OPCODE_NEG: 768 return NEGATE_EXPR; 769 case BRIG_OPCODE_MIN: 770 if (gccbrig_is_float_type (brig_inner_type)) 771 return CALL_EXPR; 772 else 773 return MIN_EXPR; 774 case BRIG_OPCODE_MAX: 775 if (gccbrig_is_float_type (brig_inner_type)) 776 return CALL_EXPR; 777 else 778 return MAX_EXPR; 779 case BRIG_OPCODE_FMA: 780 return FMA_EXPR; 781 case BRIG_OPCODE_ABS: 782 return ABS_EXPR; 783 case BRIG_OPCODE_SHL: 784 return LSHIFT_EXPR; 785 case BRIG_OPCODE_SHR: 786 return RSHIFT_EXPR; 787 case BRIG_OPCODE_OR: 788 return BIT_IOR_EXPR; 789 case BRIG_OPCODE_XOR: 790 return BIT_XOR_EXPR; 791 case BRIG_OPCODE_AND: 792 return BIT_AND_EXPR; 793 case BRIG_OPCODE_NOT: 794 return BIT_NOT_EXPR; 795 case BRIG_OPCODE_RET: 796 return RETURN_EXPR; 797 case BRIG_OPCODE_MOV: 798 case BRIG_OPCODE_LDF: 799 return MODIFY_EXPR; 800 case BRIG_OPCODE_LD: 801 case BRIG_OPCODE_ST: 802 return MEM_REF; 803 case BRIG_OPCODE_BR: 804 return GOTO_EXPR; 805 case BRIG_OPCODE_REM: 806 if (brig_type == BRIG_TYPE_U64 || brig_type == BRIG_TYPE_U32) 807 return TRUNC_MOD_EXPR; 808 else 809 return CALL_EXPR; 810 case BRIG_OPCODE_NRCP: 811 case BRIG_OPCODE_NRSQRT: 812 /* Implement as 1/f (x). gcc should pattern detect that and 813 use a native instruction, if available, for it. */ 814 return TREE_LIST; 815 case BRIG_OPCODE_FLOOR: 816 case BRIG_OPCODE_CEIL: 817 case BRIG_OPCODE_SQRT: 818 case BRIG_OPCODE_NSQRT: 819 case BRIG_OPCODE_RINT: 820 case BRIG_OPCODE_TRUNC: 821 case BRIG_OPCODE_POPCOUNT: 822 case BRIG_OPCODE_COPYSIGN: 823 case BRIG_OPCODE_NCOS: 824 case BRIG_OPCODE_NSIN: 825 case BRIG_OPCODE_NLOG2: 826 case BRIG_OPCODE_NEXP2: 827 case BRIG_OPCODE_NFMA: 828 /* Class has type B1 regardless of the float type, thus 829 the below builtin map search cannot find it. */ 830 case BRIG_OPCODE_CLASS: 831 case BRIG_OPCODE_WORKITEMABSID: 832 return CALL_EXPR; 833 default: 834 835 /* Some BRIG opcodes can use the same builtins for unsigned and 836 signed types. Force these cases to unsigned types. 837 */ 838 839 if (brig_opcode == BRIG_OPCODE_BORROW 840 || brig_opcode == BRIG_OPCODE_CARRY 841 || brig_opcode == BRIG_OPCODE_LASTBIT 842 || brig_opcode == BRIG_OPCODE_BITINSERT) 843 { 844 if (brig_type == BRIG_TYPE_S32) 845 brig_type = BRIG_TYPE_U32; 846 else if (brig_type == BRIG_TYPE_S64) 847 brig_type = BRIG_TYPE_U64; 848 } 849 850 851 builtin_map::const_iterator i 852 = s_custom_builtins.find (std::make_pair (brig_opcode, brig_type)); 853 if (i != s_custom_builtins.end ()) 854 return CALL_EXPR; 855 else if (s_custom_builtins.find 856 (std::make_pair (brig_opcode, brig_inner_type)) 857 != s_custom_builtins.end ()) 858 return CALL_EXPR; 859 if (brig_inner_type == BRIG_TYPE_F16 860 && s_custom_builtins.find 861 (std::make_pair (brig_opcode, BRIG_TYPE_F32)) 862 != s_custom_builtins.end ()) 863 return CALL_EXPR; 864 break; 865 } 866 return TREE_LIST; /* Emulate using a chain of nodes. */ 867 } 868