1/* brig-basic-inst-handler.cc -- brig basic instruction handling
2   Copyright (C) 2016-2020 Free Software Foundation, Inc.
3   Contributed by Pekka Jaaskelainen <pekka.jaaskelainen@parmance.com>
4   for General Processor Tech.
5
6   This file is part of GCC.
7
8   GCC is free software; you can redistribute it and/or modify it under
9   the terms of the GNU General Public License as published by the Free
10   Software Foundation; either version 3, or (at your option) any later
11   version.
12
13   GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14   WARRANTY; without even the implied warranty of MERCHANTABILITY or
15   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16   for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with GCC; see the file COPYING3.  If not see
20   <http://www.gnu.org/licenses/>.  */
21
22#include <sstream>
23
24#include "brig-code-entry-handler.h"
25#include "brig-util.h"
26
27#include "errors.h"
28#include "gimple-expr.h"
29#include "convert.h"
30#include "print-tree.h"
31#include "tree-pretty-print.h"
32#include "langhooks.h"
33#include "stor-layout.h"
34#include "diagnostic-core.h"
35#include "brig-builtins.h"
36#include "fold-const.h"
37
38brig_basic_inst_handler::brig_basic_inst_handler (brig_to_generic &parent)
39  : brig_code_entry_handler (parent)
40{
41}
42
43class scalarized_sat_arithmetics : public tree_element_binary_visitor
44{
45public:
46  scalarized_sat_arithmetics (const BrigInstBase &brig_inst)
47    : m_brig_inst (brig_inst)
48  {
49    BrigType16_t element_type = brig_inst.type & BRIG_TYPE_BASE_MASK;
50
51#undef DEF_HSAIL_SAT_BUILTIN
52#undef DEF_HSAIL_BUILTIN
53#undef DEF_HSAIL_ATOMIC_BUILTIN
54#undef DEF_HSAIL_INTR_BUILTIN
55#undef DEF_HSAIL_CVT_ZEROI_SAT_BUILTIN
56
57#define DEF_HSAIL_SAT_BUILTIN(ENUM, BRIG_OPCODE, HSAIL_TYPE,		\
58			      NAME, TYPE, ATTRS)			\
59    if (brig_inst.opcode == BRIG_OPCODE && element_type == HSAIL_TYPE)	\
60      m_builtin = builtin_decl_explicit (ENUM);				\
61    else
62#include "brig-builtins.def"
63      gcc_unreachable ();
64  }
65
66  virtual tree
67  visit_element (brig_code_entry_handler &, tree operand0, tree operand1)
68  {
69    /* Implement saturating arithmetics with scalar built-ins for now.
70       TODO: emit GENERIC nodes for the simplest cases or at least
71       emit vector built-ins.  */
72    return call_builtin (m_builtin, 2, TREE_TYPE (operand0),
73			 TREE_TYPE (operand0), operand0,
74			 TREE_TYPE (operand1), operand1);
75  }
76  const BrigInstBase &m_brig_inst;
77  tree m_builtin;
78};
79
80/* Implements a vector shuffle.  ARITH_TYPE is the type of the vector,
81   OPERANDS[0] is the first vector, OPERAND[1] the second vector and
82   OPERANDS[2] the shuffle mask in HSAIL format.  The output is a VEC_PERM_EXPR
83   that implements the shuffle as a GENERIC expression.  */
84
85tree
86brig_basic_inst_handler::build_shuffle (tree arith_type,
87					tree_stl_vec &operands)
88{
89  tree element_type
90    = get_unsigned_int_type (TREE_TYPE (TREE_TYPE (operands[0])));
91
92  /* Offsets to add to the mask values to convert from the
93     HSAIL mask to VEC_PERM_EXPR masks.  VEC_PERM_EXPR mask
94     assumes an index spanning from 0 to 2 times the vec
95     width while HSAIL refers separately to two different
96     input vectors, thus is not a "full shuffle" where all
97     output elements can originate from any input element.  */
98  vec<constructor_elt, va_gc> *mask_offset_vals = NULL;
99
100  unsigned int element_count = gccbrig_type_vector_subparts (arith_type);
101
102  vec<constructor_elt, va_gc> *input_mask_vals = NULL;
103  size_t input_mask_element_size = exact_log2 (element_count);
104
105  /* Unpack the tightly packed mask elements to BIT_FIELD_REFs
106     from which to construct the mask vector as understood by
107     VEC_PERM_EXPR.  */
108  tree mask_operand
109    = m_parent.m_cf->add_temp_var ("shuffle_mask", operands[2]);
110
111  tree mask_element_type
112    = build_nonstandard_integer_type (input_mask_element_size, true);
113
114  for (size_t i = 0; i < element_count; ++i)
115    {
116      tree mask_element
117	= build3 (BIT_FIELD_REF, mask_element_type, mask_operand,
118		  bitsize_int (input_mask_element_size),
119		  bitsize_int (i * input_mask_element_size));
120
121      mask_element = convert (element_type, mask_element);
122
123      tree offset;
124      if (i < element_count / 2)
125	offset = build_int_cst (element_type, 0);
126      else
127	offset = build_int_cst (element_type, element_count);
128
129      CONSTRUCTOR_APPEND_ELT (mask_offset_vals, NULL_TREE, offset);
130      CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, mask_element);
131    }
132  tree mask_vec_type = build_vector_type (element_type, element_count);
133
134  tree mask_vec = build_constructor (mask_vec_type, input_mask_vals);
135  tree offset_vec = build_constructor (mask_vec_type, mask_offset_vals);
136
137  tree mask = build2 (PLUS_EXPR, mask_vec_type, mask_vec, offset_vec);
138
139  tree perm = build3 (VEC_PERM_EXPR, TREE_TYPE (operands[0]), operands[0],
140		      operands[1], mask);
141  return perm;
142}
143
144/* Unpacks (extracts) a scalar element with an index in OPERANDS[1]
145   from the vector expression in OPERANDS[0].  */
146
147tree
148brig_basic_inst_handler::build_unpack (tree_stl_vec &operands)
149{
150  /* Implement the unpack with a shuffle that stores the unpacked
151     element to the lowest bit positions in the dest.  After that
152     a bitwise AND is used to clear the uppermost bits.  */
153  tree src_element_type = TREE_TYPE (TREE_TYPE (operands[0]));
154
155  /* Perform the operations with a raw (unsigned int type) type.  */
156  tree element_type = get_unsigned_int_type (src_element_type);
157
158  vec<constructor_elt, va_gc> *input_mask_vals = NULL;
159  vec<constructor_elt, va_gc> *and_mask_vals = NULL;
160
161  size_t element_count
162    = gccbrig_type_vector_subparts (TREE_TYPE (operands[0]));
163  tree vec_type = build_vector_type (element_type, element_count);
164
165  for (size_t i = 0; i < element_count; ++i)
166    {
167      tree mask_element;
168      if (i == 0)
169	mask_element = convert (element_type, operands[1]);
170      else
171	mask_element = build_int_cst (element_type, 0);
172
173      CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE, mask_element);
174
175      tree and_mask_element;
176      if (i == 0)
177	and_mask_element = build_int_cst (element_type, -1);
178      else
179	and_mask_element = build_int_cst (element_type, 0);
180      CONSTRUCTOR_APPEND_ELT (and_mask_vals, NULL_TREE, and_mask_element);
181    }
182
183  tree mask_vec = build_constructor (vec_type, input_mask_vals);
184
185  tree and_mask_vec = build_constructor (vec_type, and_mask_vals);
186
187  tree perm = build3 (VEC_PERM_EXPR, vec_type,
188		      build_resize_convert_view (vec_type, operands[0]),
189		      build_resize_convert_view (vec_type, operands[0]),
190		      mask_vec);
191
192  tree cleared = build2 (BIT_AND_EXPR, vec_type, perm, and_mask_vec);
193
194  size_t s = int_size_in_bytes (TREE_TYPE (cleared)) * BITS_PER_UNIT;
195  tree raw_type = build_nonstandard_integer_type (s, true);
196
197  tree as_int = build_resize_convert_view (raw_type, cleared);
198
199  if (int_size_in_bytes (src_element_type) < 4)
200    {
201      if (INTEGRAL_TYPE_P (src_element_type))
202	return extend_int (as_int, uint32_type_node, src_element_type);
203    }
204  return as_int;
205}
206
207/* Packs (inserts) a scalar element in OPERANDS[1]
208   to the vector in OPERANDS[0] at element position defined by
209   OPERANDS[2].  */
210
211tree
212brig_basic_inst_handler::build_pack (tree_stl_vec &operands)
213{
214  /* Implement using a bit level insertion.
215     TODO: Reuse this for implementing 'bitinsert'
216     without a builtin call.  */
217
218  size_t ecount = gccbrig_type_vector_subparts (TREE_TYPE (operands[0]));
219  size_t vecsize = int_size_in_bytes (TREE_TYPE (operands[0])) * BITS_PER_UNIT;
220  tree wide_type = build_nonstandard_integer_type (vecsize, 1);
221
222  tree src_vect = build_resize_convert_view (wide_type, operands[0]);
223  src_vect = m_parent.m_cf->add_temp_var ("src_vect", src_vect);
224
225  tree scalar = operands[1];
226  scalar = m_parent.m_cf->add_temp_var ("scalar",
227					convert_to_integer (wide_type, scalar));
228
229  tree pos = operands[2];
230
231  /* The upper bits of the position can contain garbage.
232     Zero them for well-defined semantics.  */
233  tree t = build2 (BIT_AND_EXPR, TREE_TYPE (pos), operands[2],
234		   build_int_cstu (TREE_TYPE (pos), ecount - 1));
235  pos = m_parent.m_cf->add_temp_var ("pos", convert (wide_type, t));
236
237  tree element_type = TREE_TYPE (TREE_TYPE (operands[0]));
238  size_t element_width = int_size_in_bytes (element_type) * BITS_PER_UNIT;
239  tree ewidth = build_int_cstu (wide_type, element_width);
240
241  tree bitoffset = build2 (MULT_EXPR, wide_type, ewidth, pos);
242  bitoffset = m_parent.m_cf->add_temp_var ("offset", bitoffset);
243
244  uint64_t mask_int
245    = element_width == 64 ? (uint64_t) -1 : ((uint64_t) 1 << element_width) - 1;
246
247  tree mask = build_int_cstu (wide_type, mask_int);
248
249  mask = m_parent.m_cf->add_temp_var ("mask",
250				      convert_to_integer (wide_type, mask));
251
252  tree clearing_mask
253    = build1 (BIT_NOT_EXPR, wide_type,
254	      build2 (LSHIFT_EXPR, wide_type, mask, bitoffset));
255
256  tree zeroed_element
257    = build2 (BIT_AND_EXPR, wide_type, src_vect, clearing_mask);
258
259  /* TODO: Is the AND necessary: does HSA define what
260     happens if the upper bits in the inserted element are not
261     zero? */
262  tree element_in_position
263    = build2 (LSHIFT_EXPR, wide_type,
264	      build2 (BIT_AND_EXPR, wide_type, scalar, mask), bitoffset);
265
266  tree inserted
267    = build2 (BIT_IOR_EXPR, wide_type, zeroed_element, element_in_position);
268  return inserted;
269}
270
271/* Implement the unpack{lo,hi}.  BRIG_OPCODE should tell which one and
272   ARITH_TYPE describe the type of the vector arithmetics.
273   OPERANDS[0] and OPERANDS[1] are the input vectors.  */
274
275tree
276brig_basic_inst_handler::build_unpack_lo_or_hi (BrigOpcode16_t brig_opcode,
277						tree arith_type,
278						tree_stl_vec &operands)
279{
280  tree element_type = get_unsigned_int_type (TREE_TYPE (arith_type));
281  tree mask_vec_type
282    = build_vector_type (element_type,
283			 gccbrig_type_vector_subparts (arith_type));
284
285  size_t element_count = gccbrig_type_vector_subparts (arith_type);
286  vec<constructor_elt, va_gc> *input_mask_vals = NULL;
287
288  size_t offset = (brig_opcode == BRIG_OPCODE_UNPACKLO) ? 0 : element_count / 2;
289
290  for (size_t i = 0; i < element_count / 2; ++i)
291    {
292      CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE,
293			      build_int_cst (element_type, offset + i));
294      CONSTRUCTOR_APPEND_ELT (input_mask_vals, NULL_TREE,
295			      build_int_cst (element_type,
296					     offset + i + element_count));
297    }
298
299  tree mask_vec = build_constructor (mask_vec_type, input_mask_vals);
300
301  tree perm = build3 (VEC_PERM_EXPR, TREE_TYPE (operands[0]), operands[0],
302		      operands[1], mask_vec);
303  return perm;
304}
305
306/* Builds a basic instruction expression from a BRIG instruction.  BRIG_OPCODE
307   is the opcode, BRIG_TYPE the brig type of the instruction, ARITH_TYPE the
308   desired tree type for the instruction, and OPERANDS the instruction's
309   input operands already converted to tree nodes.  */
310
311tree
312brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
313					  BrigType16_t brig_type,
314					  tree arith_type,
315					  tree_stl_vec &operands)
316{
317  tree_code opcode
318    = brig_function::get_tree_code_for_hsa_opcode (brig_opcode, brig_type);
319
320  BrigType16_t inner_type = brig_type & BRIG_TYPE_BASE_MASK;
321
322  tree instr_inner_type
323    = VECTOR_TYPE_P (arith_type) ? TREE_TYPE (arith_type) : arith_type;
324
325  if (opcode == RSHIFT_EXPR || opcode == LSHIFT_EXPR)
326    {
327      /* HSA defines modulo/clipping behavior for shift amounts larger
328	 than the bit width, while tree.def leaves it undefined.
329	 We need to mask the upper bits to ensure the defined behavior.  */
330      tree scalar_mask
331	= build_int_cst (instr_inner_type,
332			 gccbrig_hsa_type_bit_size (inner_type) - 1);
333
334      tree mask = VECTOR_TYPE_P (arith_type)
335		    ? build_vector_from_val (arith_type, scalar_mask)
336		    : scalar_mask;
337
338      /* The shift amount is a scalar, broadcast it to produce
339	 a vector shift.  */
340      if (VECTOR_TYPE_P (arith_type))
341	operands[1] = build_vector_from_val (arith_type, operands[1]);
342      operands[1] = build2 (BIT_AND_EXPR, arith_type, operands[1], mask);
343    }
344
345  size_t input_count = operands.size ();
346  size_t output_count = gccbrig_hsa_opcode_op_output_p (brig_opcode, 0) ?
347    1 : 0;
348
349  if (opcode == TREE_LIST)
350    {
351      /* There was no direct GENERIC opcode for the instruction;
352	 try to emulate it with a chain of GENERIC nodes.  */
353      if (brig_opcode == BRIG_OPCODE_MAD || brig_opcode == BRIG_OPCODE_MAD24)
354	{
355	  /* There doesn't seem to be a "standard" MAD built-in in gcc so let's
356	     use a chain of multiply + add for now (double rounding method).
357	     It should be easier for optimizers than a custom built-in call
358	     WIDEN_MULT_EXPR is close, but requires a double size result
359	     type.  */
360	  tree mult_res
361	    = build2 (MULT_EXPR, arith_type, operands[0], operands[1]);
362	  return build2 (PLUS_EXPR, arith_type, mult_res, operands[2]);
363	}
364      else if (brig_opcode == BRIG_OPCODE_MAD24HI)
365	{
366	  tree mult_res
367	    = build2 (MULT_HIGHPART_EXPR, arith_type, operands[0], operands[1]);
368	  return build2 (PLUS_EXPR, arith_type, mult_res, operands[2]);
369	}
370      else if (brig_opcode == BRIG_OPCODE_SHUFFLE)
371	{
372	  return build_shuffle (arith_type, operands);
373	}
374      else if (brig_opcode == BRIG_OPCODE_UNPACKLO
375	       || brig_opcode == BRIG_OPCODE_UNPACKHI)
376	{
377	  return build_unpack_lo_or_hi (brig_opcode, arith_type, operands);
378	}
379      else if (brig_opcode == BRIG_OPCODE_UNPACK)
380	{
381	  return build_unpack (operands);
382	}
383      else if (brig_opcode == BRIG_OPCODE_PACK)
384	{
385	  return build_pack (operands);
386	}
387      else if (brig_opcode == BRIG_OPCODE_NRSQRT)
388	{
389	  /* Implement as 1.0/sqrt (x) and assume gcc instruction selects to
390	     native ISA other than a division, if available.
391	     TODO: this will happen only with unsafe math optimizations
392	     on which cannot be used in general to remain HSAIL compliant.
393	     Perhaps a builtin call would be better option here.  */
394	  return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
395			 m_parent.m_cf->expand_or_call_builtin
396			 (BRIG_OPCODE_SQRT, brig_type, arith_type, operands));
397	}
398      else if (brig_opcode == BRIG_OPCODE_NRCP)
399	{
400	  /* Implement as 1.0/x and assume gcc instruction selects to
401	     native ISA other than a division, if available.  */
402	  return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
403			 operands[0]);
404	}
405      else if (brig_opcode == BRIG_OPCODE_LANEID
406	       || brig_opcode == BRIG_OPCODE_MAXWAVEID
407	       || brig_opcode == BRIG_OPCODE_WAVEID)
408	{
409	  /* Assuming WAVESIZE 1 (for now), therefore LANEID, WAVEID and
410	     MAXWAVEID always return 0.  */
411	  return build_zero_cst (arith_type);
412	}
413      else
414	gcc_unreachable ();
415    }
416  else if (opcode == CALL_EXPR)
417    return m_parent.m_cf->expand_or_call_builtin (brig_opcode, brig_type,
418						  arith_type, operands);
419  else if (output_count == 1)
420    {
421      if (input_count == 1)
422	{
423	  if (opcode == MODIFY_EXPR)
424	    return operands[0];
425	  else
426	    return build1 (opcode, arith_type, operands[0]);
427	}
428      else if (input_count == 2)
429	return build2 (opcode, arith_type, operands[0], operands[1]);
430      else if (input_count == 3)
431	return build3 (opcode, arith_type, operands[0], operands[1],
432		       operands[2]);
433      else
434	gcc_unreachable ();
435    }
436  else
437    gcc_unreachable ();
438
439  return NULL_TREE;
440}
441
442/* Handles the basic instructions, including packed instructions. Deals
443   with the different packing modes by unpacking/packing the wanted
444   elements.  Delegates most of the instruction cases to build_inst_expr(). */
445
446size_t
447brig_basic_inst_handler::operator () (const BrigBase *base)
448{
449  const BrigInstBase *brig_inst = (const BrigInstBase *) base;
450  if (brig_inst->opcode == BRIG_OPCODE_NOP)
451    return base->byteCount;
452
453  tree_stl_vec operands = build_operands (*brig_inst);
454
455  size_t output_count
456    = gccbrig_hsa_opcode_op_output_p (brig_inst->opcode, 0) ? 1 : 0;
457  size_t input_count
458    = operands.size () == 0 ? 0 : (operands.size () - output_count);
459
460  gcc_assert (output_count == 0 || output_count == 1);
461
462  tree_stl_vec::iterator first_input_i = operands.begin ();
463  if (output_count > 0 && operands.size () > 0)
464    ++first_input_i;
465
466  tree_stl_vec in_operands;
467  in_operands.assign (first_input_i, operands.end ());
468
469  BrigType16_t brig_inst_type = brig_inst->type;
470
471  if (brig_inst->opcode == BRIG_OPCODE_FIRSTBIT
472      || brig_inst->opcode == BRIG_OPCODE_LASTBIT
473      || brig_inst->opcode == BRIG_OPCODE_SAD)
474    /* These instructions are reported to be always 32b in HSAIL, but we want
475       to treat them according to their input argument's type to select the
476       correct instruction/builtin.  */
477    brig_inst_type
478      = gccbrig_tree_type_to_hsa_type (TREE_TYPE (in_operands[0]));
479
480  tree instr_type = gccbrig_tree_type_for_hsa_type (brig_inst_type);
481
482  if (!instr_type)
483    {
484      gcc_unreachable ();
485      return base->byteCount;
486    }
487
488  bool is_vec_instr = hsa_type_packed_p (brig_inst_type);
489
490  size_t element_size_bits;
491  size_t element_count;
492
493  if (is_vec_instr)
494    {
495      BrigType16_t brig_element_type = brig_inst_type & BRIG_TYPE_BASE_MASK;
496      element_size_bits = gccbrig_hsa_type_bit_size (brig_element_type);
497      element_count = gccbrig_hsa_type_bit_size (brig_inst_type)
498	/ gccbrig_hsa_type_bit_size (brig_element_type);
499    }
500  else
501    {
502      element_size_bits = gccbrig_hsa_type_bit_size (brig_inst_type);
503      element_count = 1;
504    }
505
506  /* The actual arithmetics type that should be performed with the
507     operation.  This is not always the same as the original BRIG
508     opcode's type due to implicit conversions of storage-only f16.  */
509  tree arith_type = gccbrig_is_bit_operation (brig_inst->opcode)
510		      ? gccbrig_tree_type_for_hsa_type (brig_inst_type)
511		      : get_tree_expr_type_for_hsa_type (brig_inst_type);
512
513  tree instr_expr = NULL_TREE;
514
515  BrigPack8_t p = BRIG_PACK_NONE;
516  if (brig_inst->base.kind == BRIG_KIND_INST_MOD)
517    p = ((const BrigInstMod *) brig_inst)->pack;
518  else if (brig_inst->base.kind == BRIG_KIND_INST_CMP)
519    p = ((const BrigInstCmp *) brig_inst)->pack;
520
521  if (p == BRIG_PACK_PS || p == BRIG_PACK_PSSAT)
522    in_operands[1] = build_lower_element_broadcast (in_operands[1]);
523  else if (p == BRIG_PACK_SP || p == BRIG_PACK_SPSAT)
524    in_operands[0] = build_lower_element_broadcast (in_operands[0]);
525
526  tree_code opcode
527    = brig_function::get_tree_code_for_hsa_opcode (brig_inst->opcode,
528						   brig_inst_type);
529
530  if (p >= BRIG_PACK_PPSAT && p <= BRIG_PACK_PSAT)
531    {
532      scalarized_sat_arithmetics sat_arith (*brig_inst);
533      gcc_assert (input_count == 2);
534      instr_expr = sat_arith (*this, in_operands[0], in_operands[1]);
535    }
536  else if (opcode == RETURN_EXPR)
537    {
538      if (m_parent.m_cf->m_is_kernel)
539	{
540	  tree goto_stmt
541	    = build1 (GOTO_EXPR, void_type_node, m_parent.m_cf->m_exit_label);
542	  m_parent.m_cf->append_statement (goto_stmt);
543	  return base->byteCount;
544	}
545      else
546	{
547	  m_parent.m_cf->append_return_stmt ();
548	  return base->byteCount;
549	}
550    }
551  else if (opcode == MULT_HIGHPART_EXPR &&
552	   is_vec_instr && element_size_bits < 64)
553    {
554      /* MULT_HIGHPART_EXPR works only on target dependent vector sizes and
555	 even the scalars do not seem to work at least for char elements.
556
557	 Let's fall back to scalarization and promotion of the vector elements
558	 to larger types with the MULHI computed as a regular MUL.
559	 MULHI for 2x64b seems to work with the Intel CPUs I've tested so
560	 that is passed on for vector processing so there is no need for
561	 128b scalar arithmetics.
562
563	 This is not modular as these type of things do not belong to the
564	 frontend, there should be a legalization phase before the backend
565	 that figures out the best way to compute the MULHI for any
566	 integer vector datatype.
567
568	 TODO: promote to larger vector types instead.  For example
569	 MULT_HIGHPART_EXPR with s8x8 doesn't work, but s16x8 seems to at least
570	 with my x86-64.
571      */
572      tree_stl_vec operand0_elements;
573      if (input_count > 0)
574	m_parent.m_cf->unpack (in_operands[0], operand0_elements);
575
576      tree_stl_vec operand1_elements;
577      if (input_count > 1)
578	m_parent.m_cf->unpack (in_operands[1], operand1_elements);
579
580      tree_stl_vec result_elements;
581
582      tree scalar_type = TREE_TYPE (arith_type);
583      BrigType16_t element_type = brig_inst_type & BRIG_TYPE_BASE_MASK;
584      tree promoted_type = short_integer_type_node;
585      switch (element_type)
586	{
587	case BRIG_TYPE_S8:
588	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S16);
589	  break;
590	case BRIG_TYPE_U8:
591	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U16);
592	  break;
593	case BRIG_TYPE_S16:
594	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S32);
595	  break;
596	case BRIG_TYPE_U16:
597	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U32);
598	  break;
599	case BRIG_TYPE_S32:
600	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_S64);
601	  break;
602	case BRIG_TYPE_U32:
603	  promoted_type = gccbrig_tree_type_for_hsa_type (BRIG_TYPE_U64);
604	  break;
605	default:
606	  gcc_unreachable ();
607	}
608
609      size_t promoted_type_size = int_size_in_bytes (promoted_type) * 8;
610      size_t element_count = gccbrig_type_vector_subparts (arith_type);
611      for (size_t i = 0; i < element_count; ++i)
612	{
613	  tree operand0 = convert (promoted_type, operand0_elements.at (i));
614	  tree operand1 = convert (promoted_type, operand1_elements.at (i));
615
616	  tree scalar_expr
617	    = build2 (MULT_EXPR, promoted_type, operand0, operand1);
618
619	  scalar_expr
620	    = build2 (RSHIFT_EXPR, promoted_type, scalar_expr,
621		      build_int_cstu (promoted_type, promoted_type_size / 2));
622
623	  result_elements.push_back (convert (scalar_type, scalar_expr));
624	}
625      instr_expr = m_parent.m_cf->pack (result_elements);
626    }
627  else
628    {
629      /* 'class' is always of b1 type, let's consider it by its
630	 float type when building the instruction to find the
631	 correct builtin.  */
632      if (brig_inst->opcode == BRIG_OPCODE_CLASS)
633	brig_inst_type = ((const BrigInstSourceType *) base)->sourceType;
634      instr_expr = build_inst_expr (brig_inst->opcode, brig_inst_type,
635				     arith_type, in_operands);
636    }
637
638  if (instr_expr == NULL_TREE)
639    {
640      gcc_unreachable ();
641      return base->byteCount;
642    }
643
644  if (p == BRIG_PACK_SS || p == BRIG_PACK_S || p == BRIG_PACK_SSSAT
645      || p == BRIG_PACK_SSAT)
646    {
647      /* In case of _s_ or _ss_, select only the lowest element
648	 from the new input to the output.  We could extract
649	 the element and use a scalar operation, but try
650	 to keep data in vector registers as much as possible
651	 to avoid copies between scalar and vector datapaths.  */
652      tree old_value;
653      tree half_storage_type = gccbrig_tree_type_for_hsa_type (brig_inst_type);
654      bool is_fp16_operation
655	= (brig_inst_type & BRIG_TYPE_BASE_MASK) == BRIG_TYPE_F16
656	&& !gccbrig_is_bit_operation (brig_inst->opcode);
657
658      if (is_fp16_operation)
659	old_value = build_h2f_conversion
660	  (build_resize_convert_view (half_storage_type, operands[0]));
661      else
662	old_value
663	  = build_resize_convert_view (TREE_TYPE (instr_expr), operands[0]);
664
665      size_t esize = is_fp16_operation ? 32 : element_size_bits;
666
667      /* Construct a permutation mask where other elements than the lowest one
668	 is picked from the old_value.  */
669      tree mask_inner_type = build_nonstandard_integer_type (esize, 1);
670      vec<constructor_elt, va_gc> *constructor_vals = NULL;
671      for (size_t i = 0; i < element_count; ++i)
672	{
673	  tree cst;
674
675	  if (i == 0)
676	    cst = build_int_cstu (mask_inner_type, element_count);
677	  else
678	    cst = build_int_cstu (mask_inner_type, i);
679	  CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, cst);
680	}
681      tree mask_vec_type = build_vector_type (mask_inner_type, element_count);
682      tree mask = build_vector_from_ctor (mask_vec_type, constructor_vals);
683
684      tree new_value = create_tmp_var (TREE_TYPE (instr_expr), "new_output");
685      tree assign
686	= build2 (MODIFY_EXPR, TREE_TYPE (instr_expr), new_value, instr_expr);
687      m_parent.m_cf->append_statement (assign);
688
689      instr_expr
690	= build3 (VEC_PERM_EXPR, arith_type, old_value, new_value, mask);
691
692      tree lower_output = create_tmp_var (TREE_TYPE (instr_expr), "s_output");
693      tree assign_lower = build2 (MODIFY_EXPR, TREE_TYPE (instr_expr),
694				  lower_output, instr_expr);
695      m_parent.m_cf->append_statement (assign_lower);
696      instr_expr = lower_output;
697    }
698
699  if (output_count == 1)
700    build_output_assignment (*brig_inst, operands[0], instr_expr);
701  else
702    m_parent.m_cf->append_statement (instr_expr);
703  return base->byteCount;
704}
705
706/* Create an expression that broadcasts the lowest element of the
707   vector in VEC_OPERAND to all elements of the returned vector.  */
708
709tree
710brig_basic_inst_handler::build_lower_element_broadcast (tree vec_operand)
711{
712  /* Build the broadcast using shuffle because there's no
713     direct broadcast in GENERIC and this way there's no need for
714     a separate extract of the lowest element.  */
715  tree element_type = TREE_TYPE (TREE_TYPE (vec_operand));
716  size_t esize = 8 * int_size_in_bytes (element_type);
717
718  size_t element_count
719    = gccbrig_type_vector_subparts (TREE_TYPE (vec_operand));
720  tree mask_inner_type = build_nonstandard_integer_type (esize, 1);
721  vec<constructor_elt, va_gc> *constructor_vals = NULL;
722
723  /* Construct the mask.  */
724  for (size_t i = 0; i < element_count; ++i)
725    {
726      tree cst = build_int_cstu (mask_inner_type, element_count);
727      CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, cst);
728    }
729  tree mask_vec_type = build_vector_type (mask_inner_type, element_count);
730  tree mask = build_vector_from_ctor (mask_vec_type, constructor_vals);
731
732  return build3 (VEC_PERM_EXPR, TREE_TYPE (vec_operand), vec_operand,
733		 vec_operand, mask);
734}
735
736