1/* Lower vector operations to scalar operations.
2   Copyright (C) 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify it
7under the terms of the GNU General Public License as published by the
8Free Software Foundation; either version 2, or (at your option) any
9later version.
10
11GCC is distributed in the hope that it will be useful, but WITHOUT
12ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14for more details.
15
16You should have received a copy of the GNU General Public License
17along with GCC; see the file COPYING.  If not, write to the Free
18Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
1902110-1301, USA.  */
20
21#include "config.h"
22#include "system.h"
23#include "coretypes.h"
24#include "tree.h"
25#include "tm.h"
26#include "rtl.h"
27#include "expr.h"
28#include "insn-codes.h"
29#include "diagnostic.h"
30#include "optabs.h"
31#include "machmode.h"
32#include "langhooks.h"
33#include "tree-flow.h"
34#include "tree-gimple.h"
35#include "tree-iterator.h"
36#include "tree-pass.h"
37#include "flags.h"
38#include "ggc.h"
39
40
41/* Build a constant of type TYPE, made of VALUE's bits replicated
42   every TYPE_SIZE (INNER_TYPE) bits to fit TYPE's precision.  */
43static tree
44build_replicated_const (tree type, tree inner_type, HOST_WIDE_INT value)
45{
46  int width = tree_low_cst (TYPE_SIZE (inner_type), 1);
47  int n = HOST_BITS_PER_WIDE_INT / width;
48  unsigned HOST_WIDE_INT low, high, mask;
49  tree ret;
50
51  gcc_assert (n);
52
53  if (width == HOST_BITS_PER_WIDE_INT)
54    low = value;
55  else
56    {
57      mask = ((HOST_WIDE_INT)1 << width) - 1;
58      low = (unsigned HOST_WIDE_INT) ~0 / mask * (value & mask);
59    }
60
61  if (TYPE_PRECISION (type) < HOST_BITS_PER_WIDE_INT)
62    low &= ((HOST_WIDE_INT)1 << TYPE_PRECISION (type)) - 1, high = 0;
63  else if (TYPE_PRECISION (type) == HOST_BITS_PER_WIDE_INT)
64    high = 0;
65  else if (TYPE_PRECISION (type) == 2 * HOST_BITS_PER_WIDE_INT)
66    high = low;
67  else
68    gcc_unreachable ();
69
70  ret = build_int_cst_wide (type, low, high);
71  return ret;
72}
73
74static GTY(()) tree vector_inner_type;
75static GTY(()) tree vector_last_type;
76static GTY(()) int vector_last_nunits;
77
78/* Return a suitable vector types made of SUBPARTS units each of mode
79   "word_mode" (the global variable).  */
80static tree
81build_word_mode_vector_type (int nunits)
82{
83  if (!vector_inner_type)
84    vector_inner_type = lang_hooks.types.type_for_mode (word_mode, 1);
85  else if (vector_last_nunits == nunits)
86    {
87      gcc_assert (TREE_CODE (vector_last_type) == VECTOR_TYPE);
88      return vector_last_type;
89    }
90
91  /* We build a new type, but we canonicalize it nevertheless,
92     because it still saves some memory.  */
93  vector_last_nunits = nunits;
94  vector_last_type = type_hash_canon (nunits,
95				      build_vector_type (vector_inner_type,
96							 nunits));
97  return vector_last_type;
98}
99
100typedef tree (*elem_op_func) (block_stmt_iterator *,
101			      tree, tree, tree, tree, tree, enum tree_code);
102
103static inline tree
104tree_vec_extract (block_stmt_iterator *bsi, tree type,
105		  tree t, tree bitsize, tree bitpos)
106{
107  if (bitpos)
108    return gimplify_build3 (bsi, BIT_FIELD_REF, type, t, bitsize, bitpos);
109  else
110    return gimplify_build1 (bsi, VIEW_CONVERT_EXPR, type, t);
111}
112
113static tree
114do_unop (block_stmt_iterator *bsi, tree inner_type, tree a,
115	 tree b ATTRIBUTE_UNUSED, tree bitpos, tree bitsize,
116	 enum tree_code code)
117{
118  a = tree_vec_extract (bsi, inner_type, a, bitsize, bitpos);
119  return gimplify_build1 (bsi, code, inner_type, a);
120}
121
122static tree
123do_binop (block_stmt_iterator *bsi, tree inner_type, tree a, tree b,
124	  tree bitpos, tree bitsize, enum tree_code code)
125{
126  a = tree_vec_extract (bsi, inner_type, a, bitsize, bitpos);
127  b = tree_vec_extract (bsi, inner_type, b, bitsize, bitpos);
128  return gimplify_build2 (bsi, code, inner_type, a, b);
129}
130
131/* Expand vector addition to scalars.  This does bit twiddling
132   in order to increase parallelism:
133
134   a + b = (((int) a & 0x7f7f7f7f) + ((int) b & 0x7f7f7f7f)) ^
135           (a ^ b) & 0x80808080
136
137   a - b =  (((int) a | 0x80808080) - ((int) b & 0x7f7f7f7f)) ^
138            (a ^ ~b) & 0x80808080
139
140   -b = (0x80808080 - ((int) b & 0x7f7f7f7f)) ^ (~b & 0x80808080)
141
142   This optimization should be done only if 4 vector items or more
143   fit into a word.  */
144static tree
145do_plus_minus (block_stmt_iterator *bsi, tree word_type, tree a, tree b,
146	       tree bitpos ATTRIBUTE_UNUSED, tree bitsize ATTRIBUTE_UNUSED,
147	       enum tree_code code)
148{
149  tree inner_type = TREE_TYPE (TREE_TYPE (a));
150  unsigned HOST_WIDE_INT max;
151  tree low_bits, high_bits, a_low, b_low, result_low, signs;
152
153  max = GET_MODE_MASK (TYPE_MODE (inner_type));
154  low_bits = build_replicated_const (word_type, inner_type, max >> 1);
155  high_bits = build_replicated_const (word_type, inner_type, max & ~(max >> 1));
156
157  a = tree_vec_extract (bsi, word_type, a, bitsize, bitpos);
158  b = tree_vec_extract (bsi, word_type, b, bitsize, bitpos);
159
160  signs = gimplify_build2 (bsi, BIT_XOR_EXPR, word_type, a, b);
161  b_low = gimplify_build2 (bsi, BIT_AND_EXPR, word_type, b, low_bits);
162  if (code == PLUS_EXPR)
163    a_low = gimplify_build2 (bsi, BIT_AND_EXPR, word_type, a, low_bits);
164  else
165    {
166      a_low = gimplify_build2 (bsi, BIT_IOR_EXPR, word_type, a, high_bits);
167      signs = gimplify_build1 (bsi, BIT_NOT_EXPR, word_type, signs);
168    }
169
170  signs = gimplify_build2 (bsi, BIT_AND_EXPR, word_type, signs, high_bits);
171  result_low = gimplify_build2 (bsi, code, word_type, a_low, b_low);
172  return gimplify_build2 (bsi, BIT_XOR_EXPR, word_type, result_low, signs);
173}
174
175static tree
176do_negate (block_stmt_iterator *bsi, tree word_type, tree b,
177	   tree unused ATTRIBUTE_UNUSED, tree bitpos ATTRIBUTE_UNUSED,
178	   tree bitsize ATTRIBUTE_UNUSED,
179	   enum tree_code code ATTRIBUTE_UNUSED)
180{
181  tree inner_type = TREE_TYPE (TREE_TYPE (b));
182  HOST_WIDE_INT max;
183  tree low_bits, high_bits, b_low, result_low, signs;
184
185  max = GET_MODE_MASK (TYPE_MODE (inner_type));
186  low_bits = build_replicated_const (word_type, inner_type, max >> 1);
187  high_bits = build_replicated_const (word_type, inner_type, max & ~(max >> 1));
188
189  b = tree_vec_extract (bsi, word_type, b, bitsize, bitpos);
190
191  b_low = gimplify_build2 (bsi, BIT_AND_EXPR, word_type, b, low_bits);
192  signs = gimplify_build1 (bsi, BIT_NOT_EXPR, word_type, b);
193  signs = gimplify_build2 (bsi, BIT_AND_EXPR, word_type, signs, high_bits);
194  result_low = gimplify_build2 (bsi, MINUS_EXPR, word_type, high_bits, b_low);
195  return gimplify_build2 (bsi, BIT_XOR_EXPR, word_type, result_low, signs);
196}
197
198/* Expand a vector operation to scalars, by using many operations
199   whose type is the vector type's inner type.  */
200static tree
201expand_vector_piecewise (block_stmt_iterator *bsi, elem_op_func f,
202			 tree type, tree inner_type,
203			 tree a, tree b, enum tree_code code)
204{
205  VEC(constructor_elt,gc) *v;
206  tree part_width = TYPE_SIZE (inner_type);
207  tree index = bitsize_int (0);
208  int nunits = TYPE_VECTOR_SUBPARTS (type);
209  int delta = tree_low_cst (part_width, 1)
210	      / tree_low_cst (TYPE_SIZE (TREE_TYPE (type)), 1);
211  int i;
212
213  v = VEC_alloc(constructor_elt, gc, (nunits + delta - 1) / delta);
214  for (i = 0; i < nunits;
215       i += delta, index = int_const_binop (PLUS_EXPR, index, part_width, 0))
216    {
217      tree result = f (bsi, inner_type, a, b, index, part_width, code);
218      constructor_elt *ce = VEC_quick_push (constructor_elt, v, NULL);
219      ce->index = NULL_TREE;
220      ce->value = result;
221    }
222
223  return build_constructor (type, v);
224}
225
226/* Expand a vector operation to scalars with the freedom to use
227   a scalar integer type, or to use a different size for the items
228   in the vector type.  */
229static tree
230expand_vector_parallel (block_stmt_iterator *bsi, elem_op_func f, tree type,
231			tree a, tree b,
232			enum tree_code code)
233{
234  tree result, compute_type;
235  enum machine_mode mode;
236  int n_words = tree_low_cst (TYPE_SIZE_UNIT (type), 1) / UNITS_PER_WORD;
237
238  /* We have three strategies.  If the type is already correct, just do
239     the operation an element at a time.  Else, if the vector is wider than
240     one word, do it a word at a time; finally, if the vector is smaller
241     than one word, do it as a scalar.  */
242  if (TYPE_MODE (TREE_TYPE (type)) == word_mode)
243     return expand_vector_piecewise (bsi, f,
244				     type, TREE_TYPE (type),
245				     a, b, code);
246  else if (n_words > 1)
247    {
248      tree word_type = build_word_mode_vector_type (n_words);
249      result = expand_vector_piecewise (bsi, f,
250				        word_type, TREE_TYPE (word_type),
251					a, b, code);
252      result = gimplify_val (bsi, word_type, result);
253    }
254  else
255    {
256      /* Use a single scalar operation with a mode no wider than word_mode.  */
257      mode = mode_for_size (tree_low_cst (TYPE_SIZE (type), 1), MODE_INT, 0);
258      compute_type = lang_hooks.types.type_for_mode (mode, 1);
259      result = f (bsi, compute_type, a, b, NULL_TREE, NULL_TREE, code);
260    }
261
262  return result;
263}
264
265/* Expand a vector operation to scalars; for integer types we can use
266   special bit twiddling tricks to do the sums a word at a time, using
267   function F_PARALLEL instead of F.  These tricks are done only if
268   they can process at least four items, that is, only if the vector
269   holds at least four items and if a word can hold four items.  */
270static tree
271expand_vector_addition (block_stmt_iterator *bsi,
272			elem_op_func f, elem_op_func f_parallel,
273			tree type, tree a, tree b, enum tree_code code)
274{
275  int parts_per_word = UNITS_PER_WORD
276	  	       / tree_low_cst (TYPE_SIZE_UNIT (TREE_TYPE (type)), 1);
277
278  if (INTEGRAL_TYPE_P (TREE_TYPE (type))
279      && parts_per_word >= 4
280      && TYPE_VECTOR_SUBPARTS (type) >= 4)
281    return expand_vector_parallel (bsi, f_parallel,
282				   type, a, b, code);
283  else
284    return expand_vector_piecewise (bsi, f,
285				    type, TREE_TYPE (type),
286				    a, b, code);
287}
288
289static tree
290expand_vector_operation (block_stmt_iterator *bsi, tree type, tree compute_type,
291			 tree rhs, enum tree_code code)
292{
293  enum machine_mode compute_mode = TYPE_MODE (compute_type);
294
295  /* If the compute mode is not a vector mode (hence we are not decomposing
296     a BLKmode vector to smaller, hardware-supported vectors), we may want
297     to expand the operations in parallel.  */
298  if (GET_MODE_CLASS (compute_mode) != MODE_VECTOR_INT
299      && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_FLOAT)
300    switch (code)
301      {
302      case PLUS_EXPR:
303      case MINUS_EXPR:
304        if (!TYPE_OVERFLOW_TRAPS (type))
305          return expand_vector_addition (bsi, do_binop, do_plus_minus, type,
306		      		         TREE_OPERAND (rhs, 0),
307					 TREE_OPERAND (rhs, 1), code);
308	break;
309
310      case NEGATE_EXPR:
311        if (!TYPE_OVERFLOW_TRAPS (type))
312          return expand_vector_addition (bsi, do_unop, do_negate, type,
313		      		         TREE_OPERAND (rhs, 0),
314					 NULL_TREE, code);
315	break;
316
317      case BIT_AND_EXPR:
318      case BIT_IOR_EXPR:
319      case BIT_XOR_EXPR:
320        return expand_vector_parallel (bsi, do_binop, type,
321		      		       TREE_OPERAND (rhs, 0),
322				       TREE_OPERAND (rhs, 1), code);
323
324      case BIT_NOT_EXPR:
325        return expand_vector_parallel (bsi, do_unop, type,
326		      		       TREE_OPERAND (rhs, 0),
327				       NULL_TREE, code);
328
329      default:
330	break;
331      }
332
333  if (TREE_CODE_CLASS (code) == tcc_unary)
334    return expand_vector_piecewise (bsi, do_unop, type, compute_type,
335				    TREE_OPERAND (rhs, 0),
336				    NULL_TREE, code);
337  else
338    return expand_vector_piecewise (bsi, do_binop, type, compute_type,
339				    TREE_OPERAND (rhs, 0),
340				    TREE_OPERAND (rhs, 1), code);
341}
342
343/* Return a type for the widest vector mode whose components are of mode
344   INNER_MODE, or NULL_TREE if none is found.  */
345static tree
346type_for_widest_vector_mode (enum machine_mode inner_mode, optab op)
347{
348  enum machine_mode best_mode = VOIDmode, mode;
349  int best_nunits = 0;
350
351  if (SCALAR_FLOAT_MODE_P (inner_mode))
352    mode = MIN_MODE_VECTOR_FLOAT;
353  else
354    mode = MIN_MODE_VECTOR_INT;
355
356  for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
357    if (GET_MODE_INNER (mode) == inner_mode
358        && GET_MODE_NUNITS (mode) > best_nunits
359	&& op->handlers[mode].insn_code != CODE_FOR_nothing)
360      best_mode = mode, best_nunits = GET_MODE_NUNITS (mode);
361
362  if (best_mode == VOIDmode)
363    return NULL_TREE;
364  else
365    return lang_hooks.types.type_for_mode (best_mode, 1);
366}
367
368/* Process one statement.  If we identify a vector operation, expand it.  */
369
370static void
371expand_vector_operations_1 (block_stmt_iterator *bsi)
372{
373  tree stmt = bsi_stmt (*bsi);
374  tree *p_lhs, *p_rhs, lhs, rhs, type, compute_type;
375  enum tree_code code;
376  enum machine_mode compute_mode;
377  optab op;
378
379  switch (TREE_CODE (stmt))
380    {
381    case RETURN_EXPR:
382      stmt = TREE_OPERAND (stmt, 0);
383      if (!stmt || TREE_CODE (stmt) != MODIFY_EXPR)
384	return;
385
386      /* FALLTHRU */
387
388    case MODIFY_EXPR:
389      p_lhs = &TREE_OPERAND (stmt, 0);
390      p_rhs = &TREE_OPERAND (stmt, 1);
391      lhs = *p_lhs;
392      rhs = *p_rhs;
393      break;
394
395    default:
396      return;
397    }
398
399  type = TREE_TYPE (rhs);
400  if (TREE_CODE (type) != VECTOR_TYPE)
401    return;
402
403  code = TREE_CODE (rhs);
404  if (TREE_CODE_CLASS (code) != tcc_unary
405      && TREE_CODE_CLASS (code) != tcc_binary)
406    return;
407
408  if (code == NOP_EXPR || code == VIEW_CONVERT_EXPR)
409    return;
410
411  gcc_assert (code != CONVERT_EXPR);
412  op = optab_for_tree_code (code, type);
413
414  /* For widening vector operations, the relevant type is of the arguments,
415     not the widened result.  */
416  if (code == WIDEN_SUM_EXPR)
417    type = TREE_TYPE (TREE_OPERAND (rhs, 0));
418
419  /* Optabs will try converting a negation into a subtraction, so
420     look for it as well.  TODO: negation of floating-point vectors
421     might be turned into an exclusive OR toggling the sign bit.  */
422  if (op == NULL
423      && code == NEGATE_EXPR
424      && INTEGRAL_TYPE_P (TREE_TYPE (type)))
425    op = optab_for_tree_code (MINUS_EXPR, type);
426
427  /* For very wide vectors, try using a smaller vector mode.  */
428  compute_type = type;
429  if (TYPE_MODE (type) == BLKmode && op)
430    {
431      tree vector_compute_type
432        = type_for_widest_vector_mode (TYPE_MODE (TREE_TYPE (type)), op);
433      if (vector_compute_type != NULL_TREE)
434        compute_type = vector_compute_type;
435    }
436
437  /* If we are breaking a BLKmode vector into smaller pieces,
438     type_for_widest_vector_mode has already looked into the optab,
439     so skip these checks.  */
440  if (compute_type == type)
441    {
442      compute_mode = TYPE_MODE (compute_type);
443      if ((GET_MODE_CLASS (compute_mode) == MODE_VECTOR_INT
444	   || GET_MODE_CLASS (compute_mode) == MODE_VECTOR_FLOAT)
445          && op != NULL
446	  && op->handlers[compute_mode].insn_code != CODE_FOR_nothing)
447	return;
448      else
449	/* There is no operation in hardware, so fall back to scalars.  */
450	compute_type = TREE_TYPE (type);
451    }
452
453  gcc_assert (code != VEC_LSHIFT_EXPR && code != VEC_RSHIFT_EXPR);
454  rhs = expand_vector_operation (bsi, type, compute_type, rhs, code);
455  if (lang_hooks.types_compatible_p (TREE_TYPE (lhs), TREE_TYPE (rhs)))
456    *p_rhs = rhs;
457  else
458    *p_rhs = gimplify_build1 (bsi, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), rhs);
459
460  mark_stmt_modified (bsi_stmt (*bsi));
461}
462
463/* Use this to lower vector operations introduced by the vectorizer,
464   if it may need the bit-twiddling tricks implemented in this file.  */
465
466static bool
467gate_expand_vector_operations (void)
468{
469  return flag_tree_vectorize != 0;
470}
471
472static unsigned int
473expand_vector_operations (void)
474{
475  block_stmt_iterator bsi;
476  basic_block bb;
477
478  FOR_EACH_BB (bb)
479    {
480      for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
481	{
482	  expand_vector_operations_1 (&bsi);
483	  update_stmt_if_modified (bsi_stmt (bsi));
484	}
485    }
486  return 0;
487}
488
489struct tree_opt_pass pass_lower_vector =
490{
491  "veclower",				/* name */
492  0,					/* gate */
493  expand_vector_operations,		/* execute */
494  NULL,					/* sub */
495  NULL,					/* next */
496  0,					/* static_pass_number */
497  0,					/* tv_id */
498  PROP_cfg,				/* properties_required */
499  0,					/* properties_provided */
500  0,					/* properties_destroyed */
501  0,					/* todo_flags_start */
502  TODO_dump_func | TODO_ggc_collect
503    | TODO_verify_stmts,		/* todo_flags_finish */
504  0					/* letter */
505};
506
507struct tree_opt_pass pass_lower_vector_ssa =
508{
509  "veclower2",				/* name */
510  gate_expand_vector_operations,	/* gate */
511  expand_vector_operations,		/* execute */
512  NULL,					/* sub */
513  NULL,					/* next */
514  0,					/* static_pass_number */
515  0,					/* tv_id */
516  PROP_cfg,				/* properties_required */
517  0,					/* properties_provided */
518  0,					/* properties_destroyed */
519  0,					/* todo_flags_start */
520  TODO_dump_func | TODO_update_ssa	/* todo_flags_finish */
521    | TODO_verify_ssa
522    | TODO_verify_stmts | TODO_verify_flow,
523  0					/* letter */
524};
525
526#include "gt-tree-vect-generic.h"
527