1/* Lower vector operations to scalar operations. 2 Copyright (C) 2004, 2005, 2006, 2007 Free Software Foundation, Inc. 3 4This file is part of GCC. 5 6GCC is free software; you can redistribute it and/or modify it 7under the terms of the GNU General Public License as published by the 8Free Software Foundation; either version 2, or (at your option) any 9later version. 10 11GCC is distributed in the hope that it will be useful, but WITHOUT 12ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14for more details. 15 16You should have received a copy of the GNU General Public License 17along with GCC; see the file COPYING. If not, write to the Free 18Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 1902110-1301, USA. */ 20 21#include "config.h" 22#include "system.h" 23#include "coretypes.h" 24#include "tree.h" 25#include "tm.h" 26#include "rtl.h" 27#include "expr.h" 28#include "insn-codes.h" 29#include "diagnostic.h" 30#include "optabs.h" 31#include "machmode.h" 32#include "langhooks.h" 33#include "tree-flow.h" 34#include "tree-gimple.h" 35#include "tree-iterator.h" 36#include "tree-pass.h" 37#include "flags.h" 38#include "ggc.h" 39 40 41/* Build a constant of type TYPE, made of VALUE's bits replicated 42 every TYPE_SIZE (INNER_TYPE) bits to fit TYPE's precision. */ 43static tree 44build_replicated_const (tree type, tree inner_type, HOST_WIDE_INT value) 45{ 46 int width = tree_low_cst (TYPE_SIZE (inner_type), 1); 47 int n = HOST_BITS_PER_WIDE_INT / width; 48 unsigned HOST_WIDE_INT low, high, mask; 49 tree ret; 50 51 gcc_assert (n); 52 53 if (width == HOST_BITS_PER_WIDE_INT) 54 low = value; 55 else 56 { 57 mask = ((HOST_WIDE_INT)1 << width) - 1; 58 low = (unsigned HOST_WIDE_INT) ~0 / mask * (value & mask); 59 } 60 61 if (TYPE_PRECISION (type) < HOST_BITS_PER_WIDE_INT) 62 low &= ((HOST_WIDE_INT)1 << TYPE_PRECISION (type)) - 1, high = 0; 63 else if (TYPE_PRECISION (type) == HOST_BITS_PER_WIDE_INT) 64 high = 0; 65 else if (TYPE_PRECISION (type) == 2 * HOST_BITS_PER_WIDE_INT) 66 high = low; 67 else 68 gcc_unreachable (); 69 70 ret = build_int_cst_wide (type, low, high); 71 return ret; 72} 73 74static GTY(()) tree vector_inner_type; 75static GTY(()) tree vector_last_type; 76static GTY(()) int vector_last_nunits; 77 78/* Return a suitable vector types made of SUBPARTS units each of mode 79 "word_mode" (the global variable). */ 80static tree 81build_word_mode_vector_type (int nunits) 82{ 83 if (!vector_inner_type) 84 vector_inner_type = lang_hooks.types.type_for_mode (word_mode, 1); 85 else if (vector_last_nunits == nunits) 86 { 87 gcc_assert (TREE_CODE (vector_last_type) == VECTOR_TYPE); 88 return vector_last_type; 89 } 90 91 /* We build a new type, but we canonicalize it nevertheless, 92 because it still saves some memory. */ 93 vector_last_nunits = nunits; 94 vector_last_type = type_hash_canon (nunits, 95 build_vector_type (vector_inner_type, 96 nunits)); 97 return vector_last_type; 98} 99 100typedef tree (*elem_op_func) (block_stmt_iterator *, 101 tree, tree, tree, tree, tree, enum tree_code); 102 103static inline tree 104tree_vec_extract (block_stmt_iterator *bsi, tree type, 105 tree t, tree bitsize, tree bitpos) 106{ 107 if (bitpos) 108 return gimplify_build3 (bsi, BIT_FIELD_REF, type, t, bitsize, bitpos); 109 else 110 return gimplify_build1 (bsi, VIEW_CONVERT_EXPR, type, t); 111} 112 113static tree 114do_unop (block_stmt_iterator *bsi, tree inner_type, tree a, 115 tree b ATTRIBUTE_UNUSED, tree bitpos, tree bitsize, 116 enum tree_code code) 117{ 118 a = tree_vec_extract (bsi, inner_type, a, bitsize, bitpos); 119 return gimplify_build1 (bsi, code, inner_type, a); 120} 121 122static tree 123do_binop (block_stmt_iterator *bsi, tree inner_type, tree a, tree b, 124 tree bitpos, tree bitsize, enum tree_code code) 125{ 126 a = tree_vec_extract (bsi, inner_type, a, bitsize, bitpos); 127 b = tree_vec_extract (bsi, inner_type, b, bitsize, bitpos); 128 return gimplify_build2 (bsi, code, inner_type, a, b); 129} 130 131/* Expand vector addition to scalars. This does bit twiddling 132 in order to increase parallelism: 133 134 a + b = (((int) a & 0x7f7f7f7f) + ((int) b & 0x7f7f7f7f)) ^ 135 (a ^ b) & 0x80808080 136 137 a - b = (((int) a | 0x80808080) - ((int) b & 0x7f7f7f7f)) ^ 138 (a ^ ~b) & 0x80808080 139 140 -b = (0x80808080 - ((int) b & 0x7f7f7f7f)) ^ (~b & 0x80808080) 141 142 This optimization should be done only if 4 vector items or more 143 fit into a word. */ 144static tree 145do_plus_minus (block_stmt_iterator *bsi, tree word_type, tree a, tree b, 146 tree bitpos ATTRIBUTE_UNUSED, tree bitsize ATTRIBUTE_UNUSED, 147 enum tree_code code) 148{ 149 tree inner_type = TREE_TYPE (TREE_TYPE (a)); 150 unsigned HOST_WIDE_INT max; 151 tree low_bits, high_bits, a_low, b_low, result_low, signs; 152 153 max = GET_MODE_MASK (TYPE_MODE (inner_type)); 154 low_bits = build_replicated_const (word_type, inner_type, max >> 1); 155 high_bits = build_replicated_const (word_type, inner_type, max & ~(max >> 1)); 156 157 a = tree_vec_extract (bsi, word_type, a, bitsize, bitpos); 158 b = tree_vec_extract (bsi, word_type, b, bitsize, bitpos); 159 160 signs = gimplify_build2 (bsi, BIT_XOR_EXPR, word_type, a, b); 161 b_low = gimplify_build2 (bsi, BIT_AND_EXPR, word_type, b, low_bits); 162 if (code == PLUS_EXPR) 163 a_low = gimplify_build2 (bsi, BIT_AND_EXPR, word_type, a, low_bits); 164 else 165 { 166 a_low = gimplify_build2 (bsi, BIT_IOR_EXPR, word_type, a, high_bits); 167 signs = gimplify_build1 (bsi, BIT_NOT_EXPR, word_type, signs); 168 } 169 170 signs = gimplify_build2 (bsi, BIT_AND_EXPR, word_type, signs, high_bits); 171 result_low = gimplify_build2 (bsi, code, word_type, a_low, b_low); 172 return gimplify_build2 (bsi, BIT_XOR_EXPR, word_type, result_low, signs); 173} 174 175static tree 176do_negate (block_stmt_iterator *bsi, tree word_type, tree b, 177 tree unused ATTRIBUTE_UNUSED, tree bitpos ATTRIBUTE_UNUSED, 178 tree bitsize ATTRIBUTE_UNUSED, 179 enum tree_code code ATTRIBUTE_UNUSED) 180{ 181 tree inner_type = TREE_TYPE (TREE_TYPE (b)); 182 HOST_WIDE_INT max; 183 tree low_bits, high_bits, b_low, result_low, signs; 184 185 max = GET_MODE_MASK (TYPE_MODE (inner_type)); 186 low_bits = build_replicated_const (word_type, inner_type, max >> 1); 187 high_bits = build_replicated_const (word_type, inner_type, max & ~(max >> 1)); 188 189 b = tree_vec_extract (bsi, word_type, b, bitsize, bitpos); 190 191 b_low = gimplify_build2 (bsi, BIT_AND_EXPR, word_type, b, low_bits); 192 signs = gimplify_build1 (bsi, BIT_NOT_EXPR, word_type, b); 193 signs = gimplify_build2 (bsi, BIT_AND_EXPR, word_type, signs, high_bits); 194 result_low = gimplify_build2 (bsi, MINUS_EXPR, word_type, high_bits, b_low); 195 return gimplify_build2 (bsi, BIT_XOR_EXPR, word_type, result_low, signs); 196} 197 198/* Expand a vector operation to scalars, by using many operations 199 whose type is the vector type's inner type. */ 200static tree 201expand_vector_piecewise (block_stmt_iterator *bsi, elem_op_func f, 202 tree type, tree inner_type, 203 tree a, tree b, enum tree_code code) 204{ 205 VEC(constructor_elt,gc) *v; 206 tree part_width = TYPE_SIZE (inner_type); 207 tree index = bitsize_int (0); 208 int nunits = TYPE_VECTOR_SUBPARTS (type); 209 int delta = tree_low_cst (part_width, 1) 210 / tree_low_cst (TYPE_SIZE (TREE_TYPE (type)), 1); 211 int i; 212 213 v = VEC_alloc(constructor_elt, gc, (nunits + delta - 1) / delta); 214 for (i = 0; i < nunits; 215 i += delta, index = int_const_binop (PLUS_EXPR, index, part_width, 0)) 216 { 217 tree result = f (bsi, inner_type, a, b, index, part_width, code); 218 constructor_elt *ce = VEC_quick_push (constructor_elt, v, NULL); 219 ce->index = NULL_TREE; 220 ce->value = result; 221 } 222 223 return build_constructor (type, v); 224} 225 226/* Expand a vector operation to scalars with the freedom to use 227 a scalar integer type, or to use a different size for the items 228 in the vector type. */ 229static tree 230expand_vector_parallel (block_stmt_iterator *bsi, elem_op_func f, tree type, 231 tree a, tree b, 232 enum tree_code code) 233{ 234 tree result, compute_type; 235 enum machine_mode mode; 236 int n_words = tree_low_cst (TYPE_SIZE_UNIT (type), 1) / UNITS_PER_WORD; 237 238 /* We have three strategies. If the type is already correct, just do 239 the operation an element at a time. Else, if the vector is wider than 240 one word, do it a word at a time; finally, if the vector is smaller 241 than one word, do it as a scalar. */ 242 if (TYPE_MODE (TREE_TYPE (type)) == word_mode) 243 return expand_vector_piecewise (bsi, f, 244 type, TREE_TYPE (type), 245 a, b, code); 246 else if (n_words > 1) 247 { 248 tree word_type = build_word_mode_vector_type (n_words); 249 result = expand_vector_piecewise (bsi, f, 250 word_type, TREE_TYPE (word_type), 251 a, b, code); 252 result = gimplify_val (bsi, word_type, result); 253 } 254 else 255 { 256 /* Use a single scalar operation with a mode no wider than word_mode. */ 257 mode = mode_for_size (tree_low_cst (TYPE_SIZE (type), 1), MODE_INT, 0); 258 compute_type = lang_hooks.types.type_for_mode (mode, 1); 259 result = f (bsi, compute_type, a, b, NULL_TREE, NULL_TREE, code); 260 } 261 262 return result; 263} 264 265/* Expand a vector operation to scalars; for integer types we can use 266 special bit twiddling tricks to do the sums a word at a time, using 267 function F_PARALLEL instead of F. These tricks are done only if 268 they can process at least four items, that is, only if the vector 269 holds at least four items and if a word can hold four items. */ 270static tree 271expand_vector_addition (block_stmt_iterator *bsi, 272 elem_op_func f, elem_op_func f_parallel, 273 tree type, tree a, tree b, enum tree_code code) 274{ 275 int parts_per_word = UNITS_PER_WORD 276 / tree_low_cst (TYPE_SIZE_UNIT (TREE_TYPE (type)), 1); 277 278 if (INTEGRAL_TYPE_P (TREE_TYPE (type)) 279 && parts_per_word >= 4 280 && TYPE_VECTOR_SUBPARTS (type) >= 4) 281 return expand_vector_parallel (bsi, f_parallel, 282 type, a, b, code); 283 else 284 return expand_vector_piecewise (bsi, f, 285 type, TREE_TYPE (type), 286 a, b, code); 287} 288 289static tree 290expand_vector_operation (block_stmt_iterator *bsi, tree type, tree compute_type, 291 tree rhs, enum tree_code code) 292{ 293 enum machine_mode compute_mode = TYPE_MODE (compute_type); 294 295 /* If the compute mode is not a vector mode (hence we are not decomposing 296 a BLKmode vector to smaller, hardware-supported vectors), we may want 297 to expand the operations in parallel. */ 298 if (GET_MODE_CLASS (compute_mode) != MODE_VECTOR_INT 299 && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_FLOAT) 300 switch (code) 301 { 302 case PLUS_EXPR: 303 case MINUS_EXPR: 304 if (!TYPE_OVERFLOW_TRAPS (type)) 305 return expand_vector_addition (bsi, do_binop, do_plus_minus, type, 306 TREE_OPERAND (rhs, 0), 307 TREE_OPERAND (rhs, 1), code); 308 break; 309 310 case NEGATE_EXPR: 311 if (!TYPE_OVERFLOW_TRAPS (type)) 312 return expand_vector_addition (bsi, do_unop, do_negate, type, 313 TREE_OPERAND (rhs, 0), 314 NULL_TREE, code); 315 break; 316 317 case BIT_AND_EXPR: 318 case BIT_IOR_EXPR: 319 case BIT_XOR_EXPR: 320 return expand_vector_parallel (bsi, do_binop, type, 321 TREE_OPERAND (rhs, 0), 322 TREE_OPERAND (rhs, 1), code); 323 324 case BIT_NOT_EXPR: 325 return expand_vector_parallel (bsi, do_unop, type, 326 TREE_OPERAND (rhs, 0), 327 NULL_TREE, code); 328 329 default: 330 break; 331 } 332 333 if (TREE_CODE_CLASS (code) == tcc_unary) 334 return expand_vector_piecewise (bsi, do_unop, type, compute_type, 335 TREE_OPERAND (rhs, 0), 336 NULL_TREE, code); 337 else 338 return expand_vector_piecewise (bsi, do_binop, type, compute_type, 339 TREE_OPERAND (rhs, 0), 340 TREE_OPERAND (rhs, 1), code); 341} 342 343/* Return a type for the widest vector mode whose components are of mode 344 INNER_MODE, or NULL_TREE if none is found. */ 345static tree 346type_for_widest_vector_mode (enum machine_mode inner_mode, optab op) 347{ 348 enum machine_mode best_mode = VOIDmode, mode; 349 int best_nunits = 0; 350 351 if (SCALAR_FLOAT_MODE_P (inner_mode)) 352 mode = MIN_MODE_VECTOR_FLOAT; 353 else 354 mode = MIN_MODE_VECTOR_INT; 355 356 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode)) 357 if (GET_MODE_INNER (mode) == inner_mode 358 && GET_MODE_NUNITS (mode) > best_nunits 359 && op->handlers[mode].insn_code != CODE_FOR_nothing) 360 best_mode = mode, best_nunits = GET_MODE_NUNITS (mode); 361 362 if (best_mode == VOIDmode) 363 return NULL_TREE; 364 else 365 return lang_hooks.types.type_for_mode (best_mode, 1); 366} 367 368/* Process one statement. If we identify a vector operation, expand it. */ 369 370static void 371expand_vector_operations_1 (block_stmt_iterator *bsi) 372{ 373 tree stmt = bsi_stmt (*bsi); 374 tree *p_lhs, *p_rhs, lhs, rhs, type, compute_type; 375 enum tree_code code; 376 enum machine_mode compute_mode; 377 optab op; 378 379 switch (TREE_CODE (stmt)) 380 { 381 case RETURN_EXPR: 382 stmt = TREE_OPERAND (stmt, 0); 383 if (!stmt || TREE_CODE (stmt) != MODIFY_EXPR) 384 return; 385 386 /* FALLTHRU */ 387 388 case MODIFY_EXPR: 389 p_lhs = &TREE_OPERAND (stmt, 0); 390 p_rhs = &TREE_OPERAND (stmt, 1); 391 lhs = *p_lhs; 392 rhs = *p_rhs; 393 break; 394 395 default: 396 return; 397 } 398 399 type = TREE_TYPE (rhs); 400 if (TREE_CODE (type) != VECTOR_TYPE) 401 return; 402 403 code = TREE_CODE (rhs); 404 if (TREE_CODE_CLASS (code) != tcc_unary 405 && TREE_CODE_CLASS (code) != tcc_binary) 406 return; 407 408 if (code == NOP_EXPR || code == VIEW_CONVERT_EXPR) 409 return; 410 411 gcc_assert (code != CONVERT_EXPR); 412 op = optab_for_tree_code (code, type); 413 414 /* For widening vector operations, the relevant type is of the arguments, 415 not the widened result. */ 416 if (code == WIDEN_SUM_EXPR) 417 type = TREE_TYPE (TREE_OPERAND (rhs, 0)); 418 419 /* Optabs will try converting a negation into a subtraction, so 420 look for it as well. TODO: negation of floating-point vectors 421 might be turned into an exclusive OR toggling the sign bit. */ 422 if (op == NULL 423 && code == NEGATE_EXPR 424 && INTEGRAL_TYPE_P (TREE_TYPE (type))) 425 op = optab_for_tree_code (MINUS_EXPR, type); 426 427 /* For very wide vectors, try using a smaller vector mode. */ 428 compute_type = type; 429 if (TYPE_MODE (type) == BLKmode && op) 430 { 431 tree vector_compute_type 432 = type_for_widest_vector_mode (TYPE_MODE (TREE_TYPE (type)), op); 433 if (vector_compute_type != NULL_TREE) 434 compute_type = vector_compute_type; 435 } 436 437 /* If we are breaking a BLKmode vector into smaller pieces, 438 type_for_widest_vector_mode has already looked into the optab, 439 so skip these checks. */ 440 if (compute_type == type) 441 { 442 compute_mode = TYPE_MODE (compute_type); 443 if ((GET_MODE_CLASS (compute_mode) == MODE_VECTOR_INT 444 || GET_MODE_CLASS (compute_mode) == MODE_VECTOR_FLOAT) 445 && op != NULL 446 && op->handlers[compute_mode].insn_code != CODE_FOR_nothing) 447 return; 448 else 449 /* There is no operation in hardware, so fall back to scalars. */ 450 compute_type = TREE_TYPE (type); 451 } 452 453 gcc_assert (code != VEC_LSHIFT_EXPR && code != VEC_RSHIFT_EXPR); 454 rhs = expand_vector_operation (bsi, type, compute_type, rhs, code); 455 if (lang_hooks.types_compatible_p (TREE_TYPE (lhs), TREE_TYPE (rhs))) 456 *p_rhs = rhs; 457 else 458 *p_rhs = gimplify_build1 (bsi, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), rhs); 459 460 mark_stmt_modified (bsi_stmt (*bsi)); 461} 462 463/* Use this to lower vector operations introduced by the vectorizer, 464 if it may need the bit-twiddling tricks implemented in this file. */ 465 466static bool 467gate_expand_vector_operations (void) 468{ 469 return flag_tree_vectorize != 0; 470} 471 472static unsigned int 473expand_vector_operations (void) 474{ 475 block_stmt_iterator bsi; 476 basic_block bb; 477 478 FOR_EACH_BB (bb) 479 { 480 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi)) 481 { 482 expand_vector_operations_1 (&bsi); 483 update_stmt_if_modified (bsi_stmt (bsi)); 484 } 485 } 486 return 0; 487} 488 489struct tree_opt_pass pass_lower_vector = 490{ 491 "veclower", /* name */ 492 0, /* gate */ 493 expand_vector_operations, /* execute */ 494 NULL, /* sub */ 495 NULL, /* next */ 496 0, /* static_pass_number */ 497 0, /* tv_id */ 498 PROP_cfg, /* properties_required */ 499 0, /* properties_provided */ 500 0, /* properties_destroyed */ 501 0, /* todo_flags_start */ 502 TODO_dump_func | TODO_ggc_collect 503 | TODO_verify_stmts, /* todo_flags_finish */ 504 0 /* letter */ 505}; 506 507struct tree_opt_pass pass_lower_vector_ssa = 508{ 509 "veclower2", /* name */ 510 gate_expand_vector_operations, /* gate */ 511 expand_vector_operations, /* execute */ 512 NULL, /* sub */ 513 NULL, /* next */ 514 0, /* static_pass_number */ 515 0, /* tv_id */ 516 PROP_cfg, /* properties_required */ 517 0, /* properties_provided */ 518 0, /* properties_destroyed */ 519 0, /* todo_flags_start */ 520 TODO_dump_func | TODO_update_ssa /* todo_flags_finish */ 521 | TODO_verify_ssa 522 | TODO_verify_stmts | TODO_verify_flow, 523 0 /* letter */ 524}; 525 526#include "gt-tree-vect-generic.h" 527