1/* Copyright (C) 2006-2015 Free Software Foundation, Inc.
2
3   This file is free software; you can redistribute it and/or modify it under
4   the terms of the GNU General Public License as published by the Free
5   Software Foundation; either version 3 of the License, or (at your option)
6   any later version.
7
8   This file is distributed in the hope that it will be useful, but WITHOUT
9   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
11   for more details.
12
13   You should have received a copy of the GNU General Public License
14   along with GCC; see the file COPYING3.  If not see
15   <http://www.gnu.org/licenses/>.  */
16
17#include "config.h"
18#include "system.h"
19#include "coretypes.h"
20#include "tm.h"
21#include "rtl.h"
22#include "regs.h"
23#include "hard-reg-set.h"
24#include "insn-config.h"
25#include "conditions.h"
26#include "insn-attr.h"
27#include "flags.h"
28#include "recog.h"
29#include "obstack.h"
30#include "hash-set.h"
31#include "machmode.h"
32#include "vec.h"
33#include "double-int.h"
34#include "input.h"
35#include "alias.h"
36#include "symtab.h"
37#include "wide-int.h"
38#include "inchash.h"
39#include "tree.h"
40#include "fold-const.h"
41#include "stringpool.h"
42#include "stor-layout.h"
43#include "calls.h"
44#include "varasm.h"
45#include "hashtab.h"
46#include "function.h"
47#include "statistics.h"
48#include "real.h"
49#include "fixed-value.h"
50#include "expmed.h"
51#include "dojump.h"
52#include "explow.h"
53#include "emit-rtl.h"
54#include "stmt.h"
55#include "expr.h"
56#include "insn-codes.h"
57#include "optabs.h"
58#include "except.h"
59#include "output.h"
60#include "predict.h"
61#include "dominance.h"
62#include "cfg.h"
63#include "cfgrtl.h"
64#include "cfganal.h"
65#include "lcm.h"
66#include "cfgbuild.h"
67#include "cfgcleanup.h"
68#include "basic-block.h"
69#include "diagnostic-core.h"
70#include "ggc.h"
71#include "tm_p.h"
72#include "target.h"
73#include "target-def.h"
74#include "langhooks.h"
75#include "reload.h"
76#include "sched-int.h"
77#include "params.h"
78#include "hash-table.h"
79#include "tree-ssa-alias.h"
80#include "internal-fn.h"
81#include "gimple-fold.h"
82#include "tree-eh.h"
83#include "gimple-expr.h"
84#include "is-a.h"
85#include "gimple.h"
86#include "gimplify.h"
87#include "tm-constrs.h"
88#include "sbitmap.h"
89#include "df.h"
90#include "ddg.h"
91#include "timevar.h"
92#include "dumpfile.h"
93#include "cfgloop.h"
94#include "builtins.h"
95#include "rtl-iter.h"
96
97/* Builtin types, data and prototypes. */
98
99enum spu_builtin_type_index
100{
101  SPU_BTI_END_OF_PARAMS,
102
103  /* We create new type nodes for these. */
104  SPU_BTI_V16QI,
105  SPU_BTI_V8HI,
106  SPU_BTI_V4SI,
107  SPU_BTI_V2DI,
108  SPU_BTI_V4SF,
109  SPU_BTI_V2DF,
110  SPU_BTI_UV16QI,
111  SPU_BTI_UV8HI,
112  SPU_BTI_UV4SI,
113  SPU_BTI_UV2DI,
114
115  /* A 16-byte type. (Implemented with V16QI_type_node) */
116  SPU_BTI_QUADWORD,
117
118  /* These all correspond to intSI_type_node */
119  SPU_BTI_7,
120  SPU_BTI_S7,
121  SPU_BTI_U7,
122  SPU_BTI_S10,
123  SPU_BTI_S10_4,
124  SPU_BTI_U14,
125  SPU_BTI_16,
126  SPU_BTI_S16,
127  SPU_BTI_S16_2,
128  SPU_BTI_U16,
129  SPU_BTI_U16_2,
130  SPU_BTI_U18,
131
132  /* These correspond to the standard types */
133  SPU_BTI_INTQI,
134  SPU_BTI_INTHI,
135  SPU_BTI_INTSI,
136  SPU_BTI_INTDI,
137
138  SPU_BTI_UINTQI,
139  SPU_BTI_UINTHI,
140  SPU_BTI_UINTSI,
141  SPU_BTI_UINTDI,
142
143  SPU_BTI_FLOAT,
144  SPU_BTI_DOUBLE,
145
146  SPU_BTI_VOID,
147  SPU_BTI_PTR,
148
149  SPU_BTI_MAX
150};
151
152#define V16QI_type_node               (spu_builtin_types[SPU_BTI_V16QI])
153#define V8HI_type_node                (spu_builtin_types[SPU_BTI_V8HI])
154#define V4SI_type_node                (spu_builtin_types[SPU_BTI_V4SI])
155#define V2DI_type_node                (spu_builtin_types[SPU_BTI_V2DI])
156#define V4SF_type_node                (spu_builtin_types[SPU_BTI_V4SF])
157#define V2DF_type_node                (spu_builtin_types[SPU_BTI_V2DF])
158#define unsigned_V16QI_type_node      (spu_builtin_types[SPU_BTI_UV16QI])
159#define unsigned_V8HI_type_node       (spu_builtin_types[SPU_BTI_UV8HI])
160#define unsigned_V4SI_type_node       (spu_builtin_types[SPU_BTI_UV4SI])
161#define unsigned_V2DI_type_node       (spu_builtin_types[SPU_BTI_UV2DI])
162
163static GTY(()) tree spu_builtin_types[SPU_BTI_MAX];
164
165struct spu_builtin_range
166{
167  int low, high;
168};
169
170static struct spu_builtin_range spu_builtin_range[] = {
171  {-0x40ll, 0x7fll},		/* SPU_BTI_7     */
172  {-0x40ll, 0x3fll},		/* SPU_BTI_S7    */
173  {0ll, 0x7fll},		/* SPU_BTI_U7    */
174  {-0x200ll, 0x1ffll},		/* SPU_BTI_S10   */
175  {-0x2000ll, 0x1fffll},	/* SPU_BTI_S10_4 */
176  {0ll, 0x3fffll},		/* SPU_BTI_U14   */
177  {-0x8000ll, 0xffffll},	/* SPU_BTI_16    */
178  {-0x8000ll, 0x7fffll},	/* SPU_BTI_S16   */
179  {-0x20000ll, 0x1ffffll},	/* SPU_BTI_S16_2 */
180  {0ll, 0xffffll},		/* SPU_BTI_U16   */
181  {0ll, 0x3ffffll},		/* SPU_BTI_U16_2 */
182  {0ll, 0x3ffffll},		/* SPU_BTI_U18   */
183};
184
185
186/*  Target specific attribute specifications.  */
187char regs_ever_allocated[FIRST_PSEUDO_REGISTER];
188
189/*  Prototypes and external defs.  */
190static int get_pipe (rtx_insn *insn);
191static int spu_naked_function_p (tree func);
192static int mem_is_padded_component_ref (rtx x);
193static void fix_range (const char *);
194static rtx spu_expand_load (rtx, rtx, rtx, int);
195
196/* Which instruction set architecture to use.  */
197int spu_arch;
198/* Which cpu are we tuning for.  */
199int spu_tune;
200
201/* The hardware requires 8 insns between a hint and the branch it
202   effects.  This variable describes how many rtl instructions the
203   compiler needs to see before inserting a hint, and then the compiler
204   will insert enough nops to make it at least 8 insns.  The default is
205   for the compiler to allow up to 2 nops be emitted.  The nops are
206   inserted in pairs, so we round down. */
207int spu_hint_dist = (8*4) - (2*4);
208
209enum spu_immediate {
210  SPU_NONE,
211  SPU_IL,
212  SPU_ILA,
213  SPU_ILH,
214  SPU_ILHU,
215  SPU_ORI,
216  SPU_ORHI,
217  SPU_ORBI,
218  SPU_IOHL
219};
220enum immediate_class
221{
222  IC_POOL,			/* constant pool */
223  IC_IL1,			/* one il* instruction */
224  IC_IL2,			/* both ilhu and iohl instructions */
225  IC_IL1s,			/* one il* instruction */
226  IC_IL2s,			/* both ilhu and iohl instructions */
227  IC_FSMBI,			/* the fsmbi instruction */
228  IC_CPAT,			/* one of the c*d instructions */
229  IC_FSMBI2			/* fsmbi plus 1 other instruction */
230};
231
232static enum spu_immediate which_immediate_load (HOST_WIDE_INT val);
233static enum spu_immediate which_logical_immediate (HOST_WIDE_INT val);
234static int cpat_info(unsigned char *arr, int size, int *prun, int *pstart);
235static enum immediate_class classify_immediate (rtx op,
236						machine_mode mode);
237
238/* Pointer mode for __ea references.  */
239#define EAmode (spu_ea_model != 32 ? DImode : SImode)
240
241
242/* Define the structure for the machine field in struct function.  */
243struct GTY(()) machine_function
244{
245  /* Register to use for PIC accesses.  */
246  rtx pic_reg;
247};
248
249/* How to allocate a 'struct machine_function'.  */
250static struct machine_function *
251spu_init_machine_status (void)
252{
253  return ggc_cleared_alloc<machine_function> ();
254}
255
256/* Implement TARGET_OPTION_OVERRIDE.  */
257static void
258spu_option_override (void)
259{
260  /* Set up function hooks.  */
261  init_machine_status = spu_init_machine_status;
262
263  /* Small loops will be unpeeled at -O3.  For SPU it is more important
264     to keep code small by default.  */
265  if (!flag_unroll_loops && !flag_peel_loops)
266    maybe_set_param_value (PARAM_MAX_COMPLETELY_PEEL_TIMES, 4,
267			   global_options.x_param_values,
268			   global_options_set.x_param_values);
269
270  flag_omit_frame_pointer = 1;
271
272  /* Functions must be 8 byte aligned so we correctly handle dual issue */
273  if (align_functions < 8)
274    align_functions = 8;
275
276  spu_hint_dist = 8*4 - spu_max_nops*4;
277  if (spu_hint_dist < 0)
278    spu_hint_dist = 0;
279
280  if (spu_fixed_range_string)
281    fix_range (spu_fixed_range_string);
282
283  /* Determine processor architectural level.  */
284  if (spu_arch_string)
285    {
286      if (strcmp (&spu_arch_string[0], "cell") == 0)
287        spu_arch = PROCESSOR_CELL;
288      else if (strcmp (&spu_arch_string[0], "celledp") == 0)
289        spu_arch = PROCESSOR_CELLEDP;
290      else
291        error ("bad value (%s) for -march= switch", spu_arch_string);
292    }
293
294  /* Determine processor to tune for.  */
295  if (spu_tune_string)
296    {
297      if (strcmp (&spu_tune_string[0], "cell") == 0)
298        spu_tune = PROCESSOR_CELL;
299      else if (strcmp (&spu_tune_string[0], "celledp") == 0)
300        spu_tune = PROCESSOR_CELLEDP;
301      else
302        error ("bad value (%s) for -mtune= switch", spu_tune_string);
303    }
304
305  /* Change defaults according to the processor architecture.  */
306  if (spu_arch == PROCESSOR_CELLEDP)
307    {
308      /* If no command line option has been otherwise specified, change
309	 the default to -mno-safe-hints on celledp -- only the original
310	 Cell/B.E. processors require this workaround.  */
311      if (!(target_flags_explicit & MASK_SAFE_HINTS))
312	target_flags &= ~MASK_SAFE_HINTS;
313    }
314
315  REAL_MODE_FORMAT (SFmode) = &spu_single_format;
316}
317
318/* Handle an attribute requiring a FUNCTION_DECL; arguments as in
319   struct attribute_spec.handler.  */
320
321/* True if MODE is valid for the target.  By "valid", we mean able to
322   be manipulated in non-trivial ways.  In particular, this means all
323   the arithmetic is supported.  */
324static bool
325spu_scalar_mode_supported_p (machine_mode mode)
326{
327  switch (mode)
328    {
329    case QImode:
330    case HImode:
331    case SImode:
332    case SFmode:
333    case DImode:
334    case TImode:
335    case DFmode:
336      return true;
337
338    default:
339      return false;
340    }
341}
342
343/* Similarly for vector modes.  "Supported" here is less strict.  At
344   least some operations are supported; need to check optabs or builtins
345   for further details.  */
346static bool
347spu_vector_mode_supported_p (machine_mode mode)
348{
349  switch (mode)
350    {
351    case V16QImode:
352    case V8HImode:
353    case V4SImode:
354    case V2DImode:
355    case V4SFmode:
356    case V2DFmode:
357      return true;
358
359    default:
360      return false;
361    }
362}
363
364/* GCC assumes that in a paradoxical SUBREG the inner mode occupies the
365   least significant bytes of the outer mode.  This function returns
366   TRUE for the SUBREG's where this is correct.  */
367int
368valid_subreg (rtx op)
369{
370  machine_mode om = GET_MODE (op);
371  machine_mode im = GET_MODE (SUBREG_REG (op));
372  return om != VOIDmode && im != VOIDmode
373    && (GET_MODE_SIZE (im) == GET_MODE_SIZE (om)
374	|| (GET_MODE_SIZE (im) <= 4 && GET_MODE_SIZE (om) <= 4)
375	|| (GET_MODE_SIZE (im) >= 16 && GET_MODE_SIZE (om) >= 16));
376}
377
378/* When insv and ext[sz]v ar passed a TI SUBREG, we want to strip it off
379   and adjust the start offset.  */
380static rtx
381adjust_operand (rtx op, HOST_WIDE_INT * start)
382{
383  machine_mode mode;
384  int op_size;
385  /* Strip any paradoxical SUBREG.  */
386  if (GET_CODE (op) == SUBREG
387      && (GET_MODE_BITSIZE (GET_MODE (op))
388	  > GET_MODE_BITSIZE (GET_MODE (SUBREG_REG (op)))))
389    {
390      if (start)
391	*start -=
392	  GET_MODE_BITSIZE (GET_MODE (op)) -
393	  GET_MODE_BITSIZE (GET_MODE (SUBREG_REG (op)));
394      op = SUBREG_REG (op);
395    }
396  /* If it is smaller than SI, assure a SUBREG */
397  op_size = GET_MODE_BITSIZE (GET_MODE (op));
398  if (op_size < 32)
399    {
400      if (start)
401	*start += 32 - op_size;
402      op_size = 32;
403    }
404  /* If it is not a MODE_INT (and/or it is smaller than SI) add a SUBREG. */
405  mode = mode_for_size (op_size, MODE_INT, 0);
406  if (mode != GET_MODE (op))
407    op = gen_rtx_SUBREG (mode, op, 0);
408  return op;
409}
410
411void
412spu_expand_extv (rtx ops[], int unsignedp)
413{
414  rtx dst = ops[0], src = ops[1];
415  HOST_WIDE_INT width = INTVAL (ops[2]);
416  HOST_WIDE_INT start = INTVAL (ops[3]);
417  HOST_WIDE_INT align_mask;
418  rtx s0, s1, mask, r0;
419
420  gcc_assert (REG_P (dst) && GET_MODE (dst) == TImode);
421
422  if (MEM_P (src))
423    {
424      /* First, determine if we need 1 TImode load or 2.  We need only 1
425         if the bits being extracted do not cross the alignment boundary
426         as determined by the MEM and its address. */
427
428      align_mask = -MEM_ALIGN (src);
429      if ((start & align_mask) == ((start + width - 1) & align_mask))
430	{
431	  /* Alignment is sufficient for 1 load. */
432	  s0 = gen_reg_rtx (TImode);
433	  r0 = spu_expand_load (s0, 0, src, start / 8);
434	  start &= 7;
435	  if (r0)
436	    emit_insn (gen_rotqby_ti (s0, s0, r0));
437	}
438      else
439	{
440	  /* Need 2 loads. */
441	  s0 = gen_reg_rtx (TImode);
442	  s1 = gen_reg_rtx (TImode);
443	  r0 = spu_expand_load (s0, s1, src, start / 8);
444	  start &= 7;
445
446	  gcc_assert (start + width <= 128);
447	  if (r0)
448	    {
449	      rtx r1 = gen_reg_rtx (SImode);
450	      mask = gen_reg_rtx (TImode);
451	      emit_move_insn (mask, GEN_INT (-1));
452	      emit_insn (gen_rotqby_ti (s0, s0, r0));
453	      emit_insn (gen_rotqby_ti (s1, s1, r0));
454	      if (GET_CODE (r0) == CONST_INT)
455		r1 = GEN_INT (INTVAL (r0) & 15);
456	      else
457		emit_insn (gen_andsi3 (r1, r0, GEN_INT (15)));
458	      emit_insn (gen_shlqby_ti (mask, mask, r1));
459	      emit_insn (gen_selb (s0, s1, s0, mask));
460	    }
461	}
462
463    }
464  else if (GET_CODE (src) == SUBREG)
465    {
466      rtx r = SUBREG_REG (src);
467      gcc_assert (REG_P (r) && SCALAR_INT_MODE_P (GET_MODE (r)));
468      s0 = gen_reg_rtx (TImode);
469      if (GET_MODE_SIZE (GET_MODE (r)) < GET_MODE_SIZE (TImode))
470	emit_insn (gen_rtx_SET (VOIDmode, s0, gen_rtx_ZERO_EXTEND (TImode, r)));
471      else
472	emit_move_insn (s0, src);
473    }
474  else
475    {
476      gcc_assert (REG_P (src) && GET_MODE (src) == TImode);
477      s0 = gen_reg_rtx (TImode);
478      emit_move_insn (s0, src);
479    }
480
481  /* Now s0 is TImode and contains the bits to extract at start. */
482
483  if (start)
484    emit_insn (gen_rotlti3 (s0, s0, GEN_INT (start)));
485
486  if (128 - width)
487    s0 = expand_shift (RSHIFT_EXPR, TImode, s0, 128 - width, s0, unsignedp);
488
489  emit_move_insn (dst, s0);
490}
491
492void
493spu_expand_insv (rtx ops[])
494{
495  HOST_WIDE_INT width = INTVAL (ops[1]);
496  HOST_WIDE_INT start = INTVAL (ops[2]);
497  HOST_WIDE_INT maskbits;
498  machine_mode dst_mode;
499  rtx dst = ops[0], src = ops[3];
500  int dst_size;
501  rtx mask;
502  rtx shift_reg;
503  int shift;
504
505
506  if (GET_CODE (ops[0]) == MEM)
507    dst = gen_reg_rtx (TImode);
508  else
509    dst = adjust_operand (dst, &start);
510  dst_mode = GET_MODE (dst);
511  dst_size = GET_MODE_BITSIZE (GET_MODE (dst));
512
513  if (CONSTANT_P (src))
514    {
515      machine_mode m =
516	(width <= 32 ? SImode : width <= 64 ? DImode : TImode);
517      src = force_reg (m, convert_to_mode (m, src, 0));
518    }
519  src = adjust_operand (src, 0);
520
521  mask = gen_reg_rtx (dst_mode);
522  shift_reg = gen_reg_rtx (dst_mode);
523  shift = dst_size - start - width;
524
525  /* It's not safe to use subreg here because the compiler assumes
526     that the SUBREG_REG is right justified in the SUBREG. */
527  convert_move (shift_reg, src, 1);
528
529  if (shift > 0)
530    {
531      switch (dst_mode)
532	{
533	case SImode:
534	  emit_insn (gen_ashlsi3 (shift_reg, shift_reg, GEN_INT (shift)));
535	  break;
536	case DImode:
537	  emit_insn (gen_ashldi3 (shift_reg, shift_reg, GEN_INT (shift)));
538	  break;
539	case TImode:
540	  emit_insn (gen_ashlti3 (shift_reg, shift_reg, GEN_INT (shift)));
541	  break;
542	default:
543	  abort ();
544	}
545    }
546  else if (shift < 0)
547    abort ();
548
549  switch (dst_size)
550    {
551    case 32:
552      maskbits = (-1ll << (32 - width - start));
553      if (start)
554	maskbits += (1ll << (32 - start));
555      emit_move_insn (mask, GEN_INT (maskbits));
556      break;
557    case 64:
558      maskbits = (-1ll << (64 - width - start));
559      if (start)
560	maskbits += (1ll << (64 - start));
561      emit_move_insn (mask, GEN_INT (maskbits));
562      break;
563    case 128:
564      {
565	unsigned char arr[16];
566	int i = start / 8;
567	memset (arr, 0, sizeof (arr));
568	arr[i] = 0xff >> (start & 7);
569	for (i++; i <= (start + width - 1) / 8; i++)
570	  arr[i] = 0xff;
571	arr[i - 1] &= 0xff << (7 - ((start + width - 1) & 7));
572	emit_move_insn (mask, array_to_constant (TImode, arr));
573      }
574      break;
575    default:
576      abort ();
577    }
578  if (GET_CODE (ops[0]) == MEM)
579    {
580      rtx low = gen_reg_rtx (SImode);
581      rtx rotl = gen_reg_rtx (SImode);
582      rtx mask0 = gen_reg_rtx (TImode);
583      rtx addr;
584      rtx addr0;
585      rtx addr1;
586      rtx mem;
587
588      addr = force_reg (Pmode, XEXP (ops[0], 0));
589      addr0 = gen_rtx_AND (Pmode, addr, GEN_INT (-16));
590      emit_insn (gen_andsi3 (low, addr, GEN_INT (15)));
591      emit_insn (gen_negsi2 (rotl, low));
592      emit_insn (gen_rotqby_ti (shift_reg, shift_reg, rotl));
593      emit_insn (gen_rotqmby_ti (mask0, mask, rotl));
594      mem = change_address (ops[0], TImode, addr0);
595      set_mem_alias_set (mem, 0);
596      emit_move_insn (dst, mem);
597      emit_insn (gen_selb (dst, dst, shift_reg, mask0));
598      if (start + width > MEM_ALIGN (ops[0]))
599	{
600	  rtx shl = gen_reg_rtx (SImode);
601	  rtx mask1 = gen_reg_rtx (TImode);
602	  rtx dst1 = gen_reg_rtx (TImode);
603	  rtx mem1;
604	  addr1 = plus_constant (Pmode, addr, 16);
605	  addr1 = gen_rtx_AND (Pmode, addr1, GEN_INT (-16));
606	  emit_insn (gen_subsi3 (shl, GEN_INT (16), low));
607	  emit_insn (gen_shlqby_ti (mask1, mask, shl));
608	  mem1 = change_address (ops[0], TImode, addr1);
609	  set_mem_alias_set (mem1, 0);
610	  emit_move_insn (dst1, mem1);
611	  emit_insn (gen_selb (dst1, dst1, shift_reg, mask1));
612	  emit_move_insn (mem1, dst1);
613	}
614      emit_move_insn (mem, dst);
615    }
616  else
617    emit_insn (gen_selb (dst, copy_rtx (dst), shift_reg, mask));
618}
619
620
621int
622spu_expand_block_move (rtx ops[])
623{
624  HOST_WIDE_INT bytes, align, offset;
625  rtx src, dst, sreg, dreg, target;
626  int i;
627  if (GET_CODE (ops[2]) != CONST_INT
628      || GET_CODE (ops[3]) != CONST_INT
629      || INTVAL (ops[2]) > (HOST_WIDE_INT) (MOVE_RATIO (optimize_insn_for_speed_p ()) * 8))
630    return 0;
631
632  bytes = INTVAL (ops[2]);
633  align = INTVAL (ops[3]);
634
635  if (bytes <= 0)
636    return 1;
637
638  dst = ops[0];
639  src = ops[1];
640
641  if (align == 16)
642    {
643      for (offset = 0; offset + 16 <= bytes; offset += 16)
644	{
645	  dst = adjust_address (ops[0], V16QImode, offset);
646	  src = adjust_address (ops[1], V16QImode, offset);
647	  emit_move_insn (dst, src);
648	}
649      if (offset < bytes)
650	{
651	  rtx mask;
652	  unsigned char arr[16] = { 0 };
653	  for (i = 0; i < bytes - offset; i++)
654	    arr[i] = 0xff;
655	  dst = adjust_address (ops[0], V16QImode, offset);
656	  src = adjust_address (ops[1], V16QImode, offset);
657	  mask = gen_reg_rtx (V16QImode);
658	  sreg = gen_reg_rtx (V16QImode);
659	  dreg = gen_reg_rtx (V16QImode);
660	  target = gen_reg_rtx (V16QImode);
661	  emit_move_insn (mask, array_to_constant (V16QImode, arr));
662	  emit_move_insn (dreg, dst);
663	  emit_move_insn (sreg, src);
664	  emit_insn (gen_selb (target, dreg, sreg, mask));
665	  emit_move_insn (dst, target);
666	}
667      return 1;
668    }
669  return 0;
670}
671
672enum spu_comp_code
673{ SPU_EQ, SPU_GT, SPU_GTU };
674
675int spu_comp_icode[12][3] = {
676 {CODE_FOR_ceq_qi, CODE_FOR_cgt_qi, CODE_FOR_clgt_qi},
677 {CODE_FOR_ceq_hi, CODE_FOR_cgt_hi, CODE_FOR_clgt_hi},
678 {CODE_FOR_ceq_si, CODE_FOR_cgt_si, CODE_FOR_clgt_si},
679 {CODE_FOR_ceq_di, CODE_FOR_cgt_di, CODE_FOR_clgt_di},
680 {CODE_FOR_ceq_ti, CODE_FOR_cgt_ti, CODE_FOR_clgt_ti},
681 {CODE_FOR_ceq_sf, CODE_FOR_cgt_sf, 0},
682 {CODE_FOR_ceq_df, CODE_FOR_cgt_df, 0},
683 {CODE_FOR_ceq_v16qi, CODE_FOR_cgt_v16qi, CODE_FOR_clgt_v16qi},
684 {CODE_FOR_ceq_v8hi,  CODE_FOR_cgt_v8hi,  CODE_FOR_clgt_v8hi},
685 {CODE_FOR_ceq_v4si,  CODE_FOR_cgt_v4si,  CODE_FOR_clgt_v4si},
686 {CODE_FOR_ceq_v4sf,  CODE_FOR_cgt_v4sf, 0},
687 {CODE_FOR_ceq_v2df,  CODE_FOR_cgt_v2df, 0},
688};
689
690/* Generate a compare for CODE.  Return a brand-new rtx that represents
691   the result of the compare.   GCC can figure this out too if we don't
692   provide all variations of compares, but GCC always wants to use
693   WORD_MODE, we can generate better code in most cases if we do it
694   ourselves.  */
695void
696spu_emit_branch_or_set (int is_set, rtx cmp, rtx operands[])
697{
698  int reverse_compare = 0;
699  int reverse_test = 0;
700  rtx compare_result, eq_result;
701  rtx comp_rtx, eq_rtx;
702  machine_mode comp_mode;
703  machine_mode op_mode;
704  enum spu_comp_code scode, eq_code;
705  enum insn_code ior_code;
706  enum rtx_code code = GET_CODE (cmp);
707  rtx op0 = XEXP (cmp, 0);
708  rtx op1 = XEXP (cmp, 1);
709  int index;
710  int eq_test = 0;
711
712  /* When op1 is a CONST_INT change (X >= C) to (X > C-1),
713     and so on, to keep the constant in operand 1. */
714  if (GET_CODE (op1) == CONST_INT)
715    {
716      HOST_WIDE_INT val = INTVAL (op1) - 1;
717      if (trunc_int_for_mode (val, GET_MODE (op0)) == val)
718	switch (code)
719	  {
720	  case GE:
721	    op1 = GEN_INT (val);
722	    code = GT;
723	    break;
724	  case LT:
725	    op1 = GEN_INT (val);
726	    code = LE;
727	    break;
728	  case GEU:
729	    op1 = GEN_INT (val);
730	    code = GTU;
731	    break;
732	  case LTU:
733	    op1 = GEN_INT (val);
734	    code = LEU;
735	    break;
736	  default:
737	    break;
738	  }
739    }
740
741  /* However, if we generate an integer result, performing a reverse test
742     would require an extra negation, so avoid that where possible.  */
743  if (GET_CODE (op1) == CONST_INT && is_set == 1)
744    {
745      HOST_WIDE_INT val = INTVAL (op1) + 1;
746      if (trunc_int_for_mode (val, GET_MODE (op0)) == val)
747	switch (code)
748	  {
749	  case LE:
750	    op1 = GEN_INT (val);
751	    code = LT;
752	    break;
753	  case LEU:
754	    op1 = GEN_INT (val);
755	    code = LTU;
756	    break;
757	  default:
758	    break;
759	  }
760    }
761
762  comp_mode = SImode;
763  op_mode = GET_MODE (op0);
764
765  switch (code)
766    {
767    case GE:
768      scode = SPU_GT;
769      if (HONOR_NANS (op_mode))
770	{
771	  reverse_compare = 0;
772	  reverse_test = 0;
773	  eq_test = 1;
774	  eq_code = SPU_EQ;
775	}
776      else
777	{
778	  reverse_compare = 1;
779	  reverse_test = 1;
780	}
781      break;
782    case LE:
783      scode = SPU_GT;
784      if (HONOR_NANS (op_mode))
785	{
786	  reverse_compare = 1;
787	  reverse_test = 0;
788	  eq_test = 1;
789	  eq_code = SPU_EQ;
790	}
791      else
792	{
793	  reverse_compare = 0;
794	  reverse_test = 1;
795	}
796      break;
797    case LT:
798      reverse_compare = 1;
799      reverse_test = 0;
800      scode = SPU_GT;
801      break;
802    case GEU:
803      reverse_compare = 1;
804      reverse_test = 1;
805      scode = SPU_GTU;
806      break;
807    case LEU:
808      reverse_compare = 0;
809      reverse_test = 1;
810      scode = SPU_GTU;
811      break;
812    case LTU:
813      reverse_compare = 1;
814      reverse_test = 0;
815      scode = SPU_GTU;
816      break;
817    case NE:
818      reverse_compare = 0;
819      reverse_test = 1;
820      scode = SPU_EQ;
821      break;
822
823    case EQ:
824      scode = SPU_EQ;
825      break;
826    case GT:
827      scode = SPU_GT;
828      break;
829    case GTU:
830      scode = SPU_GTU;
831      break;
832    default:
833      scode = SPU_EQ;
834      break;
835    }
836
837  switch (op_mode)
838    {
839    case QImode:
840      index = 0;
841      comp_mode = QImode;
842      break;
843    case HImode:
844      index = 1;
845      comp_mode = HImode;
846      break;
847    case SImode:
848      index = 2;
849      break;
850    case DImode:
851      index = 3;
852      break;
853    case TImode:
854      index = 4;
855      break;
856    case SFmode:
857      index = 5;
858      break;
859    case DFmode:
860      index = 6;
861      break;
862    case V16QImode:
863      index = 7;
864      comp_mode = op_mode;
865      break;
866    case V8HImode:
867      index = 8;
868      comp_mode = op_mode;
869      break;
870    case V4SImode:
871      index = 9;
872      comp_mode = op_mode;
873      break;
874    case V4SFmode:
875      index = 10;
876      comp_mode = V4SImode;
877      break;
878    case V2DFmode:
879      index = 11;
880      comp_mode = V2DImode;
881      break;
882    case V2DImode:
883    default:
884      abort ();
885    }
886
887  if (GET_MODE (op1) == DFmode
888      && (scode != SPU_GT && scode != SPU_EQ))
889    abort ();
890
891  if (is_set == 0 && op1 == const0_rtx
892      && (GET_MODE (op0) == SImode
893	  || GET_MODE (op0) == HImode
894	  || GET_MODE (op0) == QImode) && scode == SPU_EQ)
895    {
896      /* Don't need to set a register with the result when we are
897         comparing against zero and branching. */
898      reverse_test = !reverse_test;
899      compare_result = op0;
900    }
901  else
902    {
903      compare_result = gen_reg_rtx (comp_mode);
904
905      if (reverse_compare)
906	{
907	  rtx t = op1;
908	  op1 = op0;
909	  op0 = t;
910	}
911
912      if (spu_comp_icode[index][scode] == 0)
913	abort ();
914
915      if (!(*insn_data[spu_comp_icode[index][scode]].operand[1].predicate)
916	  (op0, op_mode))
917	op0 = force_reg (op_mode, op0);
918      if (!(*insn_data[spu_comp_icode[index][scode]].operand[2].predicate)
919	  (op1, op_mode))
920	op1 = force_reg (op_mode, op1);
921      comp_rtx = GEN_FCN (spu_comp_icode[index][scode]) (compare_result,
922							 op0, op1);
923      if (comp_rtx == 0)
924	abort ();
925      emit_insn (comp_rtx);
926
927      if (eq_test)
928        {
929          eq_result = gen_reg_rtx (comp_mode);
930          eq_rtx = GEN_FCN (spu_comp_icode[index][eq_code]) (eq_result,
931							     op0, op1);
932          if (eq_rtx == 0)
933	    abort ();
934          emit_insn (eq_rtx);
935          ior_code = optab_handler (ior_optab, comp_mode);
936          gcc_assert (ior_code != CODE_FOR_nothing);
937          emit_insn (GEN_FCN (ior_code)
938		     (compare_result, compare_result, eq_result));
939        }
940    }
941
942  if (is_set == 0)
943    {
944      rtx bcomp;
945      rtx loc_ref;
946
947      /* We don't have branch on QI compare insns, so we convert the
948         QI compare result to a HI result. */
949      if (comp_mode == QImode)
950	{
951	  rtx old_res = compare_result;
952	  compare_result = gen_reg_rtx (HImode);
953	  comp_mode = HImode;
954	  emit_insn (gen_extendqihi2 (compare_result, old_res));
955	}
956
957      if (reverse_test)
958	bcomp = gen_rtx_EQ (comp_mode, compare_result, const0_rtx);
959      else
960	bcomp = gen_rtx_NE (comp_mode, compare_result, const0_rtx);
961
962      loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[3]);
963      emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx,
964				   gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
965							 loc_ref, pc_rtx)));
966    }
967  else if (is_set == 2)
968    {
969      rtx target = operands[0];
970      int compare_size = GET_MODE_BITSIZE (comp_mode);
971      int target_size = GET_MODE_BITSIZE (GET_MODE (target));
972      machine_mode mode = mode_for_size (target_size, MODE_INT, 0);
973      rtx select_mask;
974      rtx op_t = operands[2];
975      rtx op_f = operands[3];
976
977      /* The result of the comparison can be SI, HI or QI mode.  Create a
978         mask based on that result. */
979      if (target_size > compare_size)
980	{
981	  select_mask = gen_reg_rtx (mode);
982	  emit_insn (gen_extend_compare (select_mask, compare_result));
983	}
984      else if (target_size < compare_size)
985	select_mask =
986	  gen_rtx_SUBREG (mode, compare_result,
987			  (compare_size - target_size) / BITS_PER_UNIT);
988      else if (comp_mode != mode)
989	select_mask = gen_rtx_SUBREG (mode, compare_result, 0);
990      else
991	select_mask = compare_result;
992
993      if (GET_MODE (target) != GET_MODE (op_t)
994	  || GET_MODE (target) != GET_MODE (op_f))
995	abort ();
996
997      if (reverse_test)
998	emit_insn (gen_selb (target, op_t, op_f, select_mask));
999      else
1000	emit_insn (gen_selb (target, op_f, op_t, select_mask));
1001    }
1002  else
1003    {
1004      rtx target = operands[0];
1005      if (reverse_test)
1006	emit_insn (gen_rtx_SET (VOIDmode, compare_result,
1007				gen_rtx_NOT (comp_mode, compare_result)));
1008      if (GET_MODE (target) == SImode && GET_MODE (compare_result) == HImode)
1009	emit_insn (gen_extendhisi2 (target, compare_result));
1010      else if (GET_MODE (target) == SImode
1011	       && GET_MODE (compare_result) == QImode)
1012	emit_insn (gen_extend_compare (target, compare_result));
1013      else
1014	emit_move_insn (target, compare_result);
1015    }
1016}
1017
1018HOST_WIDE_INT
1019const_double_to_hwint (rtx x)
1020{
1021  HOST_WIDE_INT val;
1022  REAL_VALUE_TYPE rv;
1023  if (GET_MODE (x) == SFmode)
1024    {
1025      REAL_VALUE_FROM_CONST_DOUBLE (rv, x);
1026      REAL_VALUE_TO_TARGET_SINGLE (rv, val);
1027    }
1028  else if (GET_MODE (x) == DFmode)
1029    {
1030      long l[2];
1031      REAL_VALUE_FROM_CONST_DOUBLE (rv, x);
1032      REAL_VALUE_TO_TARGET_DOUBLE (rv, l);
1033      val = l[0];
1034      val = (val << 32) | (l[1] & 0xffffffff);
1035    }
1036  else
1037    abort ();
1038  return val;
1039}
1040
1041rtx
1042hwint_to_const_double (machine_mode mode, HOST_WIDE_INT v)
1043{
1044  long tv[2];
1045  REAL_VALUE_TYPE rv;
1046  gcc_assert (mode == SFmode || mode == DFmode);
1047
1048  if (mode == SFmode)
1049    tv[0] = (v << 32) >> 32;
1050  else if (mode == DFmode)
1051    {
1052      tv[1] = (v << 32) >> 32;
1053      tv[0] = v >> 32;
1054    }
1055  real_from_target (&rv, tv, mode);
1056  return CONST_DOUBLE_FROM_REAL_VALUE (rv, mode);
1057}
1058
1059void
1060print_operand_address (FILE * file, register rtx addr)
1061{
1062  rtx reg;
1063  rtx offset;
1064
1065  if (GET_CODE (addr) == AND
1066      && GET_CODE (XEXP (addr, 1)) == CONST_INT
1067      && INTVAL (XEXP (addr, 1)) == -16)
1068    addr = XEXP (addr, 0);
1069
1070  switch (GET_CODE (addr))
1071    {
1072    case REG:
1073      fprintf (file, "0(%s)", reg_names[REGNO (addr)]);
1074      break;
1075
1076    case PLUS:
1077      reg = XEXP (addr, 0);
1078      offset = XEXP (addr, 1);
1079      if (GET_CODE (offset) == REG)
1080	{
1081	  fprintf (file, "%s,%s", reg_names[REGNO (reg)],
1082		   reg_names[REGNO (offset)]);
1083	}
1084      else if (GET_CODE (offset) == CONST_INT)
1085	{
1086	  fprintf (file, HOST_WIDE_INT_PRINT_DEC "(%s)",
1087		   INTVAL (offset), reg_names[REGNO (reg)]);
1088	}
1089      else
1090	abort ();
1091      break;
1092
1093    case CONST:
1094    case LABEL_REF:
1095    case SYMBOL_REF:
1096    case CONST_INT:
1097      output_addr_const (file, addr);
1098      break;
1099
1100    default:
1101      debug_rtx (addr);
1102      abort ();
1103    }
1104}
1105
1106void
1107print_operand (FILE * file, rtx x, int code)
1108{
1109  machine_mode mode = GET_MODE (x);
1110  HOST_WIDE_INT val;
1111  unsigned char arr[16];
1112  int xcode = GET_CODE (x);
1113  int i, info;
1114  if (GET_MODE (x) == VOIDmode)
1115    switch (code)
1116      {
1117      case 'L':			/* 128 bits, signed */
1118      case 'm':			/* 128 bits, signed */
1119      case 'T':			/* 128 bits, signed */
1120      case 't':			/* 128 bits, signed */
1121	mode = TImode;
1122	break;
1123      case 'K':			/* 64 bits, signed */
1124      case 'k':			/* 64 bits, signed */
1125      case 'D':			/* 64 bits, signed */
1126      case 'd':			/* 64 bits, signed */
1127	mode = DImode;
1128	break;
1129      case 'J':			/* 32 bits, signed */
1130      case 'j':			/* 32 bits, signed */
1131      case 's':			/* 32 bits, signed */
1132      case 'S':			/* 32 bits, signed */
1133	mode = SImode;
1134	break;
1135      }
1136  switch (code)
1137    {
1138
1139    case 'j':			/* 32 bits, signed */
1140    case 'k':			/* 64 bits, signed */
1141    case 'm':			/* 128 bits, signed */
1142      if (xcode == CONST_INT
1143	  || xcode == CONST_DOUBLE || xcode == CONST_VECTOR)
1144	{
1145	  gcc_assert (logical_immediate_p (x, mode));
1146	  constant_to_array (mode, x, arr);
1147	  val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
1148	  val = trunc_int_for_mode (val, SImode);
1149	  switch (which_logical_immediate (val))
1150	  {
1151	  case SPU_ORI:
1152	    break;
1153	  case SPU_ORHI:
1154	    fprintf (file, "h");
1155	    break;
1156	  case SPU_ORBI:
1157	    fprintf (file, "b");
1158	    break;
1159	  default:
1160	    gcc_unreachable();
1161	  }
1162	}
1163      else
1164	gcc_unreachable();
1165      return;
1166
1167    case 'J':			/* 32 bits, signed */
1168    case 'K':			/* 64 bits, signed */
1169    case 'L':			/* 128 bits, signed */
1170      if (xcode == CONST_INT
1171	  || xcode == CONST_DOUBLE || xcode == CONST_VECTOR)
1172	{
1173	  gcc_assert (logical_immediate_p (x, mode)
1174		      || iohl_immediate_p (x, mode));
1175	  constant_to_array (mode, x, arr);
1176	  val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
1177	  val = trunc_int_for_mode (val, SImode);
1178	  switch (which_logical_immediate (val))
1179	  {
1180	  case SPU_ORI:
1181	  case SPU_IOHL:
1182	    break;
1183	  case SPU_ORHI:
1184	    val = trunc_int_for_mode (val, HImode);
1185	    break;
1186	  case SPU_ORBI:
1187	    val = trunc_int_for_mode (val, QImode);
1188	    break;
1189	  default:
1190	    gcc_unreachable();
1191	  }
1192	  fprintf (file, HOST_WIDE_INT_PRINT_DEC, val);
1193	}
1194      else
1195	gcc_unreachable();
1196      return;
1197
1198    case 't':			/* 128 bits, signed */
1199    case 'd':			/* 64 bits, signed */
1200    case 's':			/* 32 bits, signed */
1201      if (CONSTANT_P (x))
1202	{
1203	  enum immediate_class c = classify_immediate (x, mode);
1204	  switch (c)
1205	    {
1206	    case IC_IL1:
1207	      constant_to_array (mode, x, arr);
1208	      val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
1209	      val = trunc_int_for_mode (val, SImode);
1210	      switch (which_immediate_load (val))
1211		{
1212		case SPU_IL:
1213		  break;
1214		case SPU_ILA:
1215		  fprintf (file, "a");
1216		  break;
1217		case SPU_ILH:
1218		  fprintf (file, "h");
1219		  break;
1220		case SPU_ILHU:
1221		  fprintf (file, "hu");
1222		  break;
1223		default:
1224		  gcc_unreachable ();
1225		}
1226	      break;
1227	    case IC_CPAT:
1228	      constant_to_array (mode, x, arr);
1229	      cpat_info (arr, GET_MODE_SIZE (mode), &info, 0);
1230	      if (info == 1)
1231		fprintf (file, "b");
1232	      else if (info == 2)
1233		fprintf (file, "h");
1234	      else if (info == 4)
1235		fprintf (file, "w");
1236	      else if (info == 8)
1237		fprintf (file, "d");
1238	      break;
1239	    case IC_IL1s:
1240	      if (xcode == CONST_VECTOR)
1241		{
1242		  x = CONST_VECTOR_ELT (x, 0);
1243		  xcode = GET_CODE (x);
1244		}
1245	      if (xcode == SYMBOL_REF || xcode == LABEL_REF || xcode == CONST)
1246		fprintf (file, "a");
1247	      else if (xcode == HIGH)
1248		fprintf (file, "hu");
1249	      break;
1250	    case IC_FSMBI:
1251	    case IC_FSMBI2:
1252	    case IC_IL2:
1253	    case IC_IL2s:
1254	    case IC_POOL:
1255	      abort ();
1256	    }
1257	}
1258      else
1259	gcc_unreachable ();
1260      return;
1261
1262    case 'T':			/* 128 bits, signed */
1263    case 'D':			/* 64 bits, signed */
1264    case 'S':			/* 32 bits, signed */
1265      if (CONSTANT_P (x))
1266	{
1267	  enum immediate_class c = classify_immediate (x, mode);
1268	  switch (c)
1269	    {
1270	    case IC_IL1:
1271	      constant_to_array (mode, x, arr);
1272	      val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
1273	      val = trunc_int_for_mode (val, SImode);
1274	      switch (which_immediate_load (val))
1275		{
1276		case SPU_IL:
1277		case SPU_ILA:
1278		  break;
1279		case SPU_ILH:
1280		case SPU_ILHU:
1281		  val = trunc_int_for_mode (((arr[0] << 8) | arr[1]), HImode);
1282		  break;
1283		default:
1284		  gcc_unreachable ();
1285		}
1286	      fprintf (file, HOST_WIDE_INT_PRINT_DEC, val);
1287	      break;
1288	    case IC_FSMBI:
1289	      constant_to_array (mode, x, arr);
1290	      val = 0;
1291	      for (i = 0; i < 16; i++)
1292		{
1293		  val <<= 1;
1294		  val |= arr[i] & 1;
1295		}
1296	      print_operand (file, GEN_INT (val), 0);
1297	      break;
1298	    case IC_CPAT:
1299	      constant_to_array (mode, x, arr);
1300	      cpat_info (arr, GET_MODE_SIZE (mode), 0, &info);
1301	      fprintf (file, HOST_WIDE_INT_PRINT_DEC, (HOST_WIDE_INT)info);
1302	      break;
1303	    case IC_IL1s:
1304	      if (xcode == HIGH)
1305		x = XEXP (x, 0);
1306	      if (GET_CODE (x) == CONST_VECTOR)
1307		x = CONST_VECTOR_ELT (x, 0);
1308	      output_addr_const (file, x);
1309	      if (xcode == HIGH)
1310		fprintf (file, "@h");
1311	      break;
1312	    case IC_IL2:
1313	    case IC_IL2s:
1314	    case IC_FSMBI2:
1315	    case IC_POOL:
1316	      abort ();
1317	    }
1318	}
1319      else
1320	gcc_unreachable ();
1321      return;
1322
1323    case 'C':
1324      if (xcode == CONST_INT)
1325	{
1326	  /* Only 4 least significant bits are relevant for generate
1327	     control word instructions. */
1328	  fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (x) & 15);
1329	  return;
1330	}
1331      break;
1332
1333    case 'M':			/* print code for c*d */
1334      if (GET_CODE (x) == CONST_INT)
1335	switch (INTVAL (x))
1336	  {
1337	  case 1:
1338	    fprintf (file, "b");
1339	    break;
1340	  case 2:
1341	    fprintf (file, "h");
1342	    break;
1343	  case 4:
1344	    fprintf (file, "w");
1345	    break;
1346	  case 8:
1347	    fprintf (file, "d");
1348	    break;
1349	  default:
1350	    gcc_unreachable();
1351	  }
1352      else
1353	gcc_unreachable();
1354      return;
1355
1356    case 'N':			/* Negate the operand */
1357      if (xcode == CONST_INT)
1358	fprintf (file, HOST_WIDE_INT_PRINT_DEC, -INTVAL (x));
1359      else if (xcode == CONST_VECTOR)
1360	fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1361		 -INTVAL (CONST_VECTOR_ELT (x, 0)));
1362      return;
1363
1364    case 'I':			/* enable/disable interrupts */
1365      if (xcode == CONST_INT)
1366	fprintf (file, "%s",  INTVAL (x) == 0 ? "d" : "e");
1367      return;
1368
1369    case 'b':			/* branch modifiers */
1370      if (xcode == REG)
1371	fprintf (file, "%s", GET_MODE (x) == HImode ? "h" : "");
1372      else if (COMPARISON_P (x))
1373	fprintf (file, "%s", xcode == NE ? "n" : "");
1374      return;
1375
1376    case 'i':			/* indirect call */
1377      if (xcode == MEM)
1378	{
1379	  if (GET_CODE (XEXP (x, 0)) == REG)
1380	    /* Used in indirect function calls. */
1381	    fprintf (file, "%s", reg_names[REGNO (XEXP (x, 0))]);
1382	  else
1383	    output_address (XEXP (x, 0));
1384	}
1385      return;
1386
1387    case 'p':			/* load/store */
1388      if (xcode == MEM)
1389	{
1390	  x = XEXP (x, 0);
1391	  xcode = GET_CODE (x);
1392	}
1393      if (xcode == AND)
1394	{
1395	  x = XEXP (x, 0);
1396	  xcode = GET_CODE (x);
1397	}
1398      if (xcode == REG)
1399	fprintf (file, "d");
1400      else if (xcode == CONST_INT)
1401	fprintf (file, "a");
1402      else if (xcode == CONST || xcode == SYMBOL_REF || xcode == LABEL_REF)
1403	fprintf (file, "r");
1404      else if (xcode == PLUS || xcode == LO_SUM)
1405	{
1406	  if (GET_CODE (XEXP (x, 1)) == REG)
1407	    fprintf (file, "x");
1408	  else
1409	    fprintf (file, "d");
1410	}
1411      return;
1412
1413    case 'e':
1414      val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
1415      val &= 0x7;
1416      output_addr_const (file, GEN_INT (val));
1417      return;
1418
1419    case 'f':
1420      val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
1421      val &= 0x1f;
1422      output_addr_const (file, GEN_INT (val));
1423      return;
1424
1425    case 'g':
1426      val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
1427      val &= 0x3f;
1428      output_addr_const (file, GEN_INT (val));
1429      return;
1430
1431    case 'h':
1432      val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
1433      val = (val >> 3) & 0x1f;
1434      output_addr_const (file, GEN_INT (val));
1435      return;
1436
1437    case 'E':
1438      val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
1439      val = -val;
1440      val &= 0x7;
1441      output_addr_const (file, GEN_INT (val));
1442      return;
1443
1444    case 'F':
1445      val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
1446      val = -val;
1447      val &= 0x1f;
1448      output_addr_const (file, GEN_INT (val));
1449      return;
1450
1451    case 'G':
1452      val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
1453      val = -val;
1454      val &= 0x3f;
1455      output_addr_const (file, GEN_INT (val));
1456      return;
1457
1458    case 'H':
1459      val = xcode == CONST_INT ? INTVAL (x) : INTVAL (CONST_VECTOR_ELT (x, 0));
1460      val = -(val & -8ll);
1461      val = (val >> 3) & 0x1f;
1462      output_addr_const (file, GEN_INT (val));
1463      return;
1464
1465    case 'v':
1466    case 'w':
1467      constant_to_array (mode, x, arr);
1468      val = (((arr[0] << 1) + (arr[1] >> 7)) & 0xff) - 127;
1469      output_addr_const (file, GEN_INT (code == 'w' ? -val : val));
1470      return;
1471
1472    case 0:
1473      if (xcode == REG)
1474	fprintf (file, "%s", reg_names[REGNO (x)]);
1475      else if (xcode == MEM)
1476	output_address (XEXP (x, 0));
1477      else if (xcode == CONST_VECTOR)
1478	print_operand (file, CONST_VECTOR_ELT (x, 0), 0);
1479      else
1480	output_addr_const (file, x);
1481      return;
1482
1483      /* unused letters
1484	              o qr  u   yz
1485	AB            OPQR  UVWXYZ */
1486    default:
1487      output_operand_lossage ("invalid %%xn code");
1488    }
1489  gcc_unreachable ();
1490}
1491
1492/* For PIC mode we've reserved PIC_OFFSET_TABLE_REGNUM, which is a
1493   caller saved register.  For leaf functions it is more efficient to
1494   use a volatile register because we won't need to save and restore the
1495   pic register.  This routine is only valid after register allocation
1496   is completed, so we can pick an unused register.  */
1497static rtx
1498get_pic_reg (void)
1499{
1500  if (!reload_completed && !reload_in_progress)
1501    abort ();
1502
1503  /* If we've already made the decision, we need to keep with it.  Once we've
1504     decided to use LAST_ARG_REGNUM, future calls to df_regs_ever_live_p may
1505     return true since the register is now live; this should not cause us to
1506     "switch back" to using pic_offset_table_rtx.  */
1507  if (!cfun->machine->pic_reg)
1508    {
1509      if (crtl->is_leaf && !df_regs_ever_live_p (LAST_ARG_REGNUM))
1510	cfun->machine->pic_reg = gen_rtx_REG (SImode, LAST_ARG_REGNUM);
1511      else
1512	cfun->machine->pic_reg = pic_offset_table_rtx;
1513    }
1514
1515  return cfun->machine->pic_reg;
1516}
1517
1518/* Split constant addresses to handle cases that are too large.
1519   Add in the pic register when in PIC mode.
1520   Split immediates that require more than 1 instruction. */
1521int
1522spu_split_immediate (rtx * ops)
1523{
1524  machine_mode mode = GET_MODE (ops[0]);
1525  enum immediate_class c = classify_immediate (ops[1], mode);
1526
1527  switch (c)
1528    {
1529    case IC_IL2:
1530      {
1531	unsigned char arrhi[16];
1532	unsigned char arrlo[16];
1533	rtx to, temp, hi, lo;
1534	int i;
1535	machine_mode imode = mode;
1536	/* We need to do reals as ints because the constant used in the
1537	   IOR might not be a legitimate real constant. */
1538	imode = int_mode_for_mode (mode);
1539	constant_to_array (mode, ops[1], arrhi);
1540	if (imode != mode)
1541	  to = simplify_gen_subreg (imode, ops[0], mode, 0);
1542	else
1543	  to = ops[0];
1544	temp = !can_create_pseudo_p () ? to : gen_reg_rtx (imode);
1545	for (i = 0; i < 16; i += 4)
1546	  {
1547	    arrlo[i + 2] = arrhi[i + 2];
1548	    arrlo[i + 3] = arrhi[i + 3];
1549	    arrlo[i + 0] = arrlo[i + 1] = 0;
1550	    arrhi[i + 2] = arrhi[i + 3] = 0;
1551	  }
1552	hi = array_to_constant (imode, arrhi);
1553	lo = array_to_constant (imode, arrlo);
1554	emit_move_insn (temp, hi);
1555	emit_insn (gen_rtx_SET
1556		   (VOIDmode, to, gen_rtx_IOR (imode, temp, lo)));
1557	return 1;
1558      }
1559    case IC_FSMBI2:
1560      {
1561	unsigned char arr_fsmbi[16];
1562	unsigned char arr_andbi[16];
1563	rtx to, reg_fsmbi, reg_and;
1564	int i;
1565	machine_mode imode = mode;
1566	/* We need to do reals as ints because the constant used in the
1567	 * AND might not be a legitimate real constant. */
1568	imode = int_mode_for_mode (mode);
1569	constant_to_array (mode, ops[1], arr_fsmbi);
1570	if (imode != mode)
1571	  to = simplify_gen_subreg(imode, ops[0], GET_MODE (ops[0]), 0);
1572	else
1573	  to = ops[0];
1574	for (i = 0; i < 16; i++)
1575	  if (arr_fsmbi[i] != 0)
1576	    {
1577	      arr_andbi[0] = arr_fsmbi[i];
1578	      arr_fsmbi[i] = 0xff;
1579	    }
1580	for (i = 1; i < 16; i++)
1581	  arr_andbi[i] = arr_andbi[0];
1582	reg_fsmbi = array_to_constant (imode, arr_fsmbi);
1583	reg_and = array_to_constant (imode, arr_andbi);
1584	emit_move_insn (to, reg_fsmbi);
1585	emit_insn (gen_rtx_SET
1586		   (VOIDmode, to, gen_rtx_AND (imode, to, reg_and)));
1587	return 1;
1588      }
1589    case IC_POOL:
1590      if (reload_in_progress || reload_completed)
1591	{
1592	  rtx mem = force_const_mem (mode, ops[1]);
1593	  if (TARGET_LARGE_MEM)
1594	    {
1595	      rtx addr = gen_rtx_REG (Pmode, REGNO (ops[0]));
1596	      emit_move_insn (addr, XEXP (mem, 0));
1597	      mem = replace_equiv_address (mem, addr);
1598	    }
1599	  emit_move_insn (ops[0], mem);
1600	  return 1;
1601	}
1602      break;
1603    case IC_IL1s:
1604    case IC_IL2s:
1605      if (reload_completed && GET_CODE (ops[1]) != HIGH)
1606	{
1607	  if (c == IC_IL2s)
1608	    {
1609	      emit_move_insn (ops[0], gen_rtx_HIGH (mode, ops[1]));
1610	      emit_move_insn (ops[0], gen_rtx_LO_SUM (mode, ops[0], ops[1]));
1611	    }
1612	  else if (flag_pic)
1613	    emit_insn (gen_pic (ops[0], ops[1]));
1614	  if (flag_pic)
1615	    {
1616	      rtx pic_reg = get_pic_reg ();
1617	      emit_insn (gen_addsi3 (ops[0], ops[0], pic_reg));
1618	    }
1619	  return flag_pic || c == IC_IL2s;
1620	}
1621      break;
1622    case IC_IL1:
1623    case IC_FSMBI:
1624    case IC_CPAT:
1625      break;
1626    }
1627  return 0;
1628}
1629
1630/* SAVING is TRUE when we are generating the actual load and store
1631   instructions for REGNO.  When determining the size of the stack
1632   needed for saving register we must allocate enough space for the
1633   worst case, because we don't always have the information early enough
1634   to not allocate it.  But we can at least eliminate the actual loads
1635   and stores during the prologue/epilogue.  */
1636static int
1637need_to_save_reg (int regno, int saving)
1638{
1639  if (df_regs_ever_live_p (regno) && !call_used_regs[regno])
1640    return 1;
1641  if (flag_pic
1642      && regno == PIC_OFFSET_TABLE_REGNUM
1643      && (!saving || cfun->machine->pic_reg == pic_offset_table_rtx))
1644    return 1;
1645  return 0;
1646}
1647
1648/* This function is only correct starting with local register
1649   allocation */
1650int
1651spu_saved_regs_size (void)
1652{
1653  int reg_save_size = 0;
1654  int regno;
1655
1656  for (regno = FIRST_PSEUDO_REGISTER - 1; regno >= 0; --regno)
1657    if (need_to_save_reg (regno, 0))
1658      reg_save_size += 0x10;
1659  return reg_save_size;
1660}
1661
1662static rtx_insn *
1663frame_emit_store (int regno, rtx addr, HOST_WIDE_INT offset)
1664{
1665  rtx reg = gen_rtx_REG (V4SImode, regno);
1666  rtx mem =
1667    gen_frame_mem (V4SImode, gen_rtx_PLUS (Pmode, addr, GEN_INT (offset)));
1668  return emit_insn (gen_movv4si (mem, reg));
1669}
1670
1671static rtx_insn *
1672frame_emit_load (int regno, rtx addr, HOST_WIDE_INT offset)
1673{
1674  rtx reg = gen_rtx_REG (V4SImode, regno);
1675  rtx mem =
1676    gen_frame_mem (V4SImode, gen_rtx_PLUS (Pmode, addr, GEN_INT (offset)));
1677  return emit_insn (gen_movv4si (reg, mem));
1678}
1679
1680/* This happens after reload, so we need to expand it.  */
1681static rtx_insn *
1682frame_emit_add_imm (rtx dst, rtx src, HOST_WIDE_INT imm, rtx scratch)
1683{
1684  rtx_insn *insn;
1685  if (satisfies_constraint_K (GEN_INT (imm)))
1686    {
1687      insn = emit_insn (gen_addsi3 (dst, src, GEN_INT (imm)));
1688    }
1689  else
1690    {
1691      emit_insn (gen_movsi (scratch, gen_int_mode (imm, SImode)));
1692      insn = emit_insn (gen_addsi3 (dst, src, scratch));
1693      if (REGNO (src) == REGNO (scratch))
1694	abort ();
1695    }
1696  return insn;
1697}
1698
1699/* Return nonzero if this function is known to have a null epilogue.  */
1700
1701int
1702direct_return (void)
1703{
1704  if (reload_completed)
1705    {
1706      if (cfun->static_chain_decl == 0
1707	  && (spu_saved_regs_size ()
1708	      + get_frame_size ()
1709	      + crtl->outgoing_args_size
1710	      + crtl->args.pretend_args_size == 0)
1711	  && crtl->is_leaf)
1712	return 1;
1713    }
1714  return 0;
1715}
1716
1717/*
1718   The stack frame looks like this:
1719         +-------------+
1720         |  incoming   |
1721         |    args     |
1722   AP -> +-------------+
1723         | $lr save    |
1724         +-------------+
1725 prev SP | back chain  |
1726         +-------------+
1727         |  var args   |
1728         |  reg save   | crtl->args.pretend_args_size bytes
1729         +-------------+
1730         |    ...      |
1731         | saved regs  | spu_saved_regs_size() bytes
1732   FP -> +-------------+
1733         |    ...      |
1734         |   vars      | get_frame_size()  bytes
1735  HFP -> +-------------+
1736         |    ...      |
1737         |  outgoing   |
1738         |    args     | crtl->outgoing_args_size bytes
1739         +-------------+
1740         | $lr of next |
1741         |   frame     |
1742         +-------------+
1743         | back chain  |
1744   SP -> +-------------+
1745
1746*/
1747void
1748spu_expand_prologue (void)
1749{
1750  HOST_WIDE_INT size = get_frame_size (), offset, regno;
1751  HOST_WIDE_INT total_size;
1752  HOST_WIDE_INT saved_regs_size;
1753  rtx sp_reg = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
1754  rtx scratch_reg_0, scratch_reg_1;
1755  rtx_insn *insn;
1756  rtx real;
1757
1758  if (flag_pic && optimize == 0 && !cfun->machine->pic_reg)
1759    cfun->machine->pic_reg = pic_offset_table_rtx;
1760
1761  if (spu_naked_function_p (current_function_decl))
1762    return;
1763
1764  scratch_reg_0 = gen_rtx_REG (SImode, LAST_ARG_REGNUM + 1);
1765  scratch_reg_1 = gen_rtx_REG (SImode, LAST_ARG_REGNUM + 2);
1766
1767  saved_regs_size = spu_saved_regs_size ();
1768  total_size = size + saved_regs_size
1769    + crtl->outgoing_args_size
1770    + crtl->args.pretend_args_size;
1771
1772  if (!crtl->is_leaf
1773      || cfun->calls_alloca || total_size > 0)
1774    total_size += STACK_POINTER_OFFSET;
1775
1776  /* Save this first because code after this might use the link
1777     register as a scratch register. */
1778  if (!crtl->is_leaf)
1779    {
1780      insn = frame_emit_store (LINK_REGISTER_REGNUM, sp_reg, 16);
1781      RTX_FRAME_RELATED_P (insn) = 1;
1782    }
1783
1784  if (total_size > 0)
1785    {
1786      offset = -crtl->args.pretend_args_size;
1787      for (regno = 0; regno < FIRST_PSEUDO_REGISTER; ++regno)
1788	if (need_to_save_reg (regno, 1))
1789	  {
1790	    offset -= 16;
1791	    insn = frame_emit_store (regno, sp_reg, offset);
1792	    RTX_FRAME_RELATED_P (insn) = 1;
1793	  }
1794    }
1795
1796  if (flag_pic && cfun->machine->pic_reg)
1797    {
1798      rtx pic_reg = cfun->machine->pic_reg;
1799      insn = emit_insn (gen_load_pic_offset (pic_reg, scratch_reg_0));
1800      insn = emit_insn (gen_subsi3 (pic_reg, pic_reg, scratch_reg_0));
1801    }
1802
1803  if (total_size > 0)
1804    {
1805      if (flag_stack_check)
1806	{
1807	  /* We compare against total_size-1 because
1808	     ($sp >= total_size) <=> ($sp > total_size-1) */
1809	  rtx scratch_v4si = gen_rtx_REG (V4SImode, REGNO (scratch_reg_0));
1810	  rtx sp_v4si = gen_rtx_REG (V4SImode, STACK_POINTER_REGNUM);
1811	  rtx size_v4si = spu_const (V4SImode, total_size - 1);
1812	  if (!satisfies_constraint_K (GEN_INT (total_size - 1)))
1813	    {
1814	      emit_move_insn (scratch_v4si, size_v4si);
1815	      size_v4si = scratch_v4si;
1816	    }
1817	  emit_insn (gen_cgt_v4si (scratch_v4si, sp_v4si, size_v4si));
1818	  emit_insn (gen_vec_extractv4si
1819		     (scratch_reg_0, scratch_v4si, GEN_INT (1)));
1820	  emit_insn (gen_spu_heq (scratch_reg_0, GEN_INT (0)));
1821	}
1822
1823      /* Adjust the stack pointer, and make sure scratch_reg_0 contains
1824         the value of the previous $sp because we save it as the back
1825         chain. */
1826      if (total_size <= 2000)
1827	{
1828	  /* In this case we save the back chain first. */
1829	  insn = frame_emit_store (STACK_POINTER_REGNUM, sp_reg, -total_size);
1830	  insn =
1831	    frame_emit_add_imm (sp_reg, sp_reg, -total_size, scratch_reg_0);
1832	}
1833      else
1834	{
1835	  insn = emit_move_insn (scratch_reg_0, sp_reg);
1836	  insn =
1837	    frame_emit_add_imm (sp_reg, sp_reg, -total_size, scratch_reg_1);
1838	}
1839      RTX_FRAME_RELATED_P (insn) = 1;
1840      real = gen_addsi3 (sp_reg, sp_reg, GEN_INT (-total_size));
1841      add_reg_note (insn, REG_FRAME_RELATED_EXPR, real);
1842
1843      if (total_size > 2000)
1844	{
1845	  /* Save the back chain ptr */
1846	  insn = frame_emit_store (REGNO (scratch_reg_0), sp_reg, 0);
1847	}
1848
1849      if (frame_pointer_needed)
1850	{
1851	  rtx fp_reg = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM);
1852	  HOST_WIDE_INT fp_offset = STACK_POINTER_OFFSET
1853	    + crtl->outgoing_args_size;
1854	  /* Set the new frame_pointer */
1855	  insn = frame_emit_add_imm (fp_reg, sp_reg, fp_offset, scratch_reg_0);
1856	  RTX_FRAME_RELATED_P (insn) = 1;
1857	  real = gen_addsi3 (fp_reg, sp_reg, GEN_INT (fp_offset));
1858	  add_reg_note (insn, REG_FRAME_RELATED_EXPR, real);
1859          REGNO_POINTER_ALIGN (HARD_FRAME_POINTER_REGNUM) = STACK_BOUNDARY;
1860	}
1861    }
1862
1863  if (flag_stack_usage_info)
1864    current_function_static_stack_size = total_size;
1865}
1866
1867void
1868spu_expand_epilogue (bool sibcall_p)
1869{
1870  int size = get_frame_size (), offset, regno;
1871  HOST_WIDE_INT saved_regs_size, total_size;
1872  rtx sp_reg = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM);
1873  rtx scratch_reg_0;
1874
1875  if (spu_naked_function_p (current_function_decl))
1876    return;
1877
1878  scratch_reg_0 = gen_rtx_REG (SImode, LAST_ARG_REGNUM + 1);
1879
1880  saved_regs_size = spu_saved_regs_size ();
1881  total_size = size + saved_regs_size
1882    + crtl->outgoing_args_size
1883    + crtl->args.pretend_args_size;
1884
1885  if (!crtl->is_leaf
1886      || cfun->calls_alloca || total_size > 0)
1887    total_size += STACK_POINTER_OFFSET;
1888
1889  if (total_size > 0)
1890    {
1891      if (cfun->calls_alloca)
1892	frame_emit_load (STACK_POINTER_REGNUM, sp_reg, 0);
1893      else
1894	frame_emit_add_imm (sp_reg, sp_reg, total_size, scratch_reg_0);
1895
1896
1897      if (saved_regs_size > 0)
1898	{
1899	  offset = -crtl->args.pretend_args_size;
1900	  for (regno = 0; regno < FIRST_PSEUDO_REGISTER; ++regno)
1901	    if (need_to_save_reg (regno, 1))
1902	      {
1903		offset -= 0x10;
1904		frame_emit_load (regno, sp_reg, offset);
1905	      }
1906	}
1907    }
1908
1909  if (!crtl->is_leaf)
1910    frame_emit_load (LINK_REGISTER_REGNUM, sp_reg, 16);
1911
1912  if (!sibcall_p)
1913    {
1914      emit_use (gen_rtx_REG (SImode, LINK_REGISTER_REGNUM));
1915      emit_jump_insn (gen__return ());
1916    }
1917}
1918
1919rtx
1920spu_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
1921{
1922  if (count != 0)
1923    return 0;
1924  /* This is inefficient because it ends up copying to a save-register
1925     which then gets saved even though $lr has already been saved.  But
1926     it does generate better code for leaf functions and we don't need
1927     to use RETURN_ADDRESS_POINTER_REGNUM to get it working.  It's only
1928     used for __builtin_return_address anyway, so maybe we don't care if
1929     it's inefficient. */
1930  return get_hard_reg_initial_val (Pmode, LINK_REGISTER_REGNUM);
1931}
1932
1933
1934/* Given VAL, generate a constant appropriate for MODE.
1935   If MODE is a vector mode, every element will be VAL.
1936   For TImode, VAL will be zero extended to 128 bits. */
1937rtx
1938spu_const (machine_mode mode, HOST_WIDE_INT val)
1939{
1940  rtx inner;
1941  rtvec v;
1942  int units, i;
1943
1944  gcc_assert (GET_MODE_CLASS (mode) == MODE_INT
1945	      || GET_MODE_CLASS (mode) == MODE_FLOAT
1946	      || GET_MODE_CLASS (mode) == MODE_VECTOR_INT
1947	      || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT);
1948
1949  if (GET_MODE_CLASS (mode) == MODE_INT)
1950    return immed_double_const (val, 0, mode);
1951
1952  /* val is the bit representation of the float */
1953  if (GET_MODE_CLASS (mode) == MODE_FLOAT)
1954    return hwint_to_const_double (mode, val);
1955
1956  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
1957    inner = immed_double_const (val, 0, GET_MODE_INNER (mode));
1958  else
1959    inner = hwint_to_const_double (GET_MODE_INNER (mode), val);
1960
1961  units = GET_MODE_NUNITS (mode);
1962
1963  v = rtvec_alloc (units);
1964
1965  for (i = 0; i < units; ++i)
1966    RTVEC_ELT (v, i) = inner;
1967
1968  return gen_rtx_CONST_VECTOR (mode, v);
1969}
1970
1971/* Create a MODE vector constant from 4 ints. */
1972rtx
1973spu_const_from_ints(machine_mode mode, int a, int b, int c, int d)
1974{
1975  unsigned char arr[16];
1976  arr[0] = (a >> 24) & 0xff;
1977  arr[1] = (a >> 16) & 0xff;
1978  arr[2] = (a >> 8) & 0xff;
1979  arr[3] = (a >> 0) & 0xff;
1980  arr[4] = (b >> 24) & 0xff;
1981  arr[5] = (b >> 16) & 0xff;
1982  arr[6] = (b >> 8) & 0xff;
1983  arr[7] = (b >> 0) & 0xff;
1984  arr[8] = (c >> 24) & 0xff;
1985  arr[9] = (c >> 16) & 0xff;
1986  arr[10] = (c >> 8) & 0xff;
1987  arr[11] = (c >> 0) & 0xff;
1988  arr[12] = (d >> 24) & 0xff;
1989  arr[13] = (d >> 16) & 0xff;
1990  arr[14] = (d >> 8) & 0xff;
1991  arr[15] = (d >> 0) & 0xff;
1992  return array_to_constant(mode, arr);
1993}
1994
1995/* branch hint stuff */
1996
1997/* An array of these is used to propagate hints to predecessor blocks. */
1998struct spu_bb_info
1999{
2000  rtx_insn *prop_jump; /* propagated from another block */
2001  int bb_index;  /* the original block. */
2002};
2003static struct spu_bb_info *spu_bb_info;
2004
2005#define STOP_HINT_P(INSN) \
2006		(CALL_P(INSN) \
2007		 || INSN_CODE(INSN) == CODE_FOR_divmodsi4 \
2008		 || INSN_CODE(INSN) == CODE_FOR_udivmodsi4)
2009
2010/* 1 when RTX is a hinted branch or its target.  We keep track of
2011   what has been hinted so the safe-hint code can test it easily.  */
2012#define HINTED_P(RTX)						\
2013  (RTL_FLAG_CHECK3("HINTED_P", (RTX), CODE_LABEL, JUMP_INSN, CALL_INSN)->unchanging)
2014
2015/* 1 when RTX is an insn that must be scheduled on an even boundary. */
2016#define SCHED_ON_EVEN_P(RTX)						\
2017  (RTL_FLAG_CHECK2("SCHED_ON_EVEN_P", (RTX), JUMP_INSN, CALL_INSN)->in_struct)
2018
2019/* Emit a nop for INSN such that the two will dual issue.  This assumes
2020   INSN is 8-byte aligned.  When INSN is inline asm we emit an lnop.
2021   We check for TImode to handle a MULTI1 insn which has dual issued its
2022   first instruction.  get_pipe returns -1 for MULTI0 or inline asm.  */
2023static void
2024emit_nop_for_insn (rtx_insn *insn)
2025{
2026  int p;
2027  rtx_insn *new_insn;
2028
2029  /* We need to handle JUMP_TABLE_DATA separately.  */
2030  if (JUMP_TABLE_DATA_P (insn))
2031    {
2032      new_insn = emit_insn_after (gen_lnop(), insn);
2033      recog_memoized (new_insn);
2034      INSN_LOCATION (new_insn) = UNKNOWN_LOCATION;
2035      return;
2036    }
2037
2038  p = get_pipe (insn);
2039  if ((CALL_P (insn) || JUMP_P (insn)) && SCHED_ON_EVEN_P (insn))
2040    new_insn = emit_insn_after (gen_lnop (), insn);
2041  else if (p == 1 && GET_MODE (insn) == TImode)
2042    {
2043      new_insn = emit_insn_before (gen_nopn (GEN_INT (127)), insn);
2044      PUT_MODE (new_insn, TImode);
2045      PUT_MODE (insn, VOIDmode);
2046    }
2047  else
2048    new_insn = emit_insn_after (gen_lnop (), insn);
2049  recog_memoized (new_insn);
2050  INSN_LOCATION (new_insn) = INSN_LOCATION (insn);
2051}
2052
2053/* Insert nops in basic blocks to meet dual issue alignment
2054   requirements.  Also make sure hbrp and hint instructions are at least
2055   one cycle apart, possibly inserting a nop.  */
2056static void
2057pad_bb(void)
2058{
2059  rtx_insn *insn, *next_insn, *prev_insn, *hbr_insn = 0;
2060  int length;
2061  int addr;
2062
2063  /* This sets up INSN_ADDRESSES. */
2064  shorten_branches (get_insns ());
2065
2066  /* Keep track of length added by nops. */
2067  length = 0;
2068
2069  prev_insn = 0;
2070  insn = get_insns ();
2071  if (!active_insn_p (insn))
2072    insn = next_active_insn (insn);
2073  for (; insn; insn = next_insn)
2074    {
2075      next_insn = next_active_insn (insn);
2076      if (INSN_CODE (insn) == CODE_FOR_iprefetch
2077	  || INSN_CODE (insn) == CODE_FOR_hbr)
2078	{
2079	  if (hbr_insn)
2080	    {
2081	      int a0 = INSN_ADDRESSES (INSN_UID (hbr_insn));
2082	      int a1 = INSN_ADDRESSES (INSN_UID (insn));
2083	      if ((a1 - a0 == 8 && GET_MODE (insn) != TImode)
2084		  || (a1 - a0 == 4))
2085		{
2086		  prev_insn = emit_insn_before (gen_lnop (), insn);
2087		  PUT_MODE (prev_insn, GET_MODE (insn));
2088		  PUT_MODE (insn, TImode);
2089		  INSN_LOCATION (prev_insn) = INSN_LOCATION (insn);
2090		  length += 4;
2091		}
2092	    }
2093	  hbr_insn = insn;
2094	}
2095      if (INSN_CODE (insn) == CODE_FOR_blockage && next_insn)
2096	{
2097	  if (GET_MODE (insn) == TImode)
2098	    PUT_MODE (next_insn, TImode);
2099	  insn = next_insn;
2100	  next_insn = next_active_insn (insn);
2101	}
2102      addr = INSN_ADDRESSES (INSN_UID (insn));
2103      if ((CALL_P (insn) || JUMP_P (insn)) && SCHED_ON_EVEN_P (insn))
2104	{
2105	  if (((addr + length) & 7) != 0)
2106	    {
2107	      emit_nop_for_insn (prev_insn);
2108	      length += 4;
2109	    }
2110	}
2111      else if (GET_MODE (insn) == TImode
2112	       && ((next_insn && GET_MODE (next_insn) != TImode)
2113		   || get_attr_type (insn) == TYPE_MULTI0)
2114	       && ((addr + length) & 7) != 0)
2115	{
2116	  /* prev_insn will always be set because the first insn is
2117	     always 8-byte aligned. */
2118	  emit_nop_for_insn (prev_insn);
2119	  length += 4;
2120	}
2121      prev_insn = insn;
2122    }
2123}
2124
2125
2126/* Routines for branch hints. */
2127
2128static void
2129spu_emit_branch_hint (rtx_insn *before, rtx_insn *branch, rtx target,
2130		      int distance, sbitmap blocks)
2131{
2132  rtx branch_label = 0;
2133  rtx_insn *hint;
2134  rtx_insn *insn;
2135  rtx_jump_table_data *table;
2136
2137  if (before == 0 || branch == 0 || target == 0)
2138    return;
2139
2140  /* While scheduling we require hints to be no further than 600, so
2141     we need to enforce that here too */
2142  if (distance > 600)
2143    return;
2144
2145  /* If we have a Basic block note, emit it after the basic block note.  */
2146  if (NOTE_INSN_BASIC_BLOCK_P (before))
2147    before = NEXT_INSN (before);
2148
2149  branch_label = gen_label_rtx ();
2150  LABEL_NUSES (branch_label)++;
2151  LABEL_PRESERVE_P (branch_label) = 1;
2152  insn = emit_label_before (branch_label, branch);
2153  branch_label = gen_rtx_LABEL_REF (VOIDmode, branch_label);
2154  bitmap_set_bit (blocks, BLOCK_FOR_INSN (branch)->index);
2155
2156  hint = emit_insn_before (gen_hbr (branch_label, target), before);
2157  recog_memoized (hint);
2158  INSN_LOCATION (hint) = INSN_LOCATION (branch);
2159  HINTED_P (branch) = 1;
2160
2161  if (GET_CODE (target) == LABEL_REF)
2162    HINTED_P (XEXP (target, 0)) = 1;
2163  else if (tablejump_p (branch, 0, &table))
2164    {
2165      rtvec vec;
2166      int j;
2167      if (GET_CODE (PATTERN (table)) == ADDR_VEC)
2168	vec = XVEC (PATTERN (table), 0);
2169      else
2170	vec = XVEC (PATTERN (table), 1);
2171      for (j = GET_NUM_ELEM (vec) - 1; j >= 0; --j)
2172	HINTED_P (XEXP (RTVEC_ELT (vec, j), 0)) = 1;
2173    }
2174
2175  if (distance >= 588)
2176    {
2177      /* Make sure the hint isn't scheduled any earlier than this point,
2178         which could make it too far for the branch offest to fit */
2179      insn = emit_insn_before (gen_blockage (), hint);
2180      recog_memoized (insn);
2181      INSN_LOCATION (insn) = INSN_LOCATION (hint);
2182    }
2183  else if (distance <= 8 * 4)
2184    {
2185      /* To guarantee at least 8 insns between the hint and branch we
2186         insert nops. */
2187      int d;
2188      for (d = distance; d < 8 * 4; d += 4)
2189	{
2190	  insn =
2191	    emit_insn_after (gen_nopn_nv (gen_rtx_REG (SImode, 127)), hint);
2192	  recog_memoized (insn);
2193	  INSN_LOCATION (insn) = INSN_LOCATION (hint);
2194	}
2195
2196      /* Make sure any nops inserted aren't scheduled before the hint. */
2197      insn = emit_insn_after (gen_blockage (), hint);
2198      recog_memoized (insn);
2199      INSN_LOCATION (insn) = INSN_LOCATION (hint);
2200
2201      /* Make sure any nops inserted aren't scheduled after the call. */
2202      if (CALL_P (branch) && distance < 8 * 4)
2203	{
2204	  insn = emit_insn_before (gen_blockage (), branch);
2205	  recog_memoized (insn);
2206	  INSN_LOCATION (insn) = INSN_LOCATION (branch);
2207	}
2208    }
2209}
2210
2211/* Returns 0 if we don't want a hint for this branch.  Otherwise return
2212   the rtx for the branch target. */
2213static rtx
2214get_branch_target (rtx_insn *branch)
2215{
2216  if (JUMP_P (branch))
2217    {
2218      rtx set, src;
2219
2220      /* Return statements */
2221      if (GET_CODE (PATTERN (branch)) == RETURN)
2222	return gen_rtx_REG (SImode, LINK_REGISTER_REGNUM);
2223
2224     /* ASM GOTOs. */
2225     if (extract_asm_operands (PATTERN (branch)) != NULL)
2226	return NULL;
2227
2228      set = single_set (branch);
2229      src = SET_SRC (set);
2230      if (GET_CODE (SET_DEST (set)) != PC)
2231	abort ();
2232
2233      if (GET_CODE (src) == IF_THEN_ELSE)
2234	{
2235	  rtx lab = 0;
2236	  rtx note = find_reg_note (branch, REG_BR_PROB, 0);
2237	  if (note)
2238	    {
2239	      /* If the more probable case is not a fall through, then
2240	         try a branch hint.  */
2241	      int prob = XINT (note, 0);
2242	      if (prob > (REG_BR_PROB_BASE * 6 / 10)
2243		  && GET_CODE (XEXP (src, 1)) != PC)
2244		lab = XEXP (src, 1);
2245	      else if (prob < (REG_BR_PROB_BASE * 4 / 10)
2246		       && GET_CODE (XEXP (src, 2)) != PC)
2247		lab = XEXP (src, 2);
2248	    }
2249	  if (lab)
2250	    {
2251	      if (GET_CODE (lab) == RETURN)
2252		return gen_rtx_REG (SImode, LINK_REGISTER_REGNUM);
2253	      return lab;
2254	    }
2255	  return 0;
2256	}
2257
2258      return src;
2259    }
2260  else if (CALL_P (branch))
2261    {
2262      rtx call;
2263      /* All of our call patterns are in a PARALLEL and the CALL is
2264         the first pattern in the PARALLEL. */
2265      if (GET_CODE (PATTERN (branch)) != PARALLEL)
2266	abort ();
2267      call = XVECEXP (PATTERN (branch), 0, 0);
2268      if (GET_CODE (call) == SET)
2269	call = SET_SRC (call);
2270      if (GET_CODE (call) != CALL)
2271	abort ();
2272      return XEXP (XEXP (call, 0), 0);
2273    }
2274  return 0;
2275}
2276
2277/* The special $hbr register is used to prevent the insn scheduler from
2278   moving hbr insns across instructions which invalidate them.  It
2279   should only be used in a clobber, and this function searches for
2280   insns which clobber it.  */
2281static bool
2282insn_clobbers_hbr (rtx_insn *insn)
2283{
2284  if (INSN_P (insn)
2285      && GET_CODE (PATTERN (insn)) == PARALLEL)
2286    {
2287      rtx parallel = PATTERN (insn);
2288      rtx clobber;
2289      int j;
2290      for (j = XVECLEN (parallel, 0) - 1; j >= 0; j--)
2291	{
2292	  clobber = XVECEXP (parallel, 0, j);
2293	  if (GET_CODE (clobber) == CLOBBER
2294	      && GET_CODE (XEXP (clobber, 0)) == REG
2295	      && REGNO (XEXP (clobber, 0)) == HBR_REGNUM)
2296	    return 1;
2297	}
2298    }
2299  return 0;
2300}
2301
2302/* Search up to 32 insns starting at FIRST:
2303   - at any kind of hinted branch, just return
2304   - at any unconditional branch in the first 15 insns, just return
2305   - at a call or indirect branch, after the first 15 insns, force it to
2306     an even address and return
2307   - at any unconditional branch, after the first 15 insns, force it to
2308     an even address.
2309   At then end of the search, insert an hbrp within 4 insns of FIRST,
2310   and an hbrp within 16 instructions of FIRST.
2311 */
2312static void
2313insert_hbrp_for_ilb_runout (rtx_insn *first)
2314{
2315  rtx_insn *insn, *before_4 = 0, *before_16 = 0;
2316  int addr = 0, length, first_addr = -1;
2317  int hbrp_addr0 = 128 * 4, hbrp_addr1 = 128 * 4;
2318  int insert_lnop_after = 0;
2319  for (insn = first; insn; insn = NEXT_INSN (insn))
2320    if (INSN_P (insn))
2321      {
2322	if (first_addr == -1)
2323	  first_addr = INSN_ADDRESSES (INSN_UID (insn));
2324	addr = INSN_ADDRESSES (INSN_UID (insn)) - first_addr;
2325	length = get_attr_length (insn);
2326
2327	if (before_4 == 0 && addr + length >= 4 * 4)
2328	  before_4 = insn;
2329	/* We test for 14 instructions because the first hbrp will add
2330	   up to 2 instructions. */
2331	if (before_16 == 0 && addr + length >= 14 * 4)
2332	  before_16 = insn;
2333
2334	if (INSN_CODE (insn) == CODE_FOR_hbr)
2335	  {
2336	    /* Make sure an hbrp is at least 2 cycles away from a hint.
2337	       Insert an lnop after the hbrp when necessary. */
2338	    if (before_4 == 0 && addr > 0)
2339	      {
2340		before_4 = insn;
2341		insert_lnop_after |= 1;
2342	      }
2343	    else if (before_4 && addr <= 4 * 4)
2344	      insert_lnop_after |= 1;
2345	    if (before_16 == 0 && addr > 10 * 4)
2346	      {
2347		before_16 = insn;
2348		insert_lnop_after |= 2;
2349	      }
2350	    else if (before_16 && addr <= 14 * 4)
2351	      insert_lnop_after |= 2;
2352	  }
2353
2354	if (INSN_CODE (insn) == CODE_FOR_iprefetch)
2355	  {
2356	    if (addr < hbrp_addr0)
2357	      hbrp_addr0 = addr;
2358	    else if (addr < hbrp_addr1)
2359	      hbrp_addr1 = addr;
2360	  }
2361
2362	if (CALL_P (insn) || JUMP_P (insn))
2363	  {
2364	    if (HINTED_P (insn))
2365	      return;
2366
2367	    /* Any branch after the first 15 insns should be on an even
2368	       address to avoid a special case branch.  There might be
2369	       some nops and/or hbrps inserted, so we test after 10
2370	       insns. */
2371	    if (addr > 10 * 4)
2372	      SCHED_ON_EVEN_P (insn) = 1;
2373	  }
2374
2375	if (CALL_P (insn) || tablejump_p (insn, 0, 0))
2376	  return;
2377
2378
2379	if (addr + length >= 32 * 4)
2380	  {
2381	    gcc_assert (before_4 && before_16);
2382	    if (hbrp_addr0 > 4 * 4)
2383	      {
2384		insn =
2385		  emit_insn_before (gen_iprefetch (GEN_INT (1)), before_4);
2386		recog_memoized (insn);
2387		INSN_LOCATION (insn) = INSN_LOCATION (before_4);
2388		INSN_ADDRESSES_NEW (insn,
2389				    INSN_ADDRESSES (INSN_UID (before_4)));
2390		PUT_MODE (insn, GET_MODE (before_4));
2391		PUT_MODE (before_4, TImode);
2392		if (insert_lnop_after & 1)
2393		  {
2394		    insn = emit_insn_before (gen_lnop (), before_4);
2395		    recog_memoized (insn);
2396		    INSN_LOCATION (insn) = INSN_LOCATION (before_4);
2397		    INSN_ADDRESSES_NEW (insn,
2398					INSN_ADDRESSES (INSN_UID (before_4)));
2399		    PUT_MODE (insn, TImode);
2400		  }
2401	      }
2402	    if ((hbrp_addr0 <= 4 * 4 || hbrp_addr0 > 16 * 4)
2403		&& hbrp_addr1 > 16 * 4)
2404	      {
2405		insn =
2406		  emit_insn_before (gen_iprefetch (GEN_INT (2)), before_16);
2407		recog_memoized (insn);
2408		INSN_LOCATION (insn) = INSN_LOCATION (before_16);
2409		INSN_ADDRESSES_NEW (insn,
2410				    INSN_ADDRESSES (INSN_UID (before_16)));
2411		PUT_MODE (insn, GET_MODE (before_16));
2412		PUT_MODE (before_16, TImode);
2413		if (insert_lnop_after & 2)
2414		  {
2415		    insn = emit_insn_before (gen_lnop (), before_16);
2416		    recog_memoized (insn);
2417		    INSN_LOCATION (insn) = INSN_LOCATION (before_16);
2418		    INSN_ADDRESSES_NEW (insn,
2419					INSN_ADDRESSES (INSN_UID
2420							(before_16)));
2421		    PUT_MODE (insn, TImode);
2422		  }
2423	      }
2424	    return;
2425	  }
2426      }
2427    else if (BARRIER_P (insn))
2428      return;
2429
2430}
2431
2432/* The SPU might hang when it executes 48 inline instructions after a
2433   hinted branch jumps to its hinted target.  The beginning of a
2434   function and the return from a call might have been hinted, and
2435   must be handled as well.  To prevent a hang we insert 2 hbrps.  The
2436   first should be within 6 insns of the branch target.  The second
2437   should be within 22 insns of the branch target.  When determining
2438   if hbrps are necessary, we look for only 32 inline instructions,
2439   because up to 12 nops and 4 hbrps could be inserted.  Similarily,
2440   when inserting new hbrps, we insert them within 4 and 16 insns of
2441   the target.  */
2442static void
2443insert_hbrp (void)
2444{
2445  rtx_insn *insn;
2446  if (TARGET_SAFE_HINTS)
2447    {
2448      shorten_branches (get_insns ());
2449      /* Insert hbrp at beginning of function */
2450      insn = next_active_insn (get_insns ());
2451      if (insn)
2452	insert_hbrp_for_ilb_runout (insn);
2453      /* Insert hbrp after hinted targets. */
2454      for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
2455	if ((LABEL_P (insn) && HINTED_P (insn)) || CALL_P (insn))
2456	  insert_hbrp_for_ilb_runout (next_active_insn (insn));
2457    }
2458}
2459
2460static int in_spu_reorg;
2461
2462static void
2463spu_var_tracking (void)
2464{
2465  if (flag_var_tracking)
2466    {
2467      df_analyze ();
2468      timevar_push (TV_VAR_TRACKING);
2469      variable_tracking_main ();
2470      timevar_pop (TV_VAR_TRACKING);
2471      df_finish_pass (false);
2472    }
2473}
2474
2475/* Insert branch hints.  There are no branch optimizations after this
2476   pass, so it's safe to set our branch hints now. */
2477static void
2478spu_machine_dependent_reorg (void)
2479{
2480  sbitmap blocks;
2481  basic_block bb;
2482  rtx_insn *branch, *insn;
2483  rtx branch_target = 0;
2484  int branch_addr = 0, insn_addr, required_dist = 0;
2485  int i;
2486  unsigned int j;
2487
2488  if (!TARGET_BRANCH_HINTS || optimize == 0)
2489    {
2490      /* We still do it for unoptimized code because an external
2491         function might have hinted a call or return. */
2492      compute_bb_for_insn ();
2493      insert_hbrp ();
2494      pad_bb ();
2495      spu_var_tracking ();
2496      free_bb_for_insn ();
2497      return;
2498    }
2499
2500  blocks = sbitmap_alloc (last_basic_block_for_fn (cfun));
2501  bitmap_clear (blocks);
2502
2503  in_spu_reorg = 1;
2504  compute_bb_for_insn ();
2505
2506  /* (Re-)discover loops so that bb->loop_father can be used
2507     in the analysis below.  */
2508  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
2509
2510  compact_blocks ();
2511
2512  spu_bb_info =
2513    (struct spu_bb_info *) xcalloc (n_basic_blocks_for_fn (cfun),
2514				    sizeof (struct spu_bb_info));
2515
2516  /* We need exact insn addresses and lengths.  */
2517  shorten_branches (get_insns ());
2518
2519  for (i = n_basic_blocks_for_fn (cfun) - 1; i >= 0; i--)
2520    {
2521      bb = BASIC_BLOCK_FOR_FN (cfun, i);
2522      branch = 0;
2523      if (spu_bb_info[i].prop_jump)
2524	{
2525	  branch = spu_bb_info[i].prop_jump;
2526	  branch_target = get_branch_target (branch);
2527	  branch_addr = INSN_ADDRESSES (INSN_UID (branch));
2528	  required_dist = spu_hint_dist;
2529	}
2530      /* Search from end of a block to beginning.   In this loop, find
2531         jumps which need a branch and emit them only when:
2532         - it's an indirect branch and we're at the insn which sets
2533         the register
2534         - we're at an insn that will invalidate the hint. e.g., a
2535         call, another hint insn, inline asm that clobbers $hbr, and
2536         some inlined operations (divmodsi4).  Don't consider jumps
2537         because they are only at the end of a block and are
2538         considered when we are deciding whether to propagate
2539         - we're getting too far away from the branch.  The hbr insns
2540         only have a signed 10 bit offset
2541         We go back as far as possible so the branch will be considered
2542         for propagation when we get to the beginning of the block.  */
2543      for (insn = BB_END (bb); insn; insn = PREV_INSN (insn))
2544	{
2545	  if (INSN_P (insn))
2546	    {
2547	      insn_addr = INSN_ADDRESSES (INSN_UID (insn));
2548	      if (branch
2549		  && ((GET_CODE (branch_target) == REG
2550		       && set_of (branch_target, insn) != NULL_RTX)
2551		      || insn_clobbers_hbr (insn)
2552		      || branch_addr - insn_addr > 600))
2553		{
2554		  rtx_insn *next = NEXT_INSN (insn);
2555		  int next_addr = INSN_ADDRESSES (INSN_UID (next));
2556		  if (insn != BB_END (bb)
2557		      && branch_addr - next_addr >= required_dist)
2558		    {
2559		      if (dump_file)
2560			fprintf (dump_file,
2561				 "hint for %i in block %i before %i\n",
2562				 INSN_UID (branch), bb->index,
2563				 INSN_UID (next));
2564		      spu_emit_branch_hint (next, branch, branch_target,
2565					    branch_addr - next_addr, blocks);
2566		    }
2567		  branch = 0;
2568		}
2569
2570	      /* JUMP_P will only be true at the end of a block.  When
2571	         branch is already set it means we've previously decided
2572	         to propagate a hint for that branch into this block. */
2573	      if (CALL_P (insn) || (JUMP_P (insn) && !branch))
2574		{
2575		  branch = 0;
2576		  if ((branch_target = get_branch_target (insn)))
2577		    {
2578		      branch = insn;
2579		      branch_addr = insn_addr;
2580		      required_dist = spu_hint_dist;
2581		    }
2582		}
2583	    }
2584	  if (insn == BB_HEAD (bb))
2585	    break;
2586	}
2587
2588      if (branch)
2589	{
2590	  /* If we haven't emitted a hint for this branch yet, it might
2591	     be profitable to emit it in one of the predecessor blocks,
2592	     especially for loops.  */
2593	  rtx_insn *bbend;
2594	  basic_block prev = 0, prop = 0, prev2 = 0;
2595	  int loop_exit = 0, simple_loop = 0;
2596	  int next_addr = INSN_ADDRESSES (INSN_UID (NEXT_INSN (insn)));
2597
2598	  for (j = 0; j < EDGE_COUNT (bb->preds); j++)
2599	    if (EDGE_PRED (bb, j)->flags & EDGE_FALLTHRU)
2600	      prev = EDGE_PRED (bb, j)->src;
2601	    else
2602	      prev2 = EDGE_PRED (bb, j)->src;
2603
2604	  for (j = 0; j < EDGE_COUNT (bb->succs); j++)
2605	    if (EDGE_SUCC (bb, j)->flags & EDGE_LOOP_EXIT)
2606	      loop_exit = 1;
2607	    else if (EDGE_SUCC (bb, j)->dest == bb)
2608	      simple_loop = 1;
2609
2610	  /* If this branch is a loop exit then propagate to previous
2611	     fallthru block. This catches the cases when it is a simple
2612	     loop or when there is an initial branch into the loop. */
2613	  if (prev && (loop_exit || simple_loop)
2614	      && bb_loop_depth (prev) <= bb_loop_depth (bb))
2615	    prop = prev;
2616
2617	  /* If there is only one adjacent predecessor.  Don't propagate
2618	     outside this loop.  */
2619	  else if (prev && single_pred_p (bb)
2620		   && prev->loop_father == bb->loop_father)
2621	    prop = prev;
2622
2623	  /* If this is the JOIN block of a simple IF-THEN then
2624	     propagate the hint to the HEADER block. */
2625	  else if (prev && prev2
2626		   && EDGE_COUNT (bb->preds) == 2
2627		   && EDGE_COUNT (prev->preds) == 1
2628		   && EDGE_PRED (prev, 0)->src == prev2
2629		   && prev2->loop_father == bb->loop_father
2630		   && GET_CODE (branch_target) != REG)
2631	    prop = prev;
2632
2633	  /* Don't propagate when:
2634	     - this is a simple loop and the hint would be too far
2635	     - this is not a simple loop and there are 16 insns in
2636	     this block already
2637	     - the predecessor block ends in a branch that will be
2638	     hinted
2639	     - the predecessor block ends in an insn that invalidates
2640	     the hint */
2641	  if (prop
2642	      && prop->index >= 0
2643	      && (bbend = BB_END (prop))
2644	      && branch_addr - INSN_ADDRESSES (INSN_UID (bbend)) <
2645	      (simple_loop ? 600 : 16 * 4) && get_branch_target (bbend) == 0
2646	      && (JUMP_P (bbend) || !insn_clobbers_hbr (bbend)))
2647	    {
2648	      if (dump_file)
2649		fprintf (dump_file, "propagate from %i to %i (loop depth %i) "
2650			 "for %i (loop_exit %i simple_loop %i dist %i)\n",
2651			 bb->index, prop->index, bb_loop_depth (bb),
2652			 INSN_UID (branch), loop_exit, simple_loop,
2653			 branch_addr - INSN_ADDRESSES (INSN_UID (bbend)));
2654
2655	      spu_bb_info[prop->index].prop_jump = branch;
2656	      spu_bb_info[prop->index].bb_index = i;
2657	    }
2658	  else if (branch_addr - next_addr >= required_dist)
2659	    {
2660	      if (dump_file)
2661		fprintf (dump_file, "hint for %i in block %i before %i\n",
2662			 INSN_UID (branch), bb->index,
2663			 INSN_UID (NEXT_INSN (insn)));
2664	      spu_emit_branch_hint (NEXT_INSN (insn), branch, branch_target,
2665				    branch_addr - next_addr, blocks);
2666	    }
2667	  branch = 0;
2668	}
2669    }
2670  free (spu_bb_info);
2671
2672  if (!bitmap_empty_p (blocks))
2673    find_many_sub_basic_blocks (blocks);
2674
2675  /* We have to schedule to make sure alignment is ok. */
2676  FOR_EACH_BB_FN (bb, cfun) bb->flags &= ~BB_DISABLE_SCHEDULE;
2677
2678  /* The hints need to be scheduled, so call it again. */
2679  schedule_insns ();
2680  df_finish_pass (true);
2681
2682  insert_hbrp ();
2683
2684  pad_bb ();
2685
2686  for (insn = get_insns (); insn; insn = NEXT_INSN (insn))
2687    if (NONJUMP_INSN_P (insn) && INSN_CODE (insn) == CODE_FOR_hbr)
2688      {
2689	/* Adjust the LABEL_REF in a hint when we have inserted a nop
2690	   between its branch label and the branch .  We don't move the
2691	   label because GCC expects it at the beginning of the block. */
2692	rtx unspec = SET_SRC (XVECEXP (PATTERN (insn), 0, 0));
2693	rtx label_ref = XVECEXP (unspec, 0, 0);
2694	rtx_insn *label = as_a <rtx_insn *> (XEXP (label_ref, 0));
2695	rtx_insn *branch;
2696	int offset = 0;
2697	for (branch = NEXT_INSN (label);
2698	     !JUMP_P (branch) && !CALL_P (branch);
2699	     branch = NEXT_INSN (branch))
2700	  if (NONJUMP_INSN_P (branch))
2701	    offset += get_attr_length (branch);
2702	if (offset > 0)
2703	  XVECEXP (unspec, 0, 0) = plus_constant (Pmode, label_ref, offset);
2704      }
2705
2706  spu_var_tracking ();
2707
2708  loop_optimizer_finalize ();
2709
2710  free_bb_for_insn ();
2711
2712  in_spu_reorg = 0;
2713}
2714
2715
2716/* Insn scheduling routines, primarily for dual issue. */
2717static int
2718spu_sched_issue_rate (void)
2719{
2720  return 2;
2721}
2722
2723static int
2724uses_ls_unit(rtx_insn *insn)
2725{
2726  rtx set = single_set (insn);
2727  if (set != 0
2728      && (GET_CODE (SET_DEST (set)) == MEM
2729	  || GET_CODE (SET_SRC (set)) == MEM))
2730    return 1;
2731  return 0;
2732}
2733
2734static int
2735get_pipe (rtx_insn *insn)
2736{
2737  enum attr_type t;
2738  /* Handle inline asm */
2739  if (INSN_CODE (insn) == -1)
2740    return -1;
2741  t = get_attr_type (insn);
2742  switch (t)
2743    {
2744    case TYPE_CONVERT:
2745      return -2;
2746    case TYPE_MULTI0:
2747      return -1;
2748
2749    case TYPE_FX2:
2750    case TYPE_FX3:
2751    case TYPE_SPR:
2752    case TYPE_NOP:
2753    case TYPE_FXB:
2754    case TYPE_FPD:
2755    case TYPE_FP6:
2756    case TYPE_FP7:
2757      return 0;
2758
2759    case TYPE_LNOP:
2760    case TYPE_SHUF:
2761    case TYPE_LOAD:
2762    case TYPE_STORE:
2763    case TYPE_BR:
2764    case TYPE_MULTI1:
2765    case TYPE_HBR:
2766    case TYPE_IPREFETCH:
2767      return 1;
2768    default:
2769      abort ();
2770    }
2771}
2772
2773
2774/* haifa-sched.c has a static variable that keeps track of the current
2775   cycle.  It is passed to spu_sched_reorder, and we record it here for
2776   use by spu_sched_variable_issue.  It won't be accurate if the
2777   scheduler updates it's clock_var between the two calls. */
2778static int clock_var;
2779
2780/* This is used to keep track of insn alignment.  Set to 0 at the
2781   beginning of each block and increased by the "length" attr of each
2782   insn scheduled. */
2783static int spu_sched_length;
2784
2785/* Record when we've issued pipe0 and pipe1 insns so we can reorder the
2786   ready list appropriately in spu_sched_reorder(). */
2787static int pipe0_clock;
2788static int pipe1_clock;
2789
2790static int prev_clock_var;
2791
2792static int prev_priority;
2793
2794/* The SPU needs to load the next ilb sometime during the execution of
2795   the previous ilb.  There is a potential conflict if every cycle has a
2796   load or store.  To avoid the conflict we make sure the load/store
2797   unit is free for at least one cycle during the execution of insns in
2798   the previous ilb. */
2799static int spu_ls_first;
2800static int prev_ls_clock;
2801
2802static void
2803spu_sched_init_global (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED,
2804		       int max_ready ATTRIBUTE_UNUSED)
2805{
2806  spu_sched_length = 0;
2807}
2808
2809static void
2810spu_sched_init (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED,
2811		int max_ready ATTRIBUTE_UNUSED)
2812{
2813  if (align_labels > 4 || align_loops > 4 || align_jumps > 4)
2814    {
2815      /* When any block might be at least 8-byte aligned, assume they
2816         will all be at least 8-byte aligned to make sure dual issue
2817         works out correctly. */
2818      spu_sched_length = 0;
2819    }
2820  spu_ls_first = INT_MAX;
2821  clock_var = -1;
2822  prev_ls_clock = -1;
2823  pipe0_clock = -1;
2824  pipe1_clock = -1;
2825  prev_clock_var = -1;
2826  prev_priority = -1;
2827}
2828
2829static int
2830spu_sched_variable_issue (FILE *file ATTRIBUTE_UNUSED,
2831			  int verbose ATTRIBUTE_UNUSED,
2832			  rtx_insn *insn, int more)
2833{
2834  int len;
2835  int p;
2836  if (GET_CODE (PATTERN (insn)) == USE
2837      || GET_CODE (PATTERN (insn)) == CLOBBER
2838      || (len = get_attr_length (insn)) == 0)
2839    return more;
2840
2841  spu_sched_length += len;
2842
2843  /* Reset on inline asm */
2844  if (INSN_CODE (insn) == -1)
2845    {
2846      spu_ls_first = INT_MAX;
2847      pipe0_clock = -1;
2848      pipe1_clock = -1;
2849      return 0;
2850    }
2851  p = get_pipe (insn);
2852  if (p == 0)
2853    pipe0_clock = clock_var;
2854  else
2855    pipe1_clock = clock_var;
2856
2857  if (in_spu_reorg)
2858    {
2859      if (clock_var - prev_ls_clock > 1
2860	  || INSN_CODE (insn) == CODE_FOR_iprefetch)
2861	spu_ls_first = INT_MAX;
2862      if (uses_ls_unit (insn))
2863	{
2864	  if (spu_ls_first == INT_MAX)
2865	    spu_ls_first = spu_sched_length;
2866	  prev_ls_clock = clock_var;
2867	}
2868
2869      /* The scheduler hasn't inserted the nop, but we will later on.
2870         Include those nops in spu_sched_length. */
2871      if (prev_clock_var == clock_var && (spu_sched_length & 7))
2872	spu_sched_length += 4;
2873      prev_clock_var = clock_var;
2874
2875      /* more is -1 when called from spu_sched_reorder for new insns
2876         that don't have INSN_PRIORITY */
2877      if (more >= 0)
2878	prev_priority = INSN_PRIORITY (insn);
2879    }
2880
2881  /* Always try issuing more insns.  spu_sched_reorder will decide
2882     when the cycle should be advanced. */
2883  return 1;
2884}
2885
2886/* This function is called for both TARGET_SCHED_REORDER and
2887   TARGET_SCHED_REORDER2.  */
2888static int
2889spu_sched_reorder (FILE *file ATTRIBUTE_UNUSED, int verbose ATTRIBUTE_UNUSED,
2890		   rtx_insn **ready, int *nreadyp, int clock)
2891{
2892  int i, nready = *nreadyp;
2893  int pipe_0, pipe_1, pipe_hbrp, pipe_ls, schedule_i;
2894  rtx_insn *insn;
2895
2896  clock_var = clock;
2897
2898  if (nready <= 0 || pipe1_clock >= clock)
2899    return 0;
2900
2901  /* Find any rtl insns that don't generate assembly insns and schedule
2902     them first. */
2903  for (i = nready - 1; i >= 0; i--)
2904    {
2905      insn = ready[i];
2906      if (INSN_CODE (insn) == -1
2907	  || INSN_CODE (insn) == CODE_FOR_blockage
2908	  || (INSN_P (insn) && get_attr_length (insn) == 0))
2909	{
2910	  ready[i] = ready[nready - 1];
2911	  ready[nready - 1] = insn;
2912	  return 1;
2913	}
2914    }
2915
2916  pipe_0 = pipe_1 = pipe_hbrp = pipe_ls = schedule_i = -1;
2917  for (i = 0; i < nready; i++)
2918    if (INSN_CODE (ready[i]) != -1)
2919      {
2920	insn = ready[i];
2921	switch (get_attr_type (insn))
2922	  {
2923	  default:
2924	  case TYPE_MULTI0:
2925	  case TYPE_CONVERT:
2926	  case TYPE_FX2:
2927	  case TYPE_FX3:
2928	  case TYPE_SPR:
2929	  case TYPE_NOP:
2930	  case TYPE_FXB:
2931	  case TYPE_FPD:
2932	  case TYPE_FP6:
2933	  case TYPE_FP7:
2934	    pipe_0 = i;
2935	    break;
2936	  case TYPE_LOAD:
2937	  case TYPE_STORE:
2938	    pipe_ls = i;
2939	  case TYPE_LNOP:
2940	  case TYPE_SHUF:
2941	  case TYPE_BR:
2942	  case TYPE_MULTI1:
2943	  case TYPE_HBR:
2944	    pipe_1 = i;
2945	    break;
2946	  case TYPE_IPREFETCH:
2947	    pipe_hbrp = i;
2948	    break;
2949	  }
2950      }
2951
2952  /* In the first scheduling phase, schedule loads and stores together
2953     to increase the chance they will get merged during postreload CSE. */
2954  if (!reload_completed && pipe_ls >= 0)
2955    {
2956      insn = ready[pipe_ls];
2957      ready[pipe_ls] = ready[nready - 1];
2958      ready[nready - 1] = insn;
2959      return 1;
2960    }
2961
2962  /* If there is an hbrp ready, prefer it over other pipe 1 insns. */
2963  if (pipe_hbrp >= 0)
2964    pipe_1 = pipe_hbrp;
2965
2966  /* When we have loads/stores in every cycle of the last 15 insns and
2967     we are about to schedule another load/store, emit an hbrp insn
2968     instead. */
2969  if (in_spu_reorg
2970      && spu_sched_length - spu_ls_first >= 4 * 15
2971      && !(pipe0_clock < clock && pipe_0 >= 0) && pipe_1 == pipe_ls)
2972    {
2973      insn = sched_emit_insn (gen_iprefetch (GEN_INT (3)));
2974      recog_memoized (insn);
2975      if (pipe0_clock < clock)
2976	PUT_MODE (insn, TImode);
2977      spu_sched_variable_issue (file, verbose, insn, -1);
2978      return 0;
2979    }
2980
2981  /* In general, we want to emit nops to increase dual issue, but dual
2982     issue isn't faster when one of the insns could be scheduled later
2983     without effecting the critical path.  We look at INSN_PRIORITY to
2984     make a good guess, but it isn't perfect so -mdual-nops=n can be
2985     used to effect it. */
2986  if (in_spu_reorg && spu_dual_nops < 10)
2987    {
2988      /* When we are at an even address and we are not issuing nops to
2989         improve scheduling then we need to advance the cycle.  */
2990      if ((spu_sched_length & 7) == 0 && prev_clock_var == clock
2991	  && (spu_dual_nops == 0
2992	      || (pipe_1 != -1
2993		  && prev_priority >
2994		  INSN_PRIORITY (ready[pipe_1]) + spu_dual_nops)))
2995	return 0;
2996
2997      /* When at an odd address, schedule the highest priority insn
2998         without considering pipeline. */
2999      if ((spu_sched_length & 7) == 4 && prev_clock_var != clock
3000	  && (spu_dual_nops == 0
3001	      || (prev_priority >
3002		  INSN_PRIORITY (ready[nready - 1]) + spu_dual_nops)))
3003	return 1;
3004    }
3005
3006
3007  /* We haven't issued a pipe0 insn yet this cycle, if there is a
3008     pipe0 insn in the ready list, schedule it. */
3009  if (pipe0_clock < clock && pipe_0 >= 0)
3010    schedule_i = pipe_0;
3011
3012  /* Either we've scheduled a pipe0 insn already or there is no pipe0
3013     insn to schedule.  Put a pipe1 insn at the front of the ready list. */
3014  else
3015    schedule_i = pipe_1;
3016
3017  if (schedule_i > -1)
3018    {
3019      insn = ready[schedule_i];
3020      ready[schedule_i] = ready[nready - 1];
3021      ready[nready - 1] = insn;
3022      return 1;
3023    }
3024  return 0;
3025}
3026
3027/* INSN is dependent on DEP_INSN. */
3028static int
3029spu_sched_adjust_cost (rtx_insn *insn, rtx link, rtx_insn *dep_insn, int cost)
3030{
3031  rtx set;
3032
3033  /* The blockage pattern is used to prevent instructions from being
3034     moved across it and has no cost. */
3035  if (INSN_CODE (insn) == CODE_FOR_blockage
3036      || INSN_CODE (dep_insn) == CODE_FOR_blockage)
3037    return 0;
3038
3039  if ((INSN_P (insn) && get_attr_length (insn) == 0)
3040      || (INSN_P (dep_insn) && get_attr_length (dep_insn) == 0))
3041    return 0;
3042
3043  /* Make sure hbrps are spread out. */
3044  if (INSN_CODE (insn) == CODE_FOR_iprefetch
3045      && INSN_CODE (dep_insn) == CODE_FOR_iprefetch)
3046    return 8;
3047
3048  /* Make sure hints and hbrps are 2 cycles apart. */
3049  if ((INSN_CODE (insn) == CODE_FOR_iprefetch
3050       || INSN_CODE (insn) == CODE_FOR_hbr)
3051       && (INSN_CODE (dep_insn) == CODE_FOR_iprefetch
3052	   || INSN_CODE (dep_insn) == CODE_FOR_hbr))
3053    return 2;
3054
3055  /* An hbrp has no real dependency on other insns. */
3056  if (INSN_CODE (insn) == CODE_FOR_iprefetch
3057      || INSN_CODE (dep_insn) == CODE_FOR_iprefetch)
3058    return 0;
3059
3060  /* Assuming that it is unlikely an argument register will be used in
3061     the first cycle of the called function, we reduce the cost for
3062     slightly better scheduling of dep_insn.  When not hinted, the
3063     mispredicted branch would hide the cost as well.  */
3064  if (CALL_P (insn))
3065  {
3066    rtx target = get_branch_target (insn);
3067    if (GET_CODE (target) != REG || !set_of (target, insn))
3068      return cost - 2;
3069    return cost;
3070  }
3071
3072  /* And when returning from a function, let's assume the return values
3073     are completed sooner too. */
3074  if (CALL_P (dep_insn))
3075    return cost - 2;
3076
3077  /* Make sure an instruction that loads from the back chain is schedule
3078     away from the return instruction so a hint is more likely to get
3079     issued. */
3080  if (INSN_CODE (insn) == CODE_FOR__return
3081      && (set = single_set (dep_insn))
3082      && GET_CODE (SET_DEST (set)) == REG
3083      && REGNO (SET_DEST (set)) == LINK_REGISTER_REGNUM)
3084    return 20;
3085
3086  /* The dfa scheduler sets cost to 0 for all anti-dependencies and the
3087     scheduler makes every insn in a block anti-dependent on the final
3088     jump_insn.  We adjust here so higher cost insns will get scheduled
3089     earlier. */
3090  if (JUMP_P (insn) && REG_NOTE_KIND (link) == REG_DEP_ANTI)
3091    return insn_cost (dep_insn) - 3;
3092
3093  return cost;
3094}
3095
3096/* Create a CONST_DOUBLE from a string.  */
3097rtx
3098spu_float_const (const char *string, machine_mode mode)
3099{
3100  REAL_VALUE_TYPE value;
3101  value = REAL_VALUE_ATOF (string, mode);
3102  return CONST_DOUBLE_FROM_REAL_VALUE (value, mode);
3103}
3104
3105int
3106spu_constant_address_p (rtx x)
3107{
3108  return (GET_CODE (x) == LABEL_REF || GET_CODE (x) == SYMBOL_REF
3109	  || GET_CODE (x) == CONST_INT || GET_CODE (x) == CONST
3110	  || GET_CODE (x) == HIGH);
3111}
3112
3113static enum spu_immediate
3114which_immediate_load (HOST_WIDE_INT val)
3115{
3116  gcc_assert (val == trunc_int_for_mode (val, SImode));
3117
3118  if (val >= -0x8000 && val <= 0x7fff)
3119    return SPU_IL;
3120  if (val >= 0 && val <= 0x3ffff)
3121    return SPU_ILA;
3122  if ((val & 0xffff) == ((val >> 16) & 0xffff))
3123    return SPU_ILH;
3124  if ((val & 0xffff) == 0)
3125    return SPU_ILHU;
3126
3127  return SPU_NONE;
3128}
3129
3130/* Return true when OP can be loaded by one of the il instructions, or
3131   when flow2 is not completed and OP can be loaded using ilhu and iohl. */
3132int
3133immediate_load_p (rtx op, machine_mode mode)
3134{
3135  if (CONSTANT_P (op))
3136    {
3137      enum immediate_class c = classify_immediate (op, mode);
3138      return c == IC_IL1 || c == IC_IL1s
3139	     || (!epilogue_completed && (c == IC_IL2 || c == IC_IL2s));
3140    }
3141  return 0;
3142}
3143
3144/* Return true if the first SIZE bytes of arr is a constant that can be
3145   generated with cbd, chd, cwd or cdd.  When non-NULL, PRUN and PSTART
3146   represent the size and offset of the instruction to use. */
3147static int
3148cpat_info(unsigned char *arr, int size, int *prun, int *pstart)
3149{
3150  int cpat, run, i, start;
3151  cpat = 1;
3152  run = 0;
3153  start = -1;
3154  for (i = 0; i < size && cpat; i++)
3155    if (arr[i] != i+16)
3156      {
3157	if (!run)
3158	  {
3159	    start = i;
3160	    if (arr[i] == 3)
3161	      run = 1;
3162	    else if (arr[i] == 2 && arr[i+1] == 3)
3163	      run = 2;
3164	    else if (arr[i] == 0)
3165	      {
3166		while (arr[i+run] == run && i+run < 16)
3167		  run++;
3168		if (run != 4 && run != 8)
3169		  cpat = 0;
3170	      }
3171	    else
3172	      cpat = 0;
3173	    if ((i & (run-1)) != 0)
3174	      cpat = 0;
3175	    i += run;
3176	  }
3177	else
3178	  cpat = 0;
3179      }
3180  if (cpat && (run || size < 16))
3181    {
3182      if (run == 0)
3183	run = 1;
3184      if (prun)
3185	*prun = run;
3186      if (pstart)
3187	*pstart = start == -1 ? 16-run : start;
3188      return 1;
3189    }
3190  return 0;
3191}
3192
3193/* OP is a CONSTANT_P.  Determine what instructions can be used to load
3194   it into a register.  MODE is only valid when OP is a CONST_INT. */
3195static enum immediate_class
3196classify_immediate (rtx op, machine_mode mode)
3197{
3198  HOST_WIDE_INT val;
3199  unsigned char arr[16];
3200  int i, j, repeated, fsmbi, repeat;
3201
3202  gcc_assert (CONSTANT_P (op));
3203
3204  if (GET_MODE (op) != VOIDmode)
3205    mode = GET_MODE (op);
3206
3207  /* A V4SI const_vector with all identical symbols is ok. */
3208  if (!flag_pic
3209      && mode == V4SImode
3210      && GET_CODE (op) == CONST_VECTOR
3211      && GET_CODE (CONST_VECTOR_ELT (op, 0)) != CONST_INT
3212      && GET_CODE (CONST_VECTOR_ELT (op, 0)) != CONST_DOUBLE
3213      && CONST_VECTOR_ELT (op, 0) == CONST_VECTOR_ELT (op, 1)
3214      && CONST_VECTOR_ELT (op, 1) == CONST_VECTOR_ELT (op, 2)
3215      && CONST_VECTOR_ELT (op, 2) == CONST_VECTOR_ELT (op, 3))
3216    op = CONST_VECTOR_ELT (op, 0);
3217
3218  switch (GET_CODE (op))
3219    {
3220    case SYMBOL_REF:
3221    case LABEL_REF:
3222      return TARGET_LARGE_MEM ? IC_IL2s : IC_IL1s;
3223
3224    case CONST:
3225      /* We can never know if the resulting address fits in 18 bits and can be
3226	 loaded with ila.  For now, assume the address will not overflow if
3227	 the displacement is "small" (fits 'K' constraint).  */
3228      if (!TARGET_LARGE_MEM && GET_CODE (XEXP (op, 0)) == PLUS)
3229	{
3230	  rtx sym = XEXP (XEXP (op, 0), 0);
3231	  rtx cst = XEXP (XEXP (op, 0), 1);
3232
3233	  if (GET_CODE (sym) == SYMBOL_REF
3234	      && GET_CODE (cst) == CONST_INT
3235	      && satisfies_constraint_K (cst))
3236	    return IC_IL1s;
3237	}
3238      return IC_IL2s;
3239
3240    case HIGH:
3241      return IC_IL1s;
3242
3243    case CONST_VECTOR:
3244      for (i = 0; i < GET_MODE_NUNITS (mode); i++)
3245	if (GET_CODE (CONST_VECTOR_ELT (op, i)) != CONST_INT
3246	    && GET_CODE (CONST_VECTOR_ELT (op, i)) != CONST_DOUBLE)
3247	  return IC_POOL;
3248      /* Fall through. */
3249
3250    case CONST_INT:
3251    case CONST_DOUBLE:
3252      constant_to_array (mode, op, arr);
3253
3254      /* Check that each 4-byte slot is identical. */
3255      repeated = 1;
3256      for (i = 4; i < 16; i += 4)
3257	for (j = 0; j < 4; j++)
3258	  if (arr[j] != arr[i + j])
3259	    repeated = 0;
3260
3261      if (repeated)
3262	{
3263	  val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
3264	  val = trunc_int_for_mode (val, SImode);
3265
3266	  if (which_immediate_load (val) != SPU_NONE)
3267	    return IC_IL1;
3268	}
3269
3270      /* Any mode of 2 bytes or smaller can be loaded with an il
3271         instruction. */
3272      gcc_assert (GET_MODE_SIZE (mode) > 2);
3273
3274      fsmbi = 1;
3275      repeat = 0;
3276      for (i = 0; i < 16 && fsmbi; i++)
3277	if (arr[i] != 0 && repeat == 0)
3278	  repeat = arr[i];
3279	else if (arr[i] != 0 && arr[i] != repeat)
3280	  fsmbi = 0;
3281      if (fsmbi)
3282	return repeat == 0xff ? IC_FSMBI : IC_FSMBI2;
3283
3284      if (cpat_info (arr, GET_MODE_SIZE (mode), 0, 0))
3285	return IC_CPAT;
3286
3287      if (repeated)
3288	return IC_IL2;
3289
3290      return IC_POOL;
3291    default:
3292      break;
3293    }
3294  gcc_unreachable ();
3295}
3296
3297static enum spu_immediate
3298which_logical_immediate (HOST_WIDE_INT val)
3299{
3300  gcc_assert (val == trunc_int_for_mode (val, SImode));
3301
3302  if (val >= -0x200 && val <= 0x1ff)
3303    return SPU_ORI;
3304  if (val >= 0 && val <= 0xffff)
3305    return SPU_IOHL;
3306  if ((val & 0xffff) == ((val >> 16) & 0xffff))
3307    {
3308      val = trunc_int_for_mode (val, HImode);
3309      if (val >= -0x200 && val <= 0x1ff)
3310	return SPU_ORHI;
3311      if ((val & 0xff) == ((val >> 8) & 0xff))
3312	{
3313	  val = trunc_int_for_mode (val, QImode);
3314	  if (val >= -0x200 && val <= 0x1ff)
3315	    return SPU_ORBI;
3316	}
3317    }
3318  return SPU_NONE;
3319}
3320
3321/* Return TRUE when X, a CONST_VECTOR, only contains CONST_INTs or
3322   CONST_DOUBLEs. */
3323static int
3324const_vector_immediate_p (rtx x)
3325{
3326  int i;
3327  gcc_assert (GET_CODE (x) == CONST_VECTOR);
3328  for (i = 0; i < GET_MODE_NUNITS (GET_MODE (x)); i++)
3329    if (GET_CODE (CONST_VECTOR_ELT (x, i)) != CONST_INT
3330	&& GET_CODE (CONST_VECTOR_ELT (x, i)) != CONST_DOUBLE)
3331      return 0;
3332  return 1;
3333}
3334
3335int
3336logical_immediate_p (rtx op, machine_mode mode)
3337{
3338  HOST_WIDE_INT val;
3339  unsigned char arr[16];
3340  int i, j;
3341
3342  gcc_assert (GET_CODE (op) == CONST_INT || GET_CODE (op) == CONST_DOUBLE
3343	      || GET_CODE (op) == CONST_VECTOR);
3344
3345  if (GET_CODE (op) == CONST_VECTOR
3346      && !const_vector_immediate_p (op))
3347    return 0;
3348
3349  if (GET_MODE (op) != VOIDmode)
3350    mode = GET_MODE (op);
3351
3352  constant_to_array (mode, op, arr);
3353
3354  /* Check that bytes are repeated. */
3355  for (i = 4; i < 16; i += 4)
3356    for (j = 0; j < 4; j++)
3357      if (arr[j] != arr[i + j])
3358	return 0;
3359
3360  val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
3361  val = trunc_int_for_mode (val, SImode);
3362
3363  i = which_logical_immediate (val);
3364  return i != SPU_NONE && i != SPU_IOHL;
3365}
3366
3367int
3368iohl_immediate_p (rtx op, machine_mode mode)
3369{
3370  HOST_WIDE_INT val;
3371  unsigned char arr[16];
3372  int i, j;
3373
3374  gcc_assert (GET_CODE (op) == CONST_INT || GET_CODE (op) == CONST_DOUBLE
3375	      || GET_CODE (op) == CONST_VECTOR);
3376
3377  if (GET_CODE (op) == CONST_VECTOR
3378      && !const_vector_immediate_p (op))
3379    return 0;
3380
3381  if (GET_MODE (op) != VOIDmode)
3382    mode = GET_MODE (op);
3383
3384  constant_to_array (mode, op, arr);
3385
3386  /* Check that bytes are repeated. */
3387  for (i = 4; i < 16; i += 4)
3388    for (j = 0; j < 4; j++)
3389      if (arr[j] != arr[i + j])
3390	return 0;
3391
3392  val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
3393  val = trunc_int_for_mode (val, SImode);
3394
3395  return val >= 0 && val <= 0xffff;
3396}
3397
3398int
3399arith_immediate_p (rtx op, machine_mode mode,
3400		   HOST_WIDE_INT low, HOST_WIDE_INT high)
3401{
3402  HOST_WIDE_INT val;
3403  unsigned char arr[16];
3404  int bytes, i, j;
3405
3406  gcc_assert (GET_CODE (op) == CONST_INT || GET_CODE (op) == CONST_DOUBLE
3407	      || GET_CODE (op) == CONST_VECTOR);
3408
3409  if (GET_CODE (op) == CONST_VECTOR
3410      && !const_vector_immediate_p (op))
3411    return 0;
3412
3413  if (GET_MODE (op) != VOIDmode)
3414    mode = GET_MODE (op);
3415
3416  constant_to_array (mode, op, arr);
3417
3418  if (VECTOR_MODE_P (mode))
3419    mode = GET_MODE_INNER (mode);
3420
3421  bytes = GET_MODE_SIZE (mode);
3422  mode = mode_for_size (GET_MODE_BITSIZE (mode), MODE_INT, 0);
3423
3424  /* Check that bytes are repeated. */
3425  for (i = bytes; i < 16; i += bytes)
3426    for (j = 0; j < bytes; j++)
3427      if (arr[j] != arr[i + j])
3428	return 0;
3429
3430  val = arr[0];
3431  for (j = 1; j < bytes; j++)
3432    val = (val << 8) | arr[j];
3433
3434  val = trunc_int_for_mode (val, mode);
3435
3436  return val >= low && val <= high;
3437}
3438
3439/* TRUE when op is an immediate and an exact power of 2, and given that
3440   OP is 2^scale, scale >= LOW && scale <= HIGH.  When OP is a vector,
3441   all entries must be the same. */
3442bool
3443exp2_immediate_p (rtx op, machine_mode mode, int low, int high)
3444{
3445  machine_mode int_mode;
3446  HOST_WIDE_INT val;
3447  unsigned char arr[16];
3448  int bytes, i, j;
3449
3450  gcc_assert (GET_CODE (op) == CONST_INT || GET_CODE (op) == CONST_DOUBLE
3451	      || GET_CODE (op) == CONST_VECTOR);
3452
3453  if (GET_CODE (op) == CONST_VECTOR
3454      && !const_vector_immediate_p (op))
3455    return 0;
3456
3457  if (GET_MODE (op) != VOIDmode)
3458    mode = GET_MODE (op);
3459
3460  constant_to_array (mode, op, arr);
3461
3462  if (VECTOR_MODE_P (mode))
3463    mode = GET_MODE_INNER (mode);
3464
3465  bytes = GET_MODE_SIZE (mode);
3466  int_mode = mode_for_size (GET_MODE_BITSIZE (mode), MODE_INT, 0);
3467
3468  /* Check that bytes are repeated. */
3469  for (i = bytes; i < 16; i += bytes)
3470    for (j = 0; j < bytes; j++)
3471      if (arr[j] != arr[i + j])
3472	return 0;
3473
3474  val = arr[0];
3475  for (j = 1; j < bytes; j++)
3476    val = (val << 8) | arr[j];
3477
3478  val = trunc_int_for_mode (val, int_mode);
3479
3480  /* Currently, we only handle SFmode */
3481  gcc_assert (mode == SFmode);
3482  if (mode == SFmode)
3483    {
3484      int exp = (val >> 23) - 127;
3485      return val > 0 && (val & 0x007fffff) == 0
3486	     &&  exp >= low && exp <= high;
3487    }
3488  return FALSE;
3489}
3490
3491/* Return true if X is a SYMBOL_REF to an __ea qualified variable.  */
3492
3493static bool
3494ea_symbol_ref_p (const_rtx x)
3495{
3496  tree decl;
3497
3498  if (GET_CODE (x) == CONST && GET_CODE (XEXP (x, 0)) == PLUS)
3499    {
3500      rtx plus = XEXP (x, 0);
3501      rtx op0 = XEXP (plus, 0);
3502      rtx op1 = XEXP (plus, 1);
3503      if (GET_CODE (op1) == CONST_INT)
3504	x = op0;
3505    }
3506
3507  return (GET_CODE (x) == SYMBOL_REF
3508 	  && (decl = SYMBOL_REF_DECL (x)) != 0
3509 	  && TREE_CODE (decl) == VAR_DECL
3510 	  && TYPE_ADDR_SPACE (TREE_TYPE (decl)));
3511}
3512
3513/* We accept:
3514   - any 32-bit constant (SImode, SFmode)
3515   - any constant that can be generated with fsmbi (any mode)
3516   - a 64-bit constant where the high and low bits are identical
3517     (DImode, DFmode)
3518   - a 128-bit constant where the four 32-bit words match.  */
3519bool
3520spu_legitimate_constant_p (machine_mode mode, rtx x)
3521{
3522  subrtx_iterator::array_type array;
3523  if (GET_CODE (x) == HIGH)
3524    x = XEXP (x, 0);
3525
3526  /* Reject any __ea qualified reference.  These can't appear in
3527     instructions but must be forced to the constant pool.  */
3528  FOR_EACH_SUBRTX (iter, array, x, ALL)
3529    if (ea_symbol_ref_p (*iter))
3530      return 0;
3531
3532  /* V4SI with all identical symbols is valid. */
3533  if (!flag_pic
3534      && mode == V4SImode
3535      && (GET_CODE (CONST_VECTOR_ELT (x, 0)) == SYMBOL_REF
3536	  || GET_CODE (CONST_VECTOR_ELT (x, 0)) == LABEL_REF
3537	  || GET_CODE (CONST_VECTOR_ELT (x, 0)) == CONST))
3538    return CONST_VECTOR_ELT (x, 0) == CONST_VECTOR_ELT (x, 1)
3539	   && CONST_VECTOR_ELT (x, 1) == CONST_VECTOR_ELT (x, 2)
3540	   && CONST_VECTOR_ELT (x, 2) == CONST_VECTOR_ELT (x, 3);
3541
3542  if (GET_CODE (x) == CONST_VECTOR
3543      && !const_vector_immediate_p (x))
3544    return 0;
3545  return 1;
3546}
3547
3548/* Valid address are:
3549   - symbol_ref, label_ref, const
3550   - reg
3551   - reg + const_int, where const_int is 16 byte aligned
3552   - reg + reg, alignment doesn't matter
3553  The alignment matters in the reg+const case because lqd and stqd
3554  ignore the 4 least significant bits of the const.  We only care about
3555  16 byte modes because the expand phase will change all smaller MEM
3556  references to TImode.  */
3557static bool
3558spu_legitimate_address_p (machine_mode mode,
3559			  rtx x, bool reg_ok_strict)
3560{
3561  int aligned = GET_MODE_SIZE (mode) >= 16;
3562  if (aligned
3563      && GET_CODE (x) == AND
3564      && GET_CODE (XEXP (x, 1)) == CONST_INT
3565      && INTVAL (XEXP (x, 1)) == (HOST_WIDE_INT) - 16)
3566    x = XEXP (x, 0);
3567  switch (GET_CODE (x))
3568    {
3569    case LABEL_REF:
3570      return !TARGET_LARGE_MEM;
3571
3572    case SYMBOL_REF:
3573    case CONST:
3574      /* Keep __ea references until reload so that spu_expand_mov can see them
3575	 in MEMs.  */
3576      if (ea_symbol_ref_p (x))
3577	return !reload_in_progress && !reload_completed;
3578      return !TARGET_LARGE_MEM;
3579
3580    case CONST_INT:
3581      return INTVAL (x) >= 0 && INTVAL (x) <= 0x3ffff;
3582
3583    case SUBREG:
3584      x = XEXP (x, 0);
3585      if (REG_P (x))
3586	return 0;
3587
3588    case REG:
3589      return INT_REG_OK_FOR_BASE_P (x, reg_ok_strict);
3590
3591    case PLUS:
3592    case LO_SUM:
3593      {
3594	rtx op0 = XEXP (x, 0);
3595	rtx op1 = XEXP (x, 1);
3596	if (GET_CODE (op0) == SUBREG)
3597	  op0 = XEXP (op0, 0);
3598	if (GET_CODE (op1) == SUBREG)
3599	  op1 = XEXP (op1, 0);
3600	if (GET_CODE (op0) == REG
3601	    && INT_REG_OK_FOR_BASE_P (op0, reg_ok_strict)
3602	    && GET_CODE (op1) == CONST_INT
3603	    && ((INTVAL (op1) >= -0x2000 && INTVAL (op1) <= 0x1fff)
3604		/* If virtual registers are involved, the displacement will
3605		   change later on anyway, so checking would be premature.
3606		   Reload will make sure the final displacement after
3607		   register elimination is OK.  */
3608		|| op0 == arg_pointer_rtx
3609		|| op0 == frame_pointer_rtx
3610		|| op0 == virtual_stack_vars_rtx)
3611	    && (!aligned || (INTVAL (op1) & 15) == 0))
3612	  return TRUE;
3613	if (GET_CODE (op0) == REG
3614	    && INT_REG_OK_FOR_BASE_P (op0, reg_ok_strict)
3615	    && GET_CODE (op1) == REG
3616	    && INT_REG_OK_FOR_INDEX_P (op1, reg_ok_strict))
3617	  return TRUE;
3618      }
3619      break;
3620
3621    default:
3622      break;
3623    }
3624  return FALSE;
3625}
3626
3627/* Like spu_legitimate_address_p, except with named addresses.  */
3628static bool
3629spu_addr_space_legitimate_address_p (machine_mode mode, rtx x,
3630				     bool reg_ok_strict, addr_space_t as)
3631{
3632  if (as == ADDR_SPACE_EA)
3633    return (REG_P (x) && (GET_MODE (x) == EAmode));
3634
3635  else if (as != ADDR_SPACE_GENERIC)
3636    gcc_unreachable ();
3637
3638  return spu_legitimate_address_p (mode, x, reg_ok_strict);
3639}
3640
3641/* When the address is reg + const_int, force the const_int into a
3642   register.  */
3643static rtx
3644spu_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED,
3645			machine_mode mode ATTRIBUTE_UNUSED)
3646{
3647  rtx op0, op1;
3648  /* Make sure both operands are registers.  */
3649  if (GET_CODE (x) == PLUS)
3650    {
3651      op0 = XEXP (x, 0);
3652      op1 = XEXP (x, 1);
3653      if (ALIGNED_SYMBOL_REF_P (op0))
3654	{
3655	  op0 = force_reg (Pmode, op0);
3656	  mark_reg_pointer (op0, 128);
3657	}
3658      else if (GET_CODE (op0) != REG)
3659	op0 = force_reg (Pmode, op0);
3660      if (ALIGNED_SYMBOL_REF_P (op1))
3661	{
3662	  op1 = force_reg (Pmode, op1);
3663	  mark_reg_pointer (op1, 128);
3664	}
3665      else if (GET_CODE (op1) != REG)
3666	op1 = force_reg (Pmode, op1);
3667      x = gen_rtx_PLUS (Pmode, op0, op1);
3668    }
3669  return x;
3670}
3671
3672/* Like spu_legitimate_address, except with named address support.  */
3673static rtx
3674spu_addr_space_legitimize_address (rtx x, rtx oldx, machine_mode mode,
3675				   addr_space_t as)
3676{
3677  if (as != ADDR_SPACE_GENERIC)
3678    return x;
3679
3680  return spu_legitimize_address (x, oldx, mode);
3681}
3682
3683/* Reload reg + const_int for out-of-range displacements.  */
3684rtx
3685spu_legitimize_reload_address (rtx ad, machine_mode mode ATTRIBUTE_UNUSED,
3686			       int opnum, int type)
3687{
3688  bool removed_and = false;
3689
3690  if (GET_CODE (ad) == AND
3691      && CONST_INT_P (XEXP (ad, 1))
3692      && INTVAL (XEXP (ad, 1)) == (HOST_WIDE_INT) - 16)
3693    {
3694      ad = XEXP (ad, 0);
3695      removed_and = true;
3696    }
3697
3698  if (GET_CODE (ad) == PLUS
3699      && REG_P (XEXP (ad, 0))
3700      && CONST_INT_P (XEXP (ad, 1))
3701      && !(INTVAL (XEXP (ad, 1)) >= -0x2000
3702	   && INTVAL (XEXP (ad, 1)) <= 0x1fff))
3703    {
3704      /* Unshare the sum.  */
3705      ad = copy_rtx (ad);
3706
3707      /* Reload the displacement.  */
3708      push_reload (XEXP (ad, 1), NULL_RTX, &XEXP (ad, 1), NULL,
3709		   BASE_REG_CLASS, GET_MODE (ad), VOIDmode, 0, 0,
3710		   opnum, (enum reload_type) type);
3711
3712      /* Add back AND for alignment if we stripped it.  */
3713      if (removed_and)
3714	ad = gen_rtx_AND (GET_MODE (ad), ad, GEN_INT (-16));
3715
3716      return ad;
3717    }
3718
3719  return NULL_RTX;
3720}
3721
3722/* Handle an attribute requiring a FUNCTION_DECL; arguments as in
3723   struct attribute_spec.handler.  */
3724static tree
3725spu_handle_fndecl_attribute (tree * node,
3726			     tree name,
3727			     tree args ATTRIBUTE_UNUSED,
3728			     int flags ATTRIBUTE_UNUSED, bool * no_add_attrs)
3729{
3730  if (TREE_CODE (*node) != FUNCTION_DECL)
3731    {
3732      warning (0, "%qE attribute only applies to functions",
3733	       name);
3734      *no_add_attrs = true;
3735    }
3736
3737  return NULL_TREE;
3738}
3739
3740/* Handle the "vector" attribute.  */
3741static tree
3742spu_handle_vector_attribute (tree * node, tree name,
3743			     tree args ATTRIBUTE_UNUSED,
3744			     int flags ATTRIBUTE_UNUSED, bool * no_add_attrs)
3745{
3746  tree type = *node, result = NULL_TREE;
3747  machine_mode mode;
3748  int unsigned_p;
3749
3750  while (POINTER_TYPE_P (type)
3751	 || TREE_CODE (type) == FUNCTION_TYPE
3752	 || TREE_CODE (type) == METHOD_TYPE || TREE_CODE (type) == ARRAY_TYPE)
3753    type = TREE_TYPE (type);
3754
3755  mode = TYPE_MODE (type);
3756
3757  unsigned_p = TYPE_UNSIGNED (type);
3758  switch (mode)
3759    {
3760    case DImode:
3761      result = (unsigned_p ? unsigned_V2DI_type_node : V2DI_type_node);
3762      break;
3763    case SImode:
3764      result = (unsigned_p ? unsigned_V4SI_type_node : V4SI_type_node);
3765      break;
3766    case HImode:
3767      result = (unsigned_p ? unsigned_V8HI_type_node : V8HI_type_node);
3768      break;
3769    case QImode:
3770      result = (unsigned_p ? unsigned_V16QI_type_node : V16QI_type_node);
3771      break;
3772    case SFmode:
3773      result = V4SF_type_node;
3774      break;
3775    case DFmode:
3776      result = V2DF_type_node;
3777      break;
3778    default:
3779      break;
3780    }
3781
3782  /* Propagate qualifiers attached to the element type
3783     onto the vector type.  */
3784  if (result && result != type && TYPE_QUALS (type))
3785    result = build_qualified_type (result, TYPE_QUALS (type));
3786
3787  *no_add_attrs = true;		/* No need to hang on to the attribute.  */
3788
3789  if (!result)
3790    warning (0, "%qE attribute ignored", name);
3791  else
3792    *node = lang_hooks.types.reconstruct_complex_type (*node, result);
3793
3794  return NULL_TREE;
3795}
3796
3797/* Return nonzero if FUNC is a naked function.  */
3798static int
3799spu_naked_function_p (tree func)
3800{
3801  tree a;
3802
3803  if (TREE_CODE (func) != FUNCTION_DECL)
3804    abort ();
3805
3806  a = lookup_attribute ("naked", DECL_ATTRIBUTES (func));
3807  return a != NULL_TREE;
3808}
3809
3810int
3811spu_initial_elimination_offset (int from, int to)
3812{
3813  int saved_regs_size = spu_saved_regs_size ();
3814  int sp_offset = 0;
3815  if (!crtl->is_leaf || crtl->outgoing_args_size
3816      || get_frame_size () || saved_regs_size)
3817    sp_offset = STACK_POINTER_OFFSET;
3818  if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
3819    return get_frame_size () + crtl->outgoing_args_size + sp_offset;
3820  else if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
3821    return get_frame_size ();
3822  else if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
3823    return sp_offset + crtl->outgoing_args_size
3824      + get_frame_size () + saved_regs_size + STACK_POINTER_OFFSET;
3825  else if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
3826    return get_frame_size () + saved_regs_size + sp_offset;
3827  else
3828    gcc_unreachable ();
3829}
3830
3831rtx
3832spu_function_value (const_tree type, const_tree func ATTRIBUTE_UNUSED)
3833{
3834  machine_mode mode = TYPE_MODE (type);
3835  int byte_size = ((mode == BLKmode)
3836		   ? int_size_in_bytes (type) : GET_MODE_SIZE (mode));
3837
3838  /* Make sure small structs are left justified in a register. */
3839  if ((mode == BLKmode || (type && AGGREGATE_TYPE_P (type)))
3840      && byte_size <= UNITS_PER_WORD * MAX_REGISTER_RETURN && byte_size > 0)
3841    {
3842      machine_mode smode;
3843      rtvec v;
3844      int i;
3845      int nregs = (byte_size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
3846      int n = byte_size / UNITS_PER_WORD;
3847      v = rtvec_alloc (nregs);
3848      for (i = 0; i < n; i++)
3849	{
3850	  RTVEC_ELT (v, i) = gen_rtx_EXPR_LIST (VOIDmode,
3851						gen_rtx_REG (TImode,
3852							     FIRST_RETURN_REGNUM
3853							     + i),
3854						GEN_INT (UNITS_PER_WORD * i));
3855	  byte_size -= UNITS_PER_WORD;
3856	}
3857
3858      if (n < nregs)
3859	{
3860	  if (byte_size < 4)
3861	    byte_size = 4;
3862	  smode =
3863	    smallest_mode_for_size (byte_size * BITS_PER_UNIT, MODE_INT);
3864	  RTVEC_ELT (v, n) =
3865	    gen_rtx_EXPR_LIST (VOIDmode,
3866			       gen_rtx_REG (smode, FIRST_RETURN_REGNUM + n),
3867			       GEN_INT (UNITS_PER_WORD * n));
3868	}
3869      return gen_rtx_PARALLEL (mode, v);
3870    }
3871  return gen_rtx_REG (mode, FIRST_RETURN_REGNUM);
3872}
3873
3874static rtx
3875spu_function_arg (cumulative_args_t cum_v,
3876		  machine_mode mode,
3877		  const_tree type, bool named ATTRIBUTE_UNUSED)
3878{
3879  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
3880  int byte_size;
3881
3882  if (*cum >= MAX_REGISTER_ARGS)
3883    return 0;
3884
3885  byte_size = ((mode == BLKmode)
3886	       ? int_size_in_bytes (type) : GET_MODE_SIZE (mode));
3887
3888  /* The ABI does not allow parameters to be passed partially in
3889     reg and partially in stack. */
3890  if ((*cum + (byte_size + 15) / 16) > MAX_REGISTER_ARGS)
3891    return 0;
3892
3893  /* Make sure small structs are left justified in a register. */
3894  if ((mode == BLKmode || (type && AGGREGATE_TYPE_P (type)))
3895      && byte_size < UNITS_PER_WORD && byte_size > 0)
3896    {
3897      machine_mode smode;
3898      rtx gr_reg;
3899      if (byte_size < 4)
3900	byte_size = 4;
3901      smode = smallest_mode_for_size (byte_size * BITS_PER_UNIT, MODE_INT);
3902      gr_reg = gen_rtx_EXPR_LIST (VOIDmode,
3903				  gen_rtx_REG (smode, FIRST_ARG_REGNUM + *cum),
3904				  const0_rtx);
3905      return gen_rtx_PARALLEL (mode, gen_rtvec (1, gr_reg));
3906    }
3907  else
3908    return gen_rtx_REG (mode, FIRST_ARG_REGNUM + *cum);
3909}
3910
3911static void
3912spu_function_arg_advance (cumulative_args_t cum_v, machine_mode mode,
3913			  const_tree type, bool named ATTRIBUTE_UNUSED)
3914{
3915  CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
3916
3917  *cum += (type && TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST
3918	   ? 1
3919	   : mode == BLKmode
3920	   ? ((int_size_in_bytes (type) + 15) / 16)
3921	   : mode == VOIDmode
3922	   ? 1
3923	   : HARD_REGNO_NREGS (cum, mode));
3924}
3925
3926/* Variable sized types are passed by reference.  */
3927static bool
3928spu_pass_by_reference (cumulative_args_t cum ATTRIBUTE_UNUSED,
3929		       machine_mode mode ATTRIBUTE_UNUSED,
3930		       const_tree type, bool named ATTRIBUTE_UNUSED)
3931{
3932  return type && TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST;
3933}
3934
3935
3936/* Var args. */
3937
3938/* Create and return the va_list datatype.
3939
3940   On SPU, va_list is an array type equivalent to
3941
3942      typedef struct __va_list_tag
3943        {
3944            void *__args __attribute__((__aligned(16)));
3945            void *__skip __attribute__((__aligned(16)));
3946
3947        } va_list[1];
3948
3949   where __args points to the arg that will be returned by the next
3950   va_arg(), and __skip points to the previous stack frame such that
3951   when __args == __skip we should advance __args by 32 bytes. */
3952static tree
3953spu_build_builtin_va_list (void)
3954{
3955  tree f_args, f_skip, record, type_decl;
3956  bool owp;
3957
3958  record = (*lang_hooks.types.make_type) (RECORD_TYPE);
3959
3960  type_decl =
3961    build_decl (BUILTINS_LOCATION,
3962		TYPE_DECL, get_identifier ("__va_list_tag"), record);
3963
3964  f_args = build_decl (BUILTINS_LOCATION,
3965		       FIELD_DECL, get_identifier ("__args"), ptr_type_node);
3966  f_skip = build_decl (BUILTINS_LOCATION,
3967		       FIELD_DECL, get_identifier ("__skip"), ptr_type_node);
3968
3969  DECL_FIELD_CONTEXT (f_args) = record;
3970  DECL_ALIGN (f_args) = 128;
3971  DECL_USER_ALIGN (f_args) = 1;
3972
3973  DECL_FIELD_CONTEXT (f_skip) = record;
3974  DECL_ALIGN (f_skip) = 128;
3975  DECL_USER_ALIGN (f_skip) = 1;
3976
3977  TYPE_STUB_DECL (record) = type_decl;
3978  TYPE_NAME (record) = type_decl;
3979  TYPE_FIELDS (record) = f_args;
3980  DECL_CHAIN (f_args) = f_skip;
3981
3982  /* We know this is being padded and we want it too.  It is an internal
3983     type so hide the warnings from the user. */
3984  owp = warn_padded;
3985  warn_padded = false;
3986
3987  layout_type (record);
3988
3989  warn_padded = owp;
3990
3991  /* The correct type is an array type of one element.  */
3992  return build_array_type (record, build_index_type (size_zero_node));
3993}
3994
3995/* Implement va_start by filling the va_list structure VALIST.
3996   NEXTARG points to the first anonymous stack argument.
3997
3998   The following global variables are used to initialize
3999   the va_list structure:
4000
4001     crtl->args.info;
4002       the CUMULATIVE_ARGS for this function
4003
4004     crtl->args.arg_offset_rtx:
4005       holds the offset of the first anonymous stack argument
4006       (relative to the virtual arg pointer).  */
4007
4008static void
4009spu_va_start (tree valist, rtx nextarg)
4010{
4011  tree f_args, f_skip;
4012  tree args, skip, t;
4013
4014  f_args = TYPE_FIELDS (TREE_TYPE (va_list_type_node));
4015  f_skip = DECL_CHAIN (f_args);
4016
4017  valist = build_simple_mem_ref (valist);
4018  args =
4019    build3 (COMPONENT_REF, TREE_TYPE (f_args), valist, f_args, NULL_TREE);
4020  skip =
4021    build3 (COMPONENT_REF, TREE_TYPE (f_skip), valist, f_skip, NULL_TREE);
4022
4023  /* Find the __args area.  */
4024  t = make_tree (TREE_TYPE (args), nextarg);
4025  if (crtl->args.pretend_args_size > 0)
4026    t = fold_build_pointer_plus_hwi (t, -STACK_POINTER_OFFSET);
4027  t = build2 (MODIFY_EXPR, TREE_TYPE (args), args, t);
4028  TREE_SIDE_EFFECTS (t) = 1;
4029  expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
4030
4031  /* Find the __skip area.  */
4032  t = make_tree (TREE_TYPE (skip), virtual_incoming_args_rtx);
4033  t = fold_build_pointer_plus_hwi (t, (crtl->args.pretend_args_size
4034				       - STACK_POINTER_OFFSET));
4035  t = build2 (MODIFY_EXPR, TREE_TYPE (skip), skip, t);
4036  TREE_SIDE_EFFECTS (t) = 1;
4037  expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
4038}
4039
4040/* Gimplify va_arg by updating the va_list structure
4041   VALIST as required to retrieve an argument of type
4042   TYPE, and returning that argument.
4043
4044   ret = va_arg(VALIST, TYPE);
4045
4046   generates code equivalent to:
4047
4048    paddedsize = (sizeof(TYPE) + 15) & -16;
4049    if (VALIST.__args + paddedsize > VALIST.__skip
4050	&& VALIST.__args <= VALIST.__skip)
4051      addr = VALIST.__skip + 32;
4052    else
4053      addr = VALIST.__args;
4054    VALIST.__args = addr + paddedsize;
4055    ret = *(TYPE *)addr;
4056 */
4057static tree
4058spu_gimplify_va_arg_expr (tree valist, tree type, gimple_seq * pre_p,
4059			  gimple_seq * post_p ATTRIBUTE_UNUSED)
4060{
4061  tree f_args, f_skip;
4062  tree args, skip;
4063  HOST_WIDE_INT size, rsize;
4064  tree addr, tmp;
4065  bool pass_by_reference_p;
4066
4067  f_args = TYPE_FIELDS (TREE_TYPE (va_list_type_node));
4068  f_skip = DECL_CHAIN (f_args);
4069
4070  valist = build_simple_mem_ref (valist);
4071  args =
4072    build3 (COMPONENT_REF, TREE_TYPE (f_args), valist, f_args, NULL_TREE);
4073  skip =
4074    build3 (COMPONENT_REF, TREE_TYPE (f_skip), valist, f_skip, NULL_TREE);
4075
4076  addr = create_tmp_var (ptr_type_node, "va_arg");
4077
4078  /* if an object is dynamically sized, a pointer to it is passed
4079     instead of the object itself. */
4080  pass_by_reference_p = pass_by_reference (NULL, TYPE_MODE (type), type,
4081					   false);
4082  if (pass_by_reference_p)
4083    type = build_pointer_type (type);
4084  size = int_size_in_bytes (type);
4085  rsize = ((size + UNITS_PER_WORD - 1) / UNITS_PER_WORD) * UNITS_PER_WORD;
4086
4087  /* build conditional expression to calculate addr. The expression
4088     will be gimplified later. */
4089  tmp = fold_build_pointer_plus_hwi (unshare_expr (args), rsize);
4090  tmp = build2 (TRUTH_AND_EXPR, boolean_type_node,
4091		build2 (GT_EXPR, boolean_type_node, tmp, unshare_expr (skip)),
4092		build2 (LE_EXPR, boolean_type_node, unshare_expr (args),
4093		unshare_expr (skip)));
4094
4095  tmp = build3 (COND_EXPR, ptr_type_node, tmp,
4096		fold_build_pointer_plus_hwi (unshare_expr (skip), 32),
4097		unshare_expr (args));
4098
4099  gimplify_assign (addr, tmp, pre_p);
4100
4101  /* update VALIST.__args */
4102  tmp = fold_build_pointer_plus_hwi (addr, rsize);
4103  gimplify_assign (unshare_expr (args), tmp, pre_p);
4104
4105  addr = fold_convert (build_pointer_type_for_mode (type, ptr_mode, true),
4106		       addr);
4107
4108  if (pass_by_reference_p)
4109    addr = build_va_arg_indirect_ref (addr);
4110
4111  return build_va_arg_indirect_ref (addr);
4112}
4113
4114/* Save parameter registers starting with the register that corresponds
4115   to the first unnamed parameters.  If the first unnamed parameter is
4116   in the stack then save no registers.  Set pretend_args_size to the
4117   amount of space needed to save the registers. */
4118static void
4119spu_setup_incoming_varargs (cumulative_args_t cum, machine_mode mode,
4120			    tree type, int *pretend_size, int no_rtl)
4121{
4122  if (!no_rtl)
4123    {
4124      rtx tmp;
4125      int regno;
4126      int offset;
4127      int ncum = *get_cumulative_args (cum);
4128
4129      /* cum currently points to the last named argument, we want to
4130         start at the next argument. */
4131      spu_function_arg_advance (pack_cumulative_args (&ncum), mode, type, true);
4132
4133      offset = -STACK_POINTER_OFFSET;
4134      for (regno = ncum; regno < MAX_REGISTER_ARGS; regno++)
4135	{
4136	  tmp = gen_frame_mem (V4SImode,
4137			       plus_constant (Pmode, virtual_incoming_args_rtx,
4138					      offset));
4139	  emit_move_insn (tmp,
4140			  gen_rtx_REG (V4SImode, FIRST_ARG_REGNUM + regno));
4141	  offset += 16;
4142	}
4143      *pretend_size = offset + STACK_POINTER_OFFSET;
4144    }
4145}
4146
4147static void
4148spu_conditional_register_usage (void)
4149{
4150  if (flag_pic)
4151    {
4152      fixed_regs[PIC_OFFSET_TABLE_REGNUM] = 1;
4153      call_used_regs[PIC_OFFSET_TABLE_REGNUM] = 1;
4154    }
4155}
4156
4157/* This is called any time we inspect the alignment of a register for
4158   addresses.  */
4159static int
4160reg_aligned_for_addr (rtx x)
4161{
4162  int regno =
4163    REGNO (x) < FIRST_PSEUDO_REGISTER ? ORIGINAL_REGNO (x) : REGNO (x);
4164  return REGNO_POINTER_ALIGN (regno) >= 128;
4165}
4166
4167/* Encode symbol attributes (local vs. global, tls model) of a SYMBOL_REF
4168   into its SYMBOL_REF_FLAGS.  */
4169static void
4170spu_encode_section_info (tree decl, rtx rtl, int first)
4171{
4172  default_encode_section_info (decl, rtl, first);
4173
4174  /* If a variable has a forced alignment to < 16 bytes, mark it with
4175     SYMBOL_FLAG_ALIGN1.  */
4176  if (TREE_CODE (decl) == VAR_DECL
4177      && DECL_USER_ALIGN (decl) && DECL_ALIGN (decl) < 128)
4178    SYMBOL_REF_FLAGS (XEXP (rtl, 0)) |= SYMBOL_FLAG_ALIGN1;
4179}
4180
4181/* Return TRUE if we are certain the mem refers to a complete object
4182   which is both 16-byte aligned and padded to a 16-byte boundary.  This
4183   would make it safe to store with a single instruction.
4184   We guarantee the alignment and padding for static objects by aligning
4185   all of them to 16-bytes. (DATA_ALIGNMENT and CONSTANT_ALIGNMENT.)
4186   FIXME: We currently cannot guarantee this for objects on the stack
4187   because assign_parm_setup_stack calls assign_stack_local with the
4188   alignment of the parameter mode and in that case the alignment never
4189   gets adjusted by LOCAL_ALIGNMENT. */
4190static int
4191store_with_one_insn_p (rtx mem)
4192{
4193  machine_mode mode = GET_MODE (mem);
4194  rtx addr = XEXP (mem, 0);
4195  if (mode == BLKmode)
4196    return 0;
4197  if (GET_MODE_SIZE (mode) >= 16)
4198    return 1;
4199  /* Only static objects. */
4200  if (GET_CODE (addr) == SYMBOL_REF)
4201    {
4202      /* We use the associated declaration to make sure the access is
4203         referring to the whole object.
4204         We check both MEM_EXPR and SYMBOL_REF_DECL.  I'm not sure
4205         if it is necessary.  Will there be cases where one exists, and
4206         the other does not?  Will there be cases where both exist, but
4207         have different types?  */
4208      tree decl = MEM_EXPR (mem);
4209      if (decl
4210	  && TREE_CODE (decl) == VAR_DECL
4211	  && GET_MODE (mem) == TYPE_MODE (TREE_TYPE (decl)))
4212	return 1;
4213      decl = SYMBOL_REF_DECL (addr);
4214      if (decl
4215	  && TREE_CODE (decl) == VAR_DECL
4216	  && GET_MODE (mem) == TYPE_MODE (TREE_TYPE (decl)))
4217	return 1;
4218    }
4219  return 0;
4220}
4221
4222/* Return 1 when the address is not valid for a simple load and store as
4223   required by the '_mov*' patterns.   We could make this less strict
4224   for loads, but we prefer mem's to look the same so they are more
4225   likely to be merged.  */
4226static int
4227address_needs_split (rtx mem)
4228{
4229  if (GET_MODE_SIZE (GET_MODE (mem)) < 16
4230      && (GET_MODE_SIZE (GET_MODE (mem)) < 4
4231	  || !(store_with_one_insn_p (mem)
4232	       || mem_is_padded_component_ref (mem))))
4233    return 1;
4234
4235  return 0;
4236}
4237
4238static GTY(()) rtx cache_fetch;		  /* __cache_fetch function */
4239static GTY(()) rtx cache_fetch_dirty;	  /* __cache_fetch_dirty function */
4240static alias_set_type ea_alias_set = -1;  /* alias set for __ea memory */
4241
4242/* MEM is known to be an __ea qualified memory access.  Emit a call to
4243   fetch the ppu memory to local store, and return its address in local
4244   store.  */
4245
4246static void
4247ea_load_store (rtx mem, bool is_store, rtx ea_addr, rtx data_addr)
4248{
4249  if (is_store)
4250    {
4251      rtx ndirty = GEN_INT (GET_MODE_SIZE (GET_MODE (mem)));
4252      if (!cache_fetch_dirty)
4253	cache_fetch_dirty = init_one_libfunc ("__cache_fetch_dirty");
4254      emit_library_call_value (cache_fetch_dirty, data_addr, LCT_NORMAL, Pmode,
4255			       2, ea_addr, EAmode, ndirty, SImode);
4256    }
4257  else
4258    {
4259      if (!cache_fetch)
4260	cache_fetch = init_one_libfunc ("__cache_fetch");
4261      emit_library_call_value (cache_fetch, data_addr, LCT_NORMAL, Pmode,
4262			       1, ea_addr, EAmode);
4263    }
4264}
4265
4266/* Like ea_load_store, but do the cache tag comparison and, for stores,
4267   dirty bit marking, inline.
4268
4269   The cache control data structure is an array of
4270
4271   struct __cache_tag_array
4272     {
4273        unsigned int tag_lo[4];
4274        unsigned int tag_hi[4];
4275        void *data_pointer[4];
4276        int reserved[4];
4277        vector unsigned short dirty_bits[4];
4278     }  */
4279
4280static void
4281ea_load_store_inline (rtx mem, bool is_store, rtx ea_addr, rtx data_addr)
4282{
4283  rtx ea_addr_si;
4284  HOST_WIDE_INT v;
4285  rtx tag_size_sym = gen_rtx_SYMBOL_REF (Pmode, "__cache_tag_array_size");
4286  rtx tag_arr_sym = gen_rtx_SYMBOL_REF (Pmode, "__cache_tag_array");
4287  rtx index_mask = gen_reg_rtx (SImode);
4288  rtx tag_arr = gen_reg_rtx (Pmode);
4289  rtx splat_mask = gen_reg_rtx (TImode);
4290  rtx splat = gen_reg_rtx (V4SImode);
4291  rtx splat_hi = NULL_RTX;
4292  rtx tag_index = gen_reg_rtx (Pmode);
4293  rtx block_off = gen_reg_rtx (SImode);
4294  rtx tag_addr = gen_reg_rtx (Pmode);
4295  rtx tag = gen_reg_rtx (V4SImode);
4296  rtx cache_tag = gen_reg_rtx (V4SImode);
4297  rtx cache_tag_hi = NULL_RTX;
4298  rtx cache_ptrs = gen_reg_rtx (TImode);
4299  rtx cache_ptrs_si = gen_reg_rtx (SImode);
4300  rtx tag_equal = gen_reg_rtx (V4SImode);
4301  rtx tag_equal_hi = NULL_RTX;
4302  rtx tag_eq_pack = gen_reg_rtx (V4SImode);
4303  rtx tag_eq_pack_si = gen_reg_rtx (SImode);
4304  rtx eq_index = gen_reg_rtx (SImode);
4305  rtx bcomp, hit_label, hit_ref, cont_label;
4306  rtx_insn *insn;
4307
4308  if (spu_ea_model != 32)
4309    {
4310      splat_hi = gen_reg_rtx (V4SImode);
4311      cache_tag_hi = gen_reg_rtx (V4SImode);
4312      tag_equal_hi = gen_reg_rtx (V4SImode);
4313    }
4314
4315  emit_move_insn (index_mask, plus_constant (Pmode, tag_size_sym, -128));
4316  emit_move_insn (tag_arr, tag_arr_sym);
4317  v = 0x0001020300010203LL;
4318  emit_move_insn (splat_mask, immed_double_const (v, v, TImode));
4319  ea_addr_si = ea_addr;
4320  if (spu_ea_model != 32)
4321    ea_addr_si = convert_to_mode (SImode, ea_addr, 1);
4322
4323  /* tag_index = ea_addr & (tag_array_size - 128)  */
4324  emit_insn (gen_andsi3 (tag_index, ea_addr_si, index_mask));
4325
4326  /* splat ea_addr to all 4 slots.  */
4327  emit_insn (gen_shufb (splat, ea_addr_si, ea_addr_si, splat_mask));
4328  /* Similarly for high 32 bits of ea_addr.  */
4329  if (spu_ea_model != 32)
4330    emit_insn (gen_shufb (splat_hi, ea_addr, ea_addr, splat_mask));
4331
4332  /* block_off = ea_addr & 127  */
4333  emit_insn (gen_andsi3 (block_off, ea_addr_si, spu_const (SImode, 127)));
4334
4335  /* tag_addr = tag_arr + tag_index  */
4336  emit_insn (gen_addsi3 (tag_addr, tag_arr, tag_index));
4337
4338  /* Read cache tags.  */
4339  emit_move_insn (cache_tag, gen_rtx_MEM (V4SImode, tag_addr));
4340  if (spu_ea_model != 32)
4341    emit_move_insn (cache_tag_hi, gen_rtx_MEM (V4SImode,
4342					       plus_constant (Pmode,
4343							      tag_addr, 16)));
4344
4345  /* tag = ea_addr & -128  */
4346  emit_insn (gen_andv4si3 (tag, splat, spu_const (V4SImode, -128)));
4347
4348  /* Read all four cache data pointers.  */
4349  emit_move_insn (cache_ptrs, gen_rtx_MEM (TImode,
4350					   plus_constant (Pmode,
4351							  tag_addr, 32)));
4352
4353  /* Compare tags.  */
4354  emit_insn (gen_ceq_v4si (tag_equal, tag, cache_tag));
4355  if (spu_ea_model != 32)
4356    {
4357      emit_insn (gen_ceq_v4si (tag_equal_hi, splat_hi, cache_tag_hi));
4358      emit_insn (gen_andv4si3 (tag_equal, tag_equal, tag_equal_hi));
4359    }
4360
4361  /* At most one of the tags compare equal, so tag_equal has one
4362     32-bit slot set to all 1's, with the other slots all zero.
4363     gbb picks off low bit from each byte in the 128-bit registers,
4364     so tag_eq_pack is one of 0xf000, 0x0f00, 0x00f0, 0x000f, assuming
4365     we have a hit.  */
4366  emit_insn (gen_spu_gbb (tag_eq_pack, spu_gen_subreg (V16QImode, tag_equal)));
4367  emit_insn (gen_spu_convert (tag_eq_pack_si, tag_eq_pack));
4368
4369  /* So counting leading zeros will set eq_index to 16, 20, 24 or 28.  */
4370  emit_insn (gen_clzsi2 (eq_index, tag_eq_pack_si));
4371
4372  /* Allowing us to rotate the corresponding cache data pointer to slot0.
4373     (rotating eq_index mod 16 bytes).  */
4374  emit_insn (gen_rotqby_ti (cache_ptrs, cache_ptrs, eq_index));
4375  emit_insn (gen_spu_convert (cache_ptrs_si, cache_ptrs));
4376
4377  /* Add block offset to form final data address.  */
4378  emit_insn (gen_addsi3 (data_addr, cache_ptrs_si, block_off));
4379
4380  /* Check that we did hit.  */
4381  hit_label = gen_label_rtx ();
4382  hit_ref = gen_rtx_LABEL_REF (VOIDmode, hit_label);
4383  bcomp = gen_rtx_NE (SImode, tag_eq_pack_si, const0_rtx);
4384  insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx,
4385				      gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
4386							    hit_ref, pc_rtx)));
4387  /* Say that this branch is very likely to happen.  */
4388  v = REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100 - 1;
4389  add_int_reg_note (insn, REG_BR_PROB, v);
4390
4391  ea_load_store (mem, is_store, ea_addr, data_addr);
4392  cont_label = gen_label_rtx ();
4393  emit_jump_insn (gen_jump (cont_label));
4394  emit_barrier ();
4395
4396  emit_label (hit_label);
4397
4398  if (is_store)
4399    {
4400      HOST_WIDE_INT v_hi;
4401      rtx dirty_bits = gen_reg_rtx (TImode);
4402      rtx dirty_off = gen_reg_rtx (SImode);
4403      rtx dirty_128 = gen_reg_rtx (TImode);
4404      rtx neg_block_off = gen_reg_rtx (SImode);
4405
4406      /* Set up mask with one dirty bit per byte of the mem we are
4407	 writing, starting from top bit.  */
4408      v_hi = v = -1;
4409      v <<= (128 - GET_MODE_SIZE (GET_MODE (mem))) & 63;
4410      if ((128 - GET_MODE_SIZE (GET_MODE (mem))) >= 64)
4411	{
4412	  v_hi = v;
4413	  v = 0;
4414	}
4415      emit_move_insn (dirty_bits, immed_double_const (v, v_hi, TImode));
4416
4417      /* Form index into cache dirty_bits.  eq_index is one of
4418	 0x10, 0x14, 0x18 or 0x1c.  Multiplying by 4 gives us
4419	 0x40, 0x50, 0x60 or 0x70 which just happens to be the
4420	 offset to each of the four dirty_bits elements.  */
4421      emit_insn (gen_ashlsi3 (dirty_off, eq_index, spu_const (SImode, 2)));
4422
4423      emit_insn (gen_spu_lqx (dirty_128, tag_addr, dirty_off));
4424
4425      /* Rotate bit mask to proper bit.  */
4426      emit_insn (gen_negsi2 (neg_block_off, block_off));
4427      emit_insn (gen_rotqbybi_ti (dirty_bits, dirty_bits, neg_block_off));
4428      emit_insn (gen_rotqbi_ti (dirty_bits, dirty_bits, neg_block_off));
4429
4430      /* Or in the new dirty bits.  */
4431      emit_insn (gen_iorti3 (dirty_128, dirty_bits, dirty_128));
4432
4433      /* Store.  */
4434      emit_insn (gen_spu_stqx (dirty_128, tag_addr, dirty_off));
4435    }
4436
4437  emit_label (cont_label);
4438}
4439
4440static rtx
4441expand_ea_mem (rtx mem, bool is_store)
4442{
4443  rtx ea_addr;
4444  rtx data_addr = gen_reg_rtx (Pmode);
4445  rtx new_mem;
4446
4447  ea_addr = force_reg (EAmode, XEXP (mem, 0));
4448  if (optimize_size || optimize == 0)
4449    ea_load_store (mem, is_store, ea_addr, data_addr);
4450  else
4451    ea_load_store_inline (mem, is_store, ea_addr, data_addr);
4452
4453  if (ea_alias_set == -1)
4454    ea_alias_set = new_alias_set ();
4455
4456  /* We generate a new MEM RTX to refer to the copy of the data
4457     in the cache.  We do not copy memory attributes (except the
4458     alignment) from the original MEM, as they may no longer apply
4459     to the cache copy.  */
4460  new_mem = gen_rtx_MEM (GET_MODE (mem), data_addr);
4461  set_mem_alias_set (new_mem, ea_alias_set);
4462  set_mem_align (new_mem, MIN (MEM_ALIGN (mem), 128 * 8));
4463
4464  return new_mem;
4465}
4466
4467int
4468spu_expand_mov (rtx * ops, machine_mode mode)
4469{
4470  if (GET_CODE (ops[0]) == SUBREG && !valid_subreg (ops[0]))
4471    {
4472      /* Perform the move in the destination SUBREG's inner mode.  */
4473      ops[0] = SUBREG_REG (ops[0]);
4474      mode = GET_MODE (ops[0]);
4475      ops[1] = gen_lowpart_common (mode, ops[1]);
4476      gcc_assert (ops[1]);
4477    }
4478
4479  if (GET_CODE (ops[1]) == SUBREG && !valid_subreg (ops[1]))
4480    {
4481      rtx from = SUBREG_REG (ops[1]);
4482      machine_mode imode = int_mode_for_mode (GET_MODE (from));
4483
4484      gcc_assert (GET_MODE_CLASS (mode) == MODE_INT
4485		  && GET_MODE_CLASS (imode) == MODE_INT
4486		  && subreg_lowpart_p (ops[1]));
4487
4488      if (GET_MODE_SIZE (imode) < 4)
4489	imode = SImode;
4490      if (imode != GET_MODE (from))
4491	from = gen_rtx_SUBREG (imode, from, 0);
4492
4493      if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (imode))
4494	{
4495	  enum insn_code icode = convert_optab_handler (trunc_optab,
4496							mode, imode);
4497	  emit_insn (GEN_FCN (icode) (ops[0], from));
4498	}
4499      else
4500	emit_insn (gen_extend_insn (ops[0], from, mode, imode, 1));
4501      return 1;
4502    }
4503
4504  /* At least one of the operands needs to be a register. */
4505  if ((reload_in_progress | reload_completed) == 0
4506      && !register_operand (ops[0], mode) && !register_operand (ops[1], mode))
4507    {
4508      rtx temp = force_reg (mode, ops[1]);
4509      emit_move_insn (ops[0], temp);
4510      return 1;
4511    }
4512  if (reload_in_progress || reload_completed)
4513    {
4514      if (CONSTANT_P (ops[1]))
4515	return spu_split_immediate (ops);
4516      return 0;
4517    }
4518
4519  /* Catch the SImode immediates greater than 0x7fffffff, and sign
4520     extend them. */
4521  if (GET_CODE (ops[1]) == CONST_INT)
4522    {
4523      HOST_WIDE_INT val = trunc_int_for_mode (INTVAL (ops[1]), mode);
4524      if (val != INTVAL (ops[1]))
4525	{
4526	  emit_move_insn (ops[0], GEN_INT (val));
4527	  return 1;
4528	}
4529    }
4530  if (MEM_P (ops[0]))
4531    {
4532      if (MEM_ADDR_SPACE (ops[0]))
4533	ops[0] = expand_ea_mem (ops[0], true);
4534      return spu_split_store (ops);
4535    }
4536  if (MEM_P (ops[1]))
4537    {
4538      if (MEM_ADDR_SPACE (ops[1]))
4539	ops[1] = expand_ea_mem (ops[1], false);
4540      return spu_split_load (ops);
4541    }
4542
4543  return 0;
4544}
4545
4546static void
4547spu_convert_move (rtx dst, rtx src)
4548{
4549  machine_mode mode = GET_MODE (dst);
4550  machine_mode int_mode = mode_for_size (GET_MODE_BITSIZE (mode), MODE_INT, 0);
4551  rtx reg;
4552  gcc_assert (GET_MODE (src) == TImode);
4553  reg = int_mode != mode ? gen_reg_rtx (int_mode) : dst;
4554  emit_insn (gen_rtx_SET (VOIDmode, reg,
4555	       gen_rtx_TRUNCATE (int_mode,
4556		 gen_rtx_LSHIFTRT (TImode, src,
4557		   GEN_INT (int_mode == DImode ? 64 : 96)))));
4558  if (int_mode != mode)
4559    {
4560      reg = simplify_gen_subreg (mode, reg, int_mode, 0);
4561      emit_move_insn (dst, reg);
4562    }
4563}
4564
4565/* Load TImode values into DST0 and DST1 (when it is non-NULL) using
4566   the address from SRC and SRC+16.  Return a REG or CONST_INT that
4567   specifies how many bytes to rotate the loaded registers, plus any
4568   extra from EXTRA_ROTQBY.  The address and rotate amounts are
4569   normalized to improve merging of loads and rotate computations. */
4570static rtx
4571spu_expand_load (rtx dst0, rtx dst1, rtx src, int extra_rotby)
4572{
4573  rtx addr = XEXP (src, 0);
4574  rtx p0, p1, rot, addr0, addr1;
4575  int rot_amt;
4576
4577  rot = 0;
4578  rot_amt = 0;
4579
4580  if (MEM_ALIGN (src) >= 128)
4581    /* Address is already aligned; simply perform a TImode load.  */ ;
4582  else if (GET_CODE (addr) == PLUS)
4583    {
4584      /* 8 cases:
4585         aligned reg   + aligned reg     => lqx
4586         aligned reg   + unaligned reg   => lqx, rotqby
4587         aligned reg   + aligned const   => lqd
4588         aligned reg   + unaligned const => lqd, rotqbyi
4589         unaligned reg + aligned reg     => lqx, rotqby
4590         unaligned reg + unaligned reg   => lqx, a, rotqby (1 scratch)
4591         unaligned reg + aligned const   => lqd, rotqby
4592         unaligned reg + unaligned const -> not allowed by legitimate address
4593       */
4594      p0 = XEXP (addr, 0);
4595      p1 = XEXP (addr, 1);
4596      if (!reg_aligned_for_addr (p0))
4597	{
4598	  if (REG_P (p1) && !reg_aligned_for_addr (p1))
4599	    {
4600	      rot = gen_reg_rtx (SImode);
4601	      emit_insn (gen_addsi3 (rot, p0, p1));
4602	    }
4603	  else if (GET_CODE (p1) == CONST_INT && (INTVAL (p1) & 15))
4604	    {
4605	      if (INTVAL (p1) > 0
4606		  && REG_POINTER (p0)
4607		  && INTVAL (p1) * BITS_PER_UNIT
4608		     < REGNO_POINTER_ALIGN (REGNO (p0)))
4609		{
4610		  rot = gen_reg_rtx (SImode);
4611		  emit_insn (gen_addsi3 (rot, p0, p1));
4612		  addr = p0;
4613		}
4614	      else
4615		{
4616		  rtx x = gen_reg_rtx (SImode);
4617		  emit_move_insn (x, p1);
4618		  if (!spu_arith_operand (p1, SImode))
4619		    p1 = x;
4620		  rot = gen_reg_rtx (SImode);
4621		  emit_insn (gen_addsi3 (rot, p0, p1));
4622		  addr = gen_rtx_PLUS (Pmode, p0, x);
4623		}
4624	    }
4625	  else
4626	    rot = p0;
4627	}
4628      else
4629	{
4630	  if (GET_CODE (p1) == CONST_INT && (INTVAL (p1) & 15))
4631	    {
4632	      rot_amt = INTVAL (p1) & 15;
4633	      if (INTVAL (p1) & -16)
4634		{
4635		  p1 = GEN_INT (INTVAL (p1) & -16);
4636		  addr = gen_rtx_PLUS (SImode, p0, p1);
4637		}
4638	      else
4639		addr = p0;
4640	    }
4641	  else if (REG_P (p1) && !reg_aligned_for_addr (p1))
4642	    rot = p1;
4643	}
4644    }
4645  else if (REG_P (addr))
4646    {
4647      if (!reg_aligned_for_addr (addr))
4648	rot = addr;
4649    }
4650  else if (GET_CODE (addr) == CONST)
4651    {
4652      if (GET_CODE (XEXP (addr, 0)) == PLUS
4653	  && ALIGNED_SYMBOL_REF_P (XEXP (XEXP (addr, 0), 0))
4654	  && GET_CODE (XEXP (XEXP (addr, 0), 1)) == CONST_INT)
4655	{
4656	  rot_amt = INTVAL (XEXP (XEXP (addr, 0), 1));
4657	  if (rot_amt & -16)
4658	    addr = gen_rtx_CONST (Pmode,
4659				  gen_rtx_PLUS (Pmode,
4660						XEXP (XEXP (addr, 0), 0),
4661						GEN_INT (rot_amt & -16)));
4662	  else
4663	    addr = XEXP (XEXP (addr, 0), 0);
4664	}
4665      else
4666	{
4667	  rot = gen_reg_rtx (Pmode);
4668	  emit_move_insn (rot, addr);
4669	}
4670    }
4671  else if (GET_CODE (addr) == CONST_INT)
4672    {
4673      rot_amt = INTVAL (addr);
4674      addr = GEN_INT (rot_amt & -16);
4675    }
4676  else if (!ALIGNED_SYMBOL_REF_P (addr))
4677    {
4678      rot = gen_reg_rtx (Pmode);
4679      emit_move_insn (rot, addr);
4680    }
4681
4682  rot_amt += extra_rotby;
4683
4684  rot_amt &= 15;
4685
4686  if (rot && rot_amt)
4687    {
4688      rtx x = gen_reg_rtx (SImode);
4689      emit_insn (gen_addsi3 (x, rot, GEN_INT (rot_amt)));
4690      rot = x;
4691      rot_amt = 0;
4692    }
4693  if (!rot && rot_amt)
4694    rot = GEN_INT (rot_amt);
4695
4696  addr0 = copy_rtx (addr);
4697  addr0 = gen_rtx_AND (SImode, copy_rtx (addr), GEN_INT (-16));
4698  emit_insn (gen__movti (dst0, change_address (src, TImode, addr0)));
4699
4700  if (dst1)
4701    {
4702      addr1 = plus_constant (SImode, copy_rtx (addr), 16);
4703      addr1 = gen_rtx_AND (SImode, addr1, GEN_INT (-16));
4704      emit_insn (gen__movti (dst1, change_address (src, TImode, addr1)));
4705    }
4706
4707  return rot;
4708}
4709
4710int
4711spu_split_load (rtx * ops)
4712{
4713  machine_mode mode = GET_MODE (ops[0]);
4714  rtx addr, load, rot;
4715  int rot_amt;
4716
4717  if (GET_MODE_SIZE (mode) >= 16)
4718    return 0;
4719
4720  addr = XEXP (ops[1], 0);
4721  gcc_assert (GET_CODE (addr) != AND);
4722
4723  if (!address_needs_split (ops[1]))
4724    {
4725      ops[1] = change_address (ops[1], TImode, addr);
4726      load = gen_reg_rtx (TImode);
4727      emit_insn (gen__movti (load, ops[1]));
4728      spu_convert_move (ops[0], load);
4729      return 1;
4730    }
4731
4732  rot_amt = GET_MODE_SIZE (mode) < 4 ? GET_MODE_SIZE (mode) - 4 : 0;
4733
4734  load = gen_reg_rtx (TImode);
4735  rot = spu_expand_load (load, 0, ops[1], rot_amt);
4736
4737  if (rot)
4738    emit_insn (gen_rotqby_ti (load, load, rot));
4739
4740  spu_convert_move (ops[0], load);
4741  return 1;
4742}
4743
4744int
4745spu_split_store (rtx * ops)
4746{
4747  machine_mode mode = GET_MODE (ops[0]);
4748  rtx reg;
4749  rtx addr, p0, p1, p1_lo, smem;
4750  int aform;
4751  int scalar;
4752
4753  if (GET_MODE_SIZE (mode) >= 16)
4754    return 0;
4755
4756  addr = XEXP (ops[0], 0);
4757  gcc_assert (GET_CODE (addr) != AND);
4758
4759  if (!address_needs_split (ops[0]))
4760    {
4761      reg = gen_reg_rtx (TImode);
4762      emit_insn (gen_spu_convert (reg, ops[1]));
4763      ops[0] = change_address (ops[0], TImode, addr);
4764      emit_move_insn (ops[0], reg);
4765      return 1;
4766    }
4767
4768  if (GET_CODE (addr) == PLUS)
4769    {
4770      /* 8 cases:
4771         aligned reg   + aligned reg     => lqx, c?x, shuf, stqx
4772         aligned reg   + unaligned reg   => lqx, c?x, shuf, stqx
4773         aligned reg   + aligned const   => lqd, c?d, shuf, stqx
4774         aligned reg   + unaligned const => lqd, c?d, shuf, stqx
4775         unaligned reg + aligned reg     => lqx, c?x, shuf, stqx
4776         unaligned reg + unaligned reg   => lqx, c?x, shuf, stqx
4777         unaligned reg + aligned const   => lqd, c?d, shuf, stqx
4778         unaligned reg + unaligned const -> lqx, c?d, shuf, stqx
4779       */
4780      aform = 0;
4781      p0 = XEXP (addr, 0);
4782      p1 = p1_lo = XEXP (addr, 1);
4783      if (REG_P (p0) && GET_CODE (p1) == CONST_INT)
4784	{
4785	  p1_lo = GEN_INT (INTVAL (p1) & 15);
4786	  if (reg_aligned_for_addr (p0))
4787	    {
4788	      p1 = GEN_INT (INTVAL (p1) & -16);
4789	      if (p1 == const0_rtx)
4790		addr = p0;
4791	      else
4792		addr = gen_rtx_PLUS (SImode, p0, p1);
4793	    }
4794	  else
4795	    {
4796	      rtx x = gen_reg_rtx (SImode);
4797	      emit_move_insn (x, p1);
4798	      addr = gen_rtx_PLUS (SImode, p0, x);
4799	    }
4800	}
4801    }
4802  else if (REG_P (addr))
4803    {
4804      aform = 0;
4805      p0 = addr;
4806      p1 = p1_lo = const0_rtx;
4807    }
4808  else
4809    {
4810      aform = 1;
4811      p0 = gen_rtx_REG (SImode, STACK_POINTER_REGNUM);
4812      p1 = 0;			/* aform doesn't use p1 */
4813      p1_lo = addr;
4814      if (ALIGNED_SYMBOL_REF_P (addr))
4815	p1_lo = const0_rtx;
4816      else if (GET_CODE (addr) == CONST
4817	       && GET_CODE (XEXP (addr, 0)) == PLUS
4818	       && ALIGNED_SYMBOL_REF_P (XEXP (XEXP (addr, 0), 0))
4819	       && GET_CODE (XEXP (XEXP (addr, 0), 1)) == CONST_INT)
4820	{
4821	  HOST_WIDE_INT v = INTVAL (XEXP (XEXP (addr, 0), 1));
4822	  if ((v & -16) != 0)
4823	    addr = gen_rtx_CONST (Pmode,
4824				  gen_rtx_PLUS (Pmode,
4825						XEXP (XEXP (addr, 0), 0),
4826						GEN_INT (v & -16)));
4827	  else
4828	    addr = XEXP (XEXP (addr, 0), 0);
4829	  p1_lo = GEN_INT (v & 15);
4830	}
4831      else if (GET_CODE (addr) == CONST_INT)
4832	{
4833	  p1_lo = GEN_INT (INTVAL (addr) & 15);
4834	  addr = GEN_INT (INTVAL (addr) & -16);
4835	}
4836      else
4837	{
4838	  p1_lo = gen_reg_rtx (SImode);
4839	  emit_move_insn (p1_lo, addr);
4840	}
4841    }
4842
4843  gcc_assert (aform == 0 || aform == 1);
4844  reg = gen_reg_rtx (TImode);
4845
4846  scalar = store_with_one_insn_p (ops[0]);
4847  if (!scalar)
4848    {
4849      /* We could copy the flags from the ops[0] MEM to mem here,
4850         We don't because we want this load to be optimized away if
4851         possible, and copying the flags will prevent that in certain
4852         cases, e.g. consider the volatile flag. */
4853
4854      rtx pat = gen_reg_rtx (TImode);
4855      rtx lmem = change_address (ops[0], TImode, copy_rtx (addr));
4856      set_mem_alias_set (lmem, 0);
4857      emit_insn (gen_movti (reg, lmem));
4858
4859      if (!p0 || reg_aligned_for_addr (p0))
4860	p0 = stack_pointer_rtx;
4861      if (!p1_lo)
4862	p1_lo = const0_rtx;
4863
4864      emit_insn (gen_cpat (pat, p0, p1_lo, GEN_INT (GET_MODE_SIZE (mode))));
4865      emit_insn (gen_shufb (reg, ops[1], reg, pat));
4866    }
4867  else
4868    {
4869      if (GET_CODE (ops[1]) == REG)
4870	emit_insn (gen_spu_convert (reg, ops[1]));
4871      else if (GET_CODE (ops[1]) == SUBREG)
4872	emit_insn (gen_spu_convert (reg, SUBREG_REG (ops[1])));
4873      else
4874	abort ();
4875    }
4876
4877  if (GET_MODE_SIZE (mode) < 4 && scalar)
4878    emit_insn (gen_ashlti3
4879	       (reg, reg, GEN_INT (32 - GET_MODE_BITSIZE (mode))));
4880
4881  smem = change_address (ops[0], TImode, copy_rtx (addr));
4882  /* We can't use the previous alias set because the memory has changed
4883     size and can potentially overlap objects of other types.  */
4884  set_mem_alias_set (smem, 0);
4885
4886  emit_insn (gen_movti (smem, reg));
4887  return 1;
4888}
4889
4890/* Return TRUE if X is MEM which is a struct member reference
4891   and the member can safely be loaded and stored with a single
4892   instruction because it is padded. */
4893static int
4894mem_is_padded_component_ref (rtx x)
4895{
4896  tree t = MEM_EXPR (x);
4897  tree r;
4898  if (!t || TREE_CODE (t) != COMPONENT_REF)
4899    return 0;
4900  t = TREE_OPERAND (t, 1);
4901  if (!t || TREE_CODE (t) != FIELD_DECL
4902      || DECL_ALIGN (t) < 128 || AGGREGATE_TYPE_P (TREE_TYPE (t)))
4903    return 0;
4904  /* Only do this for RECORD_TYPEs, not UNION_TYPEs. */
4905  r = DECL_FIELD_CONTEXT (t);
4906  if (!r || TREE_CODE (r) != RECORD_TYPE)
4907    return 0;
4908  /* Make sure they are the same mode */
4909  if (GET_MODE (x) != TYPE_MODE (TREE_TYPE (t)))
4910    return 0;
4911  /* If there are no following fields then the field alignment assures
4912     the structure is padded to the alignment which means this field is
4913     padded too.  */
4914  if (TREE_CHAIN (t) == 0)
4915    return 1;
4916  /* If the following field is also aligned then this field will be
4917     padded. */
4918  t = TREE_CHAIN (t);
4919  if (TREE_CODE (t) == FIELD_DECL && DECL_ALIGN (t) >= 128)
4920    return 1;
4921  return 0;
4922}
4923
4924/* Parse the -mfixed-range= option string.  */
4925static void
4926fix_range (const char *const_str)
4927{
4928  int i, first, last;
4929  char *str, *dash, *comma;
4930
4931  /* str must be of the form REG1'-'REG2{,REG1'-'REG} where REG1 and
4932     REG2 are either register names or register numbers.  The effect
4933     of this option is to mark the registers in the range from REG1 to
4934     REG2 as ``fixed'' so they won't be used by the compiler.  */
4935
4936  i = strlen (const_str);
4937  str = (char *) alloca (i + 1);
4938  memcpy (str, const_str, i + 1);
4939
4940  while (1)
4941    {
4942      dash = strchr (str, '-');
4943      if (!dash)
4944	{
4945	  warning (0, "value of -mfixed-range must have form REG1-REG2");
4946	  return;
4947	}
4948      *dash = '\0';
4949      comma = strchr (dash + 1, ',');
4950      if (comma)
4951	*comma = '\0';
4952
4953      first = decode_reg_name (str);
4954      if (first < 0)
4955	{
4956	  warning (0, "unknown register name: %s", str);
4957	  return;
4958	}
4959
4960      last = decode_reg_name (dash + 1);
4961      if (last < 0)
4962	{
4963	  warning (0, "unknown register name: %s", dash + 1);
4964	  return;
4965	}
4966
4967      *dash = '-';
4968
4969      if (first > last)
4970	{
4971	  warning (0, "%s-%s is an empty range", str, dash + 1);
4972	  return;
4973	}
4974
4975      for (i = first; i <= last; ++i)
4976	fixed_regs[i] = call_used_regs[i] = 1;
4977
4978      if (!comma)
4979	break;
4980
4981      *comma = ',';
4982      str = comma + 1;
4983    }
4984}
4985
4986/* Return TRUE if x is a CONST_INT, CONST_DOUBLE or CONST_VECTOR that
4987   can be generated using the fsmbi instruction. */
4988int
4989fsmbi_const_p (rtx x)
4990{
4991  if (CONSTANT_P (x))
4992    {
4993      /* We can always choose TImode for CONST_INT because the high bits
4994         of an SImode will always be all 1s, i.e., valid for fsmbi. */
4995      enum immediate_class c = classify_immediate (x, TImode);
4996      return c == IC_FSMBI || (!epilogue_completed && c == IC_FSMBI2);
4997    }
4998  return 0;
4999}
5000
5001/* Return TRUE if x is a CONST_INT, CONST_DOUBLE or CONST_VECTOR that
5002   can be generated using the cbd, chd, cwd or cdd instruction. */
5003int
5004cpat_const_p (rtx x, machine_mode mode)
5005{
5006  if (CONSTANT_P (x))
5007    {
5008      enum immediate_class c = classify_immediate (x, mode);
5009      return c == IC_CPAT;
5010    }
5011  return 0;
5012}
5013
5014rtx
5015gen_cpat_const (rtx * ops)
5016{
5017  unsigned char dst[16];
5018  int i, offset, shift, isize;
5019  if (GET_CODE (ops[3]) != CONST_INT
5020      || GET_CODE (ops[2]) != CONST_INT
5021      || (GET_CODE (ops[1]) != CONST_INT
5022	  && GET_CODE (ops[1]) != REG))
5023    return 0;
5024  if (GET_CODE (ops[1]) == REG
5025      && (!REG_POINTER (ops[1])
5026	  || REGNO_POINTER_ALIGN (ORIGINAL_REGNO (ops[1])) < 128))
5027    return 0;
5028
5029  for (i = 0; i < 16; i++)
5030    dst[i] = i + 16;
5031  isize = INTVAL (ops[3]);
5032  if (isize == 1)
5033    shift = 3;
5034  else if (isize == 2)
5035    shift = 2;
5036  else
5037    shift = 0;
5038  offset = (INTVAL (ops[2]) +
5039	    (GET_CODE (ops[1]) ==
5040	     CONST_INT ? INTVAL (ops[1]) : 0)) & 15;
5041  for (i = 0; i < isize; i++)
5042    dst[offset + i] = i + shift;
5043  return array_to_constant (TImode, dst);
5044}
5045
5046/* Convert a CONST_INT, CONST_DOUBLE, or CONST_VECTOR into a 16 byte
5047   array.  Use MODE for CONST_INT's.  When the constant's mode is smaller
5048   than 16 bytes, the value is repeated across the rest of the array. */
5049void
5050constant_to_array (machine_mode mode, rtx x, unsigned char arr[16])
5051{
5052  HOST_WIDE_INT val;
5053  int i, j, first;
5054
5055  memset (arr, 0, 16);
5056  mode = GET_MODE (x) != VOIDmode ? GET_MODE (x) : mode;
5057  if (GET_CODE (x) == CONST_INT
5058      || (GET_CODE (x) == CONST_DOUBLE
5059	  && (mode == SFmode || mode == DFmode)))
5060    {
5061      gcc_assert (mode != VOIDmode && mode != BLKmode);
5062
5063      if (GET_CODE (x) == CONST_DOUBLE)
5064	val = const_double_to_hwint (x);
5065      else
5066	val = INTVAL (x);
5067      first = GET_MODE_SIZE (mode) - 1;
5068      for (i = first; i >= 0; i--)
5069	{
5070	  arr[i] = val & 0xff;
5071	  val >>= 8;
5072	}
5073      /* Splat the constant across the whole array. */
5074      for (j = 0, i = first + 1; i < 16; i++)
5075	{
5076	  arr[i] = arr[j];
5077	  j = (j == first) ? 0 : j + 1;
5078	}
5079    }
5080  else if (GET_CODE (x) == CONST_DOUBLE)
5081    {
5082      val = CONST_DOUBLE_LOW (x);
5083      for (i = 15; i >= 8; i--)
5084	{
5085	  arr[i] = val & 0xff;
5086	  val >>= 8;
5087	}
5088      val = CONST_DOUBLE_HIGH (x);
5089      for (i = 7; i >= 0; i--)
5090	{
5091	  arr[i] = val & 0xff;
5092	  val >>= 8;
5093	}
5094    }
5095  else if (GET_CODE (x) == CONST_VECTOR)
5096    {
5097      int units;
5098      rtx elt;
5099      mode = GET_MODE_INNER (mode);
5100      units = CONST_VECTOR_NUNITS (x);
5101      for (i = 0; i < units; i++)
5102	{
5103	  elt = CONST_VECTOR_ELT (x, i);
5104	  if (GET_CODE (elt) == CONST_INT || GET_CODE (elt) == CONST_DOUBLE)
5105	    {
5106	      if (GET_CODE (elt) == CONST_DOUBLE)
5107		val = const_double_to_hwint (elt);
5108	      else
5109		val = INTVAL (elt);
5110	      first = GET_MODE_SIZE (mode) - 1;
5111	      if (first + i * GET_MODE_SIZE (mode) > 16)
5112		abort ();
5113	      for (j = first; j >= 0; j--)
5114		{
5115		  arr[j + i * GET_MODE_SIZE (mode)] = val & 0xff;
5116		  val >>= 8;
5117		}
5118	    }
5119	}
5120    }
5121  else
5122    gcc_unreachable();
5123}
5124
5125/* Convert a 16 byte array to a constant of mode MODE.  When MODE is
5126   smaller than 16 bytes, use the bytes that would represent that value
5127   in a register, e.g., for QImode return the value of arr[3].  */
5128rtx
5129array_to_constant (machine_mode mode, const unsigned char arr[16])
5130{
5131  machine_mode inner_mode;
5132  rtvec v;
5133  int units, size, i, j, k;
5134  HOST_WIDE_INT val;
5135
5136  if (GET_MODE_CLASS (mode) == MODE_INT
5137      && GET_MODE_BITSIZE (mode) <= HOST_BITS_PER_WIDE_INT)
5138    {
5139      j = GET_MODE_SIZE (mode);
5140      i = j < 4 ? 4 - j : 0;
5141      for (val = 0; i < j; i++)
5142	val = (val << 8) | arr[i];
5143      val = trunc_int_for_mode (val, mode);
5144      return GEN_INT (val);
5145    }
5146
5147  if (mode == TImode)
5148    {
5149      HOST_WIDE_INT high;
5150      for (i = high = 0; i < 8; i++)
5151	high = (high << 8) | arr[i];
5152      for (i = 8, val = 0; i < 16; i++)
5153	val = (val << 8) | arr[i];
5154      return immed_double_const (val, high, TImode);
5155    }
5156  if (mode == SFmode)
5157    {
5158      val = (arr[0] << 24) | (arr[1] << 16) | (arr[2] << 8) | arr[3];
5159      val = trunc_int_for_mode (val, SImode);
5160      return hwint_to_const_double (SFmode, val);
5161    }
5162  if (mode == DFmode)
5163    {
5164      for (i = 0, val = 0; i < 8; i++)
5165	val = (val << 8) | arr[i];
5166      return hwint_to_const_double (DFmode, val);
5167    }
5168
5169  if (!VECTOR_MODE_P (mode))
5170    abort ();
5171
5172  units = GET_MODE_NUNITS (mode);
5173  size = GET_MODE_UNIT_SIZE (mode);
5174  inner_mode = GET_MODE_INNER (mode);
5175  v = rtvec_alloc (units);
5176
5177  for (k = i = 0; i < units; ++i)
5178    {
5179      val = 0;
5180      for (j = 0; j < size; j++, k++)
5181	val = (val << 8) | arr[k];
5182
5183      if (GET_MODE_CLASS (inner_mode) == MODE_FLOAT)
5184	RTVEC_ELT (v, i) = hwint_to_const_double (inner_mode, val);
5185      else
5186	RTVEC_ELT (v, i) = GEN_INT (trunc_int_for_mode (val, inner_mode));
5187    }
5188  if (k > 16)
5189    abort ();
5190
5191  return gen_rtx_CONST_VECTOR (mode, v);
5192}
5193
5194static void
5195reloc_diagnostic (rtx x)
5196{
5197  tree decl = 0;
5198  if (!flag_pic || !(TARGET_WARN_RELOC || TARGET_ERROR_RELOC))
5199    return;
5200
5201  if (GET_CODE (x) == SYMBOL_REF)
5202    decl = SYMBOL_REF_DECL (x);
5203  else if (GET_CODE (x) == CONST
5204	   && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
5205    decl = SYMBOL_REF_DECL (XEXP (XEXP (x, 0), 0));
5206
5207  /* SYMBOL_REF_DECL is not necessarily a DECL. */
5208  if (decl && !DECL_P (decl))
5209    decl = 0;
5210
5211  /* The decl could be a string constant.  */
5212  if (decl && DECL_P (decl))
5213    {
5214      location_t loc;
5215      /* We use last_assemble_variable_decl to get line information.  It's
5216	 not always going to be right and might not even be close, but will
5217	 be right for the more common cases. */
5218      if (!last_assemble_variable_decl || in_section == ctors_section)
5219	loc = DECL_SOURCE_LOCATION (decl);
5220      else
5221	loc = DECL_SOURCE_LOCATION (last_assemble_variable_decl);
5222
5223      if (TARGET_WARN_RELOC)
5224	warning_at (loc, 0,
5225		    "creating run-time relocation for %qD", decl);
5226      else
5227	error_at (loc,
5228		  "creating run-time relocation for %qD", decl);
5229    }
5230  else
5231    {
5232      if (TARGET_WARN_RELOC)
5233	warning_at (input_location, 0, "creating run-time relocation");
5234      else
5235	error_at (input_location, "creating run-time relocation");
5236    }
5237}
5238
5239/* Hook into assemble_integer so we can generate an error for run-time
5240   relocations.  The SPU ABI disallows them. */
5241static bool
5242spu_assemble_integer (rtx x, unsigned int size, int aligned_p)
5243{
5244  /* By default run-time relocations aren't supported, but we allow them
5245     in case users support it in their own run-time loader.  And we provide
5246     a warning for those users that don't.  */
5247  if ((GET_CODE (x) == SYMBOL_REF)
5248      || GET_CODE (x) == LABEL_REF || GET_CODE (x) == CONST)
5249    reloc_diagnostic (x);
5250
5251  return default_assemble_integer (x, size, aligned_p);
5252}
5253
5254static void
5255spu_asm_globalize_label (FILE * file, const char *name)
5256{
5257  fputs ("\t.global\t", file);
5258  assemble_name (file, name);
5259  fputs ("\n", file);
5260}
5261
5262static bool
5263spu_rtx_costs (rtx x, int code, int outer_code ATTRIBUTE_UNUSED,
5264	       int opno ATTRIBUTE_UNUSED, int *total,
5265	       bool speed ATTRIBUTE_UNUSED)
5266{
5267  machine_mode mode = GET_MODE (x);
5268  int cost = COSTS_N_INSNS (2);
5269
5270  /* Folding to a CONST_VECTOR will use extra space but there might
5271     be only a small savings in cycles.  We'd like to use a CONST_VECTOR
5272     only if it allows us to fold away multiple insns.  Changing the cost
5273     of a CONST_VECTOR here (or in CONST_COSTS) doesn't help though
5274     because this cost will only be compared against a single insn.
5275     if (code == CONST_VECTOR)
5276       return spu_legitimate_constant_p (mode, x) ? cost : COSTS_N_INSNS (6);
5277   */
5278
5279  /* Use defaults for float operations.  Not accurate but good enough. */
5280  if (mode == DFmode)
5281    {
5282      *total = COSTS_N_INSNS (13);
5283      return true;
5284    }
5285  if (mode == SFmode)
5286    {
5287      *total = COSTS_N_INSNS (6);
5288      return true;
5289    }
5290  switch (code)
5291    {
5292    case CONST_INT:
5293      if (satisfies_constraint_K (x))
5294	*total = 0;
5295      else if (INTVAL (x) >= -0x80000000ll && INTVAL (x) <= 0xffffffffll)
5296	*total = COSTS_N_INSNS (1);
5297      else
5298	*total = COSTS_N_INSNS (3);
5299      return true;
5300
5301    case CONST:
5302      *total = COSTS_N_INSNS (3);
5303      return true;
5304
5305    case LABEL_REF:
5306    case SYMBOL_REF:
5307      *total = COSTS_N_INSNS (0);
5308      return true;
5309
5310    case CONST_DOUBLE:
5311      *total = COSTS_N_INSNS (5);
5312      return true;
5313
5314    case FLOAT_EXTEND:
5315    case FLOAT_TRUNCATE:
5316    case FLOAT:
5317    case UNSIGNED_FLOAT:
5318    case FIX:
5319    case UNSIGNED_FIX:
5320      *total = COSTS_N_INSNS (7);
5321      return true;
5322
5323    case PLUS:
5324      if (mode == TImode)
5325	{
5326	  *total = COSTS_N_INSNS (9);
5327	  return true;
5328	}
5329      break;
5330
5331    case MULT:
5332      cost =
5333	GET_CODE (XEXP (x, 0)) ==
5334	REG ? COSTS_N_INSNS (12) : COSTS_N_INSNS (7);
5335      if (mode == SImode && GET_CODE (XEXP (x, 0)) == REG)
5336	{
5337	  if (GET_CODE (XEXP (x, 1)) == CONST_INT)
5338	    {
5339	      HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
5340	      cost = COSTS_N_INSNS (14);
5341	      if ((val & 0xffff) == 0)
5342		cost = COSTS_N_INSNS (9);
5343	      else if (val > 0 && val < 0x10000)
5344		cost = COSTS_N_INSNS (11);
5345	    }
5346	}
5347      *total = cost;
5348      return true;
5349    case DIV:
5350    case UDIV:
5351    case MOD:
5352    case UMOD:
5353      *total = COSTS_N_INSNS (20);
5354      return true;
5355    case ROTATE:
5356    case ROTATERT:
5357    case ASHIFT:
5358    case ASHIFTRT:
5359    case LSHIFTRT:
5360      *total = COSTS_N_INSNS (4);
5361      return true;
5362    case UNSPEC:
5363      if (XINT (x, 1) == UNSPEC_CONVERT)
5364	*total = COSTS_N_INSNS (0);
5365      else
5366	*total = COSTS_N_INSNS (4);
5367      return true;
5368    }
5369  /* Scale cost by mode size.  Except when initializing (cfun->decl == 0). */
5370  if (GET_MODE_CLASS (mode) == MODE_INT
5371      && GET_MODE_SIZE (mode) > GET_MODE_SIZE (SImode) && cfun && cfun->decl)
5372    cost = cost * (GET_MODE_SIZE (mode) / GET_MODE_SIZE (SImode))
5373      * (GET_MODE_SIZE (mode) / GET_MODE_SIZE (SImode));
5374  *total = cost;
5375  return true;
5376}
5377
5378static machine_mode
5379spu_unwind_word_mode (void)
5380{
5381  return SImode;
5382}
5383
5384/* Decide whether we can make a sibling call to a function.  DECL is the
5385   declaration of the function being targeted by the call and EXP is the
5386   CALL_EXPR representing the call.  */
5387static bool
5388spu_function_ok_for_sibcall (tree decl, tree exp ATTRIBUTE_UNUSED)
5389{
5390  return decl && !TARGET_LARGE_MEM;
5391}
5392
5393/* We need to correctly update the back chain pointer and the Available
5394   Stack Size (which is in the second slot of the sp register.) */
5395void
5396spu_allocate_stack (rtx op0, rtx op1)
5397{
5398  HOST_WIDE_INT v;
5399  rtx chain = gen_reg_rtx (V4SImode);
5400  rtx stack_bot = gen_frame_mem (V4SImode, stack_pointer_rtx);
5401  rtx sp = gen_reg_rtx (V4SImode);
5402  rtx splatted = gen_reg_rtx (V4SImode);
5403  rtx pat = gen_reg_rtx (TImode);
5404
5405  /* copy the back chain so we can save it back again. */
5406  emit_move_insn (chain, stack_bot);
5407
5408  op1 = force_reg (SImode, op1);
5409
5410  v = 0x1020300010203ll;
5411  emit_move_insn (pat, immed_double_const (v, v, TImode));
5412  emit_insn (gen_shufb (splatted, op1, op1, pat));
5413
5414  emit_insn (gen_spu_convert (sp, stack_pointer_rtx));
5415  emit_insn (gen_subv4si3 (sp, sp, splatted));
5416
5417  if (flag_stack_check)
5418    {
5419      rtx avail = gen_reg_rtx(SImode);
5420      rtx result = gen_reg_rtx(SImode);
5421      emit_insn (gen_vec_extractv4si (avail, sp, GEN_INT (1)));
5422      emit_insn (gen_cgt_si(result, avail, GEN_INT (-1)));
5423      emit_insn (gen_spu_heq (result, GEN_INT(0) ));
5424    }
5425
5426  emit_insn (gen_spu_convert (stack_pointer_rtx, sp));
5427
5428  emit_move_insn (stack_bot, chain);
5429
5430  emit_move_insn (op0, virtual_stack_dynamic_rtx);
5431}
5432
5433void
5434spu_restore_stack_nonlocal (rtx op0 ATTRIBUTE_UNUSED, rtx op1)
5435{
5436  static unsigned char arr[16] =
5437    { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 };
5438  rtx temp = gen_reg_rtx (SImode);
5439  rtx temp2 = gen_reg_rtx (SImode);
5440  rtx temp3 = gen_reg_rtx (V4SImode);
5441  rtx temp4 = gen_reg_rtx (V4SImode);
5442  rtx pat = gen_reg_rtx (TImode);
5443  rtx sp = gen_rtx_REG (V4SImode, STACK_POINTER_REGNUM);
5444
5445  /* Restore the backchain from the first word, sp from the second.  */
5446  emit_move_insn (temp2, adjust_address_nv (op1, SImode, 0));
5447  emit_move_insn (temp, adjust_address_nv (op1, SImode, 4));
5448
5449  emit_move_insn (pat, array_to_constant (TImode, arr));
5450
5451  /* Compute Available Stack Size for sp */
5452  emit_insn (gen_subsi3 (temp, temp, stack_pointer_rtx));
5453  emit_insn (gen_shufb (temp3, temp, temp, pat));
5454
5455  /* Compute Available Stack Size for back chain */
5456  emit_insn (gen_subsi3 (temp2, temp2, stack_pointer_rtx));
5457  emit_insn (gen_shufb (temp4, temp2, temp2, pat));
5458  emit_insn (gen_addv4si3 (temp4, sp, temp4));
5459
5460  emit_insn (gen_addv4si3 (sp, sp, temp3));
5461  emit_move_insn (gen_frame_mem (V4SImode, stack_pointer_rtx), temp4);
5462}
5463
5464static void
5465spu_init_libfuncs (void)
5466{
5467  set_optab_libfunc (smul_optab, DImode, "__muldi3");
5468  set_optab_libfunc (sdiv_optab, DImode, "__divdi3");
5469  set_optab_libfunc (smod_optab, DImode, "__moddi3");
5470  set_optab_libfunc (udiv_optab, DImode, "__udivdi3");
5471  set_optab_libfunc (umod_optab, DImode, "__umoddi3");
5472  set_optab_libfunc (udivmod_optab, DImode, "__udivmoddi4");
5473  set_optab_libfunc (ffs_optab, DImode, "__ffsdi2");
5474  set_optab_libfunc (clz_optab, DImode, "__clzdi2");
5475  set_optab_libfunc (ctz_optab, DImode, "__ctzdi2");
5476  set_optab_libfunc (clrsb_optab, DImode, "__clrsbdi2");
5477  set_optab_libfunc (popcount_optab, DImode, "__popcountdi2");
5478  set_optab_libfunc (parity_optab, DImode, "__paritydi2");
5479
5480  set_conv_libfunc (ufloat_optab, DFmode, SImode, "__float_unssidf");
5481  set_conv_libfunc (ufloat_optab, DFmode, DImode, "__float_unsdidf");
5482
5483  set_optab_libfunc (addv_optab, SImode, "__addvsi3");
5484  set_optab_libfunc (subv_optab, SImode, "__subvsi3");
5485  set_optab_libfunc (smulv_optab, SImode, "__mulvsi3");
5486  set_optab_libfunc (sdivv_optab, SImode, "__divvsi3");
5487  set_optab_libfunc (negv_optab, SImode, "__negvsi2");
5488  set_optab_libfunc (absv_optab, SImode, "__absvsi2");
5489  set_optab_libfunc (addv_optab, DImode, "__addvdi3");
5490  set_optab_libfunc (subv_optab, DImode, "__subvdi3");
5491  set_optab_libfunc (smulv_optab, DImode, "__mulvdi3");
5492  set_optab_libfunc (sdivv_optab, DImode, "__divvdi3");
5493  set_optab_libfunc (negv_optab, DImode, "__negvdi2");
5494  set_optab_libfunc (absv_optab, DImode, "__absvdi2");
5495
5496  set_optab_libfunc (smul_optab, TImode, "__multi3");
5497  set_optab_libfunc (sdiv_optab, TImode, "__divti3");
5498  set_optab_libfunc (smod_optab, TImode, "__modti3");
5499  set_optab_libfunc (udiv_optab, TImode, "__udivti3");
5500  set_optab_libfunc (umod_optab, TImode, "__umodti3");
5501  set_optab_libfunc (udivmod_optab, TImode, "__udivmodti4");
5502}
5503
5504/* Make a subreg, stripping any existing subreg.  We could possibly just
5505   call simplify_subreg, but in this case we know what we want. */
5506rtx
5507spu_gen_subreg (machine_mode mode, rtx x)
5508{
5509  if (GET_CODE (x) == SUBREG)
5510    x = SUBREG_REG (x);
5511  if (GET_MODE (x) == mode)
5512    return x;
5513  return gen_rtx_SUBREG (mode, x, 0);
5514}
5515
5516static bool
5517spu_return_in_memory (const_tree type, const_tree fntype ATTRIBUTE_UNUSED)
5518{
5519  return (TYPE_MODE (type) == BLKmode
5520	  && ((type) == 0
5521	      || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST
5522	      || int_size_in_bytes (type) >
5523	      (MAX_REGISTER_RETURN * UNITS_PER_WORD)));
5524}
5525
5526/* Create the built-in types and functions */
5527
5528enum spu_function_code
5529{
5530#define DEF_BUILTIN(fcode, icode, name, type, params) fcode,
5531#include "spu-builtins.def"
5532#undef DEF_BUILTIN
5533   NUM_SPU_BUILTINS
5534};
5535
5536extern GTY(()) struct spu_builtin_description spu_builtins[NUM_SPU_BUILTINS];
5537
5538struct spu_builtin_description spu_builtins[] = {
5539#define DEF_BUILTIN(fcode, icode, name, type, params) \
5540  {fcode, icode, name, type, params},
5541#include "spu-builtins.def"
5542#undef DEF_BUILTIN
5543};
5544
5545static GTY(()) tree spu_builtin_decls[NUM_SPU_BUILTINS];
5546
5547/* Returns the spu builtin decl for CODE.  */
5548
5549static tree
5550spu_builtin_decl (unsigned code, bool initialize_p ATTRIBUTE_UNUSED)
5551{
5552  if (code >= NUM_SPU_BUILTINS)
5553    return error_mark_node;
5554
5555  return spu_builtin_decls[code];
5556}
5557
5558
5559static void
5560spu_init_builtins (void)
5561{
5562  struct spu_builtin_description *d;
5563  unsigned int i;
5564
5565  V16QI_type_node = build_vector_type (intQI_type_node, 16);
5566  V8HI_type_node = build_vector_type (intHI_type_node, 8);
5567  V4SI_type_node = build_vector_type (intSI_type_node, 4);
5568  V2DI_type_node = build_vector_type (intDI_type_node, 2);
5569  V4SF_type_node = build_vector_type (float_type_node, 4);
5570  V2DF_type_node = build_vector_type (double_type_node, 2);
5571
5572  unsigned_V16QI_type_node = build_vector_type (unsigned_intQI_type_node, 16);
5573  unsigned_V8HI_type_node = build_vector_type (unsigned_intHI_type_node, 8);
5574  unsigned_V4SI_type_node = build_vector_type (unsigned_intSI_type_node, 4);
5575  unsigned_V2DI_type_node = build_vector_type (unsigned_intDI_type_node, 2);
5576
5577  spu_builtin_types[SPU_BTI_QUADWORD] = V16QI_type_node;
5578
5579  spu_builtin_types[SPU_BTI_7] = global_trees[TI_INTSI_TYPE];
5580  spu_builtin_types[SPU_BTI_S7] = global_trees[TI_INTSI_TYPE];
5581  spu_builtin_types[SPU_BTI_U7] = global_trees[TI_INTSI_TYPE];
5582  spu_builtin_types[SPU_BTI_S10] = global_trees[TI_INTSI_TYPE];
5583  spu_builtin_types[SPU_BTI_S10_4] = global_trees[TI_INTSI_TYPE];
5584  spu_builtin_types[SPU_BTI_U14] = global_trees[TI_INTSI_TYPE];
5585  spu_builtin_types[SPU_BTI_16] = global_trees[TI_INTSI_TYPE];
5586  spu_builtin_types[SPU_BTI_S16] = global_trees[TI_INTSI_TYPE];
5587  spu_builtin_types[SPU_BTI_S16_2] = global_trees[TI_INTSI_TYPE];
5588  spu_builtin_types[SPU_BTI_U16] = global_trees[TI_INTSI_TYPE];
5589  spu_builtin_types[SPU_BTI_U16_2] = global_trees[TI_INTSI_TYPE];
5590  spu_builtin_types[SPU_BTI_U18] = global_trees[TI_INTSI_TYPE];
5591
5592  spu_builtin_types[SPU_BTI_INTQI] = global_trees[TI_INTQI_TYPE];
5593  spu_builtin_types[SPU_BTI_INTHI] = global_trees[TI_INTHI_TYPE];
5594  spu_builtin_types[SPU_BTI_INTSI] = global_trees[TI_INTSI_TYPE];
5595  spu_builtin_types[SPU_BTI_INTDI] = global_trees[TI_INTDI_TYPE];
5596  spu_builtin_types[SPU_BTI_UINTQI] = global_trees[TI_UINTQI_TYPE];
5597  spu_builtin_types[SPU_BTI_UINTHI] = global_trees[TI_UINTHI_TYPE];
5598  spu_builtin_types[SPU_BTI_UINTSI] = global_trees[TI_UINTSI_TYPE];
5599  spu_builtin_types[SPU_BTI_UINTDI] = global_trees[TI_UINTDI_TYPE];
5600
5601  spu_builtin_types[SPU_BTI_FLOAT] = global_trees[TI_FLOAT_TYPE];
5602  spu_builtin_types[SPU_BTI_DOUBLE] = global_trees[TI_DOUBLE_TYPE];
5603
5604  spu_builtin_types[SPU_BTI_VOID] = global_trees[TI_VOID_TYPE];
5605
5606  spu_builtin_types[SPU_BTI_PTR] =
5607    build_pointer_type (build_qualified_type
5608			(void_type_node,
5609			 TYPE_QUAL_CONST | TYPE_QUAL_VOLATILE));
5610
5611  /* For each builtin we build a new prototype.  The tree code will make
5612     sure nodes are shared. */
5613  for (i = 0, d = spu_builtins; i < NUM_SPU_BUILTINS; i++, d++)
5614    {
5615      tree p;
5616      char name[64];		/* build_function will make a copy. */
5617      int parm;
5618
5619      if (d->name == 0)
5620	continue;
5621
5622      /* Find last parm.  */
5623      for (parm = 1; d->parm[parm] != SPU_BTI_END_OF_PARAMS; parm++)
5624	;
5625
5626      p = void_list_node;
5627      while (parm > 1)
5628	p = tree_cons (NULL_TREE, spu_builtin_types[d->parm[--parm]], p);
5629
5630      p = build_function_type (spu_builtin_types[d->parm[0]], p);
5631
5632      sprintf (name, "__builtin_%s", d->name);
5633      spu_builtin_decls[i] =
5634	add_builtin_function (name, p, i, BUILT_IN_MD, NULL, NULL_TREE);
5635      if (d->fcode == SPU_MASK_FOR_LOAD)
5636	TREE_READONLY (spu_builtin_decls[i]) = 1;
5637
5638      /* These builtins don't throw.  */
5639      TREE_NOTHROW (spu_builtin_decls[i]) = 1;
5640    }
5641}
5642
5643void
5644spu_restore_stack_block (rtx op0 ATTRIBUTE_UNUSED, rtx op1)
5645{
5646  static unsigned char arr[16] =
5647    { 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 };
5648
5649  rtx temp = gen_reg_rtx (Pmode);
5650  rtx temp2 = gen_reg_rtx (V4SImode);
5651  rtx temp3 = gen_reg_rtx (V4SImode);
5652  rtx pat = gen_reg_rtx (TImode);
5653  rtx sp = gen_rtx_REG (V4SImode, STACK_POINTER_REGNUM);
5654
5655  emit_move_insn (pat, array_to_constant (TImode, arr));
5656
5657  /* Restore the sp.  */
5658  emit_move_insn (temp, op1);
5659  emit_move_insn (temp2, gen_frame_mem (V4SImode, stack_pointer_rtx));
5660
5661  /* Compute available stack size for sp.  */
5662  emit_insn (gen_subsi3 (temp, temp, stack_pointer_rtx));
5663  emit_insn (gen_shufb (temp3, temp, temp, pat));
5664
5665  emit_insn (gen_addv4si3 (sp, sp, temp3));
5666  emit_move_insn (gen_frame_mem (V4SImode, stack_pointer_rtx), temp2);
5667}
5668
5669int
5670spu_safe_dma (HOST_WIDE_INT channel)
5671{
5672  return TARGET_SAFE_DMA && channel >= 21 && channel <= 27;
5673}
5674
5675void
5676spu_builtin_splats (rtx ops[])
5677{
5678  machine_mode mode = GET_MODE (ops[0]);
5679  if (GET_CODE (ops[1]) == CONST_INT || GET_CODE (ops[1]) == CONST_DOUBLE)
5680    {
5681      unsigned char arr[16];
5682      constant_to_array (GET_MODE_INNER (mode), ops[1], arr);
5683      emit_move_insn (ops[0], array_to_constant (mode, arr));
5684    }
5685  else
5686    {
5687      rtx reg = gen_reg_rtx (TImode);
5688      rtx shuf;
5689      if (GET_CODE (ops[1]) != REG
5690	  && GET_CODE (ops[1]) != SUBREG)
5691	ops[1] = force_reg (GET_MODE_INNER (mode), ops[1]);
5692      switch (mode)
5693	{
5694	case V2DImode:
5695	case V2DFmode:
5696	  shuf =
5697	    immed_double_const (0x0001020304050607ll, 0x1011121314151617ll,
5698				TImode);
5699	  break;
5700	case V4SImode:
5701	case V4SFmode:
5702	  shuf =
5703	    immed_double_const (0x0001020300010203ll, 0x0001020300010203ll,
5704				TImode);
5705	  break;
5706	case V8HImode:
5707	  shuf =
5708	    immed_double_const (0x0203020302030203ll, 0x0203020302030203ll,
5709				TImode);
5710	  break;
5711	case V16QImode:
5712	  shuf =
5713	    immed_double_const (0x0303030303030303ll, 0x0303030303030303ll,
5714				TImode);
5715	  break;
5716	default:
5717	  abort ();
5718	}
5719      emit_move_insn (reg, shuf);
5720      emit_insn (gen_shufb (ops[0], ops[1], ops[1], reg));
5721    }
5722}
5723
5724void
5725spu_builtin_extract (rtx ops[])
5726{
5727  machine_mode mode;
5728  rtx rot, from, tmp;
5729
5730  mode = GET_MODE (ops[1]);
5731
5732  if (GET_CODE (ops[2]) == CONST_INT)
5733    {
5734      switch (mode)
5735	{
5736	case V16QImode:
5737	  emit_insn (gen_vec_extractv16qi (ops[0], ops[1], ops[2]));
5738	  break;
5739	case V8HImode:
5740	  emit_insn (gen_vec_extractv8hi (ops[0], ops[1], ops[2]));
5741	  break;
5742	case V4SFmode:
5743	  emit_insn (gen_vec_extractv4sf (ops[0], ops[1], ops[2]));
5744	  break;
5745	case V4SImode:
5746	  emit_insn (gen_vec_extractv4si (ops[0], ops[1], ops[2]));
5747	  break;
5748	case V2DImode:
5749	  emit_insn (gen_vec_extractv2di (ops[0], ops[1], ops[2]));
5750	  break;
5751	case V2DFmode:
5752	  emit_insn (gen_vec_extractv2df (ops[0], ops[1], ops[2]));
5753	  break;
5754	default:
5755	  abort ();
5756	}
5757      return;
5758    }
5759
5760  from = spu_gen_subreg (TImode, ops[1]);
5761  rot = gen_reg_rtx (TImode);
5762  tmp = gen_reg_rtx (SImode);
5763
5764  switch (mode)
5765    {
5766    case V16QImode:
5767      emit_insn (gen_addsi3 (tmp, ops[2], GEN_INT (-3)));
5768      break;
5769    case V8HImode:
5770      emit_insn (gen_addsi3 (tmp, ops[2], ops[2]));
5771      emit_insn (gen_addsi3 (tmp, tmp, GEN_INT (-2)));
5772      break;
5773    case V4SFmode:
5774    case V4SImode:
5775      emit_insn (gen_ashlsi3 (tmp, ops[2], GEN_INT (2)));
5776      break;
5777    case V2DImode:
5778    case V2DFmode:
5779      emit_insn (gen_ashlsi3 (tmp, ops[2], GEN_INT (3)));
5780      break;
5781    default:
5782      abort ();
5783    }
5784  emit_insn (gen_rotqby_ti (rot, from, tmp));
5785
5786  emit_insn (gen_spu_convert (ops[0], rot));
5787}
5788
5789void
5790spu_builtin_insert (rtx ops[])
5791{
5792  machine_mode mode = GET_MODE (ops[0]);
5793  machine_mode imode = GET_MODE_INNER (mode);
5794  rtx mask = gen_reg_rtx (TImode);
5795  rtx offset;
5796
5797  if (GET_CODE (ops[3]) == CONST_INT)
5798    offset = GEN_INT (INTVAL (ops[3]) * GET_MODE_SIZE (imode));
5799  else
5800    {
5801      offset = gen_reg_rtx (SImode);
5802      emit_insn (gen_mulsi3
5803		 (offset, ops[3], GEN_INT (GET_MODE_SIZE (imode))));
5804    }
5805  emit_insn (gen_cpat
5806	     (mask, stack_pointer_rtx, offset,
5807	      GEN_INT (GET_MODE_SIZE (imode))));
5808  emit_insn (gen_shufb (ops[0], ops[1], ops[2], mask));
5809}
5810
5811void
5812spu_builtin_promote (rtx ops[])
5813{
5814  machine_mode mode, imode;
5815  rtx rot, from, offset;
5816  HOST_WIDE_INT pos;
5817
5818  mode = GET_MODE (ops[0]);
5819  imode = GET_MODE_INNER (mode);
5820
5821  from = gen_reg_rtx (TImode);
5822  rot = spu_gen_subreg (TImode, ops[0]);
5823
5824  emit_insn (gen_spu_convert (from, ops[1]));
5825
5826  if (GET_CODE (ops[2]) == CONST_INT)
5827    {
5828      pos = -GET_MODE_SIZE (imode) * INTVAL (ops[2]);
5829      if (GET_MODE_SIZE (imode) < 4)
5830	pos += 4 - GET_MODE_SIZE (imode);
5831      offset = GEN_INT (pos & 15);
5832    }
5833  else
5834    {
5835      offset = gen_reg_rtx (SImode);
5836      switch (mode)
5837	{
5838	case V16QImode:
5839	  emit_insn (gen_subsi3 (offset, GEN_INT (3), ops[2]));
5840	  break;
5841	case V8HImode:
5842	  emit_insn (gen_subsi3 (offset, GEN_INT (1), ops[2]));
5843	  emit_insn (gen_addsi3 (offset, offset, offset));
5844	  break;
5845	case V4SFmode:
5846	case V4SImode:
5847	  emit_insn (gen_subsi3 (offset, GEN_INT (0), ops[2]));
5848	  emit_insn (gen_ashlsi3 (offset, offset, GEN_INT (2)));
5849	  break;
5850	case V2DImode:
5851	case V2DFmode:
5852	  emit_insn (gen_ashlsi3 (offset, ops[2], GEN_INT (3)));
5853	  break;
5854	default:
5855	  abort ();
5856	}
5857    }
5858  emit_insn (gen_rotqby_ti (rot, from, offset));
5859}
5860
5861static void
5862spu_trampoline_init (rtx m_tramp, tree fndecl, rtx cxt)
5863{
5864  rtx fnaddr = XEXP (DECL_RTL (fndecl), 0);
5865  rtx shuf = gen_reg_rtx (V4SImode);
5866  rtx insn = gen_reg_rtx (V4SImode);
5867  rtx shufc;
5868  rtx insnc;
5869  rtx mem;
5870
5871  fnaddr = force_reg (SImode, fnaddr);
5872  cxt = force_reg (SImode, cxt);
5873
5874  if (TARGET_LARGE_MEM)
5875    {
5876      rtx rotl = gen_reg_rtx (V4SImode);
5877      rtx mask = gen_reg_rtx (V4SImode);
5878      rtx bi = gen_reg_rtx (SImode);
5879      static unsigned char const shufa[16] = {
5880	2, 3, 0, 1, 18, 19, 16, 17,
5881	0, 1, 2, 3, 16, 17, 18, 19
5882      };
5883      static unsigned char const insna[16] = {
5884	0x41, 0, 0, 79,
5885	0x41, 0, 0, STATIC_CHAIN_REGNUM,
5886	0x60, 0x80, 0, 79,
5887	0x60, 0x80, 0, STATIC_CHAIN_REGNUM
5888      };
5889
5890      shufc = force_reg (TImode, array_to_constant (TImode, shufa));
5891      insnc = force_reg (V4SImode, array_to_constant (V4SImode, insna));
5892
5893      emit_insn (gen_shufb (shuf, fnaddr, cxt, shufc));
5894      emit_insn (gen_vrotlv4si3 (rotl, shuf, spu_const (V4SImode, 7)));
5895      emit_insn (gen_movv4si (mask, spu_const (V4SImode, 0xffff << 7)));
5896      emit_insn (gen_selb (insn, insnc, rotl, mask));
5897
5898      mem = adjust_address (m_tramp, V4SImode, 0);
5899      emit_move_insn (mem, insn);
5900
5901      emit_move_insn (bi, GEN_INT (0x35000000 + (79 << 7)));
5902      mem = adjust_address (m_tramp, Pmode, 16);
5903      emit_move_insn (mem, bi);
5904    }
5905  else
5906    {
5907      rtx scxt = gen_reg_rtx (SImode);
5908      rtx sfnaddr = gen_reg_rtx (SImode);
5909      static unsigned char const insna[16] = {
5910	0x42, 0, 0, STATIC_CHAIN_REGNUM,
5911	0x30, 0, 0, 0,
5912	0, 0, 0, 0,
5913	0, 0, 0, 0
5914      };
5915
5916      shufc = gen_reg_rtx (TImode);
5917      insnc = force_reg (V4SImode, array_to_constant (V4SImode, insna));
5918
5919      /* By or'ing all of cxt with the ila opcode we are assuming cxt
5920	 fits 18 bits and the last 4 are zeros.  This will be true if
5921	 the stack pointer is initialized to 0x3fff0 at program start,
5922	 otherwise the ila instruction will be garbage. */
5923
5924      emit_insn (gen_ashlsi3 (scxt, cxt, GEN_INT (7)));
5925      emit_insn (gen_ashlsi3 (sfnaddr, fnaddr, GEN_INT (5)));
5926      emit_insn (gen_cpat
5927		 (shufc, stack_pointer_rtx, GEN_INT (4), GEN_INT (4)));
5928      emit_insn (gen_shufb (shuf, sfnaddr, scxt, shufc));
5929      emit_insn (gen_iorv4si3 (insn, insnc, shuf));
5930
5931      mem = adjust_address (m_tramp, V4SImode, 0);
5932      emit_move_insn (mem, insn);
5933    }
5934  emit_insn (gen_sync ());
5935}
5936
5937static bool
5938spu_warn_func_return (tree decl)
5939{
5940  /* Naked functions are implemented entirely in assembly, including the
5941     return sequence, so suppress warnings about this.  */
5942  return !spu_naked_function_p (decl);
5943}
5944
5945void
5946spu_expand_sign_extend (rtx ops[])
5947{
5948  unsigned char arr[16];
5949  rtx pat = gen_reg_rtx (TImode);
5950  rtx sign, c;
5951  int i, last;
5952  last = GET_MODE (ops[0]) == DImode ? 7 : 15;
5953  if (GET_MODE (ops[1]) == QImode)
5954    {
5955      sign = gen_reg_rtx (HImode);
5956      emit_insn (gen_extendqihi2 (sign, ops[1]));
5957      for (i = 0; i < 16; i++)
5958	arr[i] = 0x12;
5959      arr[last] = 0x13;
5960    }
5961  else
5962    {
5963      for (i = 0; i < 16; i++)
5964	arr[i] = 0x10;
5965      switch (GET_MODE (ops[1]))
5966	{
5967	case HImode:
5968	  sign = gen_reg_rtx (SImode);
5969	  emit_insn (gen_extendhisi2 (sign, ops[1]));
5970	  arr[last] = 0x03;
5971	  arr[last - 1] = 0x02;
5972	  break;
5973	case SImode:
5974	  sign = gen_reg_rtx (SImode);
5975	  emit_insn (gen_ashrsi3 (sign, ops[1], GEN_INT (31)));
5976	  for (i = 0; i < 4; i++)
5977	    arr[last - i] = 3 - i;
5978	  break;
5979	case DImode:
5980	  sign = gen_reg_rtx (SImode);
5981	  c = gen_reg_rtx (SImode);
5982	  emit_insn (gen_spu_convert (c, ops[1]));
5983	  emit_insn (gen_ashrsi3 (sign, c, GEN_INT (31)));
5984	  for (i = 0; i < 8; i++)
5985	    arr[last - i] = 7 - i;
5986	  break;
5987	default:
5988	  abort ();
5989	}
5990    }
5991  emit_move_insn (pat, array_to_constant (TImode, arr));
5992  emit_insn (gen_shufb (ops[0], ops[1], sign, pat));
5993}
5994
5995/* expand vector initialization. If there are any constant parts,
5996   load constant parts first. Then load any non-constant parts.  */
5997void
5998spu_expand_vector_init (rtx target, rtx vals)
5999{
6000  machine_mode mode = GET_MODE (target);
6001  int n_elts = GET_MODE_NUNITS (mode);
6002  int n_var = 0;
6003  bool all_same = true;
6004  rtx first, x = NULL_RTX, first_constant = NULL_RTX;
6005  int i;
6006
6007  first = XVECEXP (vals, 0, 0);
6008  for (i = 0; i < n_elts; ++i)
6009    {
6010      x = XVECEXP (vals, 0, i);
6011      if (!(CONST_INT_P (x)
6012	    || GET_CODE (x) == CONST_DOUBLE
6013	    || GET_CODE (x) == CONST_FIXED))
6014	++n_var;
6015      else
6016	{
6017	  if (first_constant == NULL_RTX)
6018	    first_constant = x;
6019	}
6020      if (i > 0 && !rtx_equal_p (x, first))
6021	all_same = false;
6022    }
6023
6024  /* if all elements are the same, use splats to repeat elements */
6025  if (all_same)
6026    {
6027      if (!CONSTANT_P (first)
6028	  && !register_operand (first, GET_MODE (x)))
6029	first = force_reg (GET_MODE (first), first);
6030      emit_insn (gen_spu_splats (target, first));
6031      return;
6032    }
6033
6034  /* load constant parts */
6035  if (n_var != n_elts)
6036    {
6037      if (n_var == 0)
6038	{
6039	  emit_move_insn (target,
6040			  gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
6041	}
6042      else
6043	{
6044	  rtx constant_parts_rtx = copy_rtx (vals);
6045
6046	  gcc_assert (first_constant != NULL_RTX);
6047	  /* fill empty slots with the first constant, this increases
6048	     our chance of using splats in the recursive call below. */
6049	  for (i = 0; i < n_elts; ++i)
6050	    {
6051	      x = XVECEXP (constant_parts_rtx, 0, i);
6052	      if (!(CONST_INT_P (x)
6053		    || GET_CODE (x) == CONST_DOUBLE
6054		    || GET_CODE (x) == CONST_FIXED))
6055		XVECEXP (constant_parts_rtx, 0, i) = first_constant;
6056	    }
6057
6058	  spu_expand_vector_init (target, constant_parts_rtx);
6059	}
6060    }
6061
6062  /* load variable parts */
6063  if (n_var != 0)
6064    {
6065      rtx insert_operands[4];
6066
6067      insert_operands[0] = target;
6068      insert_operands[2] = target;
6069      for (i = 0; i < n_elts; ++i)
6070	{
6071	  x = XVECEXP (vals, 0, i);
6072	  if (!(CONST_INT_P (x)
6073		|| GET_CODE (x) == CONST_DOUBLE
6074		|| GET_CODE (x) == CONST_FIXED))
6075	    {
6076	      if (!register_operand (x, GET_MODE (x)))
6077		x = force_reg (GET_MODE (x), x);
6078	      insert_operands[1] = x;
6079	      insert_operands[3] = GEN_INT (i);
6080	      spu_builtin_insert (insert_operands);
6081	    }
6082	}
6083    }
6084}
6085
6086/* Return insn index for the vector compare instruction for given CODE,
6087   and DEST_MODE, OP_MODE. Return -1 if valid insn is not available.  */
6088
6089static int
6090get_vec_cmp_insn (enum rtx_code code,
6091                  machine_mode dest_mode,
6092                  machine_mode op_mode)
6093
6094{
6095  switch (code)
6096    {
6097    case EQ:
6098      if (dest_mode == V16QImode && op_mode == V16QImode)
6099        return CODE_FOR_ceq_v16qi;
6100      if (dest_mode == V8HImode && op_mode == V8HImode)
6101        return CODE_FOR_ceq_v8hi;
6102      if (dest_mode == V4SImode && op_mode == V4SImode)
6103        return CODE_FOR_ceq_v4si;
6104      if (dest_mode == V4SImode && op_mode == V4SFmode)
6105        return CODE_FOR_ceq_v4sf;
6106      if (dest_mode == V2DImode && op_mode == V2DFmode)
6107        return CODE_FOR_ceq_v2df;
6108      break;
6109    case GT:
6110      if (dest_mode == V16QImode && op_mode == V16QImode)
6111        return CODE_FOR_cgt_v16qi;
6112      if (dest_mode == V8HImode && op_mode == V8HImode)
6113        return CODE_FOR_cgt_v8hi;
6114      if (dest_mode == V4SImode && op_mode == V4SImode)
6115        return CODE_FOR_cgt_v4si;
6116      if (dest_mode == V4SImode && op_mode == V4SFmode)
6117        return CODE_FOR_cgt_v4sf;
6118      if (dest_mode == V2DImode && op_mode == V2DFmode)
6119        return CODE_FOR_cgt_v2df;
6120      break;
6121    case GTU:
6122      if (dest_mode == V16QImode && op_mode == V16QImode)
6123        return CODE_FOR_clgt_v16qi;
6124      if (dest_mode == V8HImode && op_mode == V8HImode)
6125        return CODE_FOR_clgt_v8hi;
6126      if (dest_mode == V4SImode && op_mode == V4SImode)
6127        return CODE_FOR_clgt_v4si;
6128      break;
6129    default:
6130      break;
6131    }
6132  return -1;
6133}
6134
6135/* Emit vector compare for operands OP0 and OP1 using code RCODE.
6136   DMODE is expected destination mode. This is a recursive function.  */
6137
6138static rtx
6139spu_emit_vector_compare (enum rtx_code rcode,
6140                         rtx op0, rtx op1,
6141                         machine_mode dmode)
6142{
6143  int vec_cmp_insn;
6144  rtx mask;
6145  machine_mode dest_mode;
6146  machine_mode op_mode = GET_MODE (op1);
6147
6148  gcc_assert (GET_MODE (op0) == GET_MODE (op1));
6149
6150  /* Floating point vector compare instructions uses destination V4SImode.
6151     Double floating point vector compare instructions uses destination V2DImode.
6152     Move destination to appropriate mode later.  */
6153  if (dmode == V4SFmode)
6154    dest_mode = V4SImode;
6155  else if (dmode == V2DFmode)
6156    dest_mode = V2DImode;
6157  else
6158    dest_mode = dmode;
6159
6160  mask = gen_reg_rtx (dest_mode);
6161  vec_cmp_insn = get_vec_cmp_insn (rcode, dest_mode, op_mode);
6162
6163  if (vec_cmp_insn == -1)
6164    {
6165      bool swap_operands = false;
6166      bool try_again = false;
6167      switch (rcode)
6168        {
6169        case LT:
6170          rcode = GT;
6171          swap_operands = true;
6172          try_again = true;
6173          break;
6174        case LTU:
6175          rcode = GTU;
6176          swap_operands = true;
6177          try_again = true;
6178          break;
6179        case NE:
6180	case UNEQ:
6181	case UNLE:
6182	case UNLT:
6183	case UNGE:
6184	case UNGT:
6185	case UNORDERED:
6186          /* Treat A != B as ~(A==B).  */
6187          {
6188	    enum rtx_code rev_code;
6189            enum insn_code nor_code;
6190	    rtx rev_mask;
6191
6192	    rev_code = reverse_condition_maybe_unordered (rcode);
6193            rev_mask = spu_emit_vector_compare (rev_code, op0, op1, dest_mode);
6194
6195            nor_code = optab_handler (one_cmpl_optab, dest_mode);
6196            gcc_assert (nor_code != CODE_FOR_nothing);
6197            emit_insn (GEN_FCN (nor_code) (mask, rev_mask));
6198            if (dmode != dest_mode)
6199              {
6200                rtx temp = gen_reg_rtx (dest_mode);
6201                convert_move (temp, mask, 0);
6202                return temp;
6203              }
6204            return mask;
6205          }
6206          break;
6207        case GE:
6208        case GEU:
6209        case LE:
6210        case LEU:
6211          /* Try GT/GTU/LT/LTU OR EQ */
6212          {
6213            rtx c_rtx, eq_rtx;
6214            enum insn_code ior_code;
6215            enum rtx_code new_code;
6216
6217            switch (rcode)
6218              {
6219              case GE:  new_code = GT;  break;
6220              case GEU: new_code = GTU; break;
6221              case LE:  new_code = LT;  break;
6222              case LEU: new_code = LTU; break;
6223              default:
6224                gcc_unreachable ();
6225              }
6226
6227            c_rtx = spu_emit_vector_compare (new_code, op0, op1, dest_mode);
6228            eq_rtx = spu_emit_vector_compare (EQ, op0, op1, dest_mode);
6229
6230            ior_code = optab_handler (ior_optab, dest_mode);
6231            gcc_assert (ior_code != CODE_FOR_nothing);
6232            emit_insn (GEN_FCN (ior_code) (mask, c_rtx, eq_rtx));
6233            if (dmode != dest_mode)
6234              {
6235                rtx temp = gen_reg_rtx (dest_mode);
6236                convert_move (temp, mask, 0);
6237                return temp;
6238              }
6239            return mask;
6240          }
6241          break;
6242        case LTGT:
6243          /* Try LT OR GT */
6244          {
6245            rtx lt_rtx, gt_rtx;
6246            enum insn_code ior_code;
6247
6248            lt_rtx = spu_emit_vector_compare (LT, op0, op1, dest_mode);
6249            gt_rtx = spu_emit_vector_compare (GT, op0, op1, dest_mode);
6250
6251            ior_code = optab_handler (ior_optab, dest_mode);
6252            gcc_assert (ior_code != CODE_FOR_nothing);
6253            emit_insn (GEN_FCN (ior_code) (mask, lt_rtx, gt_rtx));
6254            if (dmode != dest_mode)
6255              {
6256                rtx temp = gen_reg_rtx (dest_mode);
6257                convert_move (temp, mask, 0);
6258                return temp;
6259              }
6260            return mask;
6261          }
6262          break;
6263        case ORDERED:
6264          /* Implement as (A==A) & (B==B) */
6265          {
6266            rtx a_rtx, b_rtx;
6267            enum insn_code and_code;
6268
6269            a_rtx = spu_emit_vector_compare (EQ, op0, op0, dest_mode);
6270            b_rtx = spu_emit_vector_compare (EQ, op1, op1, dest_mode);
6271
6272            and_code = optab_handler (and_optab, dest_mode);
6273            gcc_assert (and_code != CODE_FOR_nothing);
6274            emit_insn (GEN_FCN (and_code) (mask, a_rtx, b_rtx));
6275            if (dmode != dest_mode)
6276              {
6277                rtx temp = gen_reg_rtx (dest_mode);
6278                convert_move (temp, mask, 0);
6279                return temp;
6280              }
6281            return mask;
6282          }
6283          break;
6284        default:
6285          gcc_unreachable ();
6286        }
6287
6288      /* You only get two chances.  */
6289      if (try_again)
6290          vec_cmp_insn = get_vec_cmp_insn (rcode, dest_mode, op_mode);
6291
6292      gcc_assert (vec_cmp_insn != -1);
6293
6294      if (swap_operands)
6295        {
6296          rtx tmp;
6297          tmp = op0;
6298          op0 = op1;
6299          op1 = tmp;
6300        }
6301    }
6302
6303  emit_insn (GEN_FCN (vec_cmp_insn) (mask, op0, op1));
6304  if (dmode != dest_mode)
6305    {
6306      rtx temp = gen_reg_rtx (dest_mode);
6307      convert_move (temp, mask, 0);
6308      return temp;
6309    }
6310  return mask;
6311}
6312
6313
6314/* Emit vector conditional expression.
6315   DEST is destination. OP1 and OP2 are two VEC_COND_EXPR operands.
6316   CC_OP0 and CC_OP1 are the two operands for the relation operation COND.  */
6317
6318int
6319spu_emit_vector_cond_expr (rtx dest, rtx op1, rtx op2,
6320                           rtx cond, rtx cc_op0, rtx cc_op1)
6321{
6322  machine_mode dest_mode = GET_MODE (dest);
6323  enum rtx_code rcode = GET_CODE (cond);
6324  rtx mask;
6325
6326  /* Get the vector mask for the given relational operations.  */
6327  mask = spu_emit_vector_compare (rcode, cc_op0, cc_op1, dest_mode);
6328
6329  emit_insn(gen_selb (dest, op2, op1, mask));
6330
6331  return 1;
6332}
6333
6334static rtx
6335spu_force_reg (machine_mode mode, rtx op)
6336{
6337  rtx x, r;
6338  if (GET_MODE (op) == VOIDmode || GET_MODE (op) == BLKmode)
6339    {
6340      if ((SCALAR_INT_MODE_P (mode) && GET_CODE (op) == CONST_INT)
6341	  || GET_MODE (op) == BLKmode)
6342	return force_reg (mode, convert_to_mode (mode, op, 0));
6343      abort ();
6344    }
6345
6346  r = force_reg (GET_MODE (op), op);
6347  if (GET_MODE_SIZE (GET_MODE (op)) == GET_MODE_SIZE (mode))
6348    {
6349      x = simplify_gen_subreg (mode, r, GET_MODE (op), 0);
6350      if (x)
6351	return x;
6352    }
6353
6354  x = gen_reg_rtx (mode);
6355  emit_insn (gen_spu_convert (x, r));
6356  return x;
6357}
6358
6359static void
6360spu_check_builtin_parm (struct spu_builtin_description *d, rtx op, int p)
6361{
6362  HOST_WIDE_INT v = 0;
6363  int lsbits;
6364  /* Check the range of immediate operands. */
6365  if (p >= SPU_BTI_7 && p <= SPU_BTI_U18)
6366    {
6367      int range = p - SPU_BTI_7;
6368
6369      if (!CONSTANT_P (op))
6370	error ("%s expects an integer literal in the range [%d, %d]",
6371	       d->name,
6372	       spu_builtin_range[range].low, spu_builtin_range[range].high);
6373
6374      if (GET_CODE (op) == CONST
6375	  && (GET_CODE (XEXP (op, 0)) == PLUS
6376	      || GET_CODE (XEXP (op, 0)) == MINUS))
6377	{
6378	  v = INTVAL (XEXP (XEXP (op, 0), 1));
6379	  op = XEXP (XEXP (op, 0), 0);
6380	}
6381      else if (GET_CODE (op) == CONST_INT)
6382	v = INTVAL (op);
6383      else if (GET_CODE (op) == CONST_VECTOR
6384	       && GET_CODE (CONST_VECTOR_ELT (op, 0)) == CONST_INT)
6385	v = INTVAL (CONST_VECTOR_ELT (op, 0));
6386
6387      /* The default for v is 0 which is valid in every range. */
6388      if (v < spu_builtin_range[range].low
6389	  || v > spu_builtin_range[range].high)
6390	error ("%s expects an integer literal in the range [%d, %d]. (%wd)",
6391	       d->name,
6392	       spu_builtin_range[range].low, spu_builtin_range[range].high,
6393	       v);
6394
6395      switch (p)
6396	{
6397	case SPU_BTI_S10_4:
6398	  lsbits = 4;
6399	  break;
6400	case SPU_BTI_U16_2:
6401	  /* This is only used in lqa, and stqa.  Even though the insns
6402	     encode 16 bits of the address (all but the 2 least
6403	     significant), only 14 bits are used because it is masked to
6404	     be 16 byte aligned. */
6405	  lsbits = 4;
6406	  break;
6407	case SPU_BTI_S16_2:
6408	  /* This is used for lqr and stqr. */
6409	  lsbits = 2;
6410	  break;
6411	default:
6412	  lsbits = 0;
6413	}
6414
6415      if (GET_CODE (op) == LABEL_REF
6416	  || (GET_CODE (op) == SYMBOL_REF
6417	      && SYMBOL_REF_FUNCTION_P (op))
6418	  || (v & ((1 << lsbits) - 1)) != 0)
6419	warning (0, "%d least significant bits of %s are ignored", lsbits,
6420		 d->name);
6421    }
6422}
6423
6424
6425static int
6426expand_builtin_args (struct spu_builtin_description *d, tree exp,
6427		     rtx target, rtx ops[])
6428{
6429  enum insn_code icode = (enum insn_code) d->icode;
6430  int i = 0, a;
6431
6432  /* Expand the arguments into rtl. */
6433
6434  if (d->parm[0] != SPU_BTI_VOID)
6435    ops[i++] = target;
6436
6437  for (a = 0; d->parm[a+1] != SPU_BTI_END_OF_PARAMS; i++, a++)
6438    {
6439      tree arg = CALL_EXPR_ARG (exp, a);
6440      if (arg == 0)
6441	abort ();
6442      ops[i] = expand_expr (arg, NULL_RTX, VOIDmode, EXPAND_NORMAL);
6443    }
6444
6445  gcc_assert (i == insn_data[icode].n_generator_args);
6446  return i;
6447}
6448
6449static rtx
6450spu_expand_builtin_1 (struct spu_builtin_description *d,
6451		      tree exp, rtx target)
6452{
6453  rtx pat;
6454  rtx ops[8];
6455  enum insn_code icode = (enum insn_code) d->icode;
6456  machine_mode mode, tmode;
6457  int i, p;
6458  int n_operands;
6459  tree return_type;
6460
6461  /* Set up ops[] with values from arglist. */
6462  n_operands = expand_builtin_args (d, exp, target, ops);
6463
6464  /* Handle the target operand which must be operand 0. */
6465  i = 0;
6466  if (d->parm[0] != SPU_BTI_VOID)
6467    {
6468
6469      /* We prefer the mode specified for the match_operand otherwise
6470         use the mode from the builtin function prototype. */
6471      tmode = insn_data[d->icode].operand[0].mode;
6472      if (tmode == VOIDmode)
6473	tmode = TYPE_MODE (spu_builtin_types[d->parm[0]]);
6474
6475      /* Try to use target because not using it can lead to extra copies
6476         and when we are using all of the registers extra copies leads
6477         to extra spills.  */
6478      if (target && GET_CODE (target) == REG && GET_MODE (target) == tmode)
6479	ops[0] = target;
6480      else
6481	target = ops[0] = gen_reg_rtx (tmode);
6482
6483      if (!(*insn_data[icode].operand[0].predicate) (ops[0], tmode))
6484	abort ();
6485
6486      i++;
6487    }
6488
6489  if (d->fcode == SPU_MASK_FOR_LOAD)
6490    {
6491      machine_mode mode = insn_data[icode].operand[1].mode;
6492      tree arg;
6493      rtx addr, op, pat;
6494
6495      /* get addr */
6496      arg = CALL_EXPR_ARG (exp, 0);
6497      gcc_assert (POINTER_TYPE_P (TREE_TYPE (arg)));
6498      op = expand_expr (arg, NULL_RTX, Pmode, EXPAND_NORMAL);
6499      addr = memory_address (mode, op);
6500
6501      /* negate addr */
6502      op = gen_reg_rtx (GET_MODE (addr));
6503      emit_insn (gen_rtx_SET (VOIDmode, op,
6504                 gen_rtx_NEG (GET_MODE (addr), addr)));
6505      op = gen_rtx_MEM (mode, op);
6506
6507      pat = GEN_FCN (icode) (target, op);
6508      if (!pat)
6509        return 0;
6510      emit_insn (pat);
6511      return target;
6512    }
6513
6514  /* Ignore align_hint, but still expand it's args in case they have
6515     side effects. */
6516  if (icode == CODE_FOR_spu_align_hint)
6517    return 0;
6518
6519  /* Handle the rest of the operands. */
6520  for (p = 1; i < n_operands; i++, p++)
6521    {
6522      if (insn_data[d->icode].operand[i].mode != VOIDmode)
6523	mode = insn_data[d->icode].operand[i].mode;
6524      else
6525	mode = TYPE_MODE (spu_builtin_types[d->parm[i]]);
6526
6527      /* mode can be VOIDmode here for labels */
6528
6529      /* For specific intrinsics with an immediate operand, e.g.,
6530         si_ai(), we sometimes need to convert the scalar argument to a
6531         vector argument by splatting the scalar. */
6532      if (VECTOR_MODE_P (mode)
6533	  && (GET_CODE (ops[i]) == CONST_INT
6534	      || GET_MODE_CLASS (GET_MODE (ops[i])) == MODE_INT
6535	      || GET_MODE_CLASS (GET_MODE (ops[i])) == MODE_FLOAT))
6536	{
6537	  if (GET_CODE (ops[i]) == CONST_INT)
6538	    ops[i] = spu_const (mode, INTVAL (ops[i]));
6539	  else
6540	    {
6541	      rtx reg = gen_reg_rtx (mode);
6542	      machine_mode imode = GET_MODE_INNER (mode);
6543	      if (!spu_nonmem_operand (ops[i], GET_MODE (ops[i])))
6544		ops[i] = force_reg (GET_MODE (ops[i]), ops[i]);
6545	      if (imode != GET_MODE (ops[i]))
6546		ops[i] = convert_to_mode (imode, ops[i],
6547					  TYPE_UNSIGNED (spu_builtin_types
6548							 [d->parm[i]]));
6549	      emit_insn (gen_spu_splats (reg, ops[i]));
6550	      ops[i] = reg;
6551	    }
6552	}
6553
6554      spu_check_builtin_parm (d, ops[i], d->parm[p]);
6555
6556      if (!(*insn_data[icode].operand[i].predicate) (ops[i], mode))
6557	ops[i] = spu_force_reg (mode, ops[i]);
6558    }
6559
6560  switch (n_operands)
6561    {
6562    case 0:
6563      pat = GEN_FCN (icode) (0);
6564      break;
6565    case 1:
6566      pat = GEN_FCN (icode) (ops[0]);
6567      break;
6568    case 2:
6569      pat = GEN_FCN (icode) (ops[0], ops[1]);
6570      break;
6571    case 3:
6572      pat = GEN_FCN (icode) (ops[0], ops[1], ops[2]);
6573      break;
6574    case 4:
6575      pat = GEN_FCN (icode) (ops[0], ops[1], ops[2], ops[3]);
6576      break;
6577    case 5:
6578      pat = GEN_FCN (icode) (ops[0], ops[1], ops[2], ops[3], ops[4]);
6579      break;
6580    case 6:
6581      pat = GEN_FCN (icode) (ops[0], ops[1], ops[2], ops[3], ops[4], ops[5]);
6582      break;
6583    default:
6584      abort ();
6585    }
6586
6587  if (!pat)
6588    abort ();
6589
6590  if (d->type == B_CALL || d->type == B_BISLED)
6591    emit_call_insn (pat);
6592  else if (d->type == B_JUMP)
6593    {
6594      emit_jump_insn (pat);
6595      emit_barrier ();
6596    }
6597  else
6598    emit_insn (pat);
6599
6600  return_type = spu_builtin_types[d->parm[0]];
6601  if (d->parm[0] != SPU_BTI_VOID
6602      && GET_MODE (target) != TYPE_MODE (return_type))
6603    {
6604      /* target is the return value.  It should always be the mode of
6605         the builtin function prototype. */
6606      target = spu_force_reg (TYPE_MODE (return_type), target);
6607    }
6608
6609  return target;
6610}
6611
6612rtx
6613spu_expand_builtin (tree exp,
6614		    rtx target,
6615		    rtx subtarget ATTRIBUTE_UNUSED,
6616		    machine_mode mode ATTRIBUTE_UNUSED,
6617		    int ignore ATTRIBUTE_UNUSED)
6618{
6619  tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
6620  unsigned int fcode = DECL_FUNCTION_CODE (fndecl);
6621  struct spu_builtin_description *d;
6622
6623  if (fcode < NUM_SPU_BUILTINS)
6624    {
6625      d = &spu_builtins[fcode];
6626
6627      return spu_expand_builtin_1 (d, exp, target);
6628    }
6629  abort ();
6630}
6631
6632/* Implement targetm.vectorize.builtin_mask_for_load.  */
6633static tree
6634spu_builtin_mask_for_load (void)
6635{
6636  return spu_builtin_decls[SPU_MASK_FOR_LOAD];
6637}
6638
6639/* Implement targetm.vectorize.builtin_vectorization_cost.  */
6640static int
6641spu_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6642                                tree vectype,
6643                                int misalign ATTRIBUTE_UNUSED)
6644{
6645  unsigned elements;
6646
6647  switch (type_of_cost)
6648    {
6649      case scalar_stmt:
6650      case vector_stmt:
6651      case vector_load:
6652      case vector_store:
6653      case vec_to_scalar:
6654      case scalar_to_vec:
6655      case cond_branch_not_taken:
6656      case vec_perm:
6657      case vec_promote_demote:
6658        return 1;
6659
6660      case scalar_store:
6661        return 10;
6662
6663      case scalar_load:
6664        /* Load + rotate.  */
6665        return 2;
6666
6667      case unaligned_load:
6668        return 2;
6669
6670      case cond_branch_taken:
6671        return 6;
6672
6673      case vec_construct:
6674	elements = TYPE_VECTOR_SUBPARTS (vectype);
6675	return elements / 2 + 1;
6676
6677      default:
6678        gcc_unreachable ();
6679    }
6680}
6681
6682/* Implement targetm.vectorize.init_cost.  */
6683
6684static void *
6685spu_init_cost (struct loop *loop_info ATTRIBUTE_UNUSED)
6686{
6687  unsigned *cost = XNEWVEC (unsigned, 3);
6688  cost[vect_prologue] = cost[vect_body] = cost[vect_epilogue] = 0;
6689  return cost;
6690}
6691
6692/* Implement targetm.vectorize.add_stmt_cost.  */
6693
6694static unsigned
6695spu_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6696		   struct _stmt_vec_info *stmt_info, int misalign,
6697		   enum vect_cost_model_location where)
6698{
6699  unsigned *cost = (unsigned *) data;
6700  unsigned retval = 0;
6701
6702  if (flag_vect_cost_model)
6703    {
6704      tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6705      int stmt_cost = spu_builtin_vectorization_cost (kind, vectype, misalign);
6706
6707      /* Statements in an inner loop relative to the loop being
6708	 vectorized are weighted more heavily.  The value here is
6709	 arbitrary and could potentially be improved with analysis.  */
6710      if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6711	count *= 50;  /* FIXME.  */
6712
6713      retval = (unsigned) (count * stmt_cost);
6714      cost[where] += retval;
6715    }
6716
6717  return retval;
6718}
6719
6720/* Implement targetm.vectorize.finish_cost.  */
6721
6722static void
6723spu_finish_cost (void *data, unsigned *prologue_cost,
6724		 unsigned *body_cost, unsigned *epilogue_cost)
6725{
6726  unsigned *cost = (unsigned *) data;
6727  *prologue_cost = cost[vect_prologue];
6728  *body_cost     = cost[vect_body];
6729  *epilogue_cost = cost[vect_epilogue];
6730}
6731
6732/* Implement targetm.vectorize.destroy_cost_data.  */
6733
6734static void
6735spu_destroy_cost_data (void *data)
6736{
6737  free (data);
6738}
6739
6740/* Return true iff, data reference of TYPE can reach vector alignment (16)
6741   after applying N number of iterations.  This routine does not determine
6742   how may iterations are required to reach desired alignment.  */
6743
6744static bool
6745spu_vector_alignment_reachable (const_tree type ATTRIBUTE_UNUSED, bool is_packed)
6746{
6747  if (is_packed)
6748    return false;
6749
6750  /* All other types are naturally aligned.  */
6751  return true;
6752}
6753
6754/* Return the appropriate mode for a named address pointer.  */
6755static machine_mode
6756spu_addr_space_pointer_mode (addr_space_t addrspace)
6757{
6758  switch (addrspace)
6759    {
6760    case ADDR_SPACE_GENERIC:
6761      return ptr_mode;
6762    case ADDR_SPACE_EA:
6763      return EAmode;
6764    default:
6765      gcc_unreachable ();
6766    }
6767}
6768
6769/* Return the appropriate mode for a named address address.  */
6770static machine_mode
6771spu_addr_space_address_mode (addr_space_t addrspace)
6772{
6773  switch (addrspace)
6774    {
6775    case ADDR_SPACE_GENERIC:
6776      return Pmode;
6777    case ADDR_SPACE_EA:
6778      return EAmode;
6779    default:
6780      gcc_unreachable ();
6781    }
6782}
6783
6784/* Determine if one named address space is a subset of another.  */
6785
6786static bool
6787spu_addr_space_subset_p (addr_space_t subset, addr_space_t superset)
6788{
6789  gcc_assert (subset == ADDR_SPACE_GENERIC || subset == ADDR_SPACE_EA);
6790  gcc_assert (superset == ADDR_SPACE_GENERIC || superset == ADDR_SPACE_EA);
6791
6792  if (subset == superset)
6793    return true;
6794
6795  /* If we have -mno-address-space-conversion, treat __ea and generic as not
6796     being subsets but instead as disjoint address spaces.  */
6797  else if (!TARGET_ADDRESS_SPACE_CONVERSION)
6798    return false;
6799
6800  else
6801    return (subset == ADDR_SPACE_GENERIC && superset == ADDR_SPACE_EA);
6802}
6803
6804/* Convert from one address space to another.  */
6805static rtx
6806spu_addr_space_convert (rtx op, tree from_type, tree to_type)
6807{
6808  addr_space_t from_as = TYPE_ADDR_SPACE (TREE_TYPE (from_type));
6809  addr_space_t to_as = TYPE_ADDR_SPACE (TREE_TYPE (to_type));
6810
6811  gcc_assert (from_as == ADDR_SPACE_GENERIC || from_as == ADDR_SPACE_EA);
6812  gcc_assert (to_as == ADDR_SPACE_GENERIC || to_as == ADDR_SPACE_EA);
6813
6814  if (to_as == ADDR_SPACE_GENERIC && from_as == ADDR_SPACE_EA)
6815    {
6816      rtx result, ls;
6817
6818      ls = gen_const_mem (DImode,
6819			  gen_rtx_SYMBOL_REF (Pmode, "__ea_local_store"));
6820      set_mem_align (ls, 128);
6821
6822      result = gen_reg_rtx (Pmode);
6823      ls = force_reg (Pmode, convert_modes (Pmode, DImode, ls, 1));
6824      op = force_reg (Pmode, convert_modes (Pmode, EAmode, op, 1));
6825      ls = emit_conditional_move (ls, NE, op, const0_rtx, Pmode,
6826					  ls, const0_rtx, Pmode, 1);
6827
6828      emit_insn (gen_subsi3 (result, op, ls));
6829
6830      return result;
6831    }
6832
6833  else if (to_as == ADDR_SPACE_EA && from_as == ADDR_SPACE_GENERIC)
6834    {
6835      rtx result, ls;
6836
6837      ls = gen_const_mem (DImode,
6838			  gen_rtx_SYMBOL_REF (Pmode, "__ea_local_store"));
6839      set_mem_align (ls, 128);
6840
6841      result = gen_reg_rtx (EAmode);
6842      ls = force_reg (EAmode, convert_modes (EAmode, DImode, ls, 1));
6843      op = force_reg (Pmode, op);
6844      ls = emit_conditional_move (ls, NE, op, const0_rtx, Pmode,
6845					  ls, const0_rtx, EAmode, 1);
6846      op = force_reg (EAmode, convert_modes (EAmode, Pmode, op, 1));
6847
6848      if (EAmode == SImode)
6849	emit_insn (gen_addsi3 (result, op, ls));
6850      else
6851	emit_insn (gen_adddi3 (result, op, ls));
6852
6853      return result;
6854    }
6855
6856  else
6857    gcc_unreachable ();
6858}
6859
6860
6861/* Count the total number of instructions in each pipe and return the
6862   maximum, which is used as the Minimum Iteration Interval (MII)
6863   in the modulo scheduler.  get_pipe() will return -2, -1, 0, or 1.
6864   -2 are instructions that can go in pipe0 or pipe1.  */
6865static int
6866spu_sms_res_mii (struct ddg *g)
6867{
6868  int i;
6869  unsigned t[4] = {0, 0, 0, 0};
6870
6871  for (i = 0; i < g->num_nodes; i++)
6872    {
6873      rtx_insn *insn = g->nodes[i].insn;
6874      int p = get_pipe (insn) + 2;
6875
6876      gcc_assert (p >= 0);
6877      gcc_assert (p < 4);
6878
6879      t[p]++;
6880      if (dump_file && INSN_P (insn))
6881            fprintf (dump_file, "i%d %s %d %d\n",
6882                     INSN_UID (insn),
6883                     insn_data[INSN_CODE(insn)].name,
6884                     p, t[p]);
6885    }
6886  if (dump_file)
6887    fprintf (dump_file, "%d %d %d %d\n", t[0], t[1], t[2], t[3]);
6888
6889  return MAX ((t[0] + t[2] + t[3] + 1) / 2, MAX (t[2], t[3]));
6890}
6891
6892
6893void
6894spu_init_expanders (void)
6895{
6896  if (cfun)
6897    {
6898      rtx r0, r1;
6899      /* HARD_FRAME_REGISTER is only 128 bit aligned when
6900         frame_pointer_needed is true.  We don't know that until we're
6901         expanding the prologue. */
6902      REGNO_POINTER_ALIGN (HARD_FRAME_POINTER_REGNUM) = 8;
6903
6904      /* A number of passes use LAST_VIRTUAL_REGISTER+1 and
6905	 LAST_VIRTUAL_REGISTER+2 to test the back-end.  We want them
6906	 to be treated as aligned, so generate them here. */
6907      r0 = gen_reg_rtx (SImode);
6908      r1 = gen_reg_rtx (SImode);
6909      mark_reg_pointer (r0, 128);
6910      mark_reg_pointer (r1, 128);
6911      gcc_assert (REGNO (r0) == LAST_VIRTUAL_REGISTER + 1
6912		  && REGNO (r1) == LAST_VIRTUAL_REGISTER + 2);
6913    }
6914}
6915
6916static machine_mode
6917spu_libgcc_cmp_return_mode (void)
6918{
6919
6920/* For SPU word mode is TI mode so it is better to use SImode
6921   for compare returns.  */
6922  return SImode;
6923}
6924
6925static machine_mode
6926spu_libgcc_shift_count_mode (void)
6927{
6928/* For SPU word mode is TI mode so it is better to use SImode
6929   for shift counts.  */
6930  return SImode;
6931}
6932
6933/* Implement targetm.section_type_flags.  */
6934static unsigned int
6935spu_section_type_flags (tree decl, const char *name, int reloc)
6936{
6937  /* .toe needs to have type @nobits.  */
6938  if (strcmp (name, ".toe") == 0)
6939    return SECTION_BSS;
6940  /* Don't load _ea into the current address space.  */
6941  if (strcmp (name, "._ea") == 0)
6942    return SECTION_WRITE | SECTION_DEBUG;
6943  return default_section_type_flags (decl, name, reloc);
6944}
6945
6946/* Implement targetm.select_section.  */
6947static section *
6948spu_select_section (tree decl, int reloc, unsigned HOST_WIDE_INT align)
6949{
6950  /* Variables and constants defined in the __ea address space
6951     go into a special section named "._ea".  */
6952  if (TREE_TYPE (decl) != error_mark_node
6953      && TYPE_ADDR_SPACE (TREE_TYPE (decl)) == ADDR_SPACE_EA)
6954    {
6955      /* We might get called with string constants, but get_named_section
6956	 doesn't like them as they are not DECLs.  Also, we need to set
6957	 flags in that case.  */
6958      if (!DECL_P (decl))
6959	return get_section ("._ea", SECTION_WRITE | SECTION_DEBUG, NULL);
6960
6961      return get_named_section (decl, "._ea", reloc);
6962    }
6963
6964  return default_elf_select_section (decl, reloc, align);
6965}
6966
6967/* Implement targetm.unique_section.  */
6968static void
6969spu_unique_section (tree decl, int reloc)
6970{
6971  /* We don't support unique section names in the __ea address
6972     space for now.  */
6973  if (TREE_TYPE (decl) != error_mark_node
6974      && TYPE_ADDR_SPACE (TREE_TYPE (decl)) != 0)
6975    return;
6976
6977  default_unique_section (decl, reloc);
6978}
6979
6980/* Generate a constant or register which contains 2^SCALE.  We assume
6981   the result is valid for MODE.  Currently, MODE must be V4SFmode and
6982   SCALE must be SImode. */
6983rtx
6984spu_gen_exp2 (machine_mode mode, rtx scale)
6985{
6986  gcc_assert (mode == V4SFmode);
6987  gcc_assert (GET_MODE (scale) == SImode || GET_CODE (scale) == CONST_INT);
6988  if (GET_CODE (scale) != CONST_INT)
6989    {
6990      /* unsigned int exp = (127 + scale) << 23;
6991	__vector float m = (__vector float) spu_splats (exp); */
6992      rtx reg = force_reg (SImode, scale);
6993      rtx exp = gen_reg_rtx (SImode);
6994      rtx mul = gen_reg_rtx (mode);
6995      emit_insn (gen_addsi3 (exp, reg, GEN_INT (127)));
6996      emit_insn (gen_ashlsi3 (exp, exp, GEN_INT (23)));
6997      emit_insn (gen_spu_splats (mul, gen_rtx_SUBREG (GET_MODE_INNER (mode), exp, 0)));
6998      return mul;
6999    }
7000  else
7001    {
7002      HOST_WIDE_INT exp = 127 + INTVAL (scale);
7003      unsigned char arr[16];
7004      arr[0] = arr[4] = arr[8] = arr[12] = exp >> 1;
7005      arr[1] = arr[5] = arr[9] = arr[13] = exp << 7;
7006      arr[2] = arr[6] = arr[10] = arr[14] = 0;
7007      arr[3] = arr[7] = arr[11] = arr[15] = 0;
7008      return array_to_constant (mode, arr);
7009    }
7010}
7011
7012/* After reload, just change the convert into a move instruction
7013   or a dead instruction. */
7014void
7015spu_split_convert (rtx ops[])
7016{
7017  if (REGNO (ops[0]) == REGNO (ops[1]))
7018    emit_note (NOTE_INSN_DELETED);
7019  else
7020    {
7021      /* Use TImode always as this might help hard reg copyprop.  */
7022      rtx op0 = gen_rtx_REG (TImode, REGNO (ops[0]));
7023      rtx op1 = gen_rtx_REG (TImode, REGNO (ops[1]));
7024      emit_insn (gen_move_insn (op0, op1));
7025    }
7026}
7027
7028void
7029spu_function_profiler (FILE * file, int labelno ATTRIBUTE_UNUSED)
7030{
7031  fprintf (file, "# profile\n");
7032  fprintf (file, "brsl $75,  _mcount\n");
7033}
7034
7035/* Implement targetm.ref_may_alias_errno.  */
7036static bool
7037spu_ref_may_alias_errno (ao_ref *ref)
7038{
7039  tree base = ao_ref_base (ref);
7040
7041  /* With SPU newlib, errno is defined as something like
7042         _impure_data._errno
7043     The default implementation of this target macro does not
7044     recognize such expressions, so special-code for it here.  */
7045
7046  if (TREE_CODE (base) == VAR_DECL
7047      && !TREE_STATIC (base)
7048      && DECL_EXTERNAL (base)
7049      && TREE_CODE (TREE_TYPE (base)) == RECORD_TYPE
7050      && strcmp (IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (base)),
7051		 "_impure_data") == 0
7052      /* _errno is the first member of _impure_data.  */
7053      && ref->offset == 0)
7054    return true;
7055
7056  return default_ref_may_alias_errno (ref);
7057}
7058
7059/* Output thunk to FILE that implements a C++ virtual function call (with
7060   multiple inheritance) to FUNCTION.  The thunk adjusts the this pointer
7061   by DELTA, and unless VCALL_OFFSET is zero, applies an additional adjustment
7062   stored at VCALL_OFFSET in the vtable whose address is located at offset 0
7063   relative to the resulting this pointer.  */
7064
7065static void
7066spu_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
7067		     HOST_WIDE_INT delta, HOST_WIDE_INT vcall_offset,
7068		     tree function)
7069{
7070  rtx op[8];
7071
7072  /* Make sure unwind info is emitted for the thunk if needed.  */
7073  final_start_function (emit_barrier (), file, 1);
7074
7075  /* Operand 0 is the target function.  */
7076  op[0] = XEXP (DECL_RTL (function), 0);
7077
7078  /* Operand 1 is the 'this' pointer.  */
7079  if (aggregate_value_p (TREE_TYPE (TREE_TYPE (function)), function))
7080    op[1] = gen_rtx_REG (Pmode, FIRST_ARG_REGNUM + 1);
7081  else
7082    op[1] = gen_rtx_REG (Pmode, FIRST_ARG_REGNUM);
7083
7084  /* Operands 2/3 are the low/high halfwords of delta.  */
7085  op[2] = GEN_INT (trunc_int_for_mode (delta, HImode));
7086  op[3] = GEN_INT (trunc_int_for_mode (delta >> 16, HImode));
7087
7088  /* Operands 4/5 are the low/high halfwords of vcall_offset.  */
7089  op[4] = GEN_INT (trunc_int_for_mode (vcall_offset, HImode));
7090  op[5] = GEN_INT (trunc_int_for_mode (vcall_offset >> 16, HImode));
7091
7092  /* Operands 6/7 are temporary registers.  */
7093  op[6] = gen_rtx_REG (Pmode, 79);
7094  op[7] = gen_rtx_REG (Pmode, 78);
7095
7096  /* Add DELTA to this pointer.  */
7097  if (delta)
7098    {
7099      if (delta >= -0x200 && delta < 0x200)
7100	output_asm_insn ("ai\t%1,%1,%2", op);
7101      else if (delta >= -0x8000 && delta < 0x8000)
7102	{
7103	  output_asm_insn ("il\t%6,%2", op);
7104	  output_asm_insn ("a\t%1,%1,%6", op);
7105	}
7106      else
7107	{
7108	  output_asm_insn ("ilhu\t%6,%3", op);
7109	  output_asm_insn ("iohl\t%6,%2", op);
7110	  output_asm_insn ("a\t%1,%1,%6", op);
7111	}
7112    }
7113
7114  /* Perform vcall adjustment.  */
7115  if (vcall_offset)
7116    {
7117      output_asm_insn ("lqd\t%7,0(%1)", op);
7118      output_asm_insn ("rotqby\t%7,%7,%1", op);
7119
7120      if (vcall_offset >= -0x200 && vcall_offset < 0x200)
7121	output_asm_insn ("ai\t%7,%7,%4", op);
7122      else if (vcall_offset >= -0x8000 && vcall_offset < 0x8000)
7123	{
7124	  output_asm_insn ("il\t%6,%4", op);
7125	  output_asm_insn ("a\t%7,%7,%6", op);
7126	}
7127      else
7128	{
7129	  output_asm_insn ("ilhu\t%6,%5", op);
7130	  output_asm_insn ("iohl\t%6,%4", op);
7131	  output_asm_insn ("a\t%7,%7,%6", op);
7132	}
7133
7134      output_asm_insn ("lqd\t%6,0(%7)", op);
7135      output_asm_insn ("rotqby\t%6,%6,%7", op);
7136      output_asm_insn ("a\t%1,%1,%6", op);
7137    }
7138
7139  /* Jump to target.  */
7140  output_asm_insn ("br\t%0", op);
7141
7142  final_end_function ();
7143}
7144
7145/* Canonicalize a comparison from one we don't have to one we do have.  */
7146static void
7147spu_canonicalize_comparison (int *code, rtx *op0, rtx *op1,
7148			     bool op0_preserve_value)
7149{
7150  if (!op0_preserve_value
7151      && (*code == LE || *code == LT || *code == LEU || *code == LTU))
7152    {
7153      rtx tem = *op0;
7154      *op0 = *op1;
7155      *op1 = tem;
7156      *code = (int)swap_condition ((enum rtx_code)*code);
7157    }
7158}
7159
7160/*  Table of machine attributes.  */
7161static const struct attribute_spec spu_attribute_table[] =
7162{
7163  /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler,
7164       affects_type_identity } */
7165  { "naked",          0, 0, true,  false, false, spu_handle_fndecl_attribute,
7166    false },
7167  { "spu_vector",     0, 0, false, true,  false, spu_handle_vector_attribute,
7168    false },
7169  { NULL,             0, 0, false, false, false, NULL, false }
7170};
7171
7172/*  TARGET overrides.  */
7173
7174#undef TARGET_ADDR_SPACE_POINTER_MODE
7175#define TARGET_ADDR_SPACE_POINTER_MODE spu_addr_space_pointer_mode
7176
7177#undef TARGET_ADDR_SPACE_ADDRESS_MODE
7178#define TARGET_ADDR_SPACE_ADDRESS_MODE spu_addr_space_address_mode
7179
7180#undef TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P
7181#define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \
7182  spu_addr_space_legitimate_address_p
7183
7184#undef TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS
7185#define TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS spu_addr_space_legitimize_address
7186
7187#undef TARGET_ADDR_SPACE_SUBSET_P
7188#define TARGET_ADDR_SPACE_SUBSET_P spu_addr_space_subset_p
7189
7190#undef TARGET_ADDR_SPACE_CONVERT
7191#define TARGET_ADDR_SPACE_CONVERT spu_addr_space_convert
7192
7193#undef TARGET_INIT_BUILTINS
7194#define TARGET_INIT_BUILTINS spu_init_builtins
7195#undef TARGET_BUILTIN_DECL
7196#define TARGET_BUILTIN_DECL spu_builtin_decl
7197
7198#undef TARGET_EXPAND_BUILTIN
7199#define TARGET_EXPAND_BUILTIN spu_expand_builtin
7200
7201#undef TARGET_UNWIND_WORD_MODE
7202#define TARGET_UNWIND_WORD_MODE spu_unwind_word_mode
7203
7204#undef TARGET_LEGITIMIZE_ADDRESS
7205#define TARGET_LEGITIMIZE_ADDRESS spu_legitimize_address
7206
7207/* The current assembler doesn't like .4byte foo@ppu, so use the normal .long
7208   and .quad for the debugger.  When it is known that the assembler is fixed,
7209   these can be removed.  */
7210#undef TARGET_ASM_UNALIGNED_SI_OP
7211#define TARGET_ASM_UNALIGNED_SI_OP	"\t.long\t"
7212
7213#undef TARGET_ASM_ALIGNED_DI_OP
7214#define TARGET_ASM_ALIGNED_DI_OP	"\t.quad\t"
7215
7216/* The .8byte directive doesn't seem to work well for a 32 bit
7217   architecture. */
7218#undef TARGET_ASM_UNALIGNED_DI_OP
7219#define TARGET_ASM_UNALIGNED_DI_OP NULL
7220
7221#undef TARGET_RTX_COSTS
7222#define TARGET_RTX_COSTS spu_rtx_costs
7223
7224#undef TARGET_ADDRESS_COST
7225#define TARGET_ADDRESS_COST hook_int_rtx_mode_as_bool_0
7226
7227#undef TARGET_SCHED_ISSUE_RATE
7228#define TARGET_SCHED_ISSUE_RATE spu_sched_issue_rate
7229
7230#undef TARGET_SCHED_INIT_GLOBAL
7231#define TARGET_SCHED_INIT_GLOBAL spu_sched_init_global
7232
7233#undef TARGET_SCHED_INIT
7234#define TARGET_SCHED_INIT spu_sched_init
7235
7236#undef TARGET_SCHED_VARIABLE_ISSUE
7237#define TARGET_SCHED_VARIABLE_ISSUE spu_sched_variable_issue
7238
7239#undef TARGET_SCHED_REORDER
7240#define TARGET_SCHED_REORDER spu_sched_reorder
7241
7242#undef TARGET_SCHED_REORDER2
7243#define TARGET_SCHED_REORDER2 spu_sched_reorder
7244
7245#undef TARGET_SCHED_ADJUST_COST
7246#define TARGET_SCHED_ADJUST_COST spu_sched_adjust_cost
7247
7248#undef  TARGET_ATTRIBUTE_TABLE
7249#define TARGET_ATTRIBUTE_TABLE spu_attribute_table
7250
7251#undef TARGET_ASM_INTEGER
7252#define TARGET_ASM_INTEGER spu_assemble_integer
7253
7254#undef TARGET_SCALAR_MODE_SUPPORTED_P
7255#define TARGET_SCALAR_MODE_SUPPORTED_P	spu_scalar_mode_supported_p
7256
7257#undef TARGET_VECTOR_MODE_SUPPORTED_P
7258#define TARGET_VECTOR_MODE_SUPPORTED_P	spu_vector_mode_supported_p
7259
7260#undef TARGET_FUNCTION_OK_FOR_SIBCALL
7261#define TARGET_FUNCTION_OK_FOR_SIBCALL spu_function_ok_for_sibcall
7262
7263#undef TARGET_ASM_GLOBALIZE_LABEL
7264#define TARGET_ASM_GLOBALIZE_LABEL spu_asm_globalize_label
7265
7266#undef TARGET_PASS_BY_REFERENCE
7267#define TARGET_PASS_BY_REFERENCE spu_pass_by_reference
7268
7269#undef TARGET_FUNCTION_ARG
7270#define TARGET_FUNCTION_ARG spu_function_arg
7271
7272#undef TARGET_FUNCTION_ARG_ADVANCE
7273#define TARGET_FUNCTION_ARG_ADVANCE spu_function_arg_advance
7274
7275#undef TARGET_MUST_PASS_IN_STACK
7276#define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
7277
7278#undef TARGET_BUILD_BUILTIN_VA_LIST
7279#define TARGET_BUILD_BUILTIN_VA_LIST spu_build_builtin_va_list
7280
7281#undef TARGET_EXPAND_BUILTIN_VA_START
7282#define TARGET_EXPAND_BUILTIN_VA_START spu_va_start
7283
7284#undef TARGET_SETUP_INCOMING_VARARGS
7285#define TARGET_SETUP_INCOMING_VARARGS spu_setup_incoming_varargs
7286
7287#undef TARGET_MACHINE_DEPENDENT_REORG
7288#define TARGET_MACHINE_DEPENDENT_REORG spu_machine_dependent_reorg
7289
7290#undef TARGET_GIMPLIFY_VA_ARG_EXPR
7291#define TARGET_GIMPLIFY_VA_ARG_EXPR spu_gimplify_va_arg_expr
7292
7293#undef TARGET_INIT_LIBFUNCS
7294#define TARGET_INIT_LIBFUNCS spu_init_libfuncs
7295
7296#undef TARGET_RETURN_IN_MEMORY
7297#define TARGET_RETURN_IN_MEMORY spu_return_in_memory
7298
7299#undef  TARGET_ENCODE_SECTION_INFO
7300#define TARGET_ENCODE_SECTION_INFO spu_encode_section_info
7301
7302#undef TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD
7303#define TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD spu_builtin_mask_for_load
7304
7305#undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
7306#define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST spu_builtin_vectorization_cost
7307
7308#undef TARGET_VECTORIZE_INIT_COST
7309#define TARGET_VECTORIZE_INIT_COST spu_init_cost
7310
7311#undef TARGET_VECTORIZE_ADD_STMT_COST
7312#define TARGET_VECTORIZE_ADD_STMT_COST spu_add_stmt_cost
7313
7314#undef TARGET_VECTORIZE_FINISH_COST
7315#define TARGET_VECTORIZE_FINISH_COST spu_finish_cost
7316
7317#undef TARGET_VECTORIZE_DESTROY_COST_DATA
7318#define TARGET_VECTORIZE_DESTROY_COST_DATA spu_destroy_cost_data
7319
7320#undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
7321#define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE spu_vector_alignment_reachable
7322
7323#undef TARGET_LIBGCC_CMP_RETURN_MODE
7324#define TARGET_LIBGCC_CMP_RETURN_MODE spu_libgcc_cmp_return_mode
7325
7326#undef TARGET_LIBGCC_SHIFT_COUNT_MODE
7327#define TARGET_LIBGCC_SHIFT_COUNT_MODE spu_libgcc_shift_count_mode
7328
7329#undef TARGET_SCHED_SMS_RES_MII
7330#define TARGET_SCHED_SMS_RES_MII spu_sms_res_mii
7331
7332#undef TARGET_SECTION_TYPE_FLAGS
7333#define TARGET_SECTION_TYPE_FLAGS spu_section_type_flags
7334
7335#undef TARGET_ASM_SELECT_SECTION
7336#define TARGET_ASM_SELECT_SECTION  spu_select_section
7337
7338#undef TARGET_ASM_UNIQUE_SECTION
7339#define TARGET_ASM_UNIQUE_SECTION  spu_unique_section
7340
7341#undef TARGET_LEGITIMATE_ADDRESS_P
7342#define TARGET_LEGITIMATE_ADDRESS_P spu_legitimate_address_p
7343
7344#undef TARGET_LEGITIMATE_CONSTANT_P
7345#define TARGET_LEGITIMATE_CONSTANT_P spu_legitimate_constant_p
7346
7347#undef TARGET_TRAMPOLINE_INIT
7348#define TARGET_TRAMPOLINE_INIT spu_trampoline_init
7349
7350#undef TARGET_WARN_FUNC_RETURN
7351#define TARGET_WARN_FUNC_RETURN spu_warn_func_return
7352
7353#undef TARGET_OPTION_OVERRIDE
7354#define TARGET_OPTION_OVERRIDE spu_option_override
7355
7356#undef TARGET_CONDITIONAL_REGISTER_USAGE
7357#define TARGET_CONDITIONAL_REGISTER_USAGE spu_conditional_register_usage
7358
7359#undef TARGET_REF_MAY_ALIAS_ERRNO
7360#define TARGET_REF_MAY_ALIAS_ERRNO spu_ref_may_alias_errno
7361
7362#undef TARGET_ASM_OUTPUT_MI_THUNK
7363#define TARGET_ASM_OUTPUT_MI_THUNK spu_output_mi_thunk
7364#undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
7365#define TARGET_ASM_CAN_OUTPUT_MI_THUNK hook_bool_const_tree_hwi_hwi_const_tree_true
7366
7367/* Variable tracking should be run after all optimizations which
7368   change order of insns.  It also needs a valid CFG.  */
7369#undef TARGET_DELAY_VARTRACK
7370#define TARGET_DELAY_VARTRACK true
7371
7372#undef TARGET_CANONICALIZE_COMPARISON
7373#define TARGET_CANONICALIZE_COMPARISON spu_canonicalize_comparison
7374
7375#undef TARGET_CAN_USE_DOLOOP_P
7376#define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
7377
7378struct gcc_target targetm = TARGET_INITIALIZER;
7379
7380#include "gt-spu.h"
7381