x86-tune-sched.c revision 1.1.1.1
1/* Scheduler hooks for IA-32 which implement CPU specific logic.
2   Copyright (C) 1988-2018 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 3, or (at your option)
9any later version.
10
11GCC is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14GNU General Public License for more details.
15
16You should have received a copy of the GNU General Public License
17along with GCC; see the file COPYING3.  If not see
18<http://www.gnu.org/licenses/>.  */
19
20#define IN_TARGET_CODE 1
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "rtl.h"
27#include "tree.h"
28#include "cfghooks.h"
29#include "tm_p.h"
30#include "insn-config.h"
31#include "insn-attr.h"
32#include "recog.h"
33#include "target.h"
34
35/* Return the maximum number of instructions a cpu can issue.  */
36
37int
38ix86_issue_rate (void)
39{
40  switch (ix86_tune)
41    {
42    case PROCESSOR_PENTIUM:
43    case PROCESSOR_LAKEMONT:
44    case PROCESSOR_BONNELL:
45    case PROCESSOR_SILVERMONT:
46    case PROCESSOR_KNL:
47    case PROCESSOR_KNM:
48    case PROCESSOR_INTEL:
49    case PROCESSOR_K6:
50    case PROCESSOR_BTVER2:
51    case PROCESSOR_PENTIUM4:
52    case PROCESSOR_NOCONA:
53      return 2;
54
55    case PROCESSOR_PENTIUMPRO:
56    case PROCESSOR_ATHLON:
57    case PROCESSOR_K8:
58    case PROCESSOR_AMDFAM10:
59    case PROCESSOR_BTVER1:
60      return 3;
61
62    case PROCESSOR_BDVER1:
63    case PROCESSOR_BDVER2:
64    case PROCESSOR_BDVER3:
65    case PROCESSOR_BDVER4:
66    case PROCESSOR_ZNVER1:
67    case PROCESSOR_CORE2:
68    case PROCESSOR_NEHALEM:
69    case PROCESSOR_SANDYBRIDGE:
70    case PROCESSOR_HASWELL:
71    case PROCESSOR_GENERIC:
72      return 4;
73
74    default:
75      return 1;
76    }
77}
78
79/* Return true iff USE_INSN has a memory address with operands set by
80   SET_INSN.  */
81
82bool
83ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
84{
85  int i;
86  extract_insn_cached (use_insn);
87  for (i = recog_data.n_operands - 1; i >= 0; --i)
88    if (MEM_P (recog_data.operand[i]))
89      {
90	rtx addr = XEXP (recog_data.operand[i], 0);
91	if (modified_in_p (addr, set_insn) != 0)
92	  {
93	    /* No AGI stall if SET_INSN is a push or pop and USE_INSN
94	       has SP based memory (unless index reg is modified in a pop).  */
95	    rtx set = single_set (set_insn);
96	    if (set
97		&& (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
98		    || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
99	      {
100		struct ix86_address parts;
101		if (ix86_decompose_address (addr, &parts)
102		    && parts.base == stack_pointer_rtx
103		    && (parts.index == NULL_RTX
104			|| MEM_P (SET_DEST (set))
105			|| !modified_in_p (parts.index, set_insn)))
106		  return false;
107	      }
108	    return true;
109	  }
110	return false;
111      }
112  return false;
113}
114
115/* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
116   by DEP_INSN and nothing set by DEP_INSN.  */
117
118static bool
119ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
120{
121  rtx set, set2;
122
123  /* Simplify the test for uninteresting insns.  */
124  if (insn_type != TYPE_SETCC
125      && insn_type != TYPE_ICMOV
126      && insn_type != TYPE_FCMOV
127      && insn_type != TYPE_IBR)
128    return false;
129
130  if ((set = single_set (dep_insn)) != 0)
131    {
132      set = SET_DEST (set);
133      set2 = NULL_RTX;
134    }
135  else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
136	   && XVECLEN (PATTERN (dep_insn), 0) == 2
137	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
138	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
139    {
140      set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
141      set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
142    }
143  else
144    return false;
145
146  if (!REG_P (set) || REGNO (set) != FLAGS_REG)
147    return false;
148
149  /* This test is true if the dependent insn reads the flags but
150     not any other potentially set register.  */
151  if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
152    return false;
153
154  if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
155    return false;
156
157  return true;
158}
159
160/* Helper function for exact_store_load_dependency.
161   Return true if addr is found in insn.  */
162static bool
163exact_dependency_1 (rtx addr, rtx insn)
164{
165  enum rtx_code code;
166  const char *format_ptr;
167  int i, j;
168
169  code = GET_CODE (insn);
170  switch (code)
171    {
172    case MEM:
173      if (rtx_equal_p (addr, insn))
174	return true;
175      break;
176    case REG:
177    CASE_CONST_ANY:
178    case SYMBOL_REF:
179    case CODE_LABEL:
180    case PC:
181    case CC0:
182    case EXPR_LIST:
183      return false;
184    default:
185      break;
186    }
187
188  format_ptr = GET_RTX_FORMAT (code);
189  for (i = 0; i < GET_RTX_LENGTH (code); i++)
190    {
191      switch (*format_ptr++)
192	{
193	case 'e':
194	  if (exact_dependency_1 (addr, XEXP (insn, i)))
195	    return true;
196	  break;
197	case 'E':
198	  for (j = 0; j < XVECLEN (insn, i); j++)
199	    if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
200	      return true;
201	  break;
202	}
203    }
204  return false;
205}
206
207/* Return true if there exists exact dependency for store & load, i.e.
208   the same memory address is used in them.  */
209static bool
210exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
211{
212  rtx set1, set2;
213
214  set1 = single_set (store);
215  if (!set1)
216    return false;
217  if (!MEM_P (SET_DEST (set1)))
218    return false;
219  set2 = single_set (load);
220  if (!set2)
221    return false;
222  if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
223    return true;
224  return false;
225}
226
227
228/* This function corrects the value of COST (latency) based on the relationship
229   between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength
230   DW.  It should return the new value.
231
232   On x86 CPUs this is most commonly used to model the fact that valus of
233   registers used to compute address of memory operand  needs to be ready
234   earlier than values of registers used in the actual operation.  */
235
236int
237ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
238		  unsigned int)
239{
240  enum attr_type insn_type, dep_insn_type;
241  enum attr_memory memory;
242  rtx set, set2;
243  int dep_insn_code_number;
244
245  /* Anti and output dependencies have zero cost on all CPUs.  */
246  if (dep_type != 0)
247    return 0;
248
249  dep_insn_code_number = recog_memoized (dep_insn);
250
251  /* If we can't recognize the insns, we can't really do anything.  */
252  if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
253    return cost;
254
255  insn_type = get_attr_type (insn);
256  dep_insn_type = get_attr_type (dep_insn);
257
258  switch (ix86_tune)
259    {
260    case PROCESSOR_PENTIUM:
261    case PROCESSOR_LAKEMONT:
262      /* Address Generation Interlock adds a cycle of latency.  */
263      if (insn_type == TYPE_LEA)
264	{
265	  rtx addr = PATTERN (insn);
266
267	  if (GET_CODE (addr) == PARALLEL)
268	    addr = XVECEXP (addr, 0, 0);
269
270	  gcc_assert (GET_CODE (addr) == SET);
271
272	  addr = SET_SRC (addr);
273	  if (modified_in_p (addr, dep_insn))
274	    cost += 1;
275	}
276      else if (ix86_agi_dependent (dep_insn, insn))
277	cost += 1;
278
279      /* ??? Compares pair with jump/setcc.  */
280      if (ix86_flags_dependent (insn, dep_insn, insn_type))
281	cost = 0;
282
283      /* Floating point stores require value to be ready one cycle earlier.  */
284      if (insn_type == TYPE_FMOV
285	  && get_attr_memory (insn) == MEMORY_STORE
286	  && !ix86_agi_dependent (dep_insn, insn))
287	cost += 1;
288      break;
289
290    case PROCESSOR_PENTIUMPRO:
291      /* INT->FP conversion is expensive.  */
292      if (get_attr_fp_int_src (dep_insn))
293	cost += 5;
294
295      /* There is one cycle extra latency between an FP op and a store.  */
296      if (insn_type == TYPE_FMOV
297	  && (set = single_set (dep_insn)) != NULL_RTX
298	  && (set2 = single_set (insn)) != NULL_RTX
299	  && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
300	  && MEM_P (SET_DEST (set2)))
301	cost += 1;
302
303      memory = get_attr_memory (insn);
304
305      /* Show ability of reorder buffer to hide latency of load by executing
306	 in parallel with previous instruction in case
307	 previous instruction is not needed to compute the address.  */
308      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
309	  && !ix86_agi_dependent (dep_insn, insn))
310	{
311	  /* Claim moves to take one cycle, as core can issue one load
312	     at time and the next load can start cycle later.  */
313	  if (dep_insn_type == TYPE_IMOV
314	      || dep_insn_type == TYPE_FMOV)
315	    cost = 1;
316	  else if (cost > 1)
317	    cost--;
318	}
319      break;
320
321    case PROCESSOR_K6:
322     /* The esp dependency is resolved before
323	the instruction is really finished.  */
324      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
325	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
326	return 1;
327
328      /* INT->FP conversion is expensive.  */
329      if (get_attr_fp_int_src (dep_insn))
330	cost += 5;
331
332      memory = get_attr_memory (insn);
333
334      /* Show ability of reorder buffer to hide latency of load by executing
335	 in parallel with previous instruction in case
336	 previous instruction is not needed to compute the address.  */
337      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
338	  && !ix86_agi_dependent (dep_insn, insn))
339	{
340	  /* Claim moves to take one cycle, as core can issue one load
341	     at time and the next load can start cycle later.  */
342	  if (dep_insn_type == TYPE_IMOV
343	      || dep_insn_type == TYPE_FMOV)
344	    cost = 1;
345	  else if (cost > 2)
346	    cost -= 2;
347	  else
348	    cost = 1;
349	}
350      break;
351
352    case PROCESSOR_AMDFAM10:
353    case PROCESSOR_BDVER1:
354    case PROCESSOR_BDVER2:
355    case PROCESSOR_BDVER3:
356    case PROCESSOR_BDVER4:
357    case PROCESSOR_BTVER1:
358    case PROCESSOR_BTVER2:
359      /* Stack engine allows to execute push&pop instructions in parall.  */
360      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
361	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
362	return 0;
363      /* FALLTHRU */
364
365    case PROCESSOR_ATHLON:
366    case PROCESSOR_K8:
367      memory = get_attr_memory (insn);
368
369      /* Show ability of reorder buffer to hide latency of load by executing
370	 in parallel with previous instruction in case
371	 previous instruction is not needed to compute the address.  */
372      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
373	  && !ix86_agi_dependent (dep_insn, insn))
374	{
375	  enum attr_unit unit = get_attr_unit (insn);
376	  int loadcost = 3;
377
378	  /* Because of the difference between the length of integer and
379	     floating unit pipeline preparation stages, the memory operands
380	     for floating point are cheaper.
381
382	     ??? For Athlon it the difference is most probably 2.  */
383	  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
384	    loadcost = 3;
385	  else
386	    loadcost = TARGET_ATHLON ? 2 : 0;
387
388	  if (cost >= loadcost)
389	    cost -= loadcost;
390	  else
391	    cost = 0;
392	}
393      break;
394
395    case PROCESSOR_ZNVER1:
396      /* Stack engine allows to execute push&pop instructions in parall.  */
397      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
398	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
399	return 0;
400
401      memory = get_attr_memory (insn);
402
403      /* Show ability of reorder buffer to hide latency of load by executing
404	 in parallel with previous instruction in case
405	 previous instruction is not needed to compute the address.  */
406      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
407	  && !ix86_agi_dependent (dep_insn, insn))
408	{
409	  enum attr_unit unit = get_attr_unit (insn);
410	  int loadcost;
411
412	  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
413	    loadcost = 4;
414	  else
415	    loadcost = 7;
416
417	  if (cost >= loadcost)
418	    cost -= loadcost;
419	  else
420	    cost = 0;
421	}
422      break;
423
424    case PROCESSOR_CORE2:
425    case PROCESSOR_NEHALEM:
426    case PROCESSOR_SANDYBRIDGE:
427    case PROCESSOR_HASWELL:
428    case PROCESSOR_GENERIC:
429      /* Stack engine allows to execute push&pop instructions in parall.  */
430      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
431	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
432	return 0;
433
434      memory = get_attr_memory (insn);
435
436      /* Show ability of reorder buffer to hide latency of load by executing
437	 in parallel with previous instruction in case
438	 previous instruction is not needed to compute the address.  */
439      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
440	  && !ix86_agi_dependent (dep_insn, insn))
441	{
442	  if (cost >= 4)
443	    cost -= 4;
444	  else
445	    cost = 0;
446	}
447      break;
448
449    case PROCESSOR_SILVERMONT:
450    case PROCESSOR_KNL:
451    case PROCESSOR_KNM:
452    case PROCESSOR_INTEL:
453      if (!reload_completed)
454	return cost;
455
456      /* Increase cost of integer loads.  */
457      memory = get_attr_memory (dep_insn);
458      if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
459	{
460	  enum attr_unit unit = get_attr_unit (dep_insn);
461	  if (unit == UNIT_INTEGER && cost == 1)
462	    {
463	      if (memory == MEMORY_LOAD)
464		cost = 3;
465	      else
466		{
467		  /* Increase cost of ld/st for short int types only
468		     because of store forwarding issue.  */
469		  rtx set = single_set (dep_insn);
470		  if (set && (GET_MODE (SET_DEST (set)) == QImode
471			      || GET_MODE (SET_DEST (set)) == HImode))
472		    {
473		      /* Increase cost of store/load insn if exact
474			 dependence exists and it is load insn.  */
475		      enum attr_memory insn_memory = get_attr_memory (insn);
476		      if (insn_memory == MEMORY_LOAD
477			  && exact_store_load_dependency (dep_insn, insn))
478			cost = 3;
479		    }
480		}
481	    }
482	}
483
484    default:
485      break;
486    }
487
488  return cost;
489}
490
491/* How many alternative schedules to try.  This should be as wide as the
492   scheduling freedom in the DFA, but no wider.  Making this value too
493   large results extra work for the scheduler.  */
494
495int
496ia32_multipass_dfa_lookahead (void)
497{
498  /* Generally, we want haifa-sched:max_issue() to look ahead as far
499     as many instructions can be executed on a cycle, i.e.,
500     issue_rate.  */
501  if (reload_completed)
502    return ix86_issue_rate ();
503  /* Don't use lookahead for pre-reload schedule to save compile time.  */
504  return 0;
505}
506
507/* Return true if target platform supports macro-fusion.  */
508
509bool
510ix86_macro_fusion_p ()
511{
512  return TARGET_FUSE_CMP_AND_BRANCH;
513}
514
515/* Check whether current microarchitecture support macro fusion
516   for insn pair "CONDGEN + CONDJMP". Refer to
517   "Intel Architectures Optimization Reference Manual". */
518
519bool
520ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
521{
522  rtx src, dest;
523  enum rtx_code ccode;
524  rtx compare_set = NULL_RTX, test_if, cond;
525  rtx alu_set = NULL_RTX, addr = NULL_RTX;
526
527  if (!any_condjump_p (condjmp))
528    return false;
529
530  unsigned int condreg1, condreg2;
531  rtx cc_reg_1;
532  targetm.fixed_condition_code_regs (&condreg1, &condreg2);
533  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
534  if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
535      || !condgen
536      || !modified_in_p (cc_reg_1, condgen))
537    return false;
538
539  if (get_attr_type (condgen) != TYPE_TEST
540      && get_attr_type (condgen) != TYPE_ICMP
541      && get_attr_type (condgen) != TYPE_INCDEC
542      && get_attr_type (condgen) != TYPE_ALU)
543    return false;
544
545  compare_set = single_set (condgen);
546  if (compare_set == NULL_RTX
547      && !TARGET_FUSE_ALU_AND_BRANCH)
548    return false;
549
550  if (compare_set == NULL_RTX)
551    {
552      int i;
553      rtx pat = PATTERN (condgen);
554      for (i = 0; i < XVECLEN (pat, 0); i++)
555	if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
556	  {
557	    rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
558	    if (GET_CODE (set_src) == COMPARE)
559	      compare_set = XVECEXP (pat, 0, i);
560	    else
561	      alu_set = XVECEXP (pat, 0, i);
562	  }
563    }
564  if (compare_set == NULL_RTX)
565    return false;
566  src = SET_SRC (compare_set);
567  if (GET_CODE (src) != COMPARE)
568    return false;
569
570  /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
571     supported.  */
572  if ((MEM_P (XEXP (src, 0))
573       && CONST_INT_P (XEXP (src, 1)))
574      || (MEM_P (XEXP (src, 1))
575	  && CONST_INT_P (XEXP (src, 0))))
576    return false;
577
578  /* No fusion for RIP-relative address.  */
579  if (MEM_P (XEXP (src, 0)))
580    addr = XEXP (XEXP (src, 0), 0);
581  else if (MEM_P (XEXP (src, 1)))
582    addr = XEXP (XEXP (src, 1), 0);
583
584  if (addr) {
585    ix86_address parts;
586    int ok = ix86_decompose_address (addr, &parts);
587    gcc_assert (ok);
588
589    if (ix86_rip_relative_addr_p (&parts))
590      return false;
591  }
592
593  test_if = SET_SRC (pc_set (condjmp));
594  cond = XEXP (test_if, 0);
595  ccode = GET_CODE (cond);
596  /* Check whether conditional jump use Sign or Overflow Flags.  */
597  if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
598      && (ccode == GE
599          || ccode == GT
600	  || ccode == LE
601	  || ccode == LT))
602    return false;
603
604  /* Return true for TYPE_TEST and TYPE_ICMP.  */
605  if (get_attr_type (condgen) == TYPE_TEST
606      || get_attr_type (condgen) == TYPE_ICMP)
607    return true;
608
609  /* The following is the case that macro-fusion for alu + jmp.  */
610  if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
611    return false;
612
613  /* No fusion for alu op with memory destination operand.  */
614  dest = SET_DEST (alu_set);
615  if (MEM_P (dest))
616    return false;
617
618  /* Macro-fusion for inc/dec + unsigned conditional jump is not
619     supported.  */
620  if (get_attr_type (condgen) == TYPE_INCDEC
621      && (ccode == GEU
622	  || ccode == GTU
623	  || ccode == LEU
624	  || ccode == LTU))
625    return false;
626
627  return true;
628}
629
630