1/* Scheduler hooks for IA-32 which implement CPU specific logic.
2   Copyright (C) 1988-2020 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 3, or (at your option)
9any later version.
10
11GCC is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14GNU General Public License for more details.
15
16You should have received a copy of the GNU General Public License
17along with GCC; see the file COPYING3.  If not see
18<http://www.gnu.org/licenses/>.  */
19
20#define IN_TARGET_CODE 1
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "rtl.h"
27#include "tree.h"
28#include "cfghooks.h"
29#include "tm_p.h"
30#include "target.h"
31#include "insn-config.h"
32#include "insn-attr.h"
33#include "insn-opinit.h"
34#include "recog.h"
35
36/* Return the maximum number of instructions a cpu can issue.  */
37
38int
39ix86_issue_rate (void)
40{
41  switch (ix86_tune)
42    {
43    case PROCESSOR_PENTIUM:
44    case PROCESSOR_LAKEMONT:
45    case PROCESSOR_BONNELL:
46    case PROCESSOR_SILVERMONT:
47    case PROCESSOR_KNL:
48    case PROCESSOR_KNM:
49    case PROCESSOR_INTEL:
50    case PROCESSOR_K6:
51    case PROCESSOR_BTVER2:
52    case PROCESSOR_PENTIUM4:
53    case PROCESSOR_NOCONA:
54      return 2;
55
56    case PROCESSOR_PENTIUMPRO:
57    case PROCESSOR_ATHLON:
58    case PROCESSOR_K8:
59    case PROCESSOR_AMDFAM10:
60    case PROCESSOR_BTVER1:
61      return 3;
62
63    case PROCESSOR_BDVER1:
64    case PROCESSOR_BDVER2:
65    case PROCESSOR_BDVER3:
66    case PROCESSOR_BDVER4:
67    case PROCESSOR_ZNVER1:
68    case PROCESSOR_ZNVER2:
69    case PROCESSOR_ZNVER3:
70    case PROCESSOR_CORE2:
71    case PROCESSOR_NEHALEM:
72    case PROCESSOR_SANDYBRIDGE:
73    case PROCESSOR_HASWELL:
74    case PROCESSOR_GENERIC:
75      return 4;
76
77    default:
78      return 1;
79    }
80}
81
82/* Return true iff USE_INSN has a memory address with operands set by
83   SET_INSN.  */
84
85bool
86ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
87{
88  int i;
89  extract_insn_cached (use_insn);
90  for (i = recog_data.n_operands - 1; i >= 0; --i)
91    if (MEM_P (recog_data.operand[i]))
92      {
93	rtx addr = XEXP (recog_data.operand[i], 0);
94	if (modified_in_p (addr, set_insn) != 0)
95	  {
96	    /* No AGI stall if SET_INSN is a push or pop and USE_INSN
97	       has SP based memory (unless index reg is modified in a pop).  */
98	    rtx set = single_set (set_insn);
99	    if (set
100		&& (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
101		    || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
102	      {
103		struct ix86_address parts;
104		if (ix86_decompose_address (addr, &parts)
105		    && parts.base == stack_pointer_rtx
106		    && (parts.index == NULL_RTX
107			|| MEM_P (SET_DEST (set))
108			|| !modified_in_p (parts.index, set_insn)))
109		  return false;
110	      }
111	    return true;
112	  }
113	return false;
114      }
115  return false;
116}
117
118/* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
119   by DEP_INSN and nothing set by DEP_INSN.  */
120
121static bool
122ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
123{
124  rtx set, set2;
125
126  /* Simplify the test for uninteresting insns.  */
127  if (insn_type != TYPE_SETCC
128      && insn_type != TYPE_ICMOV
129      && insn_type != TYPE_FCMOV
130      && insn_type != TYPE_IBR)
131    return false;
132
133  if ((set = single_set (dep_insn)) != 0)
134    {
135      set = SET_DEST (set);
136      set2 = NULL_RTX;
137    }
138  else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
139	   && XVECLEN (PATTERN (dep_insn), 0) == 2
140	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
141	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
142    {
143      set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
144      set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
145    }
146  else
147    return false;
148
149  if (!REG_P (set) || REGNO (set) != FLAGS_REG)
150    return false;
151
152  /* This test is true if the dependent insn reads the flags but
153     not any other potentially set register.  */
154  if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
155    return false;
156
157  if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
158    return false;
159
160  return true;
161}
162
163/* Helper function for exact_store_load_dependency.
164   Return true if addr is found in insn.  */
165static bool
166exact_dependency_1 (rtx addr, rtx insn)
167{
168  enum rtx_code code;
169  const char *format_ptr;
170  int i, j;
171
172  code = GET_CODE (insn);
173  switch (code)
174    {
175    case MEM:
176      if (rtx_equal_p (addr, insn))
177	return true;
178      break;
179    case REG:
180    CASE_CONST_ANY:
181    case SYMBOL_REF:
182    case CODE_LABEL:
183    case PC:
184    case CC0:
185    case EXPR_LIST:
186      return false;
187    default:
188      break;
189    }
190
191  format_ptr = GET_RTX_FORMAT (code);
192  for (i = 0; i < GET_RTX_LENGTH (code); i++)
193    {
194      switch (*format_ptr++)
195	{
196	case 'e':
197	  if (exact_dependency_1 (addr, XEXP (insn, i)))
198	    return true;
199	  break;
200	case 'E':
201	  for (j = 0; j < XVECLEN (insn, i); j++)
202	    if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
203	      return true;
204	  break;
205	}
206    }
207  return false;
208}
209
210/* Return true if there exists exact dependency for store & load, i.e.
211   the same memory address is used in them.  */
212static bool
213exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
214{
215  rtx set1, set2;
216
217  set1 = single_set (store);
218  if (!set1)
219    return false;
220  if (!MEM_P (SET_DEST (set1)))
221    return false;
222  set2 = single_set (load);
223  if (!set2)
224    return false;
225  if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
226    return true;
227  return false;
228}
229
230
231/* This function corrects the value of COST (latency) based on the relationship
232   between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength
233   DW.  It should return the new value.
234
235   On x86 CPUs this is most commonly used to model the fact that valus of
236   registers used to compute address of memory operand  needs to be ready
237   earlier than values of registers used in the actual operation.  */
238
239int
240ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
241		  unsigned int)
242{
243  enum attr_type insn_type, dep_insn_type;
244  enum attr_memory memory;
245  rtx set, set2;
246  int dep_insn_code_number;
247
248  /* Anti and output dependencies have zero cost on all CPUs.  */
249  if (dep_type != 0)
250    return 0;
251
252  dep_insn_code_number = recog_memoized (dep_insn);
253
254  /* If we can't recognize the insns, we can't really do anything.  */
255  if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
256    return cost;
257
258  insn_type = get_attr_type (insn);
259  dep_insn_type = get_attr_type (dep_insn);
260
261  switch (ix86_tune)
262    {
263    case PROCESSOR_PENTIUM:
264    case PROCESSOR_LAKEMONT:
265      /* Address Generation Interlock adds a cycle of latency.  */
266      if (insn_type == TYPE_LEA)
267	{
268	  rtx addr = PATTERN (insn);
269
270	  if (GET_CODE (addr) == PARALLEL)
271	    addr = XVECEXP (addr, 0, 0);
272
273	  gcc_assert (GET_CODE (addr) == SET);
274
275	  addr = SET_SRC (addr);
276	  if (modified_in_p (addr, dep_insn))
277	    cost += 1;
278	}
279      else if (ix86_agi_dependent (dep_insn, insn))
280	cost += 1;
281
282      /* ??? Compares pair with jump/setcc.  */
283      if (ix86_flags_dependent (insn, dep_insn, insn_type))
284	cost = 0;
285
286      /* Floating point stores require value to be ready one cycle earlier.  */
287      if (insn_type == TYPE_FMOV
288	  && get_attr_memory (insn) == MEMORY_STORE
289	  && !ix86_agi_dependent (dep_insn, insn))
290	cost += 1;
291      break;
292
293    case PROCESSOR_PENTIUMPRO:
294      /* INT->FP conversion is expensive.  */
295      if (get_attr_fp_int_src (dep_insn))
296	cost += 5;
297
298      /* There is one cycle extra latency between an FP op and a store.  */
299      if (insn_type == TYPE_FMOV
300	  && (set = single_set (dep_insn)) != NULL_RTX
301	  && (set2 = single_set (insn)) != NULL_RTX
302	  && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
303	  && MEM_P (SET_DEST (set2)))
304	cost += 1;
305
306      memory = get_attr_memory (insn);
307
308      /* Show ability of reorder buffer to hide latency of load by executing
309	 in parallel with previous instruction in case
310	 previous instruction is not needed to compute the address.  */
311      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
312	  && !ix86_agi_dependent (dep_insn, insn))
313	{
314	  /* Claim moves to take one cycle, as core can issue one load
315	     at time and the next load can start cycle later.  */
316	  if (dep_insn_type == TYPE_IMOV
317	      || dep_insn_type == TYPE_FMOV)
318	    cost = 1;
319	  else if (cost > 1)
320	    cost--;
321	}
322      break;
323
324    case PROCESSOR_K6:
325     /* The esp dependency is resolved before
326	the instruction is really finished.  */
327      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
328	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
329	return 1;
330
331      /* INT->FP conversion is expensive.  */
332      if (get_attr_fp_int_src (dep_insn))
333	cost += 5;
334
335      memory = get_attr_memory (insn);
336
337      /* Show ability of reorder buffer to hide latency of load by executing
338	 in parallel with previous instruction in case
339	 previous instruction is not needed to compute the address.  */
340      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
341	  && !ix86_agi_dependent (dep_insn, insn))
342	{
343	  /* Claim moves to take one cycle, as core can issue one load
344	     at time and the next load can start cycle later.  */
345	  if (dep_insn_type == TYPE_IMOV
346	      || dep_insn_type == TYPE_FMOV)
347	    cost = 1;
348	  else if (cost > 2)
349	    cost -= 2;
350	  else
351	    cost = 1;
352	}
353      break;
354
355    case PROCESSOR_AMDFAM10:
356    case PROCESSOR_BDVER1:
357    case PROCESSOR_BDVER2:
358    case PROCESSOR_BDVER3:
359    case PROCESSOR_BDVER4:
360    case PROCESSOR_BTVER1:
361    case PROCESSOR_BTVER2:
362      /* Stack engine allows to execute push&pop instructions in parall.  */
363      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
364	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
365	return 0;
366      /* FALLTHRU */
367
368    case PROCESSOR_ATHLON:
369    case PROCESSOR_K8:
370      memory = get_attr_memory (insn);
371
372      /* Show ability of reorder buffer to hide latency of load by executing
373	 in parallel with previous instruction in case
374	 previous instruction is not needed to compute the address.  */
375      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
376	  && !ix86_agi_dependent (dep_insn, insn))
377	{
378	  enum attr_unit unit = get_attr_unit (insn);
379	  int loadcost = 3;
380
381	  /* Because of the difference between the length of integer and
382	     floating unit pipeline preparation stages, the memory operands
383	     for floating point are cheaper.
384
385	     ??? For Athlon it the difference is most probably 2.  */
386	  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
387	    loadcost = 3;
388	  else
389	    loadcost = TARGET_ATHLON ? 2 : 0;
390
391	  if (cost >= loadcost)
392	    cost -= loadcost;
393	  else
394	    cost = 0;
395	}
396      break;
397
398    case PROCESSOR_ZNVER1:
399    case PROCESSOR_ZNVER2:
400    case PROCESSOR_ZNVER3:
401      /* Stack engine allows to execute push&pop instructions in parall.  */
402      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
403	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
404	return 0;
405
406      memory = get_attr_memory (insn);
407
408      /* Show ability of reorder buffer to hide latency of load by executing
409	 in parallel with previous instruction in case
410	 previous instruction is not needed to compute the address.  */
411      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
412	  && !ix86_agi_dependent (dep_insn, insn))
413	{
414	  enum attr_unit unit = get_attr_unit (insn);
415	  int loadcost;
416
417	  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
418	    loadcost = 4;
419	  else
420	    loadcost = 7;
421
422	  if (cost >= loadcost)
423	    cost -= loadcost;
424	  else
425	    cost = 0;
426	}
427      break;
428
429    case PROCESSOR_CORE2:
430    case PROCESSOR_NEHALEM:
431    case PROCESSOR_SANDYBRIDGE:
432    case PROCESSOR_HASWELL:
433    case PROCESSOR_GENERIC:
434      /* Stack engine allows to execute push&pop instructions in parall.  */
435      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
436	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
437	return 0;
438
439      memory = get_attr_memory (insn);
440
441      /* Show ability of reorder buffer to hide latency of load by executing
442	 in parallel with previous instruction in case
443	 previous instruction is not needed to compute the address.  */
444      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
445	  && !ix86_agi_dependent (dep_insn, insn))
446	{
447	  if (cost >= 4)
448	    cost -= 4;
449	  else
450	    cost = 0;
451	}
452      break;
453
454    case PROCESSOR_SILVERMONT:
455    case PROCESSOR_KNL:
456    case PROCESSOR_KNM:
457    case PROCESSOR_INTEL:
458      if (!reload_completed)
459	return cost;
460
461      /* Increase cost of integer loads.  */
462      memory = get_attr_memory (dep_insn);
463      if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
464	{
465	  enum attr_unit unit = get_attr_unit (dep_insn);
466	  if (unit == UNIT_INTEGER && cost == 1)
467	    {
468	      if (memory == MEMORY_LOAD)
469		cost = 3;
470	      else
471		{
472		  /* Increase cost of ld/st for short int types only
473		     because of store forwarding issue.  */
474		  rtx set = single_set (dep_insn);
475		  if (set && (GET_MODE (SET_DEST (set)) == QImode
476			      || GET_MODE (SET_DEST (set)) == HImode))
477		    {
478		      /* Increase cost of store/load insn if exact
479			 dependence exists and it is load insn.  */
480		      enum attr_memory insn_memory = get_attr_memory (insn);
481		      if (insn_memory == MEMORY_LOAD
482			  && exact_store_load_dependency (dep_insn, insn))
483			cost = 3;
484		    }
485		}
486	    }
487	}
488
489    default:
490      break;
491    }
492
493  return cost;
494}
495
496/* How many alternative schedules to try.  This should be as wide as the
497   scheduling freedom in the DFA, but no wider.  Making this value too
498   large results extra work for the scheduler.  */
499
500int
501ia32_multipass_dfa_lookahead (void)
502{
503  /* Generally, we want haifa-sched:max_issue() to look ahead as far
504     as many instructions can be executed on a cycle, i.e.,
505     issue_rate.  */
506  if (reload_completed)
507    return ix86_issue_rate ();
508  /* Don't use lookahead for pre-reload schedule to save compile time.  */
509  return 0;
510}
511
512/* Return true if target platform supports macro-fusion.  */
513
514bool
515ix86_macro_fusion_p ()
516{
517  return TARGET_FUSE_CMP_AND_BRANCH;
518}
519
520/* Check whether current microarchitecture support macro fusion
521   for insn pair "CONDGEN + CONDJMP". Refer to
522   "Intel Architectures Optimization Reference Manual". */
523
524bool
525ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
526{
527  rtx src, dest;
528  enum rtx_code ccode;
529  rtx compare_set = NULL_RTX, test_if, cond;
530  rtx alu_set = NULL_RTX, addr = NULL_RTX;
531  enum attr_type condgen_type;
532
533  if (!any_condjump_p (condjmp))
534    return false;
535
536  unsigned int condreg1, condreg2;
537  rtx cc_reg_1;
538  targetm.fixed_condition_code_regs (&condreg1, &condreg2);
539  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
540  if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
541      || !condgen
542      || !modified_in_p (cc_reg_1, condgen))
543    return false;
544
545  condgen_type = get_attr_type (condgen);
546  if (condgen_type == TYPE_MULTI
547      && INSN_CODE (condgen) == code_for_stack_protect_test_1 (ptr_mode)
548      && TARGET_FUSE_ALU_AND_BRANCH)
549    {
550      /* stack_protect_test_<mode> ends with a sub, which subtracts
551	 a non-rip special memory operand from a GPR.  */
552      src = NULL_RTX;
553      alu_set = XVECEXP (PATTERN (condgen), 0, 1);
554      goto handle_stack_protect_test;
555    }
556  else if (condgen_type != TYPE_TEST
557	   && condgen_type != TYPE_ICMP
558	   && condgen_type != TYPE_INCDEC
559	   && condgen_type != TYPE_ALU)
560    return false;
561
562  compare_set = single_set (condgen);
563  if (compare_set == NULL_RTX && !TARGET_FUSE_ALU_AND_BRANCH)
564    return false;
565
566  if (compare_set == NULL_RTX)
567    {
568      int i;
569      rtx pat = PATTERN (condgen);
570      for (i = 0; i < XVECLEN (pat, 0); i++)
571	if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
572	  {
573	    rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
574	    if (GET_CODE (set_src) == COMPARE)
575	      compare_set = XVECEXP (pat, 0, i);
576	    else
577	      alu_set = XVECEXP (pat, 0, i);
578	  }
579    }
580  if (compare_set == NULL_RTX)
581    return false;
582  src = SET_SRC (compare_set);
583  if (GET_CODE (src) != COMPARE)
584    return false;
585
586  /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
587     supported.  */
588  if ((MEM_P (XEXP (src, 0)) && CONST_INT_P (XEXP (src, 1)))
589      || (MEM_P (XEXP (src, 1)) && CONST_INT_P (XEXP (src, 0))))
590    return false;
591
592  /* No fusion for RIP-relative address.  */
593  if (MEM_P (XEXP (src, 0)))
594    addr = XEXP (XEXP (src, 0), 0);
595  else if (MEM_P (XEXP (src, 1)))
596    addr = XEXP (XEXP (src, 1), 0);
597
598  if (addr)
599    {
600      ix86_address parts;
601      int ok = ix86_decompose_address (addr, &parts);
602      gcc_assert (ok);
603
604      if (ix86_rip_relative_addr_p (&parts))
605	return false;
606    }
607
608 handle_stack_protect_test:
609  test_if = SET_SRC (pc_set (condjmp));
610  cond = XEXP (test_if, 0);
611  ccode = GET_CODE (cond);
612  /* Check whether conditional jump use Sign or Overflow Flags.  */
613  if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
614      && (ccode == GE || ccode == GT || ccode == LE || ccode == LT))
615    return false;
616
617  /* Return true for TYPE_TEST and TYPE_ICMP.  */
618  if (condgen_type == TYPE_TEST || condgen_type == TYPE_ICMP)
619    return true;
620
621  /* The following is the case that macro-fusion for alu + jmp.  */
622  if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
623    return false;
624
625  /* No fusion for alu op with memory destination operand.  */
626  dest = SET_DEST (alu_set);
627  if (MEM_P (dest))
628    return false;
629
630  /* Macro-fusion for inc/dec + unsigned conditional jump is not
631     supported.  */
632  if (condgen_type == TYPE_INCDEC
633      && (ccode == GEU || ccode == GTU || ccode == LEU || ccode == LTU))
634    return false;
635
636  return true;
637}
638
639