1/* Scheduler hooks for IA-32 which implement CPU specific logic.
2   Copyright (C) 1988-2022 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License as published by
8the Free Software Foundation; either version 3, or (at your option)
9any later version.
10
11GCC is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14GNU General Public License for more details.
15
16You should have received a copy of the GNU General Public License
17along with GCC; see the file COPYING3.  If not see
18<http://www.gnu.org/licenses/>.  */
19
20#define IN_TARGET_CODE 1
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
25#include "backend.h"
26#include "rtl.h"
27#include "tree.h"
28#include "cfghooks.h"
29#include "tm_p.h"
30#include "target.h"
31#include "insn-config.h"
32#include "insn-attr.h"
33#include "insn-opinit.h"
34#include "recog.h"
35
36/* Return the maximum number of instructions a cpu can issue.  */
37
38int
39ix86_issue_rate (void)
40{
41  switch (ix86_tune)
42    {
43    case PROCESSOR_PENTIUM:
44    case PROCESSOR_LAKEMONT:
45    case PROCESSOR_BONNELL:
46    case PROCESSOR_SILVERMONT:
47    case PROCESSOR_KNL:
48    case PROCESSOR_KNM:
49    case PROCESSOR_INTEL:
50    case PROCESSOR_K6:
51    case PROCESSOR_BTVER2:
52    case PROCESSOR_PENTIUM4:
53    case PROCESSOR_NOCONA:
54      return 2;
55
56    case PROCESSOR_PENTIUMPRO:
57    case PROCESSOR_ATHLON:
58    case PROCESSOR_K8:
59    case PROCESSOR_AMDFAM10:
60    case PROCESSOR_BTVER1:
61      return 3;
62
63    case PROCESSOR_BDVER1:
64    case PROCESSOR_BDVER2:
65    case PROCESSOR_BDVER3:
66    case PROCESSOR_BDVER4:
67    case PROCESSOR_ZNVER1:
68    case PROCESSOR_ZNVER2:
69    case PROCESSOR_ZNVER3:
70    case PROCESSOR_ZNVER4:
71    case PROCESSOR_CORE2:
72    case PROCESSOR_NEHALEM:
73    case PROCESSOR_SANDYBRIDGE:
74    case PROCESSOR_HASWELL:
75    case PROCESSOR_TREMONT:
76    case PROCESSOR_ALDERLAKE:
77    case PROCESSOR_GENERIC:
78      return 4;
79
80    default:
81      return 1;
82    }
83}
84
85/* Return true iff USE_INSN has a memory address with operands set by
86   SET_INSN.  */
87
88bool
89ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
90{
91  int i;
92  extract_insn_cached (use_insn);
93  for (i = recog_data.n_operands - 1; i >= 0; --i)
94    if (MEM_P (recog_data.operand[i]))
95      {
96	rtx addr = XEXP (recog_data.operand[i], 0);
97	if (modified_in_p (addr, set_insn) != 0)
98	  {
99	    /* No AGI stall if SET_INSN is a push or pop and USE_INSN
100	       has SP based memory (unless index reg is modified in a pop).  */
101	    rtx set = single_set (set_insn);
102	    if (set
103		&& (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
104		    || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
105	      {
106		struct ix86_address parts;
107		if (ix86_decompose_address (addr, &parts)
108		    && parts.base == stack_pointer_rtx
109		    && (parts.index == NULL_RTX
110			|| MEM_P (SET_DEST (set))
111			|| !modified_in_p (parts.index, set_insn)))
112		  return false;
113	      }
114	    return true;
115	  }
116	return false;
117      }
118  return false;
119}
120
121/* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
122   by DEP_INSN and nothing set by DEP_INSN.  */
123
124static bool
125ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
126{
127  rtx set, set2;
128
129  /* Simplify the test for uninteresting insns.  */
130  if (insn_type != TYPE_SETCC
131      && insn_type != TYPE_ICMOV
132      && insn_type != TYPE_FCMOV
133      && insn_type != TYPE_IBR)
134    return false;
135
136  if ((set = single_set (dep_insn)) != 0)
137    {
138      set = SET_DEST (set);
139      set2 = NULL_RTX;
140    }
141  else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
142	   && XVECLEN (PATTERN (dep_insn), 0) == 2
143	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
144	   && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
145    {
146      set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
147      set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
148    }
149  else
150    return false;
151
152  if (!REG_P (set) || REGNO (set) != FLAGS_REG)
153    return false;
154
155  /* This test is true if the dependent insn reads the flags but
156     not any other potentially set register.  */
157  if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
158    return false;
159
160  if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
161    return false;
162
163  return true;
164}
165
166/* Helper function for exact_store_load_dependency.
167   Return true if addr is found in insn.  */
168static bool
169exact_dependency_1 (rtx addr, rtx insn)
170{
171  enum rtx_code code;
172  const char *format_ptr;
173  int i, j;
174
175  code = GET_CODE (insn);
176  switch (code)
177    {
178    case MEM:
179      if (rtx_equal_p (addr, insn))
180	return true;
181      break;
182    case REG:
183    CASE_CONST_ANY:
184    case SYMBOL_REF:
185    case CODE_LABEL:
186    case PC:
187    case EXPR_LIST:
188      return false;
189    default:
190      break;
191    }
192
193  format_ptr = GET_RTX_FORMAT (code);
194  for (i = 0; i < GET_RTX_LENGTH (code); i++)
195    {
196      switch (*format_ptr++)
197	{
198	case 'e':
199	  if (exact_dependency_1 (addr, XEXP (insn, i)))
200	    return true;
201	  break;
202	case 'E':
203	  for (j = 0; j < XVECLEN (insn, i); j++)
204	    if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
205	      return true;
206	  break;
207	}
208    }
209  return false;
210}
211
212/* Return true if there exists exact dependency for store & load, i.e.
213   the same memory address is used in them.  */
214static bool
215exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
216{
217  rtx set1, set2;
218
219  set1 = single_set (store);
220  if (!set1)
221    return false;
222  if (!MEM_P (SET_DEST (set1)))
223    return false;
224  set2 = single_set (load);
225  if (!set2)
226    return false;
227  if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
228    return true;
229  return false;
230}
231
232
233/* This function corrects the value of COST (latency) based on the relationship
234   between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength
235   DW.  It should return the new value.
236
237   On x86 CPUs this is most commonly used to model the fact that valus of
238   registers used to compute address of memory operand  needs to be ready
239   earlier than values of registers used in the actual operation.  */
240
241int
242ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
243		  unsigned int)
244{
245  enum attr_type insn_type, dep_insn_type;
246  enum attr_memory memory;
247  rtx set, set2;
248  int dep_insn_code_number;
249
250  /* Anti and output dependencies have zero cost on all CPUs.  */
251  if (dep_type != 0)
252    return 0;
253
254  dep_insn_code_number = recog_memoized (dep_insn);
255
256  /* If we can't recognize the insns, we can't really do anything.  */
257  if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
258    return cost;
259
260  insn_type = get_attr_type (insn);
261  dep_insn_type = get_attr_type (dep_insn);
262
263  switch (ix86_tune)
264    {
265    case PROCESSOR_PENTIUM:
266    case PROCESSOR_LAKEMONT:
267      /* Address Generation Interlock adds a cycle of latency.  */
268      if (insn_type == TYPE_LEA)
269	{
270	  rtx addr = PATTERN (insn);
271
272	  if (GET_CODE (addr) == PARALLEL)
273	    addr = XVECEXP (addr, 0, 0);
274
275	  gcc_assert (GET_CODE (addr) == SET);
276
277	  addr = SET_SRC (addr);
278	  if (modified_in_p (addr, dep_insn))
279	    cost += 1;
280	}
281      else if (ix86_agi_dependent (dep_insn, insn))
282	cost += 1;
283
284      /* ??? Compares pair with jump/setcc.  */
285      if (ix86_flags_dependent (insn, dep_insn, insn_type))
286	cost = 0;
287
288      /* Floating point stores require value to be ready one cycle earlier.  */
289      if (insn_type == TYPE_FMOV
290	  && get_attr_memory (insn) == MEMORY_STORE
291	  && !ix86_agi_dependent (dep_insn, insn))
292	cost += 1;
293      break;
294
295    case PROCESSOR_PENTIUMPRO:
296      /* INT->FP conversion is expensive.  */
297      if (get_attr_fp_int_src (dep_insn))
298	cost += 5;
299
300      /* There is one cycle extra latency between an FP op and a store.  */
301      if (insn_type == TYPE_FMOV
302	  && (set = single_set (dep_insn)) != NULL_RTX
303	  && (set2 = single_set (insn)) != NULL_RTX
304	  && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
305	  && MEM_P (SET_DEST (set2)))
306	cost += 1;
307
308      memory = get_attr_memory (insn);
309
310      /* Show ability of reorder buffer to hide latency of load by executing
311	 in parallel with previous instruction in case
312	 previous instruction is not needed to compute the address.  */
313      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
314	  && !ix86_agi_dependent (dep_insn, insn))
315	{
316	  /* Claim moves to take one cycle, as core can issue one load
317	     at time and the next load can start cycle later.  */
318	  if (dep_insn_type == TYPE_IMOV
319	      || dep_insn_type == TYPE_FMOV)
320	    cost = 1;
321	  else if (cost > 1)
322	    cost--;
323	}
324      break;
325
326    case PROCESSOR_K6:
327     /* The esp dependency is resolved before
328	the instruction is really finished.  */
329      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
330	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
331	return 1;
332
333      /* INT->FP conversion is expensive.  */
334      if (get_attr_fp_int_src (dep_insn))
335	cost += 5;
336
337      memory = get_attr_memory (insn);
338
339      /* Show ability of reorder buffer to hide latency of load by executing
340	 in parallel with previous instruction in case
341	 previous instruction is not needed to compute the address.  */
342      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
343	  && !ix86_agi_dependent (dep_insn, insn))
344	{
345	  /* Claim moves to take one cycle, as core can issue one load
346	     at time and the next load can start cycle later.  */
347	  if (dep_insn_type == TYPE_IMOV
348	      || dep_insn_type == TYPE_FMOV)
349	    cost = 1;
350	  else if (cost > 2)
351	    cost -= 2;
352	  else
353	    cost = 1;
354	}
355      break;
356
357    case PROCESSOR_AMDFAM10:
358    case PROCESSOR_BDVER1:
359    case PROCESSOR_BDVER2:
360    case PROCESSOR_BDVER3:
361    case PROCESSOR_BDVER4:
362    case PROCESSOR_BTVER1:
363    case PROCESSOR_BTVER2:
364      /* Stack engine allows to execute push&pop instructions in parall.  */
365      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
366	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
367	return 0;
368      /* FALLTHRU */
369
370    case PROCESSOR_ATHLON:
371    case PROCESSOR_K8:
372      memory = get_attr_memory (insn);
373
374      /* Show ability of reorder buffer to hide latency of load by executing
375	 in parallel with previous instruction in case
376	 previous instruction is not needed to compute the address.  */
377      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
378	  && !ix86_agi_dependent (dep_insn, insn))
379	{
380	  enum attr_unit unit = get_attr_unit (insn);
381	  int loadcost = 3;
382
383	  /* Because of the difference between the length of integer and
384	     floating unit pipeline preparation stages, the memory operands
385	     for floating point are cheaper.
386
387	     ??? For Athlon it the difference is most probably 2.  */
388	  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
389	    loadcost = 3;
390	  else
391	    loadcost = TARGET_CPU_P (ATHLON) ? 2 : 0;
392
393	  if (cost >= loadcost)
394	    cost -= loadcost;
395	  else
396	    cost = 0;
397	}
398      break;
399
400    case PROCESSOR_ZNVER1:
401    case PROCESSOR_ZNVER2:
402    case PROCESSOR_ZNVER3:
403    case PROCESSOR_ZNVER4:
404      /* Stack engine allows to execute push&pop instructions in parall.  */
405      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
406	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
407	return 0;
408
409      memory = get_attr_memory (insn);
410
411      /* Show ability of reorder buffer to hide latency of load by executing
412	 in parallel with previous instruction in case
413	 previous instruction is not needed to compute the address.  */
414      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
415	  && !ix86_agi_dependent (dep_insn, insn))
416	{
417	  enum attr_unit unit = get_attr_unit (insn);
418	  int loadcost;
419
420	  if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
421	    loadcost = 4;
422	  else
423	    loadcost = 7;
424
425	  if (cost >= loadcost)
426	    cost -= loadcost;
427	  else
428	    cost = 0;
429	}
430      break;
431
432    case PROCESSOR_CORE2:
433    case PROCESSOR_NEHALEM:
434    case PROCESSOR_SANDYBRIDGE:
435    case PROCESSOR_HASWELL:
436    case PROCESSOR_TREMONT:
437    case PROCESSOR_ALDERLAKE:
438    case PROCESSOR_GENERIC:
439      /* Stack engine allows to execute push&pop instructions in parall.  */
440      if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
441	  && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
442	return 0;
443
444      memory = get_attr_memory (insn);
445
446      /* Show ability of reorder buffer to hide latency of load by executing
447	 in parallel with previous instruction in case
448	 previous instruction is not needed to compute the address.  */
449      if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
450	  && !ix86_agi_dependent (dep_insn, insn))
451	{
452	  if (cost >= 4)
453	    cost -= 4;
454	  else
455	    cost = 0;
456	}
457      break;
458
459    case PROCESSOR_SILVERMONT:
460    case PROCESSOR_KNL:
461    case PROCESSOR_KNM:
462    case PROCESSOR_INTEL:
463      if (!reload_completed)
464	return cost;
465
466      /* Increase cost of integer loads.  */
467      memory = get_attr_memory (dep_insn);
468      if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
469	{
470	  enum attr_unit unit = get_attr_unit (dep_insn);
471	  if (unit == UNIT_INTEGER && cost == 1)
472	    {
473	      if (memory == MEMORY_LOAD)
474		cost = 3;
475	      else
476		{
477		  /* Increase cost of ld/st for short int types only
478		     because of store forwarding issue.  */
479		  rtx set = single_set (dep_insn);
480		  if (set && (GET_MODE (SET_DEST (set)) == QImode
481			      || GET_MODE (SET_DEST (set)) == HImode))
482		    {
483		      /* Increase cost of store/load insn if exact
484			 dependence exists and it is load insn.  */
485		      enum attr_memory insn_memory = get_attr_memory (insn);
486		      if (insn_memory == MEMORY_LOAD
487			  && exact_store_load_dependency (dep_insn, insn))
488			cost = 3;
489		    }
490		}
491	    }
492	}
493
494    default:
495      break;
496    }
497
498  return cost;
499}
500
501/* How many alternative schedules to try.  This should be as wide as the
502   scheduling freedom in the DFA, but no wider.  Making this value too
503   large results extra work for the scheduler.  */
504
505int
506ia32_multipass_dfa_lookahead (void)
507{
508  /* Generally, we want haifa-sched:max_issue() to look ahead as far
509     as many instructions can be executed on a cycle, i.e.,
510     issue_rate.  */
511  if (reload_completed)
512    return ix86_issue_rate ();
513  /* Don't use lookahead for pre-reload schedule to save compile time.  */
514  return 0;
515}
516
517/* Return true if target platform supports macro-fusion.  */
518
519bool
520ix86_macro_fusion_p ()
521{
522  return TARGET_FUSE_CMP_AND_BRANCH;
523}
524
525/* Check whether current microarchitecture support macro fusion
526   for insn pair "CONDGEN + CONDJMP". Refer to
527   "Intel Architectures Optimization Reference Manual". */
528
529bool
530ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
531{
532  rtx src, dest;
533  enum rtx_code ccode;
534  rtx compare_set = NULL_RTX, test_if, cond;
535  rtx alu_set = NULL_RTX, addr = NULL_RTX;
536  enum attr_type condgen_type;
537
538  if (!any_condjump_p (condjmp))
539    return false;
540
541  unsigned int condreg1, condreg2;
542  rtx cc_reg_1;
543  targetm.fixed_condition_code_regs (&condreg1, &condreg2);
544  cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
545  if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
546      || !condgen
547      || !modified_in_p (cc_reg_1, condgen))
548    return false;
549
550  condgen_type = get_attr_type (condgen);
551  if (condgen_type == TYPE_MULTI
552      && INSN_CODE (condgen) == code_for_stack_protect_test_1 (ptr_mode)
553      && TARGET_FUSE_ALU_AND_BRANCH)
554    {
555      /* stack_protect_test_<mode> ends with a sub, which subtracts
556	 a non-rip special memory operand from a GPR.  */
557      src = NULL_RTX;
558      alu_set = XVECEXP (PATTERN (condgen), 0, 1);
559      goto handle_stack_protect_test;
560    }
561  else if (condgen_type != TYPE_TEST
562	   && condgen_type != TYPE_ICMP
563	   && condgen_type != TYPE_INCDEC
564	   && condgen_type != TYPE_ALU)
565    return false;
566
567  compare_set = single_set (condgen);
568  if (compare_set == NULL_RTX && !TARGET_FUSE_ALU_AND_BRANCH)
569    return false;
570
571  if (compare_set == NULL_RTX)
572    {
573      int i;
574      rtx pat = PATTERN (condgen);
575      for (i = 0; i < XVECLEN (pat, 0); i++)
576	if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
577	  {
578	    rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
579	    if (GET_CODE (set_src) == COMPARE)
580	      compare_set = XVECEXP (pat, 0, i);
581	    else
582	      alu_set = XVECEXP (pat, 0, i);
583	  }
584    }
585  if (compare_set == NULL_RTX)
586    return false;
587  src = SET_SRC (compare_set);
588  if (GET_CODE (src) != COMPARE)
589    return false;
590
591  /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
592     supported.  */
593  if ((MEM_P (XEXP (src, 0)) && CONST_INT_P (XEXP (src, 1)))
594      || (MEM_P (XEXP (src, 1)) && CONST_INT_P (XEXP (src, 0))))
595    return false;
596
597  /* No fusion for RIP-relative address.  */
598  if (MEM_P (XEXP (src, 0)))
599    addr = XEXP (XEXP (src, 0), 0);
600  else if (MEM_P (XEXP (src, 1)))
601    addr = XEXP (XEXP (src, 1), 0);
602
603  if (addr)
604    {
605      ix86_address parts;
606      int ok = ix86_decompose_address (addr, &parts);
607      gcc_assert (ok);
608
609      if (ix86_rip_relative_addr_p (&parts))
610	return false;
611    }
612
613 handle_stack_protect_test:
614  test_if = SET_SRC (pc_set (condjmp));
615  cond = XEXP (test_if, 0);
616  ccode = GET_CODE (cond);
617  /* Check whether conditional jump use Sign or Overflow Flags.  */
618  if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
619      && (ccode == GE || ccode == GT || ccode == LE || ccode == LT))
620    return false;
621
622  /* Return true for TYPE_TEST and TYPE_ICMP.  */
623  if (condgen_type == TYPE_TEST || condgen_type == TYPE_ICMP)
624    return true;
625
626  /* The following is the case that macro-fusion for alu + jmp.  */
627  if (!TARGET_FUSE_ALU_AND_BRANCH || !alu_set)
628    return false;
629
630  /* No fusion for alu op with memory destination operand.  */
631  dest = SET_DEST (alu_set);
632  if (MEM_P (dest))
633    return false;
634
635  /* Macro-fusion for inc/dec + unsigned conditional jump is not
636     supported.  */
637  if (condgen_type == TYPE_INCDEC
638      && (ccode == GEU || ccode == GTU || ccode == LEU || ccode == LTU))
639    return false;
640
641  return true;
642}
643
644