1;; Copyright (C) 2016-2020 Free Software Foundation, Inc.
2
3;; This file is free software; you can redistribute it and/or modify it under
4;; the terms of the GNU General Public License as published by the Free
5;; Software Foundation; either version 3 of the License, or (at your option)
6;; any later version.
7
8;; This file is distributed in the hope that it will be useful, but WITHOUT
9;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10;; FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
11;; for more details.
12
13;; You should have received a copy of the GNU General Public License
14;; along with GCC; see the file COPYING3.  If not see
15;; <http://www.gnu.org/licenses/>.
16
17;;- See file "rtl.def" for documentation on define_insn, match_*, et. al.
18
19(include "predicates.md")
20(include "constraints.md")
21
22;; {{{ Constants and enums
23
24; Named registers
25(define_constants
26  [(FIRST_SGPR_REG		 0)
27   (CC_SAVE_REG			 22)
28   (LAST_SGPR_REG		 101)
29   (FLAT_SCRATCH_REG		 102)
30   (FLAT_SCRATCH_LO_REG		 102)
31   (FLAT_SCRATCH_HI_REG		 103)
32   (XNACK_MASK_REG		 104)
33   (XNACK_MASK_LO_REG		 104)
34   (XNACK_MASK_HI_REG		 105)
35   (VCC_REG			 106)
36   (VCC_LO_REG			 106)
37   (VCC_HI_REG			 107)
38   (VCCZ_REG			 108)
39   (TBA_REG			 109)
40   (TBA_LO_REG			 109)
41   (TBA_HI_REG			 110)
42   (TMA_REG			 111)
43   (TMA_LO_REG			 111)
44   (TMA_HI_REG			 112)
45   (TTMP0_REG			 113)
46   (TTMP11_REG			 124)
47   (M0_REG			 125)
48   (EXEC_REG			 126)
49   (EXEC_LO_REG			 126)
50   (EXEC_HI_REG			 127)
51   (EXECZ_REG			 128)
52   (SCC_REG			 129)
53   (FIRST_VGPR_REG		 160)
54   (LAST_VGPR_REG		 415)])
55
56(define_constants
57  [(SP_REGNUM 16)
58   (LR_REGNUM 18)
59   (AP_REGNUM 416)
60   (FP_REGNUM 418)])
61
62(define_c_enum "unspecv" [
63  UNSPECV_PROLOGUE_USE
64  UNSPECV_KERNEL_RETURN
65  UNSPECV_BARRIER
66  UNSPECV_ATOMIC
67  UNSPECV_ICACHE_INV])
68
69(define_c_enum "unspec" [
70  UNSPEC_VECTOR
71  UNSPEC_BPERMUTE
72  UNSPEC_SGPRBASE
73  UNSPEC_MEMORY_BARRIER
74  UNSPEC_SMIN_DPP_SHR UNSPEC_SMAX_DPP_SHR
75  UNSPEC_UMIN_DPP_SHR UNSPEC_UMAX_DPP_SHR
76  UNSPEC_PLUS_DPP_SHR
77  UNSPEC_PLUS_CARRY_DPP_SHR UNSPEC_PLUS_CARRY_IN_DPP_SHR
78  UNSPEC_AND_DPP_SHR UNSPEC_IOR_DPP_SHR UNSPEC_XOR_DPP_SHR
79  UNSPEC_MOV_DPP_SHR
80  UNSPEC_MOV_FROM_LANE63
81  UNSPEC_GATHER
82  UNSPEC_SCATTER])
83
84;; }}}
85;; {{{ Attributes
86
87; Instruction type (encoding) as described in the ISA specification.
88; The following table summarizes possible operands of individual instruction
89; types and corresponding constraints.
90;
91; sop2 - scalar, two inputs, one output
92;	 ssrc0/ssrc1: sgpr 0-102; flat_scratch,xnack,vcc,tba,tma,ttmp0-11,exec
93;		      vccz,execz,scc,inline immedate,fp inline immediate
94;	 sdst: sgpr 0-102; flat_scratch,xnack,vcc,tba,tma,ttmp0-11,exec
95;
96;	 Constraints "=SD, SD", "SSA,SSB","SSB,SSA"
97;
98; sopk - scalar, inline constant input, one output
99;	 simm16: 16bit inline constant
100;	 sdst: same as sop2/ssrc0
101;
102;	 Constraints "=SD", "J"
103;
104; sop1 - scalar, one input, one output
105;	 ssrc0: same as sop2/ssrc0.  FIXME: manual omit VCCZ
106;	 sdst: same as sop2/sdst
107;
108;	 Constraints "=SD", "SSA"
109;
110; sopc - scalar, two inputs, one comparsion
111;	 ssrc0: same as sop2/ssc0.
112;
113;	 Constraints "SSI,SSA","SSA,SSI"
114;
115; sopp - scalar, one constant input, one special
116;	 simm16
117;
118; smem - scalar memory
119;	 sbase: aligned pair of sgprs.  Specify {size[15:0], base[47:0]} in
120;               dwords
121;	 sdata: sgpr0-102, flat_scratch, xnack, vcc, tba, tma
122;	 offset: sgpr or 20bit unsigned byte offset
123;
124; vop2 - vector, two inputs, one output
125;	 vsrc0: sgpr0-102,flat_scratch,xnack,vcc,tba,ttmp0-11,m0,exec,
126;		inline constant -16 to -64, fp inline immediate, vccz, execz,
127;		scc, lds, literal constant, vgpr0-255
128;	 vsrc1: vgpr0-255
129;	 vdst: vgpr0-255
130;	 Limitations: At most one SGPR, at most one constant
131;		      if constant is used, SGPR must be M0
132;		      Only SRC0 can be LDS_DIRECT
133;
134;	 constraints: "=v", "vBSv", "v"
135;
136; vop1 - vector, one input, one output
137;	 vsrc0: same as vop2/src0
138;	 vdst: vgpr0-255
139;
140;	 constraints: "=v", "vBSv"
141;
142; vopc - vector, two inputs, one comparsion output;
143;	 vsrc0: same as vop2/src0
144;	 vsrc1: vgpr0-255
145;	 vdst:
146;
147;	 constraints: "vASv", "v"
148;
149; vop3a - vector, three inputs, one output
150;	 vdst: vgpr0-255, for v_cmp sgpr or vcc
151;	 abs,clamp
152;	 vsrc0: sgpr0-102,vcc,tba,ttmp0-11,m0,exec,
153;		inline constant -16 to -64, fp inline immediate, vccz, execz,
154;		scc, lds_direct
155;		FIXME: really missing 1/pi? really 104 SGPRs
156;
157; vop3b - vector, three inputs, one vector output, one scalar output
158;	 vsrc0,vsrc1,vsrc2: same as vop3a vsrc0
159;	 vdst: vgpr0-255
160;	 sdst: sgpr0-103/vcc/tba/tma/ttmp0-11
161;
162; vop_sdwa - second dword for vop1/vop2/vopc for specifying sub-dword address
163;	 src0: vgpr0-255
164;	 dst_sel: BYTE_0-3, WORD_0-1, DWORD
165;	 dst_unused: UNUSED_PAD, UNUSED_SEXT, UNUSED_PRESERVE
166;	 clamp: true/false
167;	 src0_sel: BYTE_0-3, WORD_0-1, DWORD
168;	 flags: src0_sext, src0_neg, src0_abs, src1_sel, src1_sext, src1_neg,
169  ;		src1_abs
170;
171; vop_dpp - second dword for vop1/vop2/vopc for specifying data-parallel ops
172;	 src0: vgpr0-255
173;	 dpp_ctrl: quad_perm, row_sl0-15, row_sr0-15, row_rr0-15, wf_sl1,
174;		  wf_rl1, wf_sr1, wf_rr1, row_mirror, row_half_mirror,
175;		  bcast15, bcast31
176;	 flags: src0_neg, src0_abs, src1_neg, src1_abs
177;	 bank_mask: 4-bit mask
178;	 row_mask: 4-bit mask
179;
180; ds - Local and global data share instructions.
181;	 offset0: 8-bit constant
182;	 offset1: 8-bit constant
183;	 flag: gds
184;	 addr: vgpr0-255
185;	 data0: vgpr0-255
186;	 data1: vgpr0-255
187;	 vdst: vgpr0-255
188;
189; mubuf - Untyped memory buffer operation. First word with LDS, second word
190;	  non-LDS.
191;	 offset: 12-bit constant
192;	 vaddr: vgpr0-255
193;	 vdata: vgpr0-255
194;	 srsrc: sgpr0-102
195;	 soffset: sgpr0-102
196;	 flags: offen, idxen, glc, lds, slc, tfe
197;
198; mtbuf - Typed memory buffer operation. Two words
199;	 offset: 12-bit constant
200;	 dfmt: 4-bit constant
201;	 nfmt: 3-bit constant
202;	 vaddr: vgpr0-255
203;	 vdata: vgpr0-255
204;	 srsrc: sgpr0-102
205;	 soffset: sgpr0-102
206;	 flags: offen, idxen, glc, lds, slc, tfe
207;
208; flat - flat or global memory operations
209;	 flags: glc, slc
210;	 addr: vgpr0-255
211;	 data: vgpr0-255
212;	 vdst: vgpr0-255
213;
214; mult - expands to multiple instructions (pseudo encoding)
215;
216; vmult - as mult, when a vector instruction is used.
217
218(define_attr "type"
219	     "unknown,sop1,sop2,sopk,sopc,sopp,smem,ds,vop2,vop1,vopc,
220	      vop3a,vop3b,vop_sdwa,vop_dpp,mubuf,mtbuf,flat,mult,vmult"
221	     (const_string "unknown"))
222
223; Set if instruction is executed in scalar or vector unit
224
225(define_attr "unit" "unknown,scalar,vector"
226  (cond [(eq_attr "type" "sop1,sop2,sopk,sopc,sopp,smem,mult")
227	    (const_string "scalar")
228	 (eq_attr "type" "vop2,vop1,vopc,vop3a,vop3b,ds,
229			  vop_sdwa,vop_dpp,flat,vmult")
230	    (const_string "vector")]
231	 (const_string "unknown")))
232
233; All vector instructions run as 64 threads as predicated by the EXEC
234; register.  Scalar operations in vector register require a single lane
235; enabled, vector moves require a full set of lanes enabled, and most vector
236; operations handle the lane masking themselves.
237; The md_reorg pass is responsible for ensuring that EXEC is set appropriately
238; according to the following settings:
239;   auto   - md_reorg will inspect def/use to determine what to do.
240;   none   - exec is not needed.
241;   single - disable all but lane zero.
242;   full   - enable all lanes.
243
244(define_attr "exec" "auto,none,single,full"
245   (const_string "auto"))
246
247; Infer the (worst-case) length from the instruction type by default.  Many
248; types can have an optional immediate word following, which we include here.
249; "Multiple" types are counted as two 64-bit instructions.  This is just a
250; default fallback: it can be overridden per-alternative in insn patterns for
251; greater accuracy.
252
253(define_attr "length" ""
254  (cond [(eq_attr "type" "sop1") (const_int 8)
255	 (eq_attr "type" "sop2") (const_int 8)
256	 (eq_attr "type" "sopk") (const_int 8)
257	 (eq_attr "type" "sopc") (const_int 8)
258	 (eq_attr "type" "sopp") (const_int 4)
259	 (eq_attr "type" "smem") (const_int 8)
260	 (eq_attr "type" "ds")   (const_int 8)
261	 (eq_attr "type" "vop1") (const_int 8)
262	 (eq_attr "type" "vop2") (const_int 8)
263	 (eq_attr "type" "vopc") (const_int 8)
264	 (eq_attr "type" "vop3a") (const_int 8)
265	 (eq_attr "type" "vop3b") (const_int 8)
266	 (eq_attr "type" "vop_sdwa") (const_int 8)
267	 (eq_attr "type" "vop_dpp") (const_int 8)
268	 (eq_attr "type" "flat") (const_int 8)
269	 (eq_attr "type" "mult") (const_int 16)
270	 (eq_attr "type" "vmult") (const_int 16)]
271	(const_int 4)))
272
273; Disable alternatives that only apply to specific ISA variants.
274
275(define_attr "gcn_version" "gcn3,gcn5" (const_string "gcn3"))
276
277(define_attr "enabled" ""
278  (cond [(eq_attr "gcn_version" "gcn3") (const_int 1)
279	 (and (eq_attr "gcn_version" "gcn5")
280	      (ne (symbol_ref "TARGET_GCN5_PLUS") (const_int 0)))
281	   (const_int 1)]
282	(const_int 0)))
283
284; We need to be able to identify v_readlane and v_writelane with
285; SGPR lane selection in order to handle "Manually Inserted Wait States".
286
287(define_attr "laneselect" "yes,no" (const_string "no"))
288
289; Identify instructions that require a "Manually Inserted Wait State" if
290; their inputs are overwritten by subsequent instructions.
291
292(define_attr "delayeduse" "yes,no" (const_string "no"))
293
294;; }}}
295;; {{{ Iterators useful across the wole machine description
296
297(define_mode_iterator SIDI [SI DI])
298(define_mode_iterator SFDF [SF DF])
299(define_mode_iterator SISF [SI SF])
300(define_mode_iterator QIHI [QI HI])
301(define_mode_iterator DIDF [DI DF])
302(define_mode_iterator FP [HF SF DF])
303(define_mode_iterator FP_1REG [HF SF])
304
305;; }}}
306;; {{{ Attributes.
307
308; Translate RTX code into GCN instruction mnemonics with and without
309; suffixes such as _b32, etc.
310
311(define_code_attr mnemonic
312  [(minus "sub%i")
313   (plus "add%i")
314   (ashift "lshl%b")
315   (lshiftrt "lshr%b")
316   (ashiftrt "ashr%i")
317   (and "and%B")
318   (ior "or%B")
319   (xor "xor%B")
320   (mult "mul%i")
321   (smin "min%i")
322   (smax "max%i")
323   (umin "min%u")
324   (umax "max%u")
325   (not "not%B")
326   (popcount "bcnt_u32%b")])
327
328(define_code_attr bare_mnemonic
329  [(plus "add")
330   (minus "sub")
331   (and "and")
332   (ior "or")
333   (xor "xor")])
334
335(define_code_attr s_mnemonic
336  [(not "not%b")
337   (popcount "bcnt1_i32%b")
338   (clz "flbit_i32%b")
339   (ctz "ff1_i32%b")])
340
341(define_code_attr revmnemonic
342  [(minus "subrev%i")
343   (ashift "lshlrev%b")
344   (lshiftrt "lshrrev%b")
345   (ashiftrt "ashrrev%i")])
346
347; Translate RTX code into corresponding expander name.
348
349(define_code_attr expander
350  [(and "and")
351   (ior "ior")
352   (xor "xor")
353   (plus "add")
354   (minus "sub")
355   (ashift "ashl")
356   (lshiftrt "lshr")
357   (ashiftrt "ashr")
358   (mult "mul")
359   (smin "smin")
360   (smax "smax")
361   (umin "umin")
362   (umax "umax")
363   (not "one_cmpl")
364   (popcount "popcount")
365   (clz "clz")
366   (ctz "ctz")
367   (sign_extend "extend")
368   (zero_extend "zero_extend")])
369
370;; }}}
371;; {{{ Miscellaneous instructions
372
373(define_insn "nop"
374  [(const_int 0)]
375  ""
376  "s_nop\t0x0"
377  [(set_attr "type" "sopp")])
378
379; FIXME: What should the value of the immediate be? Zero is disallowed, so
380; pick 1 for now.
381(define_insn "trap"
382  [(trap_if (const_int 1) (const_int 0))]
383  ""
384  "s_trap\t1"
385  [(set_attr "type" "sopp")])
386
387;; }}}
388;; {{{ Moves
389
390;; All scalar modes we support moves in.
391(define_mode_iterator MOV_MODE [BI QI HI SI DI TI SF DF])
392
393; This is the entry point for creating all kinds of scalar moves,
394; including reloads and symbols.
395
396(define_expand "mov<mode>"
397  [(set (match_operand:MOV_MODE 0 "nonimmediate_operand")
398	(match_operand:MOV_MODE 1 "general_operand"))]
399  ""
400  {
401    if (SUBREG_P (operands[1])
402	&& GET_MODE (operands[1]) == SImode
403	&& GET_MODE (SUBREG_REG (operands[1])) == BImode)
404    {
405      /* (reg:BI VCC) has nregs==2 to ensure it gets clobbered as a whole,
406	 but (subreg:SI (reg:BI VCC)) doesn't, which causes the LRA liveness
407	 checks to assert.  Transform this:
408	   (set (reg:SI) (subreg:SI (reg:BI)))
409	 to this:
410	   (set (subreg:BI (reg:SI)) (reg:BI))  */
411      operands[0] = gen_rtx_SUBREG (BImode, operands[0], 0);
412      operands[1] = SUBREG_REG (operands[1]);
413    }
414    if (SUBREG_P (operands[0])
415	&& GET_MODE (operands[0]) == SImode
416	&& GET_MODE (SUBREG_REG (operands[0])) == BImode)
417      {
418	/* Likewise, transform this:
419	     (set (subreg:SI (reg:BI)) (reg:SI))
420	   to this:
421	     (set (reg:BI) (subreg:BI (reg:SI))) */
422	operands[0] = SUBREG_REG (operands[0]);
423	operands[1] = gen_rtx_SUBREG (BImode, operands[1], 0);
424      }
425
426    if (MEM_P (operands[0]))
427      operands[1] = force_reg (<MODE>mode, operands[1]);
428
429    if (!lra_in_progress && !reload_completed
430	&& !gcn_valid_move_p (<MODE>mode, operands[0], operands[1]))
431      {
432	/* Something is probably trying to generate a move
433	   which can only work indirectly.
434	   E.g. Move from LDS memory to SGPR hardreg
435	     or MEM:QI to SGPR.  */
436	rtx tmpreg = gen_reg_rtx (<MODE>mode);
437	emit_insn (gen_mov<mode> (tmpreg, operands[1]));
438	emit_insn (gen_mov<mode> (operands[0], tmpreg));
439	DONE;
440      }
441
442    if (<MODE>mode == DImode
443	&& (GET_CODE (operands[1]) == SYMBOL_REF
444	    || GET_CODE (operands[1]) == LABEL_REF))
445      {
446	if (lra_in_progress)
447	  emit_insn (gen_movdi_symbol_save_scc (operands[0], operands[1]));
448	else
449	  emit_insn (gen_movdi_symbol (operands[0], operands[1]));
450	DONE;
451      }
452  })
453
454; Split invalid moves into two valid moves
455
456(define_split
457  [(set (match_operand:MOV_MODE 0 "nonimmediate_operand")
458	(match_operand:MOV_MODE 1 "general_operand"))]
459  "!reload_completed && !lra_in_progress
460   && !gcn_valid_move_p (<MODE>mode, operands[0], operands[1])"
461  [(set (match_dup 2) (match_dup 1))
462   (set (match_dup 0) (match_dup 2))]
463  {
464    operands[2] = gen_reg_rtx(<MODE>mode);
465  })
466
467; We need BImode move so we can reload flags registers.
468
469(define_insn "*movbi"
470  [(set (match_operand:BI 0 "nonimmediate_operand"
471				    "=Sg,   v,Sg,cs,cV,cV,Sm,RS, v,RF, v,RM")
472	(match_operand:BI 1 "gcn_load_operand"
473				    "SSA,vSvA, v,SS, v,SS,RS,Sm,RF, v,RM, v"))]
474  ""
475  {
476    /* SCC as an operand is currently not accepted by the LLVM assembler, so
477       we emit bytes directly as a workaround.  */
478    switch (which_alternative) {
479    case 0:
480      if (REG_P (operands[1]) && REGNO (operands[1]) == SCC_REG)
481	return "; s_mov_b32\t%0,%1 is not supported by the assembler.\;"
482	       ".byte\t0xfd\;"
483	       ".byte\t0x0\;"
484	       ".byte\t0x80|%R0\;"
485	       ".byte\t0xbe";
486      else
487	return "s_mov_b32\t%0, %1";
488    case 1:
489      if (REG_P (operands[1]) && REGNO (operands[1]) == SCC_REG)
490	return "; v_mov_b32\t%0, %1\;"
491	       ".byte\t0xfd\;"
492	       ".byte\t0x2\;"
493	       ".byte\t((%V0<<1)&0xff)\;"
494	       ".byte\t0x7e|(%V0>>7)";
495      else
496	return "v_mov_b32\t%0, %1";
497    case 2:
498      return "v_readlane_b32\t%0, %1, 0";
499    case 3:
500      return "s_cmpk_lg_u32\t%1, 0";
501    case 4:
502      return "v_cmp_ne_u32\tvcc, 0, %1";
503    case 5:
504      if (REGNO (operands[1]) == SCC_REG)
505	return "; s_mov_b32\t%0, %1 is not supported by the assembler.\;"
506	       ".byte\t0xfd\;"
507	       ".byte\t0x0\;"
508	       ".byte\t0xea\;"
509	       ".byte\t0xbe\;"
510	       "s_mov_b32\tvcc_hi, 0";
511      else
512	return "s_mov_b32\tvcc_lo, %1\;"
513	       "s_mov_b32\tvcc_hi, 0";
514    case 6:
515      return "s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)";
516    case 7:
517      return "s_store_dword\t%1, %A0";
518    case 8:
519      return "flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0";
520    case 9:
521      return "flat_store_dword\t%A0, %1%O0%g0";
522    case 10:
523      return "global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)";
524    case 11:
525      return "global_store_dword\t%A0, %1%O0%g0";
526    default:
527      gcc_unreachable ();
528    }
529  }
530  [(set_attr "type" "sop1,vop1,vop3a,sopk,vopc,mult,smem,smem,flat,flat,
531		     flat,flat")
532   (set_attr "exec" "*,*,none,*,*,*,*,*,*,*,*,*")
533   (set_attr "length" "4,4,4,4,4,8,12,12,12,12,12,12")])
534
535; 32bit move pattern
536
537(define_insn "*mov<mode>_insn"
538  [(set (match_operand:SISF 0 "nonimmediate_operand"
539		  "=SD,SD,SD,SD,RB,Sm,RS,v,Sg, v, v,RF,v,RLRG,   v,SD, v,RM")
540	(match_operand:SISF 1 "gcn_load_operand"
541		  "SSA, J, B,RB,Sm,RS,Sm,v, v,Sv,RF, v,B,   v,RLRG, Y,RM, v"))]
542  ""
543  "@
544  s_mov_b32\t%0, %1
545  s_movk_i32\t%0, %1
546  s_mov_b32\t%0, %1
547  s_buffer_load%s0\t%0, s[0:3], %1\;s_waitcnt\tlgkmcnt(0)
548  s_buffer_store%s1\t%1, s[0:3], %0
549  s_load_dword\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
550  s_store_dword\t%1, %A0
551  v_mov_b32\t%0, %1
552  v_readlane_b32\t%0, %1, 0
553  v_writelane_b32\t%0, %1, 0
554  flat_load_dword\t%0, %A1%O1%g1\;s_waitcnt\t0
555  flat_store_dword\t%A0, %1%O0%g0
556  v_mov_b32\t%0, %1
557  ds_write_b32\t%A0, %1%O0
558  ds_read_b32\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
559  s_mov_b32\t%0, %1
560  global_load_dword\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
561  global_store_dword\t%A0, %1%O0%g0"
562  [(set_attr "type" "sop1,sopk,sop1,smem,smem,smem,smem,vop1,vop3a,vop3a,flat,
563		     flat,vop1,ds,ds,sop1,flat,flat")
564   (set_attr "exec" "*,*,*,*,*,*,*,*,none,none,*,*,*,*,*,*,*,*")
565   (set_attr "length" "4,4,8,12,12,12,12,4,8,8,12,12,8,12,12,8,12,12")])
566
567; 8/16bit move pattern
568
569(define_insn "*mov<mode>_insn"
570  [(set (match_operand:QIHI 0 "nonimmediate_operand"
571				 "=SD,SD,SD,v,Sg, v, v,RF,v,RLRG,   v, v,RM")
572	(match_operand:QIHI 1 "gcn_load_operand"
573				 "SSA, J, B,v, v,Sv,RF, v,B,   v,RLRG,RM, v"))]
574  "gcn_valid_move_p (<MODE>mode, operands[0], operands[1])"
575  "@
576  s_mov_b32\t%0, %1
577  s_movk_i32\t%0, %1
578  s_mov_b32\t%0, %1
579  v_mov_b32\t%0, %1
580  v_readlane_b32\t%0, %1, 0
581  v_writelane_b32\t%0, %1, 0
582  flat_load%o1\t%0, %A1%O1%g1\;s_waitcnt\t0
583  flat_store%s0\t%A0, %1%O0%g0
584  v_mov_b32\t%0, %1
585  ds_write%b0\t%A0, %1%O0
586  ds_read%u1\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
587  global_load%o1\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
588  global_store%s0\t%A0, %1%O0%g0"
589  [(set_attr "type"
590	     "sop1,sopk,sop1,vop1,vop3a,vop3a,flat,flat,vop1,ds,ds,flat,flat")
591   (set_attr "exec" "*,*,*,*,none,none,*,*,*,*,*,*,*")
592   (set_attr "length" "4,4,8,4,4,4,12,12,8,12,12,12,12")])
593
594; 64bit move pattern
595
596(define_insn_and_split "*mov<mode>_insn"
597  [(set (match_operand:DIDF 0 "nonimmediate_operand"
598			  "=SD,SD,SD,RS,Sm,v, v,Sg, v, v,RF,RLRG,   v, v,RM")
599	(match_operand:DIDF 1 "general_operand"
600			  "SSA, C,DB,Sm,RS,v,DB, v,Sv,RF, v,   v,RLRG,RM, v"))]
601  "GET_CODE(operands[1]) != SYMBOL_REF"
602  "@
603  s_mov_b64\t%0, %1
604  s_mov_b64\t%0, %1
605  #
606  s_store_dwordx2\t%1, %A0
607  s_load_dwordx2\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
608  #
609  #
610  #
611  #
612  flat_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\t0
613  flat_store_dwordx2\t%A0, %1%O0%g0
614  ds_write_b64\t%A0, %1%O0
615  ds_read_b64\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)
616  global_load_dwordx2\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
617  global_store_dwordx2\t%A0, %1%O0%g0"
618  "reload_completed
619   && ((!MEM_P (operands[0]) && !MEM_P (operands[1])
620        && !gcn_sgpr_move_p (operands[0], operands[1]))
621       || (GET_CODE (operands[1]) == CONST_INT
622	   && !gcn_constant64_p (operands[1])))"
623  [(set (match_dup 0) (match_dup 1))
624   (set (match_dup 2) (match_dup 3))]
625  {
626    rtx inlo = gen_lowpart (SImode, operands[1]);
627    rtx inhi = gen_highpart_mode (SImode, <MODE>mode, operands[1]);
628    rtx outlo = gen_lowpart (SImode, operands[0]);
629    rtx outhi = gen_highpart_mode (SImode, <MODE>mode, operands[0]);
630
631    /* Ensure that overlapping registers aren't corrupted.  */
632    if (reg_overlap_mentioned_p (outlo, inhi))
633      {
634	operands[0] = outhi;
635	operands[1] = inhi;
636	operands[2] = outlo;
637	operands[3] = inlo;
638      }
639    else
640      {
641	operands[0] = outlo;
642	operands[1] = inlo;
643	operands[2] = outhi;
644	operands[3] = inhi;
645      }
646  }
647  [(set_attr "type" "sop1,sop1,mult,smem,smem,vmult,vmult,vmult,vmult,flat,
648		     flat,ds,ds,flat,flat")
649   (set_attr "length" "4,8,*,12,12,*,*,*,*,12,12,12,12,12,12")])
650
651; 128-bit move.
652
653(define_insn_and_split "*movti_insn"
654  [(set (match_operand:TI 0 "nonimmediate_operand"
655				      "=SD,RS,Sm,RF, v,v, v,SD,RM, v,RL, v")
656	(match_operand:TI 1 "general_operand"  
657				      "SSB,Sm,RS, v,RF,v,Sv, v, v,RM, v,RL"))]
658  ""
659  "@
660  #
661  s_store_dwordx4\t%1, %A0
662  s_load_dwordx4\t%0, %A1\;s_waitcnt\tlgkmcnt(0)
663  flat_store_dwordx4\t%A0, %1%O0%g0
664  flat_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\t0
665  #
666  #
667  #
668  global_store_dwordx4\t%A0, %1%O0%g0
669  global_load_dwordx4\t%0, %A1%O1%g1\;s_waitcnt\tvmcnt(0)
670  ds_write_b128\t%A0, %1%O0
671  ds_read_b128\t%0, %A1%O1\;s_waitcnt\tlgkmcnt(0)"
672  "reload_completed
673   && REG_P (operands[0])
674   && (REG_P (operands[1]) || GET_CODE (operands[1]) == CONST_INT)"
675  [(set (match_dup 0) (match_dup 1))
676   (set (match_dup 2) (match_dup 3))
677   (set (match_dup 4) (match_dup 5))
678   (set (match_dup 6) (match_dup 7))]
679  {
680    gcc_assert (rtx_equal_p (operands[0], operands[1])
681		|| !reg_overlap_mentioned_p (operands[0], operands[1]));
682    operands[6] = gcn_operand_part (TImode, operands[0], 3);
683    operands[7] = gcn_operand_part (TImode, operands[1], 3);
684    operands[4] = gcn_operand_part (TImode, operands[0], 2);
685    operands[5] = gcn_operand_part (TImode, operands[1], 2);
686    operands[2] = gcn_operand_part (TImode, operands[0], 1);
687    operands[3] = gcn_operand_part (TImode, operands[1], 1);
688    operands[0] = gcn_operand_part (TImode, operands[0], 0);
689    operands[1] = gcn_operand_part (TImode, operands[1], 0);
690  }
691  [(set_attr "type" "mult,smem,smem,flat,flat,vmult,vmult,vmult,flat,flat,\
692		     ds,ds")
693   (set_attr "delayeduse" "*,*,yes,*,*,*,*,*,yes,*,*,*")
694   (set_attr "length" "*,12,12,12,12,*,*,*,12,12,12,12")])
695
696;; }}}
697;; {{{ Prologue/Epilogue
698
699(define_insn "prologue_use"
700  [(unspec_volatile [(match_operand 0)] UNSPECV_PROLOGUE_USE)]
701  ""
702  ""
703  [(set_attr "length" "0")])
704
705(define_expand "prologue"
706  [(const_int 0)]
707  ""
708  {
709    gcn_expand_prologue ();
710    DONE;
711  })
712
713(define_expand "epilogue"
714  [(const_int 0)]
715  ""
716  {
717    gcn_expand_epilogue ();
718    DONE;
719  })
720
721;; }}}
722;; {{{ Control flow
723
724; This pattern must satisfy simplejump_p, which means it cannot be a parallel
725; that clobbers SCC.  Thus, we must preserve SCC if we're generating a long
726; branch sequence.
727
728(define_insn "jump"
729  [(set (pc)
730	(label_ref (match_operand 0)))]
731  ""
732  {
733    if (get_attr_length (insn) == 4)
734      return "s_branch\t%0";
735    else
736      /* !!! This sequence clobbers EXEC_SAVE_REG and CC_SAVE_REG.  */
737      return "; s_mov_b32\ts22, scc is not supported by the assembler.\;"
738	     ".long\t0xbe9600fd\;"
739	     "s_getpc_b64\ts[20:21]\;"
740	     "s_add_u32\ts20, s20, %0@rel32@lo+4\;"
741	     "s_addc_u32\ts21, s21, %0@rel32@hi+4\;"
742	     "s_cmpk_lg_u32\ts22, 0\;"
743	     "s_setpc_b64\ts[20:21]";
744  }
745  [(set_attr "type" "sopp")
746   (set (attr "length")
747	(if_then_else (and (ge (minus (match_dup 0) (pc))
748			       (const_int -131072))
749			   (lt (minus (match_dup 0) (pc))
750			       (const_int 131072)))
751		      (const_int 4)
752		      (const_int 32)))])
753
754(define_insn "indirect_jump"
755  [(set (pc)
756	(match_operand:DI 0 "register_operand" "Sg"))]
757  ""
758  "s_setpc_b64\t%0"
759  [(set_attr "type" "sop1")
760   (set_attr "length" "4")])
761
762(define_insn "cjump"
763  [(set (pc)
764	(if_then_else
765	  (match_operator:BI 1 "gcn_conditional_operator"
766	    [(match_operand:BI 2 "gcn_conditional_register_operand" "ca,cV")
767	     (const_int 0)])
768	  (label_ref (match_operand 0))
769	  (pc)))]
770  ""
771  {
772    if (get_attr_length (insn) == 4)
773      return "s_cbranch%C1\t%0";
774    else
775      {
776	/* !!! This sequence clobbers EXEC_SAVE_REG and CC_SAVE_REG but
777	       restores SCC.  */
778	if (REGNO (operands[2]) == SCC_REG)
779	  {
780	    if (GET_CODE (operands[1]) == EQ)
781	      return "s_cbranch%c1\t.Lskip%=\;"
782		     "s_getpc_b64\ts[20:21]\;"
783		     "s_add_u32\ts20, s20, %0@rel32@lo+4\;"
784		     "s_addc_u32\ts21, s21, %0@rel32@hi+4\;"
785		     "s_cmp_lg_u32\t0, 0\;"
786		     "s_setpc_b64\ts[20:21]\n"
787		     ".Lskip%=:";
788	    else
789	      return "s_cbranch%c1\t.Lskip%=\;"
790		     "s_getpc_b64\ts[20:21]\;"
791		     "s_add_u32\ts20, s20, %0@rel32@lo+4\;"
792		     "s_addc_u32\ts21, s21, %0@rel32@hi+4\;"
793		     "s_cmp_eq_u32\t0, 0\;"
794		     "s_setpc_b64\ts[20:21]\n"
795		     ".Lskip%=:";
796	  }
797	else
798	  return "s_cbranch%c1\t.Lskip%=\;"
799		 "; s_mov_b32\ts22, scc is not supported by the assembler.\;"
800		 ".byte\t0xfd\;"
801		 ".byte\t0x0\;"
802		 ".byte\t0x80|22\;"
803		 ".byte\t0xbe\;"
804		 "s_getpc_b64\ts[20:21]\;"
805		 "s_add_u32\ts20, s20, %0@rel32@lo+4\;"
806		 "s_addc_u32\ts21, s21, %0@rel32@hi+4\;"
807		 "s_cmpk_lg_u32\ts22, 0\;"
808		 "s_setpc_b64\ts[20:21]\n"
809		 ".Lskip%=:";
810      }
811  }
812  [(set_attr "type" "sopp")
813   (set (attr "length")
814	(if_then_else (and (ge (minus (match_dup 0) (pc))
815			       (const_int -131072))
816			   (lt (minus (match_dup 0) (pc))
817			       (const_int 131072)))
818		      (const_int 4)
819		      (const_int 36)))])
820
821; Returning from a normal function is different to returning from a
822; kernel function.
823
824(define_insn "gcn_return"
825  [(return)]
826  ""
827  {
828    if (cfun && cfun->machine && cfun->machine->normal_function)
829      return "s_setpc_b64\ts[18:19]";
830    else
831      return "s_waitcnt\tlgkmcnt(0)\;s_dcache_wb\;s_endpgm";
832  }
833  [(set_attr "type" "sop1")
834   (set_attr "length" "12")])
835
836(define_expand "call"
837  [(parallel [(call (match_operand 0 "")
838		    (match_operand 1 ""))
839	      (clobber (reg:DI LR_REGNUM))
840	      (clobber (match_scratch:DI 2))])]
841  ""
842  {})
843
844(define_insn "gcn_simple_call"
845  [(call (mem (match_operand 0 "immediate_operand" "Y,B"))
846	 (match_operand 1 "const_int_operand"))
847   (clobber (reg:DI LR_REGNUM))
848   (clobber (match_scratch:DI 2 "=&Sg,X"))]
849  ""
850  "@
851  s_getpc_b64\t%2\;s_add_u32\t%L2, %L2, %0@rel32@lo+4\;s_addc_u32\t%H2, %H2, %0@rel32@hi+4\;s_swappc_b64\ts[18:19], %2
852  s_swappc_b64\ts[18:19], %0"
853  [(set_attr "type" "mult,sop1")
854   (set_attr "length" "24,4")])
855
856(define_insn "movdi_symbol"
857 [(set (match_operand:DI 0 "nonimmediate_operand" "=Sg")
858       (match_operand:DI 1 "general_operand" "Y"))
859  (clobber (reg:BI SCC_REG))]
860 "GET_CODE (operands[1]) == SYMBOL_REF || GET_CODE (operands[1]) == LABEL_REF"
861  {
862    if (SYMBOL_REF_P (operands[1])
863	&& SYMBOL_REF_WEAK (operands[1]))
864	return "s_getpc_b64\t%0\;"
865	       "s_add_u32\t%L0, %L0, %1@gotpcrel32@lo+4\;"
866	       "s_addc_u32\t%H0, %H0, %1@gotpcrel32@hi+4\;"
867	       "s_load_dwordx2\t%0, %0\;"
868	       "s_waitcnt\tlgkmcnt(0)";
869
870    return "s_getpc_b64\t%0\;"
871	   "s_add_u32\t%L0, %L0, %1@rel32@lo+4\;"
872	   "s_addc_u32\t%H0, %H0, %1@rel32@hi+4";
873  }
874 [(set_attr "type" "mult")
875  (set_attr "length" "32")])
876
877(define_insn "movdi_symbol_save_scc"
878 [(set (match_operand:DI 0 "nonimmediate_operand" "=Sg")
879       (match_operand:DI 1 "general_operand" "Y"))
880  (clobber (reg:BI CC_SAVE_REG))]
881 "(GET_CODE (operands[1]) == SYMBOL_REF || GET_CODE (operands[1]) == LABEL_REF)
882  && (lra_in_progress || reload_completed)"
883  {
884    /* !!! These sequences clobber CC_SAVE_REG.  */
885
886    if (SYMBOL_REF_P (operands[1])
887	&& SYMBOL_REF_WEAK (operands[1]))
888	return "; s_mov_b32\ts22, scc is not supported by the assembler.\;"
889	       ".long\t0xbe9600fd\;"
890	       "s_getpc_b64\t%0\;"
891	       "s_add_u32\t%L0, %L0, %1@gotpcrel32@lo+4\;"
892	       "s_addc_u32\t%H0, %H0, %1@gotpcrel32@hi+4\;"
893	       "s_load_dwordx2\t%0, %0\;"
894	       "s_cmpk_lg_u32\ts22, 0\;"
895	       "s_waitcnt\tlgkmcnt(0)";
896
897    return "; s_mov_b32\ts22, scc is not supported by the assembler.\;"
898	   ".long\t0xbe9600fd\;"
899	   "s_getpc_b64\t%0\;"
900	   "s_add_u32\t%L0, %L0, %1@rel32@lo+4\;"
901	   "s_addc_u32\t%H0, %H0, %1@rel32@hi+4\;"
902	   "s_cmpk_lg_u32\ts22, 0";
903  }
904 [(set_attr "type" "mult")
905  (set_attr "length" "40")])
906
907
908(define_insn "gcn_indirect_call"
909  [(call (mem (match_operand:DI 0 "register_operand" "Sg"))
910	 (match_operand 1 "" ""))
911   (clobber (reg:DI LR_REGNUM))
912   (clobber (match_scratch:DI 2 "=X"))]
913  ""
914  "s_swappc_b64\ts[18:19], %0"
915  [(set_attr "type" "sop1")
916   (set_attr "length" "4")])
917
918(define_expand "call_value"
919  [(parallel [(set (match_operand 0 "")
920		   (call (match_operand 1 "")
921			 (match_operand 2 "")))
922	      (clobber (reg:DI LR_REGNUM))
923	      (clobber (match_scratch:DI 3))])]
924  ""
925  {})
926
927(define_insn "gcn_call_value"
928  [(set (match_operand 0 "register_operand" "=Sg,Sg")
929	(call (mem (match_operand 1 "immediate_operand" "Y,B"))
930	      (match_operand 2 "const_int_operand")))
931   (clobber (reg:DI LR_REGNUM))
932   (clobber (match_scratch:DI 3 "=&Sg,X"))]
933  ""
934  "@
935  s_getpc_b64\t%3\;s_add_u32\t%L3, %L3, %1@rel32@lo+4\;s_addc_u32\t%H3, %H3, %1@rel32@hi+4\;s_swappc_b64\ts[18:19], %3
936  s_swappc_b64\ts[18:19], %1"
937  [(set_attr "type" "sop1")
938   (set_attr "length" "24")])
939
940(define_insn "gcn_call_value_indirect"
941  [(set (match_operand 0 "register_operand" "=Sg")
942	(call (mem (match_operand:DI 1 "register_operand" "Sg"))
943	      (match_operand 2 "" "")))
944   (clobber (reg:DI LR_REGNUM))
945   (clobber (match_scratch:DI 3 "=X"))]
946  ""
947  "s_swappc_b64\ts[18:19], %1"
948  [(set_attr "type" "sop1")
949   (set_attr "length" "4")])
950
951; GCN does not have an instruction to clear only part of the instruction
952; cache, so the operands are ignored.
953
954(define_insn "clear_icache"
955  [(unspec_volatile
956    [(match_operand 0 "") (match_operand 1 "")]
957    UNSPECV_ICACHE_INV)]
958  ""
959  "s_icache_inv"
960  [(set_attr "type" "sopp")
961   (set_attr "length" "4")])
962
963;; }}}
964;; {{{ Conditionals
965
966; 32-bit compare, scalar unit only
967
968(define_insn "cstoresi4"
969  [(set (match_operand:BI 0 "gcn_conditional_register_operand"
970							 "=cs, cs, cs, cs")
971	(match_operator:BI 1 "gcn_compare_operator"
972	  [(match_operand:SI 2 "gcn_alu_operand"	 "SSA,SSA,SSB, SS")
973	   (match_operand:SI 3 "gcn_alu_operand"	 "SSA,SSL, SS,SSB")]))]
974  ""
975  "@
976   s_cmp%D1\t%2, %3
977   s_cmpk%D1\t%2, %3
978   s_cmp%D1\t%2, %3
979   s_cmp%D1\t%2, %3"
980  [(set_attr "type" "sopc,sopk,sopk,sopk")
981   (set_attr "length" "4,4,8,8")])
982
983(define_expand "cbranchsi4"
984  [(match_operator 0 "gcn_compare_operator"
985     [(match_operand:SI 1 "gcn_alu_operand")
986      (match_operand:SI 2 "gcn_alu_operand")])
987   (match_operand 3)]
988  ""
989  {
990    rtx cc = gen_reg_rtx (BImode);
991    emit_insn (gen_cstoresi4 (cc, operands[0], operands[1], operands[2]));
992    emit_jump_insn (gen_cjump (operands[3],
993			       gen_rtx_NE (BImode, cc, const0_rtx), cc));
994    DONE;
995  })
996
997; 64-bit compare; either unit, but scalar allows limited operators
998
999(define_expand "cstoredi4"
1000  [(set (match_operand:BI 0 "gcn_conditional_register_operand")
1001	(match_operator:BI 1 "gcn_compare_operator"
1002			   [(match_operand:DI 2 "gcn_alu_operand")
1003			    (match_operand:DI 3 "gcn_alu_operand")]))]
1004  ""
1005  {})
1006
1007(define_insn "cstoredi4_vec_and_scalar"
1008  [(set (match_operand:BI 0 "gcn_conditional_register_operand" "= cs,  cV")
1009	(match_operator:BI 1 "gcn_compare_64bit_operator"
1010	  [(match_operand:DI 2 "gcn_alu_operand"	       "%SSA,vSvC")
1011	   (match_operand:DI 3 "gcn_alu_operand"	       " SSC,   v")]))]
1012  ""
1013  "@
1014   s_cmp%D1\t%2, %3
1015   v_cmp%E1\tvcc, %2, %3"
1016  [(set_attr "type" "sopc,vopc")
1017   (set_attr "length" "8")])
1018
1019(define_insn "cstoredi4_vector"
1020  [(set (match_operand:BI 0 "gcn_conditional_register_operand" "= cV")
1021	(match_operator:BI 1 "gcn_compare_operator"
1022          [(match_operand:DI 2 "gcn_alu_operand"	       "vSvB")
1023	   (match_operand:DI 3 "gcn_alu_operand"	       "   v")]))]
1024  ""
1025  "v_cmp%E1\tvcc, %2, %3"
1026  [(set_attr "type" "vopc")
1027   (set_attr "length" "8")])
1028
1029(define_expand "cbranchdi4"
1030  [(match_operator 0 "gcn_compare_operator"
1031     [(match_operand:DI 1 "gcn_alu_operand")
1032      (match_operand:DI 2 "gcn_alu_operand")])
1033   (match_operand 3)]
1034  ""
1035  {
1036    rtx cc = gen_reg_rtx (BImode);
1037    emit_insn (gen_cstoredi4 (cc, operands[0], operands[1], operands[2]));
1038    emit_jump_insn (gen_cjump (operands[3],
1039			       gen_rtx_NE (BImode, cc, const0_rtx), cc));
1040    DONE;
1041  })
1042
1043; FP compare; vector unit only
1044
1045(define_insn "cstore<mode>4"
1046  [(set (match_operand:BI 0 "gcn_conditional_register_operand" "=cV")
1047	(match_operator:BI 1 "gcn_fp_compare_operator"
1048	  [(match_operand:SFDF 2 "gcn_alu_operand"		"vB")
1049	   (match_operand:SFDF 3 "gcn_alu_operand"		 "v")]))]
1050  ""
1051  "v_cmp%E1\tvcc, %2, %3"
1052  [(set_attr "type" "vopc")
1053   (set_attr "length" "8")])
1054
1055(define_expand "cbranch<mode>4"
1056  [(match_operator 0 "gcn_fp_compare_operator"
1057     [(match_operand:SFDF 1 "gcn_alu_operand")
1058      (match_operand:SFDF 2 "gcn_alu_operand")])
1059   (match_operand 3)]
1060  ""
1061  {
1062    rtx cc = gen_reg_rtx (BImode);
1063    emit_insn (gen_cstore<mode>4 (cc, operands[0], operands[1], operands[2]));
1064    emit_jump_insn (gen_cjump (operands[3],
1065			       gen_rtx_NE (BImode, cc, const0_rtx), cc));
1066    DONE;
1067  })
1068
1069;; }}}
1070;; {{{ ALU special cases: Plus
1071
1072(define_insn "addsi3"
1073  [(set (match_operand:SI 0 "register_operand"         "= Sg, Sg, Sg,   v")
1074        (plus:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA,  0,SgA,   v")
1075		 (match_operand:SI 2 "gcn_alu_operand" " SgA,SgJ,  B,vBSv")))
1076   (clobber (match_scratch:BI 3			       "= cs, cs, cs,   X"))
1077   (clobber (match_scratch:DI 4			       "=  X,  X,  X,  cV"))]
1078  ""
1079  "@
1080   s_add_i32\t%0, %1, %2
1081   s_addk_i32\t%0, %2
1082   s_add_i32\t%0, %1, %2
1083   v_add%^_u32\t%0, vcc, %2, %1"
1084  [(set_attr "type" "sop2,sopk,sop2,vop2")
1085   (set_attr "length" "4,4,8,8")])
1086
1087(define_expand "addsi3_scc"
1088  [(parallel [(set (match_operand:SI 0 "register_operand")
1089		   (plus:SI (match_operand:SI 1 "gcn_alu_operand")
1090			    (match_operand:SI 2 "gcn_alu_operand")))
1091	      (clobber (reg:BI SCC_REG))
1092	      (clobber (scratch:DI))])]
1093  ""
1094  {})
1095
1096; Having this as an insn_and_split allows us to keep together DImode adds
1097; through some RTL optimisation passes, and means the CC reg we set isn't
1098; dependent on the constraint alternative (which doesn't seem to work well).
1099
1100; If v_addc_u32 is used to add with carry, a 32-bit literal constant cannot be
1101; used as an operand due to the read of VCC, so we restrict constants to the
1102; inlinable range for that alternative.
1103
1104(define_insn_and_split "adddi3"
1105  [(set (match_operand:DI 0 "register_operand"		 "=Sg, v")
1106	(plus:DI (match_operand:DI 1 "register_operand"  " Sg, v")
1107		 (match_operand:DI 2 "nonmemory_operand" "SgB,vA")))
1108   (clobber (match_scratch:BI 3				 "=cs, X"))
1109   (clobber (match_scratch:DI 4				 "= X,cV"))]
1110  ""
1111  "#"
1112  "&& reload_completed"
1113  [(const_int 0)]
1114  {
1115    rtx cc = gen_rtx_REG (BImode, gcn_vgpr_register_operand (operands[1],
1116							     DImode)
1117			  ? VCC_REG : SCC_REG);
1118
1119    emit_insn (gen_addsi3_scalar_carry
1120	       (gcn_operand_part (DImode, operands[0], 0),
1121		gcn_operand_part (DImode, operands[1], 0),
1122		gcn_operand_part (DImode, operands[2], 0),
1123		cc));
1124    rtx val = gcn_operand_part (DImode, operands[2], 1);
1125    if (val != const0_rtx)
1126      emit_insn (gen_addcsi3_scalar
1127		 (gcn_operand_part (DImode, operands[0], 1),
1128		  gcn_operand_part (DImode, operands[1], 1),
1129		  gcn_operand_part (DImode, operands[2], 1),
1130		  cc, cc));
1131    else
1132      emit_insn (gen_addcsi3_scalar_zero
1133		 (gcn_operand_part (DImode, operands[0], 1),
1134		  gcn_operand_part (DImode, operands[1], 1),
1135		  cc));
1136    DONE;
1137  }
1138  [(set_attr "type" "mult,vmult")
1139   (set_attr "length" "8")])
1140
1141(define_expand "adddi3_scc"
1142  [(parallel [(set (match_operand:DI 0 "register_operand")
1143		   (plus:DI (match_operand:DI 1 "register_operand")
1144			    (match_operand:DI 2 "nonmemory_operand")))
1145	      (clobber (reg:BI SCC_REG))
1146	      (clobber (scratch:DI))])]
1147  ""
1148  {})
1149
1150;; Add with carry.
1151
1152(define_insn "addsi3_scalar_carry"
1153  [(set (match_operand:SI 0 "register_operand"	       "= Sg, v")
1154	(plus:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, v")
1155		 (match_operand:SI 2 "gcn_alu_operand" " SgB,vB")))
1156   (set (match_operand:BI 3 "register_operand"	       "= cs,cV")
1157	(ltu:BI (plus:SI (match_dup 1)
1158			 (match_dup 2))
1159		(match_dup 1)))]
1160  ""
1161  "@
1162   s_add_u32\t%0, %1, %2
1163   v_add%^_u32\t%0, vcc, %2, %1"
1164  [(set_attr "type" "sop2,vop2")
1165   (set_attr "length" "8,8")])
1166
1167(define_insn "addsi3_scalar_carry_cst"
1168  [(set (match_operand:SI 0 "register_operand"           "=Sg, v")
1169        (plus:SI (match_operand:SI 1 "gcn_alu_operand"   "SgA, v")
1170		 (match_operand:SI 2 "const_int_operand" "  n, n")))
1171   (set (match_operand:BI 4 "register_operand"           "=cs,cV")
1172	(geu:BI (plus:SI (match_dup 1)
1173			 (match_dup 2))
1174		(match_operand:SI 3 "const_int_operand"  "  n, n")))]
1175  "INTVAL (operands[2]) == -INTVAL (operands[3])"
1176  "@
1177   s_add_u32\t%0, %1, %2
1178   v_add%^_u32\t%0, vcc, %2, %1"
1179  [(set_attr "type" "sop2,vop2")
1180   (set_attr "length" "4")])
1181
1182(define_insn "addcsi3_scalar"
1183  [(set (match_operand:SI 0 "register_operand"			   "= Sg, v")
1184	(plus:SI (plus:SI (zero_extend:SI
1185			    (match_operand:BI 3 "register_operand" "= cs,cV"))
1186			  (match_operand:SI 1 "gcn_alu_operand"    "%SgA, v"))
1187		 (match_operand:SI 2 "gcn_alu_operand"		   " SgB,vA")))
1188   (set (match_operand:BI 4 "register_operand"			   "=  3, 3")
1189	(ior:BI (ltu:BI (plus:SI
1190			  (plus:SI
1191			    (zero_extend:SI (match_dup 3))
1192			    (match_dup 1))
1193			  (match_dup 2))
1194			(match_dup 2))
1195		(ltu:BI (plus:SI (zero_extend:SI (match_dup 3)) (match_dup 1))
1196			(match_dup 1))))]
1197  ""
1198  "@
1199   s_addc_u32\t%0, %1, %2
1200   v_addc%^_u32\t%0, vcc, %2, %1, vcc"
1201  [(set_attr "type" "sop2,vop2")
1202   (set_attr "length" "8,4")])
1203
1204(define_insn "addcsi3_scalar_zero"
1205  [(set (match_operand:SI 0 "register_operand"		  "=Sg, v")
1206        (plus:SI (zero_extend:SI
1207		   (match_operand:BI 2 "register_operand" "=cs,cV"))
1208		 (match_operand:SI 1 "gcn_alu_operand"    "SgA, v")))
1209   (set (match_dup 2)
1210	(ltu:BI (plus:SI (zero_extend:SI (match_dup 2))
1211			 (match_dup 1))
1212		(match_dup 1)))]
1213  ""
1214  "@
1215   s_addc_u32\t%0, %1, 0
1216   v_addc%^_u32\t%0, vcc, 0, %1, vcc"
1217  [(set_attr "type" "sop2,vop2")
1218   (set_attr "length" "4")])
1219
1220; "addptr" is the same as "add" except that it must not write to VCC or SCC
1221; as a side-effect.  Unfortunately GCN does not have a suitable instruction
1222; for this, so we use a custom VOP3 add with CC_SAVE_REG as a temp.
1223; Note that it is not safe to save/clobber/restore SCC because doing so will
1224; break data-flow analysis, so this must use vector registers.
1225;
1226; The "v0" should be just "v", but somehow the "0" helps LRA not loop forever
1227; on testcase pr54713-2.c with -O0. It's only an optimization hint anyway.
1228
1229(define_insn "addptrdi3"
1230  [(set (match_operand:DI 0 "register_operand"		 "= v")
1231	(plus:DI (match_operand:DI 1 "register_operand"	 " v0")
1232		 (match_operand:DI 2 "nonmemory_operand" "vDA")))]
1233  ""
1234  {
1235    rtx new_operands[4] = { operands[0], operands[1], operands[2],
1236			    gen_rtx_REG (DImode, CC_SAVE_REG) };
1237
1238    output_asm_insn ("v_add%^_u32 %L0, %3, %L2, %L1", new_operands);
1239    output_asm_insn ("v_addc%^_u32 %H0, %3, %H2, %H1, %3", new_operands);
1240
1241    return "";
1242  }
1243  [(set_attr "type" "vmult")
1244   (set_attr "length" "16")])
1245
1246;; }}}
1247;; {{{ ALU special cases: Minus
1248
1249(define_insn "subsi3"
1250  [(set (match_operand:SI 0 "register_operand"          "=Sg, Sg,    v,   v")
1251	(minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA,SgA,    v,vBSv")
1252		  (match_operand:SI 2 "gcn_alu_operand" "SgA,  B, vBSv,   v")))
1253   (clobber (match_scratch:BI 3				"=cs, cs,    X,   X"))
1254   (clobber (match_scratch:DI 4				"= X,  X,   cV,  cV"))]
1255  ""
1256  "@
1257   s_sub_i32\t%0, %1, %2
1258   s_sub_i32\t%0, %1, %2
1259   v_subrev%^_u32\t%0, vcc, %2, %1
1260   v_sub%^_u32\t%0, vcc, %1, %2"
1261  [(set_attr "type" "sop2,sop2,vop2,vop2")
1262   (set_attr "length" "4,8,8,8")])
1263
1264(define_insn_and_split "subdi3"
1265  [(set (match_operand:DI 0 "register_operand"        "=Sg, Sg")
1266	(minus:DI
1267		(match_operand:DI 1 "gcn_alu_operand" "SgA,SgB")
1268		(match_operand:DI 2 "gcn_alu_operand" "SgB,SgA")))
1269   (clobber (reg:BI SCC_REG))]
1270  ""
1271  "#"
1272  "reload_completed"
1273  [(const_int 0)]
1274  {
1275    emit_insn (gen_subsi3_scalar_carry
1276	       (gcn_operand_part (DImode, operands[0], 0),
1277		gcn_operand_part (DImode, operands[1], 0),
1278		gcn_operand_part (DImode, operands[2], 0)));
1279    rtx val = gcn_operand_part (DImode, operands[2], 1);
1280    if (val != const0_rtx)
1281      emit_insn (gen_subcsi3_scalar
1282		 (gcn_operand_part (DImode, operands[0], 1),
1283		  gcn_operand_part (DImode, operands[1], 1),
1284		  gcn_operand_part (DImode, operands[2], 1)));
1285    else
1286      emit_insn (gen_subcsi3_scalar_zero
1287		 (gcn_operand_part (DImode, operands[0], 1),
1288		  gcn_operand_part (DImode, operands[1], 1)));
1289    DONE;
1290  }
1291  [(set_attr "length" "8")])
1292
1293(define_insn "subsi3_scalar_carry"
1294  [(set (match_operand:SI 0 "register_operand"          "=Sg, Sg")
1295        (minus:SI (match_operand:SI 1 "gcn_alu_operand" "SgA,SgB")
1296		  (match_operand:SI 2 "gcn_alu_operand" "SgB,SgA")))
1297   (set (reg:BI SCC_REG)
1298	(gtu:BI (minus:SI (match_dup 1)
1299			  (match_dup 2))
1300		(match_dup 1)))]
1301  ""
1302  "s_sub_u32\t%0, %1, %2"
1303  [(set_attr "type" "sop2")
1304   (set_attr "length" "8")])
1305
1306(define_insn "subsi3_scalar_carry_cst"
1307  [(set (match_operand:SI 0 "register_operand"           "=Sg")
1308        (minus:SI (match_operand:SI 1 "gcn_alu_operand"  "SgA")
1309		 (match_operand:SI 2 "const_int_operand" "  n")))
1310   (set (reg:BI SCC_REG)
1311	(leu:BI (minus:SI (match_dup 1)
1312			 (match_dup 2))
1313		(match_operand:SI 3 "const_int_operand"  "  n")))]
1314  "INTVAL (operands[2]) == -INTVAL (operands[3])"
1315  "s_sub_u32\t%0, %1, %2"
1316  [(set_attr "type" "sop2")
1317   (set_attr "length" "4")])
1318
1319(define_insn "subcsi3_scalar"
1320  [(set (match_operand:SI 0 "register_operand"                    "=Sg, Sg")
1321        (minus:SI (minus:SI (zero_extend:SI (reg:BI SCC_REG))
1322			    (match_operand:SI 1 "gcn_alu_operand" "SgA,SgB"))
1323		 (match_operand:SI 2 "gcn_alu_operand"            "SgB,SgA")))
1324   (set (reg:BI SCC_REG)
1325	(ior:BI (gtu:BI (minus:SI (minus:SI (zero_extend:SI (reg:BI SCC_REG))
1326					    (match_dup 1))
1327				 (match_dup 2))
1328			(match_dup 1))
1329		(gtu:BI (minus:SI (zero_extend:SI (reg:BI SCC_REG))
1330				  (match_dup 1))
1331			(match_dup 1))))]
1332  ""
1333  "s_subb_u32\t%0, %1, %2"
1334  [(set_attr "type" "sop2")
1335   (set_attr "length" "8")])
1336
1337(define_insn "subcsi3_scalar_zero"
1338  [(set (match_operand:SI 0 "register_operand"		"=Sg")
1339        (minus:SI (zero_extend:SI (reg:BI SCC_REG))
1340		  (match_operand:SI 1 "gcn_alu_operand" "SgA")))
1341   (set (reg:BI SCC_REG)
1342	(gtu:BI (minus:SI (zero_extend:SI (reg:BI SCC_REG)) (match_dup 1))
1343		(match_dup 1)))]
1344  ""
1345  "s_subb_u32\t%0, %1, 0"
1346  [(set_attr "type" "sop2")
1347   (set_attr "length" "4")])
1348
1349;; }}}
1350;; {{{ ALU: mult
1351
1352; Vector multiply has vop3a encoding, but no corresponding vop2a, so no long
1353; immediate.
1354(define_insn "mulsi3"
1355  [(set (match_operand:SI 0 "register_operand"	       "= Sg,Sg, Sg,   v")
1356        (mult:SI (match_operand:SI 1 "gcn_alu_operand" "%SgA, 0,SgA,   v")
1357		 (match_operand:SI 2 "gcn_alu_operand" " SgA, J,  B,vASv")))]
1358  ""
1359  "@
1360   s_mul_i32\t%0, %1, %2
1361   s_mulk_i32\t%0, %2
1362   s_mul_i32\t%0, %1, %2
1363   v_mul_lo_i32\t%0, %1, %2"
1364  [(set_attr "type" "sop2,sopk,sop2,vop3a")
1365   (set_attr "length" "4,4,8,4")])
1366
1367(define_code_iterator any_extend [sign_extend zero_extend])
1368(define_code_attr sgnsuffix [(sign_extend "%i") (zero_extend "%u")])
1369(define_code_attr su [(sign_extend "s") (zero_extend "u")])
1370(define_code_attr u [(sign_extend "") (zero_extend "u")])
1371(define_code_attr iu [(sign_extend "i") (zero_extend "u")])
1372(define_code_attr e [(sign_extend "e") (zero_extend "")])
1373
1374(define_insn "<su>mulsi3_highpart"
1375  [(set (match_operand:SI 0 "register_operand"	       "= v")
1376	(truncate:SI
1377	  (lshiftrt:DI
1378	    (mult:DI
1379	      (any_extend:DI
1380		(match_operand:SI 1 "register_operand" "% v"))
1381	      (any_extend:DI
1382		(match_operand:SI 2 "register_operand" "vSv")))
1383	    (const_int 32))))]
1384  ""
1385  "v_mul_hi<sgnsuffix>0\t%0, %2, %1"
1386  [(set_attr "type" "vop3a")
1387   (set_attr "length" "8")])
1388
1389(define_insn "<u>mulhisi3"
1390  [(set (match_operand:SI 0 "register_operand"			"=v")
1391	(mult:SI
1392	  (any_extend:SI (match_operand:HI 1 "register_operand" "%v"))
1393	  (any_extend:SI (match_operand:HI 2 "register_operand" " v"))))]
1394  ""
1395  "v_mul_<iu>32_<iu>24_sdwa\t%0, %<e>1, %<e>2 src0_sel:WORD_0 src1_sel:WORD_0"
1396  [(set_attr "type" "vop_sdwa")
1397   (set_attr "length" "8")])
1398
1399(define_insn "<u>mulqihi3_scalar"
1400  [(set (match_operand:HI 0 "register_operand"			"=v")
1401	(mult:HI
1402	  (any_extend:HI (match_operand:QI 1 "register_operand" "%v"))
1403	  (any_extend:HI (match_operand:QI 2 "register_operand" " v"))))]
1404  ""
1405  "v_mul_<iu>32_<iu>24_sdwa\t%0, %<e>1, %<e>2 src0_sel:BYTE_0 src1_sel:BYTE_0"
1406  [(set_attr "type" "vop_sdwa")
1407   (set_attr "length" "8")])
1408
1409;; }}}
1410;; {{{ ALU: generic 32-bit unop
1411
1412(define_code_iterator bitunop [not popcount])
1413(define_code_attr popcount_extra_op [(not "") (popcount ", 0")])
1414
1415(define_insn "<expander>si2"
1416  [(set (match_operand:SI 0 "register_operand"  "=Sg,   v")
1417        (bitunop:SI
1418	  (match_operand:SI 1 "gcn_alu_operand" "SgB,vSvB")))
1419   (clobber (match_scratch:BI 2			"=cs,   X"))]
1420  ""
1421  "@
1422   s_<s_mnemonic>0\t%0, %1
1423   v_<mnemonic>0\t%0, %1<popcount_extra_op>"
1424  [(set_attr "type" "sop1,vop1")
1425   (set_attr "length" "8")])
1426
1427(define_code_iterator countzeros [clz ctz])
1428
1429(define_insn "<expander>si2"
1430  [(set (match_operand:SI 0 "register_operand"  "=Sg,Sg")
1431        (countzeros:SI
1432	  (match_operand:SI 1 "gcn_alu_operand" "SgA, B")))]
1433  ""
1434  "s_<s_mnemonic>1\t%0, %1"
1435  [(set_attr "type" "sop1")
1436   (set_attr "length" "4,8")])
1437
1438; The truncate ensures that a constant passed to operand 1 is treated as DImode
1439(define_insn "<expander>di2"
1440  [(set (match_operand:SI 0 "register_operand"    "=Sg,Sg")
1441	(truncate:SI
1442	  (countzeros:DI
1443	    (match_operand:DI 1 "gcn_alu_operand" "SgA, B"))))]
1444  ""
1445  "s_<s_mnemonic>1\t%0, %1"
1446  [(set_attr "type" "sop1")
1447   (set_attr "length" "4,8")])
1448
1449;; }}}
1450;; {{{ ALU: generic 32-bit binop
1451
1452; No plus and mult - they have variant with 16bit immediate
1453; and thus are defined later.
1454(define_code_iterator binop [and ior xor smin smax umin umax
1455				 ashift lshiftrt ashiftrt])
1456(define_code_iterator vec_and_scalar_com [and ior xor smin smax umin umax])
1457(define_code_iterator vec_and_scalar_nocom [ashift lshiftrt ashiftrt])
1458
1459(define_insn "<expander>si3"
1460  [(set (match_operand:SI 0 "gcn_valu_dst_operand"    "= Sg,   v,RD")
1461        (vec_and_scalar_com:SI
1462	  (match_operand:SI 1 "gcn_valu_src0_operand" "%SgA,vSvB, 0")
1463	  (match_operand:SI 2 "gcn_alu_operand"	      " SgB,   v, v")))
1464   (clobber (match_scratch:BI 3			      "= cs,   X, X"))]
1465  ""
1466  "@
1467   s_<mnemonic>0\t%0, %1, %2
1468   v_<mnemonic>0\t%0, %1, %2
1469   ds_<mnemonic>0\t%A0, %2%O0"
1470  [(set_attr "type" "sop2,vop2,ds")
1471   (set_attr "length" "8")])
1472
1473(define_insn "<expander>si3"
1474  [(set (match_operand:SI 0 "register_operand"  "=Sg, Sg,   v")
1475        (vec_and_scalar_nocom:SI
1476	  (match_operand:SI 1 "gcn_alu_operand" "SgB,SgA,   v")
1477	  (match_operand:SI 2 "gcn_alu_operand" "SgA,SgB,vSvB")))
1478   (clobber (match_scratch:BI 3			"=cs, cs,   X"))]
1479  ""
1480  "@
1481   s_<mnemonic>0\t%0, %1, %2
1482   s_<mnemonic>0\t%0, %1, %2
1483   v_<revmnemonic>0\t%0, %2, %1"
1484  [(set_attr "type" "sop2,sop2,vop2")
1485   (set_attr "length" "8")])
1486
1487(define_expand "<expander>si3_scc"
1488  [(parallel [(set (match_operand:SI 0 "gcn_valu_dst_operand")
1489		   (binop:SI
1490		     (match_operand:SI 1 "gcn_valu_src0_operand")
1491		     (match_operand:SI 2 "gcn_alu_operand")))
1492	      (clobber (reg:BI SCC_REG))])]
1493  ""
1494  {})
1495
1496;; }}}
1497;; {{{ ALU: generic 64-bit
1498
1499(define_code_iterator vec_and_scalar64_com [and ior xor])
1500
1501(define_insn_and_split "<expander>di3"
1502   [(set (match_operand:DI 0 "register_operand"  "= Sg,    v")
1503	 (vec_and_scalar64_com:DI
1504	  (match_operand:DI 1 "gcn_alu_operand"  "%SgA,vSvDB")
1505	   (match_operand:DI 2 "gcn_alu_operand" " SgC,    v")))
1506   (clobber (match_scratch:BI 3			 "= cs,    X"))]
1507  ""
1508  "@
1509   s_<mnemonic>0\t%0, %1, %2
1510   #"
1511  "reload_completed && gcn_vgpr_register_operand (operands[0], DImode)"
1512  [(parallel [(set (match_dup 4)
1513		   (vec_and_scalar64_com:SI (match_dup 5) (match_dup 6)))
1514	      (clobber (match_dup 3))])
1515   (parallel [(set (match_dup 7)
1516		   (vec_and_scalar64_com:SI (match_dup 8) (match_dup 9)))
1517	      (clobber (match_dup 3))])]
1518  {
1519    operands[4] = gcn_operand_part (DImode, operands[0], 0);
1520    operands[5] = gcn_operand_part (DImode, operands[1], 0);
1521    operands[6] = gcn_operand_part (DImode, operands[2], 0);
1522    operands[7] = gcn_operand_part (DImode, operands[0], 1);
1523    operands[8] = gcn_operand_part (DImode, operands[1], 1);
1524    operands[9] = gcn_operand_part (DImode, operands[2], 1);
1525  }
1526  [(set_attr "type" "sop2,vop2")
1527   (set_attr "length" "8")])
1528
1529(define_insn "<expander>di3"
1530  [(set (match_operand:DI 0 "register_operand"   "=Sg, Sg,   v")
1531	(vec_and_scalar_nocom:DI
1532	  (match_operand:DI 1 "gcn_alu_operand"  "SgC,SgA,   v")
1533	  (match_operand:SI 2 "gcn_alu_operand"  "SgA,SgC,vSvC")))
1534   (clobber (match_scratch:BI 3			 "=cs, cs,   X"))]
1535  ""
1536  "@
1537   s_<mnemonic>0\t%0, %1, %2
1538   s_<mnemonic>0\t%0, %1, %2
1539   v_<revmnemonic>0\t%0, %2, %1"
1540  [(set_attr "type" "sop2,sop2,vop2")
1541   (set_attr "length" "8")])
1542
1543;; }}}
1544;; {{{ Atomics
1545
1546; Each compute unit has it's own L1 cache. The L2 cache is shared between
1547; all the compute units.  Any load or store instruction can skip L1 and
1548; access L2 directly using the "glc" flag.  Atomic instructions also skip
1549; L1.  The L1 cache can be flushed and invalidated using instructions.
1550;
1551; Therefore, in order for "acquire" and "release" atomic modes to work
1552; correctly across compute units we must flush before each "release"
1553; and invalidate the cache after each "acquire".  It might seem like
1554; invalidation could be safely done before an "acquire", but since each
1555; compute unit can run up to 40 threads simultaneously, all reading values
1556; into the L1 cache, this is not actually safe.
1557;
1558; Additionally, scalar flat instructions access L2 via a different cache
1559; (the "constant cache"), so they have separate constrol instructions.  We
1560; do not attempt to invalidate both caches at once; instead, atomics
1561; operating on scalar flat pointers will flush the constant cache, and
1562; atomics operating on flat or global pointers will flush L1.  It is up to
1563; the programmer to get this right.
1564
1565(define_code_iterator atomicops [plus minus and ior xor])
1566(define_mode_attr X [(SI "") (DI "_X2")])
1567
1568;; TODO compare_and_swap test_and_set inc dec
1569;; Hardware also supports min and max, but GCC does not.
1570
1571(define_expand "memory_barrier"
1572  [(set (match_dup 0)
1573	(unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
1574  ""
1575  {
1576    operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
1577    MEM_VOLATILE_P (operands[0]) = 1;
1578  })
1579
1580(define_insn "*memory_barrier"
1581  [(set (match_operand:BLK 0)
1582	(unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
1583  ""
1584  "buffer_wbinvl1_vol"
1585  [(set_attr "type" "mubuf")
1586   (set_attr "length" "4")])
1587
1588; FIXME: These patterns have been disabled as they do not seem to work
1589; reliably - they can cause hangs or incorrect results.
1590; TODO: flush caches according to memory model
1591(define_insn "atomic_fetch_<bare_mnemonic><mode>"
1592  [(set (match_operand:SIDI 0 "register_operand"     "=Sm, v, v")
1593	(match_operand:SIDI 1 "memory_operand"	     "+RS,RF,RM"))
1594   (set (match_dup 1)
1595	(unspec_volatile:SIDI
1596	  [(atomicops:SIDI
1597	    (match_dup 1)
1598	    (match_operand:SIDI 2 "register_operand" " Sm, v, v"))]
1599	   UNSPECV_ATOMIC))
1600   (use (match_operand 3 "const_int_operand"))]
1601  "0 /* Disabled.  */"
1602  "@
1603   s_atomic_<bare_mnemonic><X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)
1604   flat_atomic_<bare_mnemonic><X>\t%0, %1, %2 glc\;s_waitcnt\t0
1605   global_atomic_<bare_mnemonic><X>\t%0, %A1, %2%O1 glc\;s_waitcnt\tvmcnt(0)"
1606  [(set_attr "type" "smem,flat,flat")
1607   (set_attr "length" "12")
1608   (set_attr "gcn_version" "gcn5,*,gcn5")])
1609
1610; FIXME: These patterns are disabled because the instructions don't
1611; seem to work as advertised.  Specifically, OMP "team distribute"
1612; reductions apparently "lose" some of the writes, similar to what
1613; you might expect from a concurrent non-atomic read-modify-write.
1614; TODO: flush caches according to memory model
1615(define_insn "atomic_<bare_mnemonic><mode>"
1616  [(set (match_operand:SIDI 0 "memory_operand"       "+RS,RF,RM")
1617	(unspec_volatile:SIDI
1618	  [(atomicops:SIDI
1619	    (match_dup 0)
1620	    (match_operand:SIDI 1 "register_operand" " Sm, v, v"))]
1621	  UNSPECV_ATOMIC))
1622   (use (match_operand 2 "const_int_operand"))]
1623  "0 /* Disabled.  */"
1624  "@
1625   s_atomic_<bare_mnemonic><X>\t%0, %1\;s_waitcnt\tlgkmcnt(0)
1626   flat_atomic_<bare_mnemonic><X>\t%0, %1\;s_waitcnt\t0
1627   global_atomic_<bare_mnemonic><X>\t%A0, %1%O0\;s_waitcnt\tvmcnt(0)"
1628  [(set_attr "type" "smem,flat,flat")
1629   (set_attr "length" "12")
1630   (set_attr "gcn_version" "gcn5,*,gcn5")])
1631
1632(define_mode_attr x2 [(SI "DI") (DI "TI")])
1633(define_mode_attr size [(SI "4") (DI "8")])
1634(define_mode_attr bitsize [(SI "32") (DI "64")])
1635
1636(define_expand "sync_compare_and_swap<mode>"
1637  [(match_operand:SIDI 0 "register_operand")
1638   (match_operand:SIDI 1 "memory_operand")
1639   (match_operand:SIDI 2 "register_operand")
1640   (match_operand:SIDI 3 "register_operand")]
1641  ""
1642  {
1643    if (MEM_ADDR_SPACE (operands[1]) == ADDR_SPACE_LDS)
1644      {
1645	emit_insn (gen_sync_compare_and_swap<mode>_lds_insn (operands[0],
1646							     operands[1],
1647							     operands[2],
1648							     operands[3]));
1649	DONE;
1650      }
1651
1652    /* Operands 2 and 3 must be placed in consecutive registers, and passed
1653       as a combined value.  */
1654    rtx src_cmp = gen_reg_rtx (<x2>mode);
1655    emit_move_insn (gen_rtx_SUBREG (<MODE>mode, src_cmp, 0), operands[3]);
1656    emit_move_insn (gen_rtx_SUBREG (<MODE>mode, src_cmp, <size>), operands[2]);
1657    emit_insn (gen_sync_compare_and_swap<mode>_insn (operands[0],
1658						     operands[1],
1659						     src_cmp));
1660    DONE;
1661  })
1662
1663(define_insn "sync_compare_and_swap<mode>_insn"
1664  [(set (match_operand:SIDI 0 "register_operand"    "=Sm, v, v")
1665	(match_operand:SIDI 1 "memory_operand"      "+RS,RF,RM"))
1666   (set (match_dup 1)
1667	(unspec_volatile:SIDI
1668	  [(match_operand:<x2> 2 "register_operand" " Sm, v, v")]
1669	  UNSPECV_ATOMIC))]
1670  ""
1671  "@
1672   s_atomic_cmpswap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)
1673   flat_atomic_cmpswap<X>\t%0, %1, %2 glc\;s_waitcnt\t0
1674   global_atomic_cmpswap<X>\t%0, %A1, %2%O1 glc\;s_waitcnt\tvmcnt(0)"
1675  [(set_attr "type" "smem,flat,flat")
1676   (set_attr "length" "12")
1677   (set_attr "gcn_version" "gcn5,*,gcn5")
1678   (set_attr "delayeduse" "*,yes,yes")])
1679
1680(define_insn "sync_compare_and_swap<mode>_lds_insn"
1681  [(set (match_operand:SIDI 0 "register_operand"    "= v")
1682	(unspec_volatile:SIDI
1683	  [(match_operand:SIDI 1 "memory_operand"   "+RL")]
1684	  UNSPECV_ATOMIC))
1685   (set (match_dup 1)
1686	(unspec_volatile:SIDI
1687	  [(match_operand:SIDI 2 "register_operand" "  v")
1688	   (match_operand:SIDI 3 "register_operand" "  v")]
1689	  UNSPECV_ATOMIC))]
1690  ""
1691  "ds_cmpst_rtn_b<bitsize> %0, %1, %2, %3\;s_waitcnt\tlgkmcnt(0)"
1692  [(set_attr "type" "ds")
1693   (set_attr "length" "12")])
1694
1695(define_insn "atomic_load<mode>"
1696  [(set (match_operand:SIDI 0 "register_operand"  "=Sm, v, v")
1697	(unspec_volatile:SIDI
1698	  [(match_operand:SIDI 1 "memory_operand" " RS,RF,RM")]
1699	  UNSPECV_ATOMIC))
1700   (use (match_operand:SIDI 2 "immediate_operand" "  i, i, i"))]
1701  ""
1702  {
1703    switch (INTVAL (operands[2]))
1704      {
1705      case MEMMODEL_RELAXED:
1706	switch (which_alternative)
1707	  {
1708	  case 0:
1709	    return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)";
1710	  case 1:
1711	    return "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0";
1712	  case 2:
1713	    return "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)";
1714	  }
1715	break;
1716      case MEMMODEL_CONSUME:
1717      case MEMMODEL_ACQUIRE:
1718      case MEMMODEL_SYNC_ACQUIRE:
1719	switch (which_alternative)
1720	  {
1721	  case 0:
1722	    return "s_load%o0\t%0, %A1 glc\;s_waitcnt\tlgkmcnt(0)\;"
1723		   "s_dcache_wb_vol";
1724	  case 1:
1725	    return "flat_load%o0\t%0, %A1%O1 glc\;s_waitcnt\t0\;"
1726		   "buffer_wbinvl1_vol";
1727	  case 2:
1728	    return "global_load%o0\t%0, %A1%O1 glc\;s_waitcnt\tvmcnt(0)\;"
1729		   "buffer_wbinvl1_vol";
1730	  }
1731	break;
1732      case MEMMODEL_ACQ_REL:
1733      case MEMMODEL_SEQ_CST:
1734      case MEMMODEL_SYNC_SEQ_CST:
1735	switch (which_alternative)
1736	  {
1737	  case 0:
1738	    return "s_dcache_wb_vol\;s_load%o0\t%0, %A1 glc\;"
1739		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
1740	  case 1:
1741	    return "buffer_wbinvl1_vol\;flat_load%o0\t%0, %A1%O1 glc\;"
1742		   "s_waitcnt\t0\;buffer_wbinvl1_vol";
1743	  case 2:
1744	    return "buffer_wbinvl1_vol\;global_load%o0\t%0, %A1%O1 glc\;"
1745		   "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
1746	  }
1747	break;
1748      }
1749    gcc_unreachable ();
1750  }
1751  [(set_attr "type" "smem,flat,flat")
1752   (set_attr "length" "20")
1753   (set_attr "gcn_version" "gcn5,*,gcn5")])
1754
1755(define_insn "atomic_store<mode>"
1756  [(set (match_operand:SIDI 0 "memory_operand"      "=RS,RF,RM")
1757	(unspec_volatile:SIDI
1758	  [(match_operand:SIDI 1 "register_operand" " Sm, v, v")]
1759	  UNSPECV_ATOMIC))
1760  (use (match_operand:SIDI 2 "immediate_operand"    "  i, i, i"))]
1761  ""
1762  {
1763    switch (INTVAL (operands[2]))
1764      {
1765      case MEMMODEL_RELAXED:
1766	switch (which_alternative)
1767	  {
1768	  case 0:
1769	    return "s_store%o1\t%1, %A0 glc\;s_waitcnt\tlgkmcnt(0)";
1770	  case 1:
1771	    return "flat_store%o1\t%A0, %1%O0 glc\;s_waitcnt\t0";
1772	  case 2:
1773	    return "global_store%o1\t%A0, %1%O0 glc\;s_waitcnt\tvmcnt(0)";
1774	  }
1775	break;
1776      case MEMMODEL_RELEASE:
1777      case MEMMODEL_SYNC_RELEASE:
1778	switch (which_alternative)
1779	  {
1780	  case 0:
1781	    return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc";
1782	  case 1:
1783	    return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc";
1784	  case 2:
1785	    return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc";
1786	  }
1787	break;
1788      case MEMMODEL_ACQ_REL:
1789      case MEMMODEL_SEQ_CST:
1790      case MEMMODEL_SYNC_SEQ_CST:
1791	switch (which_alternative)
1792	  {
1793	  case 0:
1794	    return "s_dcache_wb_vol\;s_store%o1\t%1, %A0 glc\;"
1795		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
1796	  case 1:
1797	    return "buffer_wbinvl1_vol\;flat_store%o1\t%A0, %1%O0 glc\;"
1798		   "s_waitcnt\t0\;buffer_wbinvl1_vol";
1799	  case 2:
1800	    return "buffer_wbinvl1_vol\;global_store%o1\t%A0, %1%O0 glc\;"
1801		   "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
1802	  }
1803	break;
1804      }
1805    gcc_unreachable ();
1806  }
1807  [(set_attr "type" "smem,flat,flat")
1808   (set_attr "length" "20")
1809   (set_attr "gcn_version" "gcn5,*,gcn5")])
1810
1811(define_insn "atomic_exchange<mode>"
1812  [(set (match_operand:SIDI 0 "register_operand"    "=Sm, v, v")
1813        (match_operand:SIDI 1 "memory_operand"	    "+RS,RF,RM"))
1814   (set (match_dup 1)
1815	(unspec_volatile:SIDI
1816	  [(match_operand:SIDI 2 "register_operand" " Sm, v, v")]
1817	  UNSPECV_ATOMIC))
1818   (use (match_operand 3 "immediate_operand"))]
1819  ""
1820  {
1821    switch (INTVAL (operands[3]))
1822      {
1823      case MEMMODEL_RELAXED:
1824	switch (which_alternative)
1825	  {
1826	  case 0:
1827	    return "s_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)";
1828	  case 1:
1829	    return "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0";
1830	  case 2:
1831	    return "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
1832		   "s_waitcnt\tvmcnt(0)";
1833	  }
1834	break;
1835      case MEMMODEL_CONSUME:
1836      case MEMMODEL_ACQUIRE:
1837      case MEMMODEL_SYNC_ACQUIRE:
1838	switch (which_alternative)
1839	  {
1840	  case 0:
1841	    return "s_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\tlgkmcnt(0)\;"
1842		   "s_dcache_wb_vol\;s_dcache_inv_vol";
1843	  case 1:
1844	    return "flat_atomic_swap<X>\t%0, %1, %2 glc\;s_waitcnt\t0\;"
1845		   "buffer_wbinvl1_vol";
1846	  case 2:
1847	    return "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
1848		   "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
1849	  }
1850	break;
1851      case MEMMODEL_RELEASE:
1852      case MEMMODEL_SYNC_RELEASE:
1853	switch (which_alternative)
1854	  {
1855	  case 0:
1856	    return "s_dcache_wb_vol\;s_atomic_swap<X>\t%0, %1, %2 glc\;"
1857		   "s_waitcnt\tlgkmcnt(0)";
1858	  case 1:
1859	    return "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
1860		   "s_waitcnt\t0";
1861	  case 2:
1862	    return "buffer_wbinvl1_vol\;"
1863		   "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
1864		   "s_waitcnt\tvmcnt(0)";
1865	  }
1866	break;
1867      case MEMMODEL_ACQ_REL:
1868      case MEMMODEL_SEQ_CST:
1869      case MEMMODEL_SYNC_SEQ_CST:
1870	switch (which_alternative)
1871	  {
1872	  case 0:
1873	    return "s_dcache_wb_vol\;s_atomic_swap<X>\t%0, %1, %2 glc\;"
1874		   "s_waitcnt\tlgkmcnt(0)\;s_dcache_inv_vol";
1875	  case 1:
1876	    return "buffer_wbinvl1_vol\;flat_atomic_swap<X>\t%0, %1, %2 glc\;"
1877		   "s_waitcnt\t0\;buffer_wbinvl1_vol";
1878	  case 2:
1879	    return "buffer_wbinvl1_vol\;"
1880		   "global_atomic_swap<X>\t%0, %A1, %2%O1 glc\;"
1881		   "s_waitcnt\tvmcnt(0)\;buffer_wbinvl1_vol";
1882	  }
1883	break;
1884      }
1885    gcc_unreachable ();
1886  }
1887  [(set_attr "type" "smem,flat,flat")
1888   (set_attr "length" "20")
1889   (set_attr "gcn_version" "gcn5,*,gcn5")])
1890
1891;; }}}
1892;; {{{ OpenACC / OpenMP
1893
1894(define_expand "oacc_dim_size"
1895  [(match_operand:SI 0 "register_operand")
1896   (match_operand:SI 1 "const_int_operand")]
1897  ""
1898  {
1899    rtx tmp = gcn_oacc_dim_size (INTVAL (operands[1]));
1900    emit_move_insn (operands[0], gen_lowpart (SImode, tmp));
1901    DONE;
1902  })
1903
1904(define_expand "oacc_dim_pos"
1905  [(match_operand:SI 0 "register_operand")
1906   (match_operand:SI 1 "const_int_operand")]
1907  ""
1908  {
1909    emit_move_insn (operands[0], gcn_oacc_dim_pos (INTVAL (operands[1])));
1910    DONE;
1911  })
1912
1913(define_expand "gcn_wavefront_barrier"
1914  [(set (match_dup 0)
1915	(unspec_volatile:BLK [(match_dup 0)] UNSPECV_BARRIER))]
1916  ""
1917  {
1918    operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
1919    MEM_VOLATILE_P (operands[0]) = 1;
1920  })
1921
1922(define_insn "*gcn_wavefront_barrier"
1923  [(set (match_operand:BLK 0 "")
1924	(unspec_volatile:BLK [(match_dup 0)] UNSPECV_BARRIER))]
1925  ""
1926  "s_barrier"
1927  [(set_attr "type" "sopp")])
1928
1929(define_expand "oacc_fork"
1930  [(set (match_operand:SI 0 "")
1931	(match_operand:SI 1 ""))
1932   (use (match_operand:SI 2 ""))]
1933  ""
1934  {
1935    /* We need to have oacc_fork/oacc_join named patterns as a pair,
1936       but the fork isn't actually used.  */
1937    gcc_unreachable ();
1938  })
1939
1940(define_expand "oacc_join"
1941  [(set (match_operand:SI 0 "")
1942	(match_operand:SI 1 ""))
1943   (use (match_operand:SI 2 ""))]
1944  ""
1945  {
1946    emit_insn (gen_gcn_wavefront_barrier ());
1947    DONE;
1948  })
1949
1950;; }}}
1951
1952(include "gcn-valu.md")
1953