1dnl  AMD64 mpn_add_n, mpn_sub_n optimised for bobcat.
2
3dnl  Copyright 2003-2005, 2007, 2008, 2010-2013 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	    cycles/limb
34C AMD K8,K9	 1.77
35C AMD K10	 1.76\1.82
36C AMD bd1	 1.67\2.12
37C AMD bd2	 1.62\1.82
38C AMD bd3
39C AMD bd4	 1.55\2.2
40C AMD zen
41C AMD bt1	 2.54
42C AMD bt2	 2
43C Intel P4	11
44C Intel PNR	 4.76
45C Intel NHM	 5.27
46C Intel SBR	 2
47C Intel IBR	 1.94
48C Intel HWL	 1.63
49C Intel BWL	 1.51
50C Intel SKL	 1.51
51C Intel atom	 3.56
52C Intel SLM	 4
53C VIA nano
54
55C The loop of this code is the result of running a code generation and
56C optimization tool suite written by David Harvey and Torbjorn Granlund.
57
58C INPUT PARAMETERS
59define(`rp',	`%rdi')	C rcx
60define(`up',	`%rsi')	C rdx
61define(`vp',	`%rdx')	C r8
62define(`n',	`%rcx')	C r9
63define(`cy',	`%r8')	C rsp+40    (mpn_add_nc and mpn_sub_nc)
64
65ifdef(`OPERATION_add_n', `
66	define(ADCSBB,	      adc)
67	define(func,	      mpn_add_n)
68	define(func_nc,	      mpn_add_nc)')
69ifdef(`OPERATION_sub_n', `
70	define(ADCSBB,	      sbb)
71	define(func,	      mpn_sub_n)
72	define(func_nc,	      mpn_sub_nc)')
73
74MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
75
76ABI_SUPPORT(DOS64)
77ABI_SUPPORT(STD64)
78
79ASM_START()
80	TEXT
81	ALIGN(16)
82PROLOGUE(func)
83	FUNC_ENTRY(4)
84	xor	%r8, %r8
85L(ent):	test	$1, R8(n)
86	jnz	L(bx1)
87
88L(bx0):	test	$2, R8(n)
89	jnz	L(b10)
90
91L(b00):	shr	$2, n
92	neg	%r8
93	mov	$3, R32(%rax)
94	mov	(up), %r10
95	mov	8(up), %r11
96	jmp	L(lo0)
97
98L(b10):	shr	$2, n
99	neg	%r8
100	mov	$1, R32(%rax)
101	mov	(up), %r8
102	mov	8(up), %r9
103	jrcxz	L(cj2)
104	jmp	L(top)
105
106L(bx1):	test	$2, R8(n)
107	jnz	L(b11)
108
109L(b01):	shr	$2, n
110	neg	%r8
111	mov	$0, R32(%rax)
112	mov	(up), %r9
113	jrcxz	L(cj1)
114	mov	8(up), %r10
115	jmp	L(lo1)
116
117	ALIGN(8)
118L(b11):	inc	n
119	shr	$2, n
120	neg	%r8
121	mov	$2, R32(%rax)
122	mov	(up), %r11
123	jmp	L(lo3)
124
125	ALIGN(4)
126L(top):	mov	8(up,%rax,8), %r10
127	ADCSBB	-8(vp,%rax,8), %r8
128	mov	%r8, -8(rp,%rax,8)
129L(lo1):	mov	16(up,%rax,8), %r11
130	ADCSBB	(vp,%rax,8), %r9
131	lea	4(%rax), %rax
132	mov	%r9, -32(rp,%rax,8)
133L(lo0):	ADCSBB	-24(vp,%rax,8), %r10
134	mov	%r10, -24(rp,%rax,8)
135L(lo3):	ADCSBB	-16(vp,%rax,8), %r11
136	dec	n
137	mov	-8(up,%rax,8), %r8
138	mov	%r11, -16(rp,%rax,8)
139L(lo2):	mov	(up,%rax,8), %r9
140	jnz	L(top)
141
142L(cj2):	ADCSBB	-8(vp,%rax,8), %r8
143	mov	%r8, -8(rp,%rax,8)
144L(cj1):	ADCSBB	(vp,%rax,8), %r9
145	mov	%r9, (rp,%rax,8)
146
147	mov	$0, R32(%rax)
148	adc	$0, R32(%rax)
149
150	FUNC_EXIT()
151	ret
152EPILOGUE()
153
154	ALIGN(16)
155PROLOGUE(func_nc)
156	FUNC_ENTRY(4)
157IFDOS(`	mov	56(%rsp), %r8	')
158	jmp	L(ent)
159EPILOGUE()
160