1dnl  Intel mpn_add_n/mpn_sub_n optimised for Conroe, Nehalem.
2
3dnl  Copyright 2006, 2007, 2011-2013 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	    cycles/limb
34C AMD K8,K9	 2
35C AMD K10	 1.93\2
36C AMD bull	 1.62\2.1
37C AMD pile	 1.6\1.7
38C AMD steam
39C AMD excavator
40C AMD bobcat	 2.79
41C AMD jaguar	 2.54
42C Intel P4	10
43C Intel core2	 2
44C Intel NHM	 2
45C Intel SBR	 2
46C Intel IBR	 1.95
47C Intel HWL	 1.72
48C Intel BWL	 1.54
49C Intel SKL	 1.52
50C Intel atom	 9
51C Intel SLM	 6.5
52C VIA nano	 3
53
54C INPUT PARAMETERS
55define(`rp',	`%rdi')
56define(`up',	`%rsi')
57define(`vp',	`%rdx')
58define(`n',	`%rcx')
59define(`cy',	`%r8')
60
61ifdef(`OPERATION_add_n', `
62	define(ADCSBB,	      adc)
63	define(func,	      mpn_add_n)
64	define(func_nc,	      mpn_add_nc)')
65ifdef(`OPERATION_sub_n', `
66	define(ADCSBB,	      sbb)
67	define(func,	      mpn_sub_n)
68	define(func_nc,	      mpn_sub_nc)')
69
70MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
71
72ABI_SUPPORT(DOS64)
73ABI_SUPPORT(STD64)
74
75ASM_START()
76	TEXT
77	ALIGN(16)
78PROLOGUE(func)
79	FUNC_ENTRY(4)
80	xor	%r8, %r8
81L(start):
82	mov	(up), %r10
83	mov	(vp), %r11
84
85	lea	(up,n,8), up
86	lea	(vp,n,8), vp
87	lea	(rp,n,8), rp
88	mov	R32(n), R32(%rax)
89	neg	n
90	and	$3, R32(%rax)
91	je	L(b00)
92	add	%rax, n			C clear low rcx bits for jrcxz
93	cmp	$2, R32(%rax)
94	jl	L(b01)
95	je	L(b10)
96
97L(b11):	neg	%r8			C set cy
98	jmp	L(e11)
99
100L(b00):	neg	%r8			C set cy
101	mov	%r10, %r8
102	mov	%r11, %r9
103	lea	4(n), n
104	jmp	L(e00)
105
106	nop
107	nop
108	nop
109L(b01):	neg	%r8			C set cy
110	jmp	L(top)
111
112L(b10):	neg	%r8			C set cy
113	mov	%r10, %r8
114	mov	%r11, %r9
115	jmp	L(e10)
116
117L(end):	ADCSBB	%r11, %r10
118	mov	%r10, -8(rp)
119	mov	R32(%rcx), R32(%rax)	C clear eax, ecx contains 0
120	adc	R32(%rax), R32(%rax)
121	FUNC_EXIT()
122	ret
123
124	ALIGN(16)
125L(top):	jrcxz	L(end)
126	mov	(up,n,8), %r8
127	mov	(vp,n,8), %r9
128	lea	4(n), n
129	ADCSBB	%r11, %r10
130	mov	%r10, -40(rp,n,8)
131L(e00):	mov	-24(up,n,8), %r10
132	mov	-24(vp,n,8), %r11
133	ADCSBB	%r9, %r8
134	mov	%r8, -32(rp,n,8)
135L(e11):	mov	-16(up,n,8), %r8
136	mov	-16(vp,n,8), %r9
137	ADCSBB	%r11, %r10
138	mov	%r10, -24(rp,n,8)
139L(e10):	mov	-8(up,n,8), %r10
140	mov	-8(vp,n,8), %r11
141	ADCSBB	%r9, %r8
142	mov	%r8, -16(rp,n,8)
143	jmp	L(top)
144EPILOGUE()
145
146PROLOGUE(func_nc)
147	FUNC_ENTRY(4)
148IFDOS(`	mov	56(%rsp), %r8	')
149	jmp	L(start)
150EPILOGUE()
151