1dnl  AMD64 mpn_add_err1_n, mpn_sub_err1_n
2
3dnl  Contributed by David Harvey.
4
5dnl  Copyright 2011 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb
36C AMD K8,K9	 2.75 (degenerates to 3 c/l for some alignments)
37C AMD K10	 ?
38C Intel P4	 ?
39C Intel core2	 ?
40C Intel corei	 ?
41C Intel atom	 ?
42C VIA nano	 ?
43
44
45C INPUT PARAMETERS
46define(`rp',	`%rdi')
47define(`up',	`%rsi')
48define(`vp',	`%rdx')
49define(`ep',	`%rcx')
50define(`yp',	`%r8')
51define(`n',	`%r9')
52define(`cy_param',	`8(%rsp)')
53
54define(`el',	`%rbx')
55define(`eh',	`%rbp')
56define(`t0',	`%r10')
57define(`t1',	`%r11')
58define(`t2',	`%r12')
59define(`t3',	`%r13')
60define(`w0',	`%r14')
61define(`w1',	`%r15')
62
63ifdef(`OPERATION_add_err1_n', `
64	define(ADCSBB,	      adc)
65	define(func,	      mpn_add_err1_n)')
66ifdef(`OPERATION_sub_err1_n', `
67	define(ADCSBB,	      sbb)
68	define(func,	      mpn_sub_err1_n)')
69
70MULFUNC_PROLOGUE(mpn_add_err1_n mpn_sub_err1_n)
71
72
73ASM_START()
74	TEXT
75	ALIGN(16)
76PROLOGUE(func)
77	mov	cy_param, %rax
78
79	push	%rbx
80	push	%rbp
81	push	%r12
82	push	%r13
83	push	%r14
84	push	%r15
85
86	lea	(up,n,8), up
87	lea	(vp,n,8), vp
88	lea	(rp,n,8), rp
89
90	mov	R32(n), R32(%r10)
91	and	$3, R32(%r10)
92	jz	L(0mod4)
93	cmp	$2, R32(%r10)
94	jc	L(1mod4)
95	jz	L(2mod4)
96L(3mod4):
97	xor	R32(el), R32(el)
98	xor	R32(eh), R32(eh)
99	xor	R32(t0), R32(t0)
100	xor	R32(t1), R32(t1)
101	lea	-24(yp,n,8), yp
102	neg	n
103
104	shr	$1, %al		   C restore carry
105	mov	(up,n,8), w0
106	mov	8(up,n,8), w1
107	ADCSBB	(vp,n,8), w0
108	mov	w0, (rp,n,8)
109	cmovc	16(yp), el
110	ADCSBB	8(vp,n,8), w1
111	mov	w1, 8(rp,n,8)
112	cmovc	8(yp), t0
113	mov	16(up,n,8), w0
114	ADCSBB	16(vp,n,8), w0
115	mov	w0, 16(rp,n,8)
116	cmovc	(yp), t1
117	setc	%al		   C save carry
118	add	t0, el
119	adc	$0, eh
120	add	t1, el
121	adc	$0, eh
122
123	add	$3, n
124	jnz	L(loop)
125	jmp	L(end)
126
127	ALIGN(16)
128L(0mod4):
129	xor	R32(el), R32(el)
130	xor	R32(eh), R32(eh)
131	lea	(yp,n,8), yp
132	neg	n
133	jmp	L(loop)
134
135	ALIGN(16)
136L(1mod4):
137	xor	R32(el), R32(el)
138	xor	R32(eh), R32(eh)
139	lea	-8(yp,n,8), yp
140	neg	n
141
142	shr	$1, %al		   C restore carry
143	mov	(up,n,8), w0
144	ADCSBB	(vp,n,8), w0
145	mov	w0, (rp,n,8)
146	cmovc	(yp), el
147	setc	%al		   C save carry
148
149	add	$1, n
150	jnz	L(loop)
151	jmp	L(end)
152
153	ALIGN(16)
154L(2mod4):
155	xor	R32(el), R32(el)
156	xor	R32(eh), R32(eh)
157	xor	R32(t0), R32(t0)
158	lea	-16(yp,n,8), yp
159	neg	n
160
161	shr	$1, %al		   C restore carry
162	mov	(up,n,8), w0
163	mov	8(up,n,8), w1
164	ADCSBB	(vp,n,8), w0
165	mov	w0, (rp,n,8)
166	cmovc	8(yp), el
167	ADCSBB	8(vp,n,8), w1
168	mov	w1, 8(rp,n,8)
169	cmovc	(yp), t0
170	setc	%al		   C save carry
171	add	t0, el
172	adc	$0, eh
173
174	add	$2, n
175	jnz	L(loop)
176	jmp	L(end)
177
178	ALIGN(32)
179L(loop):
180	shr	$1, %al		   C restore carry
181	mov	-8(yp), t0
182	mov	$0, R32(t3)
183	mov	(up,n,8), w0
184	mov	8(up,n,8), w1
185	ADCSBB	(vp,n,8), w0
186	cmovnc	t3, t0
187	ADCSBB	8(vp,n,8), w1
188	mov	-16(yp), t1
189	mov	w0, (rp,n,8)
190	mov	16(up,n,8), w0
191	mov	w1, 8(rp,n,8)
192	cmovnc	t3, t1
193	mov	-24(yp), t2
194	ADCSBB	16(vp,n,8), w0
195	cmovnc	t3, t2
196	mov	24(up,n,8), w1
197	ADCSBB	24(vp,n,8), w1
198	cmovc	-32(yp), t3
199	setc	%al		   C save carry
200	add	t0, el
201	adc	$0, eh
202	add	t1, el
203	adc	$0, eh
204	add	t2, el
205	adc	$0, eh
206	mov	w0, 16(rp,n,8)
207	add	t3, el
208	lea	-32(yp), yp
209	adc	$0, eh
210	mov	w1, 24(rp,n,8)
211	add	$4, n
212	jnz	L(loop)
213
214L(end):
215	mov	el, (ep)
216	mov	eh, 8(ep)
217
218	pop	%r15
219	pop	%r14
220	pop	%r13
221	pop	%r12
222	pop	%rbp
223	pop	%rbx
224	ret
225EPILOGUE()
226