aors_err1_n.asm revision 1.1.1.1
1dnl  AMD64 mpn_add_err1_n, mpn_sub_err1_n
2
3dnl  Contributed by David Harvey.
4
5dnl  Copyright 2011 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C	     cycles/limb
25C AMD K8,K9	 2.75 (most alignments, degenerates to 3 c/l for some aligments)
26C AMD K10	 ?
27C Intel P4	 ?
28C Intel core2	 ?
29C Intel corei	 ?
30C Intel atom	 ?
31C VIA nano	 ?
32
33
34C INPUT PARAMETERS
35define(`rp',	`%rdi')
36define(`up',	`%rsi')
37define(`vp',	`%rdx')
38define(`ep',	`%rcx')
39define(`yp',	`%r8')
40define(`n',	`%r9')
41define(`cy_param',	`8(%rsp)')
42
43define(`el',	`%rbx')
44define(`eh',	`%rbp')
45define(`t0',	`%r10')
46define(`t1',	`%r11')
47define(`t2',	`%r12')
48define(`t3',	`%r13')
49define(`w0',	`%r14')
50define(`w1',	`%r15')
51
52ifdef(`OPERATION_add_err1_n', `
53	define(ADCSBB,	      adc)
54	define(func,	      mpn_add_err1_n)')
55ifdef(`OPERATION_sub_err1_n', `
56	define(ADCSBB,	      sbb)
57	define(func,	      mpn_sub_err1_n)')
58
59MULFUNC_PROLOGUE(mpn_add_err1_n mpn_sub_err1_n)
60
61
62ASM_START()
63	TEXT
64	ALIGN(16)
65PROLOGUE(func)
66	mov	cy_param, %rax
67
68	push	%rbx
69	push	%rbp
70	push	%r12
71	push	%r13
72	push	%r14
73	push	%r15
74
75	lea	(up,n,8), up
76	lea	(vp,n,8), vp
77	lea	(rp,n,8), rp
78
79	mov	R32(n), R32(%r10)
80	and	$3, R32(%r10)
81	jz	L(0mod4)
82	cmp	$2, R32(%r10)
83	jc	L(1mod4)
84	jz	L(2mod4)
85L(3mod4):
86	xor	R32(el), R32(el)
87	xor	R32(eh), R32(eh)
88	xor	R32(t0), R32(t0)
89	xor	R32(t1), R32(t1)
90	lea	-24(yp,n,8), yp
91	neg	n
92
93        shr     $1, %al            C restore carry
94        mov     (up,n,8), w0
95        mov     8(up,n,8), w1
96        ADCSBB  (vp,n,8), w0
97	mov	w0, (rp,n,8)
98	cmovc	16(yp), el
99        ADCSBB  8(vp,n,8), w1
100	mov	w1, 8(rp,n,8)
101	cmovc	8(yp), t0
102        mov     16(up,n,8), w0
103        ADCSBB  16(vp,n,8), w0
104	mov	w0, 16(rp,n,8)
105	cmovc	(yp), t1
106	setc	%al                C save carry
107	add	t0, el
108	adc	$0, eh
109	add	t1, el
110	adc	$0, eh
111
112	add	$3, n
113	jnz	L(loop)
114	jmp	L(end)
115
116	ALIGN(16)
117L(0mod4):
118	xor	R32(el), R32(el)
119	xor	R32(eh), R32(eh)
120	lea	(yp,n,8), yp
121	neg	n
122	jmp	L(loop)
123
124	ALIGN(16)
125L(1mod4):
126	xor	R32(el), R32(el)
127	xor	R32(eh), R32(eh)
128	lea	-8(yp,n,8), yp
129	neg	n
130
131        shr     $1, %al            C restore carry
132        mov     (up,n,8), w0
133        ADCSBB  (vp,n,8), w0
134        mov     w0, (rp,n,8)
135	cmovc	(yp), el
136	setc	%al                C save carry
137
138	add	$1, n
139	jnz	L(loop)
140	jmp	L(end)
141
142	ALIGN(16)
143L(2mod4):
144	xor	R32(el), R32(el)
145	xor	R32(eh), R32(eh)
146	xor	R32(t0), R32(t0)
147	lea	-16(yp,n,8), yp
148	neg	n
149
150        shr     $1, %al            C restore carry
151        mov     (up,n,8), w0
152        mov     8(up,n,8), w1
153        ADCSBB  (vp,n,8), w0
154        mov     w0, (rp,n,8)
155	cmovc	8(yp), el
156        ADCSBB  8(vp,n,8), w1
157        mov     w1, 8(rp,n,8)
158	cmovc	(yp), t0
159	setc	%al                C save carry
160	add	t0, el
161	adc	$0, eh
162
163	add	$2, n
164	jnz	L(loop)
165	jmp	L(end)
166
167	ALIGN(32)
168L(loop):
169        shr     $1, %al            C restore carry
170        mov     -8(yp), t0
171        mov     $0, R32(t3)
172        mov     (up,n,8), w0
173        mov     8(up,n,8), w1
174        ADCSBB  (vp,n,8), w0
175        cmovnc  t3, t0
176        ADCSBB  8(vp,n,8), w1
177        mov     -16(yp), t1
178        mov     w0, (rp,n,8)
179        mov     16(up,n,8), w0
180        mov     w1, 8(rp,n,8)
181        cmovnc  t3, t1
182        mov     -24(yp), t2
183        ADCSBB  16(vp,n,8), w0
184        cmovnc  t3, t2
185        mov     24(up,n,8), w1
186        ADCSBB  24(vp,n,8), w1
187        cmovc   -32(yp), t3
188        setc    %al                C save carry
189        add     t0, el
190        adc     $0, eh
191        add     t1, el
192        adc     $0, eh
193        add     t2, el
194        adc     $0, eh
195        mov     w0, 16(rp,n,8)
196        add     t3, el
197        lea     -32(yp), yp
198        adc     $0, eh
199        mov     w1, 24(rp,n,8)
200        add     $4, n
201        jnz     L(loop)
202
203L(end):
204	mov	el, (ep)
205	mov	eh, 8(ep)
206
207	pop	%r15
208	pop	%r14
209	pop	%r13
210	pop	%r12
211	pop	%rbp
212	pop	%rbx
213	ret
214EPILOGUE()
215