aors_err2_n.asm revision 1.1.1.1
1dnl  AMD64 mpn_add_err2_n, mpn_sub_err2_n
2
3dnl  Contributed by David Harvey.
4
5dnl  Copyright 2011 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C	     cycles/limb
25C AMD K8,K9	 4.5
26C AMD K10	 ?
27C Intel P4	 ?
28C Intel core2	 6.9
29C Intel corei	 ?
30C Intel atom	 ?
31C VIA nano	 ?
32
33
34C INPUT PARAMETERS
35define(`rp',	`%rdi')
36define(`up',	`%rsi')
37define(`vp',	`%rdx')
38define(`ep',	`%rcx')
39define(`yp1',	`%r8')
40define(`yp2',   `%r9')
41define(`n_param',     `8(%rsp)')
42define(`cy_param',    `16(%rsp)')
43
44define(`cy1',   `%r14')
45define(`cy2',   `%rax')
46
47define(`n',     `%r10')
48
49define(`w',     `%rbx')
50define(`e1l',	`%rbp')
51define(`e1h',	`%r11')
52define(`e2l',	`%r12')
53define(`e2h',	`%r13')
54
55
56ifdef(`OPERATION_add_err2_n', `
57	define(ADCSBB,	      adc)
58	define(func,	      mpn_add_err2_n)')
59ifdef(`OPERATION_sub_err2_n', `
60	define(ADCSBB,	      sbb)
61	define(func,	      mpn_sub_err2_n)')
62
63MULFUNC_PROLOGUE(mpn_add_err2_n mpn_sub_err2_n)
64
65
66ASM_START()
67	TEXT
68	ALIGN(16)
69PROLOGUE(func)
70	mov	cy_param, cy2
71	mov	n_param, n
72
73	push	%rbx
74	push	%rbp
75	push	%r12
76	push	%r13
77	push	%r14
78
79	xor	R32(e1l), R32(e1l)
80	xor	R32(e1h), R32(e1h)
81	xor	R32(e2l), R32(e2l)
82	xor	R32(e2h), R32(e2h)
83
84	sub	yp1, yp2
85
86	lea	(rp,n,8), rp
87	lea	(up,n,8), up
88	lea	(vp,n,8), vp
89
90	test	$1, n
91	jnz	L(odd)
92
93	lea	-8(yp1,n,8), yp1
94	neg	n
95	jmp	L(top)
96
97	ALIGN(16)
98L(odd):
99	lea	-16(yp1,n,8), yp1
100	neg	n
101	shr	$1, cy2
102	mov	(up,n,8), w
103	ADCSBB	(vp,n,8), w
104	cmovc	8(yp1), e1l
105	cmovc	8(yp1,yp2), e2l
106	mov	w, (rp,n,8)
107	sbb	cy2, cy2
108	inc	n
109	jz	L(end)
110
111	ALIGN(16)
112L(top):
113        mov     (up,n,8), w
114	shr     $1, cy2         C restore carry
115	ADCSBB  (vp,n,8), w
116	mov     w, (rp,n,8)
117	sbb     cy1, cy1        C generate mask, preserve CF
118
119	mov     8(up,n,8), w
120	ADCSBB  8(vp,n,8), w
121	mov     w, 8(rp,n,8)
122	sbb     cy2, cy2        C generate mask, preserve CF
123
124	mov     (yp1), w	C (e1h:e1l) += cy1 * yp1 limb
125	and     cy1, w
126	add     w, e1l
127	adc     $0, e1h
128
129	and     (yp1,yp2), cy1	C (e2h:e2l) += cy1 * yp2 limb
130	add     cy1, e2l
131	adc     $0, e2h
132
133	mov     -8(yp1), w	C (e1h:e1l) += cy2 * next yp1 limb
134	and     cy2, w
135	add     w, e1l
136	adc     $0, e1h
137
138	mov     -8(yp1,yp2), w	C (e2h:e2l) += cy2 * next yp2 limb
139	and     cy2, w
140	add     w, e2l
141	adc     $0, e2h
142
143	add     $2, n
144	lea     -16(yp1), yp1
145	jnz     L(top)
146L(end):
147
148	mov	e1l, (ep)
149	mov	e1h, 8(ep)
150	mov	e2l, 16(ep)
151	mov	e2h, 24(ep)
152
153	and	$1, %eax	C return carry
154
155	pop	%r14
156	pop	%r13
157	pop	%r12
158	pop	%rbp
159	pop	%rbx
160	ret
161EPILOGUE()
162