1dnl  X86-64 mpn_add_n, mpn_sub_n, optimized for Intel Atom.
2
3dnl  Copyright 2003, 2004, 2005, 2007, 2008, 2010 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C	     cycles/limb
24C K8,K9:	 1.85
25C K10:		 ?
26C P4:		 ?
27C P6-15 (Core2): ?
28C P6-28 (Atom):	 3
29
30C INPUT PARAMETERS
31define(`rp',	`%rdi')
32define(`up',	`%rsi')
33define(`vp',	`%rdx')
34define(`n',	`%rcx')
35define(`cy',	`%r8')		C (only for mpn_add_nc)
36
37ifdef(`OPERATION_add_n', `
38	define(ADCSBB,	      adc)
39	define(func,	      mpn_add_n)
40	define(func_nc,	      mpn_add_nc)')
41ifdef(`OPERATION_sub_n', `
42	define(ADCSBB,	      sbb)
43	define(func,	      mpn_sub_n)
44	define(func_nc,	      mpn_sub_nc)')
45
46MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
47
48ASM_START()
49	TEXT
50	ALIGN(16)
51PROLOGUE(func_nc)
52	jmp	L(ent)
53EPILOGUE()
54PROLOGUE(func)
55	xor	%r8, %r8
56L(ent):
57	mov	R32(%rcx), R32(%rax)
58	shr	$2, %rcx
59	and	$3, R32(%rax)
60	jz	L(b0)
61	cmp	$2, R32(%rax)
62	jz	L(b2)
63	jg	L(b3)
64
65L(b1):	mov	(%rsi), %r10
66	test	%rcx, %rcx
67	jnz	L(gt1)
68	shr	R32(%r8)			C Set CF from argument
69	ADCSBB	(%rdx), %r10
70	mov	%r10, (%rdi)
71	mov	R32(%rcx), R32(%rax)		C zero rax
72	adc	R32(%rax), R32(%rax)
73	ret
74L(gt1):	shr	R32(%r8)
75	ADCSBB	(%rdx), %r10
76	mov	8(%rsi), %r11
77	lea	16(%rsi), %rsi
78	lea	-16(%rdx), %rdx
79	lea	-16(%rdi), %rdi
80	jmp	L(m1)
81
82L(b2):	mov	(%rsi), %r9
83	mov	8(%rsi), %r10
84	lea	-8(%rdx), %rdx
85	test	%rcx, %rcx
86	jnz	L(gt2)
87	shr	R32(%r8)
88	lea	-40(%rdi), %rdi
89	jmp	L(e2)
90L(gt2):	shr	R32(%r8)
91	ADCSBB	8(%rdx), %r9
92	mov	16(%rsi), %r11
93	lea	-8(%rsi), %rsi
94	lea	-8(%rdi), %rdi
95	jmp	L(m2)
96
97L(b3):	mov	(%rsi), %rax
98	mov	8(%rsi), %r9
99	mov	16(%rsi), %r10
100	test	%rcx, %rcx
101	jnz	L(gt3)
102	shr	R32(%r8)
103	lea	-32(%rdi), %rdi
104	jmp	L(e3)
105L(gt3):	shr	R32(%r8)
106	ADCSBB	(%rdx), %rax
107	jmp	L(m3)
108
109L(b0):	mov	(%rsi), %r11
110	neg	R32(%r8)
111	lea	-24(%rdx), %rdx
112	lea	-24(%rdi), %rdi
113	lea	8(%rsi), %rsi
114	jmp	L(m0)
115
116	ALIGN(8)
117L(top):	mov	%r11, 24(%rdi)
118	ADCSBB	(%rdx), %rax
119	lea	32(%rdi), %rdi
120L(m3):	mov	%rax, (%rdi)
121	ADCSBB	8(%rdx), %r9
122	mov	24(%rsi), %r11
123L(m2):	mov	%r9, 8(%rdi)
124	ADCSBB	16(%rdx), %r10
125	lea	32(%rsi), %rsi
126L(m1):	mov	%r10, 16(%rdi)
127L(m0):	ADCSBB	24(%rdx), %r11
128	mov	(%rsi), %rax
129	mov	8(%rsi), %r9
130	lea	32(%rdx), %rdx
131	dec	%rcx
132	mov	16(%rsi), %r10
133	jnz	L(top)
134
135	mov	%r11, 24(%rdi)
136L(e3):	ADCSBB	(%rdx), %rax
137	mov	%rax, 32(%rdi)
138L(e2):	ADCSBB	8(%rdx), %r9
139	mov	%r9, 40(%rdi)
140L(e1):	ADCSBB	16(%rdx), %r10
141	mov	%r10, 48(%rdi)
142	mov	R32(%rcx), R32(%rax)		C zero rax
143	adc	R32(%rax), R32(%rax)
144	ret
145EPILOGUE()
146