1dnl  AMD64 mpn_add_n, mpn_sub_n
2
3dnl  Copyright 2003, 2004, 2005, 2007, 2008, 2010 Free Software Foundation,
4dnl  Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C	     cycles/limb
24C AMD K8,K9	 1.5
25C AMD K10	 1.5
26C Intel P4	 ?
27C Intel core2 	 4.9
28C Intel corei	 ?
29C Intel atom	 4
30C VIA nano	 3.25
31
32C The inner loop of this code is the result of running a code generation and
33C optimization tool suite written by David Harvey and Torbjorn Granlund.
34
35C INPUT PARAMETERS
36define(`rp',	`%rdi')
37define(`up',	`%rsi')
38define(`vp',	`%rdx')
39define(`n',	`%rcx')
40define(`cy',	`%r8')		C (only for mpn_add_nc)
41
42ifdef(`OPERATION_add_n', `
43	define(ADCSBB,	      adc)
44	define(func,	      mpn_add_n)
45	define(func_nc,	      mpn_add_nc)')
46ifdef(`OPERATION_sub_n', `
47	define(ADCSBB,	      sbb)
48	define(func,	      mpn_sub_n)
49	define(func_nc,	      mpn_sub_nc)')
50
51MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
52
53ASM_START()
54	TEXT
55	ALIGN(16)
56PROLOGUE(func_nc)
57	mov	R32(n), R32(%rax)
58	shr	$2, n
59	and	$3, R32(%rax)
60	bt	$0, %r8			C cy flag <- carry parameter
61	jrcxz	L(lt4)
62
63	mov	(up), %r8
64	mov	8(up), %r9
65	dec	n
66	jmp	L(mid)
67
68EPILOGUE()
69	ALIGN(16)
70PROLOGUE(func)
71	mov	R32(n), R32(%rax)
72	shr	$2, n
73	and	$3, R32(%rax)
74	jrcxz	L(lt4)
75
76	mov	(up), %r8
77	mov	8(up), %r9
78	dec	n
79	jmp	L(mid)
80
81L(lt4):	dec	R32(%rax)
82	mov	(up), %r8
83	jnz	L(2)
84	ADCSBB	(vp), %r8
85	mov	%r8, (rp)
86	adc	%eax, %eax
87	ret
88
89L(2):	dec	R32(%rax)
90	mov	8(up), %r9
91	jnz	L(3)
92	ADCSBB	(vp), %r8
93	ADCSBB	8(vp), %r9
94	mov	%r8, (rp)
95	mov	%r9, 8(rp)
96	adc	%eax, %eax
97	ret
98
99L(3):	mov	16(up), %r10
100	ADCSBB	(vp), %r8
101	ADCSBB	8(vp), %r9
102	ADCSBB	16(vp), %r10
103	mov	%r8, (rp)
104	mov	%r9, 8(rp)
105	mov	%r10, 16(rp)
106	setc	R8(%rax)
107	ret
108
109	ALIGN(16)
110L(top):	ADCSBB	(vp), %r8
111	ADCSBB	8(vp), %r9
112	ADCSBB	16(vp), %r10
113	ADCSBB	24(vp), %r11
114	mov	%r8, (rp)
115	lea	32(up), up
116	mov	%r9, 8(rp)
117	mov	%r10, 16(rp)
118	dec	n
119	mov	%r11, 24(rp)
120	lea	32(vp), vp
121	mov	(up), %r8
122	mov	8(up), %r9
123	lea	32(rp), rp
124L(mid):	mov	16(up), %r10
125	mov	24(up), %r11
126	jnz	L(top)
127
128L(end):	lea	32(up), up
129	ADCSBB	(vp), %r8
130	ADCSBB	8(vp), %r9
131	ADCSBB	16(vp), %r10
132	ADCSBB	24(vp), %r11
133	lea	32(vp), vp
134	mov	%r8, (rp)
135	mov	%r9, 8(rp)
136	mov	%r10, 16(rp)
137	mov	%r11, 24(rp)
138	lea	32(rp), rp
139
140	inc	R32(%rax)
141	dec	R32(%rax)
142	jnz	L(lt4)
143	adc	%eax, %eax
144	ret
145EPILOGUE()
146