aors_n.asm revision 1.1.1.2
1dnl  Intel P6-15 mpn_add_n/mpn_sub_n -- mpn add or subtract.
2
3dnl  Copyright 2006, 2007, 2011, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C	     cycles/limb
24C AMD K8,K9	 2.25
25C AMD K10	 2
26C Intel P4	10
27C Intel core2	 2.05
28C Intel NHM	 2.3
29C Intel SBR	 1.9
30C Intel atom	 ?
31C VIA nano	 ?
32
33C INPUT PARAMETERS
34define(`rp',	`%rdi')
35define(`up',	`%rsi')
36define(`vp',	`%rdx')
37define(`n',	`%rcx')
38define(`cy',	`%r8')
39
40ifdef(`OPERATION_add_n', `
41	define(ADCSBB,	      adc)
42	define(func,	      mpn_add_n)
43	define(func_nc,	      mpn_add_nc)')
44ifdef(`OPERATION_sub_n', `
45	define(ADCSBB,	      sbb)
46	define(func,	      mpn_sub_n)
47	define(func_nc,	      mpn_sub_nc)')
48
49MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
50
51ABI_SUPPORT(DOS64)
52ABI_SUPPORT(STD64)
53
54ASM_START()
55	TEXT
56	ALIGN(16)
57PROLOGUE(func_nc)
58	FUNC_ENTRY(4)
59IFDOS(`	mov	56(%rsp), %r8	')
60	jmp	L(start)
61EPILOGUE()
62
63PROLOGUE(func)
64	FUNC_ENTRY(4)
65	xor	%r8, %r8
66L(start):
67	mov	(up), %r10
68	mov	(vp), %r11
69
70	lea	-8(up,n,8), up
71	lea	-8(vp,n,8), vp
72	lea	-16(rp,n,8), rp
73	mov	R32(%rcx), R32(%rax)
74	neg	n
75	and	$3, R32(%rax)
76	je	L(b00)
77	add	%rax, n			C clear low rcx bits for jrcxz
78	cmp	$2, R32(%rax)
79	jl	L(b01)
80	je	L(b10)
81
82L(b11):	shr	%r8			C set cy
83	jmp	L(e11)
84
85L(b00):	shr	%r8			C set cy
86	mov	%r10, %r8
87	mov	%r11, %r9
88	lea	4(n), n
89	jmp	L(e00)
90
91L(b01):	shr	%r8			C set cy
92	jmp	L(e01)
93
94L(b10):	shr	%r8			C set cy
95	mov	%r10, %r8
96	mov	%r11, %r9
97	jmp	L(e10)
98
99L(end):	ADCSBB	%r11, %r10
100	mov	%r10, 8(rp)
101	mov	R32(%rcx), R32(%rax)	C clear eax, ecx contains 0
102	adc	R32(%rax), R32(%rax)
103	FUNC_EXIT()
104	ret
105
106	ALIGN(16)
107L(top):
108	mov	-24(up,n,8), %r8
109	mov	-24(vp,n,8), %r9
110	ADCSBB	%r11, %r10
111	mov	%r10, -24(rp,n,8)
112L(e00):
113	mov	-16(up,n,8), %r10
114	mov	-16(vp,n,8), %r11
115	ADCSBB	%r9, %r8
116	mov	%r8, -16(rp,n,8)
117L(e11):
118	mov	-8(up,n,8), %r8
119	mov	-8(vp,n,8), %r9
120	ADCSBB	%r11, %r10
121	mov	%r10, -8(rp,n,8)
122L(e10):
123	mov	(up,n,8), %r10
124	mov	(vp,n,8), %r11
125	ADCSBB	%r9, %r8
126	mov	%r8, (rp,n,8)
127L(e01):
128	jrcxz	L(end)
129	lea	4(n), n
130	jmp	L(top)
131
132EPILOGUE()
133