aors_n.asm revision 1.1.1.3
1dnl  Intel mpn_add_n/mpn_sub_n optimised for Conroe, Nehalem.
2
3dnl  Copyright 2006, 2007, 2011-2013 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C	     cycles/limb
35C AMD K8,K9	 2
36C AMD K10	 2
37C Intel P4	10
38C Intel core2	 2
39C Intel NHM	 2
40C Intel SBR	 2
41C Intel atom	 9
42C VIA nano	 3
43
44C INPUT PARAMETERS
45define(`rp',	`%rdi')
46define(`up',	`%rsi')
47define(`vp',	`%rdx')
48define(`n',	`%rcx')
49define(`cy',	`%r8')
50
51ifdef(`OPERATION_add_n', `
52	define(ADCSBB,	      adc)
53	define(func,	      mpn_add_n)
54	define(func_nc,	      mpn_add_nc)')
55ifdef(`OPERATION_sub_n', `
56	define(ADCSBB,	      sbb)
57	define(func,	      mpn_sub_n)
58	define(func_nc,	      mpn_sub_nc)')
59
60MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
61
62ABI_SUPPORT(DOS64)
63ABI_SUPPORT(STD64)
64
65ASM_START()
66	TEXT
67	ALIGN(16)
68PROLOGUE(func)
69	FUNC_ENTRY(4)
70	xor	%r8, %r8
71L(start):
72	mov	(up), %r10
73	mov	(vp), %r11
74
75	lea	(up,n,8), up
76	lea	(vp,n,8), vp
77	lea	(rp,n,8), rp
78	mov	R32(n), R32(%rax)
79	neg	n
80	and	$3, R32(%rax)
81	je	L(b00)
82	add	%rax, n			C clear low rcx bits for jrcxz
83	cmp	$2, R32(%rax)
84	jl	L(b01)
85	je	L(b10)
86
87L(b11):	neg	%r8			C set cy
88	jmp	L(e11)
89
90L(b00):	neg	%r8			C set cy
91	mov	%r10, %r8
92	mov	%r11, %r9
93	lea	4(n), n
94	jmp	L(e00)
95
96	nop
97	nop
98	nop
99L(b01):	neg	%r8			C set cy
100	jmp	L(top)
101
102L(b10):	neg	%r8			C set cy
103	mov	%r10, %r8
104	mov	%r11, %r9
105	jmp	L(e10)
106
107L(end):	ADCSBB	%r11, %r10
108	mov	%r10, -8(rp)
109	mov	R32(%rcx), R32(%rax)	C clear eax, ecx contains 0
110	adc	R32(%rax), R32(%rax)
111	FUNC_EXIT()
112	ret
113
114	ALIGN(16)
115L(top):	jrcxz	L(end)
116	mov	(up,n,8), %r8
117	mov	(vp,n,8), %r9
118	lea	4(n), n
119	ADCSBB	%r11, %r10
120	mov	%r10, -40(rp,n,8)
121L(e00):	mov	-24(up,n,8), %r10
122	mov	-24(vp,n,8), %r11
123	ADCSBB	%r9, %r8
124	mov	%r8, -32(rp,n,8)
125L(e11):	mov	-16(up,n,8), %r8
126	mov	-16(vp,n,8), %r9
127	ADCSBB	%r11, %r10
128	mov	%r10, -24(rp,n,8)
129L(e10):	mov	-8(up,n,8), %r10
130	mov	-8(vp,n,8), %r11
131	ADCSBB	%r9, %r8
132	mov	%r8, -16(rp,n,8)
133	jmp	L(top)
134EPILOGUE()
135
136PROLOGUE(func_nc)
137	FUNC_ENTRY(4)
138IFDOS(`	mov	56(%rsp), %r8	')
139	jmp	L(start)
140EPILOGUE()
141
142