1dnl  AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
2dnl  AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
3
4dnl  Copyright 2003, 2005-2009, 2011, 2012 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34
35C	     cycles/limb
36C AMD K8,K9	 2
37C AMD K10	 2
38C AMD bd1	 ?
39C AMD bobcat	 ?
40C Intel P4	 13
41C Intel core2	 3.45
42C Intel NHM	 ?
43C Intel SBR	 ?
44C Intel atom	 ?
45C VIA nano	 ?
46
47
48C Sometimes speed degenerates, supposedly related to that some operand
49C alignments cause cache conflicts.
50
51C The speed is limited by decoding/issue bandwidth.  There are 22 instructions
52C in the loop, which corresponds to ceil(22/3)/4 = 1.83 c/l.
53
54C INPUT PARAMETERS
55define(`rp',`%rdi')
56define(`up',`%rsi')
57define(`vp',`%rdx')
58define(`n', `%rcx')
59
60ifdef(`OPERATION_addlsh1_n', `
61  define(ADDSUB,	add)
62  define(ADCSBB,	adc)
63  define(func,		mpn_addlsh1_n)')
64ifdef(`OPERATION_rsblsh1_n', `
65  define(ADDSUB,	sub)
66  define(ADCSBB,	sbb)
67  define(func,		mpn_rsblsh1_n)')
68
69MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
70
71ABI_SUPPORT(DOS64)
72ABI_SUPPORT(STD64)
73
74ASM_START()
75	TEXT
76	ALIGN(16)
77PROLOGUE(func)
78	FUNC_ENTRY(4)
79	push	%rbp
80
81	mov	(vp), %r8
82	mov	R32(n), R32(%rax)
83	lea	(rp,n,8), rp
84	lea	(up,n,8), up
85	lea	(vp,n,8), vp
86	neg	n
87	xor	R32(%rbp), R32(%rbp)
88	and	$3, R32(%rax)
89	je	L(b00)
90	cmp	$2, R32(%rax)
91	jc	L(b01)
92	je	L(b10)
93
94L(b11):	add	%r8, %r8
95	mov	8(vp,n,8), %r9
96	adc	%r9, %r9
97	mov	16(vp,n,8), %r10
98	adc	%r10, %r10
99	sbb	R32(%rax), R32(%rax)	C save scy
100	ADDSUB	(up,n,8), %r8
101	ADCSBB	8(up,n,8), %r9
102	mov	%r8, (rp,n,8)
103	mov	%r9, 8(rp,n,8)
104	ADCSBB	16(up,n,8), %r10
105	mov	%r10, 16(rp,n,8)
106	sbb	R32(%rbp), R32(%rbp)	C save acy
107	add	$3, n
108	jmp	L(ent)
109
110L(b10):	add	%r8, %r8
111	mov	8(vp,n,8), %r9
112	adc	%r9, %r9
113	sbb	R32(%rax), R32(%rax)	C save scy
114	ADDSUB	(up,n,8), %r8
115	ADCSBB	8(up,n,8), %r9
116	mov	%r8, (rp,n,8)
117	mov	%r9, 8(rp,n,8)
118	sbb	R32(%rbp), R32(%rbp)	C save acy
119	add	$2, n
120	jmp	L(ent)
121
122L(b01):	add	%r8, %r8
123	sbb	R32(%rax), R32(%rax)	C save scy
124	ADDSUB	(up,n,8), %r8
125	mov	%r8, (rp,n,8)
126	sbb	R32(%rbp), R32(%rbp)	C save acy
127	inc	n
128L(ent):	jns	L(end)
129
130	ALIGN(16)
131L(top):	add	R32(%rax), R32(%rax)	C restore scy
132
133	mov	(vp,n,8), %r8
134L(b00):	adc	%r8, %r8
135	mov	8(vp,n,8), %r9
136	adc	%r9, %r9
137	mov	16(vp,n,8), %r10
138	adc	%r10, %r10
139	mov	24(vp,n,8), %r11
140	adc	%r11, %r11
141
142	sbb	R32(%rax), R32(%rax)	C save scy
143	add	R32(%rbp), R32(%rbp)	C restore acy
144
145	ADCSBB	(up,n,8), %r8
146	nop				C Hammer speedup!
147	ADCSBB	8(up,n,8), %r9
148	mov	%r8, (rp,n,8)
149	mov	%r9, 8(rp,n,8)
150	ADCSBB	16(up,n,8), %r10
151	ADCSBB	24(up,n,8), %r11
152	mov	%r10, 16(rp,n,8)
153	mov	%r11, 24(rp,n,8)
154
155	sbb	R32(%rbp), R32(%rbp)	C save acy
156	add	$4, n
157	js	L(top)
158
159L(end):
160ifdef(`OPERATION_addlsh1_n',`
161	add	R32(%rbp), R32(%rax)
162	neg	R32(%rax)')
163ifdef(`OPERATION_rsblsh1_n',`
164	sub	R32(%rax), R32(%rbp)
165	movslq	R32(%rbp), %rax')
166
167	pop	%rbp
168	FUNC_EXIT()
169	ret
170EPILOGUE()
171