aorrlsh1_n.asm revision 1.1.1.2
1dnl  AMD64 mpn_addlsh1_n -- rp[] = up[] + (vp[] << 1)
2dnl  AMD64 mpn_rsblsh1_n -- rp[] = (vp[] << 1) - up[]
3
4dnl  Copyright 2003, 2005, 2006, 2007, 2008, 2009, 2011, 2012 Free Software
5dnl  Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24
25C	     cycles/limb
26C AMD K8,K9	 2
27C AMD K10	 2
28C Intel P4	 13
29C Intel core2	 3.45
30C Intel corei	 3.45
31C Intel atom	 ?
32C VIA nano	 ?
33
34
35C Sometimes speed degenerates, supposedly related to that some operand
36C alignments cause cache conflicts.
37
38C The speed is limited by decoding/issue bandwidth.  There are 22 instructions
39C in the loop, which corresponds to ceil(22/3)/4 = 1.83 c/l.
40
41C INPUT PARAMETERS
42define(`rp',`%rdi')
43define(`up',`%rsi')
44define(`vp',`%rdx')
45define(`n', `%rcx')
46
47ifdef(`OPERATION_addlsh1_n', `
48  define(ADDSUB,	add)
49  define(ADCSBB,	adc)
50  define(func,		mpn_addlsh1_n)')
51ifdef(`OPERATION_rsblsh1_n', `
52  define(ADDSUB,	sub)
53  define(ADCSBB,	sbb)
54  define(func,		mpn_rsblsh1_n)')
55
56MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_rsblsh1_n)
57
58ABI_SUPPORT(DOS64)
59ABI_SUPPORT(STD64)
60
61ASM_START()
62	TEXT
63	ALIGN(16)
64PROLOGUE(func)
65	FUNC_ENTRY(4)
66	push	%rbp
67
68	mov	(vp), %r8
69	mov	R32(n), R32(%rax)
70	lea	(rp,n,8), rp
71	lea	(up,n,8), up
72	lea	(vp,n,8), vp
73	neg	n
74	xor	R32(%rbp), R32(%rbp)
75	and	$3, R32(%rax)
76	je	L(b00)
77	cmp	$2, R32(%rax)
78	jc	L(b01)
79	je	L(b10)
80
81L(b11):	add	%r8, %r8
82	mov	8(vp,n,8), %r9
83	adc	%r9, %r9
84	mov	16(vp,n,8), %r10
85	adc	%r10, %r10
86	sbb	R32(%rax), R32(%rax)	C save scy
87	ADDSUB	(up,n,8), %r8
88	ADCSBB	8(up,n,8), %r9
89	mov	%r8, (rp,n,8)
90	mov	%r9, 8(rp,n,8)
91	ADCSBB	16(up,n,8), %r10
92	mov	%r10, 16(rp,n,8)
93	sbb	R32(%rbp), R32(%rbp)	C save acy
94	add	$3, n
95	jmp	L(ent)
96
97L(b10):	add	%r8, %r8
98	mov	8(vp,n,8), %r9
99	adc	%r9, %r9
100	sbb	R32(%rax), R32(%rax)	C save scy
101	ADDSUB	(up,n,8), %r8
102	ADCSBB	8(up,n,8), %r9
103	mov	%r8, (rp,n,8)
104	mov	%r9, 8(rp,n,8)
105	sbb	R32(%rbp), R32(%rbp)	C save acy
106	add	$2, n
107	jmp	L(ent)
108
109L(b01):	add	%r8, %r8
110	sbb	R32(%rax), R32(%rax)	C save scy
111	ADDSUB	(up,n,8), %r8
112	mov	%r8, (rp,n,8)
113	sbb	R32(%rbp), R32(%rbp)	C save acy
114	inc	n
115L(ent):	jns	L(end)
116
117	ALIGN(16)
118L(top):	add	R32(%rax), R32(%rax)	C restore scy
119
120	mov	(vp,n,8), %r8
121L(b00):	adc	%r8, %r8
122	mov	8(vp,n,8), %r9
123	adc	%r9, %r9
124	mov	16(vp,n,8), %r10
125	adc	%r10, %r10
126	mov	24(vp,n,8), %r11
127	adc	%r11, %r11
128
129	sbb	R32(%rax), R32(%rax)	C save scy
130	add	R32(%rbp), R32(%rbp)	C restore acy
131
132	ADCSBB	(up,n,8), %r8
133	nop				C Hammer speedup!
134	ADCSBB	8(up,n,8), %r9
135	mov	%r8, (rp,n,8)
136	mov	%r9, 8(rp,n,8)
137	ADCSBB	16(up,n,8), %r10
138	ADCSBB	24(up,n,8), %r11
139	mov	%r10, 16(rp,n,8)
140	mov	%r11, 24(rp,n,8)
141
142	sbb	R32(%rbp), R32(%rbp)	C save acy
143	add	$4, n
144	js	L(top)
145
146L(end):
147ifdef(`OPERATION_addlsh1_n',`
148	add	R32(%rbp), R32(%rax)
149	neg	R32(%rax)')
150ifdef(`OPERATION_rsblsh1_n',`
151	sub	R32(%rax), R32(%rbp)
152	movslq	R32(%rbp), %rax')
153
154	pop	%rbp
155	FUNC_EXIT()
156	ret
157EPILOGUE()
158