1dnl  AMD64 mpn_addlsh2_n and mpn_rsblsh2_n.  R = 2*V +- U.
2dnl  ("rsb" means reversed subtract, name mandated by mpn_sublsh2_n which
3dnl  subtacts the shifted operand from the unshifted operand.)
4
5dnl  Copyright 2009 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24
25C	     cycles/limb
26C K8,K9:	 2
27C K10:		 2
28C P4:		 ?
29C P6 core2: 	 3
30C P6 corei7:	 2.75
31C P6 atom:	 ?
32
33C INPUT PARAMETERS
34define(`rp',	`%rdi')
35define(`up',	`%rsi')
36define(`vp',	`%rdx')
37define(`n',	`%rcx')
38
39ifdef(`OPERATION_addlsh2_n',`
40  define(ADDSUB,        `add')
41  define(ADCSBB,       `adc')
42  define(func, mpn_addlsh2_n)')
43ifdef(`OPERATION_rsblsh2_n',`
44  define(ADDSUB,        `sub')
45  define(ADCSBB,       `sbb')
46  define(func, mpn_rsblsh2_n)')
47
48MULFUNC_PROLOGUE(mpn_addlsh2_n mpn_rsblsh2_n)
49
50ASM_START()
51	TEXT
52	ALIGN(16)
53PROLOGUE(func)
54	push	%r12
55	push	%r13
56	push	%r14
57	push	%r15
58
59	mov	(vp), %r8
60	lea	(,%r8,4), %r12
61	shr	$62, %r8
62
63	mov	R32(n), R32(%rax)
64	lea	(rp,n,8), rp
65	lea	(up,n,8), up
66	lea	(vp,n,8), vp
67	neg	n
68	and	$3, R8(%rax)
69	je	L(b00)
70	cmp	$2, R8(%rax)
71	jc	L(b01)
72	je	L(b10)
73
74L(b11):	mov	8(vp,n,8), %r10
75	lea	(%r8,%r10,4), %r14
76	shr	$62, %r10
77	mov	16(vp,n,8), %r11
78	lea	(%r10,%r11,4), %r15
79	shr	$62, %r11
80	ADDSUB	(up,n,8), %r12
81	ADCSBB	8(up,n,8), %r14
82	ADCSBB	16(up,n,8), %r15
83	sbb	R32(%rax), R32(%rax)		  C save carry for next
84	mov	%r12, (rp,n,8)
85	mov	%r14, 8(rp,n,8)
86	mov	%r15, 16(rp,n,8)
87	add	$3, n
88	js	L(top)
89	jmp	L(end)
90
91L(b01):	mov	%r8, %r11
92	ADDSUB	(up,n,8), %r12
93	sbb	R32(%rax), R32(%rax)		  C save carry for next
94	mov	%r12, (rp,n,8)
95	add	$1, n
96	js	L(top)
97	jmp	L(end)
98
99L(b10):	mov	8(vp,n,8), %r11
100	lea	(%r8,%r11,4), %r15
101	shr	$62, %r11
102	ADDSUB	(up,n,8), %r12
103	ADCSBB	8(up,n,8), %r15
104	sbb	R32(%rax), R32(%rax)		  C save carry for next
105	mov	%r12, (rp,n,8)
106	mov	%r15, 8(rp,n,8)
107	add	$2, n
108	js	L(top)
109	jmp	L(end)
110
111L(b00):	mov	8(vp,n,8), %r9
112	mov	16(vp,n,8), %r10
113	jmp	L(e00)
114
115	ALIGN(16)
116L(top):	mov	16(vp,n,8), %r10
117	mov	(vp,n,8), %r8
118	mov	8(vp,n,8), %r9
119	lea	(%r11,%r8,4), %r12
120	shr	$62, %r8
121L(e00):	lea	(%r8,%r9,4), %r13
122	shr	$62, %r9
123	mov	24(vp,n,8), %r11
124	lea	(%r9,%r10,4), %r14
125	shr	$62, %r10
126	lea	(%r10,%r11,4), %r15
127	shr	$62, %r11
128	add	R32(%rax), R32(%rax)		  C restore carry
129	ADCSBB	(up,n,8), %r12
130	ADCSBB	8(up,n,8), %r13
131	ADCSBB	16(up,n,8), %r14
132	ADCSBB	24(up,n,8), %r15
133	mov	%r12, (rp,n,8)
134	mov	%r13, 8(rp,n,8)
135	mov	%r14, 16(rp,n,8)
136	sbb	R32(%rax), R32(%rax)		  C save carry for next
137	mov	%r15, 24(rp,n,8)
138	add	$4, n
139	js	L(top)
140L(end):
141
142ifdef(`OPERATION_addlsh2_n',`
143	sub	R32(%r11), R32(%rax)
144	neg	R32(%rax)')
145ifdef(`OPERATION_rsblsh2_n',`
146	add	R32(%r11), R32(%rax)
147	movslq	R32(%rax), %rax')
148
149	pop	%r15
150	pop	%r14
151	pop	%r13
152	pop	%r12
153	ret
154EPILOGUE()
155