1dnl  AMD64 mpn_addlshC_n -- rp[] = up[] + (vp[] << C)
2dnl  AMD64 mpn_rsblshC_n -- rp[] = (vp[] << C) - up[]
3
4dnl  Copyright 2009-2012 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32
33C	     cycles/limb
34C AMD K8,K9	 ?
35C AMD K10	 ?
36C Intel P4	 ?
37C Intel core2	 3.25
38C Intel NHM	 4
39C Intel SBR	 2  C (or 1.95 when L(top)'s alignment = 16 (mod 32))
40C Intel atom	 ?
41C VIA nano	 ?
42
43C This code probably runs close to optimally on Sandy Bridge (using 4-way
44C unrolling).  It also runs reasonably well on Core 2, but it runs poorly on
45C all other processors, including Nehalem.
46
47C INPUT PARAMETERS
48define(`rp',	`%rdi')
49define(`up',	`%rsi')
50define(`vp',	`%rdx')
51define(`n',	`%rcx')
52define(`cy',	`%r8')
53
54ABI_SUPPORT(DOS64)
55ABI_SUPPORT(STD64)
56
57ASM_START()
58	TEXT
59	ALIGN(16)
60PROLOGUE(func_nc)
61	FUNC_ENTRY(4)
62IFDOS(`	mov	56(%rsp), %r8	')
63	push	%rbp
64	mov	cy, %rax
65	neg	%rax			C set msb on carry
66	xor	R32(%rbp), R32(%rbp)	C limb carry
67	mov	(vp), %r8
68	shrd	$RSH, %r8, %rbp
69	mov	R32(n), R32(%r9)
70	and	$3, R32(%r9)
71	je	L(b00)
72	cmp	$2, R32(%r9)
73	jc	L(b01)
74	je	L(b10)
75	jmp	L(b11)
76EPILOGUE()
77
78	ALIGN(16)
79PROLOGUE(func_n)
80	FUNC_ENTRY(4)
81	push	%rbp
82	xor	R32(%rbp), R32(%rbp)	C limb carry
83	mov	(vp), %r8
84	shrd	$RSH, %r8, %rbp
85	mov	R32(n), R32(%rax)
86	and	$3, R32(%rax)
87	je	L(b00)
88	cmp	$2, R32(%rax)
89	jc	L(b01)
90	je	L(b10)
91
92L(b11):	mov	8(vp), %r9
93	shrd	$RSH, %r9, %r8
94	mov	16(vp), %r10
95	shrd	$RSH, %r10, %r9
96	add	R32(%rax), R32(%rax)	C init carry flag
97	ADCSBB	(up), %rbp
98	ADCSBB	8(up), %r8
99	ADCSBB	16(up), %r9
100	mov	%rbp, (rp)
101	mov	%r8, 8(rp)
102	mov	%r9, 16(rp)
103	mov	%r10, %rbp
104	lea	24(up), up
105	lea	24(vp), vp
106	lea	24(rp), rp
107	sbb	R32(%rax), R32(%rax)	C save carry flag
108	sub	$3, n
109	ja	L(top)
110	jmp	L(end)
111
112L(b01):	add	R32(%rax), R32(%rax)	C init carry flag
113	ADCSBB	(up), %rbp
114	mov	%rbp, (rp)
115	mov	%r8, %rbp
116	lea	8(up), up
117	lea	8(vp), vp
118	lea	8(rp), rp
119	sbb	R32(%rax), R32(%rax)	C save carry flag
120	sub	$1, n
121	ja	L(top)
122	jmp	L(end)
123
124L(b10):	mov	8(vp), %r9
125	shrd	$RSH, %r9, %r8
126	add	R32(%rax), R32(%rax)	C init carry flag
127	ADCSBB	(up), %rbp
128	ADCSBB	8(up), %r8
129	mov	%rbp, (rp)
130	mov	%r8, 8(rp)
131	mov	%r9, %rbp
132	lea	16(up), up
133	lea	16(vp), vp
134	lea	16(rp), rp
135	sbb	R32(%rax), R32(%rax)	C save carry flag
136	sub	$2, n
137	ja	L(top)
138	jmp	L(end)
139
140	ALIGN(16)
141L(top):	mov	(vp), %r8
142	shrd	$RSH, %r8, %rbp
143L(b00):	mov	8(vp), %r9
144	shrd	$RSH, %r9, %r8
145	mov	16(vp), %r10
146	shrd	$RSH, %r10, %r9
147	mov	24(vp), %r11
148	shrd	$RSH, %r11, %r10
149	lea	32(vp), vp
150	add	R32(%rax), R32(%rax)	C restore carry flag
151	ADCSBB	(up), %rbp
152	ADCSBB	8(up), %r8
153	ADCSBB	16(up), %r9
154	ADCSBB	24(up), %r10
155	lea	32(up), up
156	mov	%rbp, (rp)
157	mov	%r8, 8(rp)
158	mov	%r9, 16(rp)
159	mov	%r10, 24(rp)
160	mov	%r11, %rbp
161	lea	32(rp), rp
162	sbb	R32(%rax), R32(%rax)	C save carry flag
163	sub	$4, n
164	jnz	L(top)
165
166L(end):	shr	$RSH, %rbp
167	add	R32(%rax), R32(%rax)	C restore carry flag
168	ADCSBB	$0, %rbp
169	mov	%rbp, %rax
170	pop	%rbp
171	FUNC_EXIT()
172	ret
173EPILOGUE()
174