1dnl  AMD64 mpn_addlshC_n -- rp[] = up[] + (vp[] << C)
2dnl  AMD64 mpn_rsblshC_n -- rp[] = (vp[] << C) - up[]
3
4dnl  Copyright 2009-2012 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32
33C	     cycles/limb
34C AMD K8,K9	 2.1
35C AMD K10	 2.0
36C AMD bd1	~2.7
37C AMD bd2	~2.7
38C AMD bd3	 ?
39C AMD bd4	 ?
40C AMD zen	 2.0
41C AMD bt1	 3.3
42C AMD bt2	 3.0
43C Intel P4	 ?
44C Intel PNR	 3.0
45C Intel NHM	 2.75
46C Intel SBR	 2.55
47C Intel IBR	 2.49
48C Intel HWL	 2.25
49C Intel BWL	 1.89
50C Intel SKL	 1.90
51C Intel atom	 8.4
52C Intel SLM	 4.0
53C VIA nano	 ?
54
55C INPUT PARAMETERS
56define(`rp',	`%rdi')
57define(`up',	`%rsi')
58define(`vp',	`%rdx')
59define(`n',	`%rcx')
60
61define(M, eval(m4_lshift(1,LSH)))
62
63ABI_SUPPORT(DOS64)
64ABI_SUPPORT(STD64)
65
66ASM_START()
67	TEXT
68	ALIGN(16)
69PROLOGUE(func)
70	FUNC_ENTRY(4)
71	push	%r12
72	push	%r13
73	push	%r14
74	push	%r15
75
76	mov	(vp), %r8
77	lea	(,%r8,M), %r12
78	shr	$RSH, %r8
79
80	mov	R32(n), R32(%rax)
81	lea	(rp,n,8), rp
82	lea	(up,n,8), up
83	lea	(vp,n,8), vp
84	neg	n
85	and	$3, R8(%rax)
86	je	L(b00)
87	cmp	$2, R8(%rax)
88	jc	L(b01)
89	je	L(b10)
90
91L(b11):	mov	8(vp,n,8), %r10
92	lea	(%r8,%r10,M), %r14
93	shr	$RSH, %r10
94	mov	16(vp,n,8), %r11
95	lea	(%r10,%r11,M), %r15
96	shr	$RSH, %r11
97	ADDSUB	(up,n,8), %r12
98	ADCSBB	8(up,n,8), %r14
99	ADCSBB	16(up,n,8), %r15
100	sbb	R32(%rax), R32(%rax)		  C save carry for next
101	mov	%r12, (rp,n,8)
102	mov	%r14, 8(rp,n,8)
103	mov	%r15, 16(rp,n,8)
104	add	$3, n
105	js	L(top)
106	jmp	L(end)
107
108L(b01):	mov	%r8, %r11
109	ADDSUB	(up,n,8), %r12
110	sbb	R32(%rax), R32(%rax)		  C save carry for next
111	mov	%r12, (rp,n,8)
112	add	$1, n
113	js	L(top)
114	jmp	L(end)
115
116L(b10):	mov	8(vp,n,8), %r11
117	lea	(%r8,%r11,M), %r15
118	shr	$RSH, %r11
119	ADDSUB	(up,n,8), %r12
120	ADCSBB	8(up,n,8), %r15
121	sbb	R32(%rax), R32(%rax)		  C save carry for next
122	mov	%r12, (rp,n,8)
123	mov	%r15, 8(rp,n,8)
124	add	$2, n
125	js	L(top)
126	jmp	L(end)
127
128L(b00):	mov	8(vp,n,8), %r9
129	mov	16(vp,n,8), %r10
130	jmp	L(e00)
131
132	ALIGN(16)
133L(top):	mov	16(vp,n,8), %r10
134	mov	(vp,n,8), %r8
135	mov	8(vp,n,8), %r9
136	lea	(%r11,%r8,M), %r12
137	shr	$RSH, %r8
138L(e00):	lea	(%r8,%r9,M), %r13
139	shr	$RSH, %r9
140	mov	24(vp,n,8), %r11
141	lea	(%r9,%r10,M), %r14
142	shr	$RSH, %r10
143	lea	(%r10,%r11,M), %r15
144	shr	$RSH, %r11
145	add	R32(%rax), R32(%rax)		  C restore carry
146	ADCSBB	(up,n,8), %r12
147	ADCSBB	8(up,n,8), %r13
148	ADCSBB	16(up,n,8), %r14
149	ADCSBB	24(up,n,8), %r15
150	mov	%r12, (rp,n,8)
151	mov	%r13, 8(rp,n,8)
152	mov	%r14, 16(rp,n,8)
153	sbb	R32(%rax), R32(%rax)		  C save carry for next
154	mov	%r15, 24(rp,n,8)
155	add	$4, n
156	js	L(top)
157L(end):
158
159ifelse(ADDSUB,add,`
160	sub	R32(%r11), R32(%rax)
161	neg	R32(%rax)
162',`
163	add	R32(%r11), R32(%rax)
164	movslq	R32(%rax), %rax
165')
166	pop	%r15
167	pop	%r14
168	pop	%r13
169	pop	%r12
170	FUNC_EXIT()
171	ret
172EPILOGUE()
173