1dnl  AMD64 mpn_sublsh1_n optimised for Intel Atom.
2dnl  Used also for AMD bd1.
3
4dnl  Contributed to the GNU project by Torbjorn Granlund.
5
6dnl  Copyright 2011, 2012 Free Software Foundation, Inc.
7
8dnl  This file is part of the GNU MP Library.
9dnl
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of either:
12dnl
13dnl    * the GNU Lesser General Public License as published by the Free
14dnl      Software Foundation; either version 3 of the License, or (at your
15dnl      option) any later version.
16dnl
17dnl  or
18dnl
19dnl    * the GNU General Public License as published by the Free Software
20dnl      Foundation; either version 2 of the License, or (at your option) any
21dnl      later version.
22dnl
23dnl  or both in parallel, as here.
24dnl
25dnl  The GNU MP Library is distributed in the hope that it will be useful, but
26dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28dnl  for more details.
29dnl
30dnl  You should have received copies of the GNU General Public License and the
31dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
32dnl  see https://www.gnu.org/licenses/.
33
34include(`../config.m4')
35
36C TODO
37C  * This code is slightly large at 501 bytes.
38C  * aorrlsh1_n.asm and this file use the same basic pattern.
39
40C	     cycles/limb
41C AMD K8,K9	 ?
42C AMD K10	 ?
43C AMD bd1	 2.3
44C AMD bobcat	 ?
45C Intel P4	 ?
46C Intel core2	 ?
47C Intel NHM	 ?
48C Intel SBR	 ?
49C Intel atom	 5	(4.875 is probably possible)
50C VIA nano	 ?
51
52C INPUT PARAMETERS
53define(`rp',       `%rdi')
54define(`up',       `%rsi')
55define(`vp',       `%rdx')
56define(`n',        `%rcx')
57define(`cy',       `%r8')
58
59ABI_SUPPORT(DOS64)
60ABI_SUPPORT(STD64)
61
62ASM_START()
63	TEXT
64	ALIGN(16)
65PROLOGUE(mpn_sublsh1_n)
66	FUNC_ENTRY(4)
67	push	%rbp
68	push	%r15
69	xor	R32(%rbp), R32(%rbp)
70L(ent):	mov	R32(n), R32(%rax)
71	and	$3, R32(%rax)
72	jz	L(b0)
73	cmp	$2, R32(%rax)
74	jz	L(b2)
75	jg	L(b3)
76
77L(b1):	mov	(vp), %r8
78	add	%r8, %r8
79	lea	8(vp), vp
80	sbb	R32(%rax), R32(%rax)	C save scy
81	add	R32(%rbp), R32(%rbp)	C restore acy
82	mov	(up), %r15
83	sbb	%r8, %r15
84	mov	%r15, (rp)
85	sbb	R32(%rbp), R32(%rbp)	C save acy
86	lea	8(up), up
87	lea	8(rp), rp
88	jmp	L(b0)
89
90L(b2):	mov	(vp), %r8
91	add	%r8, %r8
92	mov	8(vp), %r9
93	adc	%r9, %r9
94	lea	16(vp), vp
95	sbb	R32(%rax), R32(%rax)	C save scy
96	add	R32(%rbp), R32(%rbp)	C restore acy
97	mov	(up), %r15
98	sbb	%r8, %r15
99	mov	%r15, (rp)
100	mov	8(up), %r15
101	sbb	%r9, %r15
102	mov	%r15, 8(rp)
103	sbb	R32(%rbp), R32(%rbp)	C save acy
104	lea	16(up), up
105	lea	16(rp), rp
106	jmp	L(b0)
107
108L(b3):	mov	(vp), %r8
109	add	%r8, %r8
110	mov	8(vp), %r9
111	adc	%r9, %r9
112	mov	16(vp), %r10
113	adc	%r10, %r10
114	lea	24(vp), vp
115	sbb	R32(%rax), R32(%rax)	C save scy
116	add	R32(%rbp), R32(%rbp)	C restore acy
117	mov	(up), %r15
118	sbb	%r8, %r15
119	mov	%r15, (rp)
120	mov	8(up), %r15
121	sbb	%r9, %r15
122	mov	%r15, 8(rp)
123	mov	16(up), %r15
124	sbb	%r10, %r15
125	mov	%r15, 16(rp)
126	sbb	R32(%rbp), R32(%rbp)	C save acy
127	lea	24(up), up
128	lea	24(rp), rp
129
130L(b0):	test	$4, R8(n)
131	jz	L(skp)
132	add	R32(%rax), R32(%rax)	C restore scy
133	mov	(vp), %r8
134	adc	%r8, %r8
135	mov	8(vp), %r9
136	adc	%r9, %r9
137	mov	16(vp), %r10
138	adc	%r10, %r10
139	mov	24(vp), %r11
140	adc	%r11, %r11
141	lea	32(vp), vp
142	sbb	R32(%rax), R32(%rax)	C save scy
143	add	R32(%rbp), R32(%rbp)	C restore acy
144	mov	(up), %r15
145	sbb	%r8, %r15
146	mov	%r15, (rp)
147	mov	8(up), %r15
148	sbb	%r9, %r15
149	mov	%r15, 8(rp)
150	mov	16(up), %r15
151	sbb	%r10, %r15
152	mov	%r15, 16(rp)
153	mov	24(up), %r15
154	sbb	%r11, %r15
155	mov	%r15, 24(rp)
156	lea	32(up), up
157	lea	32(rp), rp
158	sbb	R32(%rbp), R32(%rbp)	C save acy
159
160L(skp):	cmp	$8, n
161	jl	L(rtn)
162
163	push	%r12
164	push	%r13
165	push	%r14
166	push	%rbx
167	lea	-64(rp), rp
168	jmp	L(x)
169
170	ALIGN(16)
171L(top):	mov	(vp), %r8
172	add	R32(%rax), R32(%rax)
173	lea	64(vp), vp
174	adc	%r8, %r8
175	mov	-56(vp), %r9
176	adc	%r9, %r9
177	mov	-48(vp), %r10
178	adc	%r10, %r10
179	mov	-40(vp), %r11
180	adc	%r11, %r11
181	mov	-32(vp), %r12
182	adc	%r12, %r12
183	mov	-24(vp), %r13
184	adc	%r13, %r13
185	mov	-16(vp), %r14
186	adc	%r14, %r14
187	mov	-8(vp), %r15
188	adc	%r15, %r15
189	sbb	R32(%rax), R32(%rax)
190	add	R32(%rbp), R32(%rbp)
191	mov	(up), %rbp
192	lea	64(rp), rp
193	mov	8(up), %rbx
194	sbb	%r8, %rbp
195	mov	32(up), %r8
196	mov	%rbp, (rp)
197	sbb	%r9, %rbx
198	mov	16(up), %rbp
199	mov	%rbx, 8(rp)
200	sbb	%r10, %rbp
201	mov	24(up), %rbx
202	mov	%rbp, 16(rp)
203	sbb	%r11, %rbx
204	mov	%rbx, 24(rp)
205	sbb	%r12, %r8
206	mov	40(up), %r9
207	mov	%r8, 32(rp)
208	sbb	%r13, %r9
209	mov	48(up), %rbp
210	mov	%r9, 40(rp)
211	sbb	%r14, %rbp
212	mov	56(up), %rbx
213	mov	%rbp, 48(rp)
214	sbb	%r15, %rbx
215	lea	64(up), up
216	mov	%rbx, 56(rp)
217	sbb	R32(%rbp), R32(%rbp)
218L(x):	sub	$8, n
219	jge	L(top)
220
221L(end):	pop	%rbx
222	pop	%r14
223	pop	%r13
224	pop	%r12
225L(rtn):
226	add	R32(%rbp), R32(%rax)
227	neg	R32(%rax)
228
229	pop	%r15
230	pop	%rbp
231	FUNC_EXIT()
232	ret
233EPILOGUE()
234PROLOGUE(mpn_sublsh1_nc)
235	FUNC_ENTRY(4)
236IFDOS(`	mov	56(%rsp), %r8	')
237	push	%rbp
238	push	%r15
239	neg	%r8			C set CF
240	sbb	R32(%rbp), R32(%rbp)	C save acy
241	jmp	L(ent)
242EPILOGUE()
243