1dnl  Intel Atom mpn_addlshC_n/mpn_sublshC_n -- rp[] = up[] +- (vp[] << C)
2
3dnl  Contributed to the GNU project by Marco Bodrato.
4
5dnl  Copyright 2011 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C mp_limb_t mpn_addlshC_n_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size);
36C mp_limb_t mpn_addlshC_nc_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
37C				mp_limb_t carry);
38C mp_limb_t mpn_sublshC_n_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,);
39C mp_limb_t mpn_sublshC_nc_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
40C				mp_signed_limb_t borrow);
41
42defframe(PARAM_CORB,	16)
43defframe(PARAM_SIZE,	12)
44defframe(PARAM_SRC,	 8)
45defframe(PARAM_DST,	 4)
46
47C mp_limb_t mpn_addlshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
48C                          mp_size_t size,);
49C mp_limb_t mpn_addlshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
50C                           mp_size_t size, mp_limb_t carry);
51C mp_limb_t mpn_sublshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
52C                          mp_size_t size,);
53C mp_limb_t mpn_sublshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
54C                           mp_size_t size, mp_limb_t borrow);
55
56C if src1 == dst, _ip1 is used
57
58C					cycles/limb
59C				dst!=src1,src2	dst==src1
60C P5
61C P6 model 0-8,10-12
62C P6 model 9  (Banias)
63C P6 model 13 (Dothan)
64C P4 model 0  (Willamette)
65C P4 model 1  (?)
66C P4 model 2  (Northwood)
67C P4 model 3  (Prescott)
68C P4 model 4  (Nocona)
69C Intel Atom			 7		 6
70C AMD K6
71C AMD K7
72C AMD K8
73C AMD K10
74
75defframe(GPARAM_CORB,	20)
76defframe(GPARAM_SIZE,	16)
77defframe(GPARAM_SRC2,	12)
78
79dnl  re-use parameter space
80define(SAVE_EBP,`PARAM_SIZE')
81define(SAVE_EBX,`PARAM_SRC')
82define(SAVE_UP,`PARAM_DST')
83
84define(M, eval(m4_lshift(1,LSH)))
85define(`rp',  `%edi')
86define(`up',  `%esi')
87
88ASM_START()
89	TEXT
90	ALIGN(8)
91
92PROLOGUE(M4_ip_function_c)
93deflit(`FRAME',0)
94	movl	PARAM_CORB, %ecx
95	movl	%ecx, %edx
96	shr	$LSH, %edx
97	andl	$1, %edx
98	M4_opp	%edx, %ecx
99	jmp	L(start_nc)
100EPILOGUE()
101
102PROLOGUE(M4_ip_function)
103deflit(`FRAME',0)
104
105	xor	%ecx, %ecx
106	xor	%edx, %edx
107L(start_nc):
108	push	rp			FRAME_pushl()
109	mov	PARAM_DST, rp
110	mov	up, SAVE_UP
111	mov	PARAM_SRC, up
112	mov	%ebx, SAVE_EBX
113	mov	PARAM_SIZE, %ebx	C size
114L(inplace):
115	incl	%ebx			C size + 1
116	shr	%ebx			C (size+1)\2
117	mov	%ebp, SAVE_EBP
118	jnc	L(entry)		C size odd
119
120	add	%edx, %edx		C size even
121	mov	%ecx, %ebp
122	mov	(up), %ecx
123	lea	-4(rp), rp
124	lea	(%ebp,%ecx,M), %eax
125	lea	4(up), up
126	jmp	L(enteven)
127
128	ALIGN(16)
129L(oop):
130	lea	(%ecx,%eax,M), %ebp
131	shr	$RSH, %eax
132	mov	4(up), %ecx
133	add	%edx, %edx
134	lea	8(up), up
135	M4_inst	%ebp, (rp)
136	lea	(%eax,%ecx,M), %eax
137
138L(enteven):
139	M4_inst	%eax, 4(rp)
140	lea	8(rp), rp
141
142	sbb	%edx, %edx
143	shr	$RSH, %ecx
144
145L(entry):
146	mov	(up), %eax
147	decl	%ebx
148	jnz	L(oop)
149
150	lea	(%ecx,%eax,M), %ebp
151	shr	$RSH, %eax
152	shr	%edx
153	M4_inst	%ebp, (rp)
154	mov	SAVE_UP, up
155	adc	$0, %eax
156	mov	SAVE_EBP, %ebp
157	mov	SAVE_EBX, %ebx
158	pop	rp			FRAME_popl()
159	ret
160EPILOGUE()
161
162PROLOGUE(M4_function_c)
163deflit(`FRAME',0)
164	movl	GPARAM_CORB, %ecx
165	movl	%ecx, %edx
166	shr	$LSH, %edx
167	andl	$1, %edx
168	M4_opp	%edx, %ecx
169	jmp	L(generic_nc)
170EPILOGUE()
171
172PROLOGUE(M4_function)
173deflit(`FRAME',0)
174
175	xor	%ecx, %ecx
176	xor	%edx, %edx
177L(generic_nc):
178	push	rp			FRAME_pushl()
179	mov	PARAM_DST, rp
180	mov	up, SAVE_UP
181	mov	PARAM_SRC, up
182	cmp	rp, up
183	mov	%ebx, SAVE_EBX
184	jne	L(general)
185	mov	GPARAM_SIZE, %ebx	C size
186	mov	GPARAM_SRC2, up
187	jmp	L(inplace)
188
189L(general):
190	mov	GPARAM_SIZE, %eax	C size
191	mov	%ebx, SAVE_EBX
192	incl	%eax			C size + 1
193	mov	up, %ebx		C vp
194	mov	GPARAM_SRC2, up		C up
195	shr	%eax			C (size+1)\2
196	mov	%ebp, SAVE_EBP
197	mov	%eax, GPARAM_SIZE
198	jnc	L(entry2)		C size odd
199
200	add	%edx, %edx		C size even
201	mov	%ecx, %ebp
202	mov	(up), %ecx
203	lea	-4(rp), rp
204	lea	-4(%ebx), %ebx
205	lea	(%ebp,%ecx,M), %eax
206	lea	4(up), up
207	jmp	L(enteven2)
208
209	ALIGN(16)
210L(oop2):
211	lea	(%ecx,%eax,M), %ebp
212	shr	$RSH, %eax
213	mov	4(up), %ecx
214	add	%edx, %edx
215	lea	8(up), up
216	mov	(%ebx), %edx
217	M4_inst	%ebp, %edx
218	lea	(%eax,%ecx,M), %eax
219	mov	%edx, (rp)
220L(enteven2):
221	mov	4(%ebx), %edx
222	lea	8(%ebx), %ebx
223	M4_inst	%eax, %edx
224	mov	%edx, 4(rp)
225	sbb	%edx, %edx
226	shr	$RSH, %ecx
227	lea	8(rp), rp
228L(entry2):
229	mov	(up), %eax
230	decl	GPARAM_SIZE
231	jnz	L(oop2)
232
233	lea	(%ecx,%eax,M), %ebp
234	shr	$RSH, %eax
235	shr	%edx
236	mov	(%ebx), %edx
237	M4_inst	%ebp, %edx
238	mov	%edx, (rp)
239	mov	SAVE_UP, up
240	adc	$0, %eax
241	mov	SAVE_EBP, %ebp
242	mov	SAVE_EBX, %ebx
243	pop	rp			FRAME_popl()
244	ret
245EPILOGUE()
246
247ASM_END()
248