1dnl  Intel Atom mpn_lshift -- mpn left shift.
2
3dnl  Copyright 2011 Free Software Foundation, Inc.
4
5dnl  Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
36C			unsigned cnt);
37
38C				  cycles/limb
39C				cnt!=1	cnt==1
40C P5
41C P6 model 0-8,10-12
42C P6 model 9  (Banias)
43C P6 model 13 (Dothan)
44C P4 model 0  (Willamette)
45C P4 model 1  (?)
46C P4 model 2  (Northwood)
47C P4 model 3  (Prescott)
48C P4 model 4  (Nocona)
49C Intel Atom			 5	 2.5
50C AMD K6
51C AMD K7
52C AMD K8
53C AMD K10
54
55defframe(PARAM_CNT, 16)
56defframe(PARAM_SIZE,12)
57defframe(PARAM_SRC,  8)
58defframe(PARAM_DST,  4)
59
60dnl  re-use parameter space
61define(SAVE_UP,`PARAM_CNT')
62define(VAR_COUNT,`PARAM_SIZE')
63define(SAVE_EBX,`PARAM_SRC')
64define(SAVE_EBP,`PARAM_DST')
65
66define(`rp',  `%edi')
67define(`up',  `%esi')
68define(`cnt',  `%ecx')
69
70ASM_START()
71	TEXT
72	ALIGN(8)
73deflit(`FRAME',0)
74PROLOGUE(mpn_lshift)
75	mov	PARAM_CNT, cnt
76	mov	PARAM_SIZE, %edx
77	mov	up, SAVE_UP
78	mov	PARAM_SRC, up
79	push	rp			FRAME_pushl()
80	mov	PARAM_DST, rp
81
82C We can use faster code for shift-by-1 under certain conditions.
83	cmp	$1,cnt
84	jne	L(normal)
85	cmpl	rp, up
86	jnc	L(special)		C jump if s_ptr + 1 >= res_ptr
87	leal	(up,%edx,4),%eax
88	cmpl	%eax,rp
89	jnc	L(special)		C jump if res_ptr >= s_ptr + size
90
91L(normal):
92	lea	-4(up,%edx,4), up
93	mov	%ebx, SAVE_EBX
94	lea	-4(rp,%edx,4), rp
95
96	shr	%edx
97	mov	(up), %eax
98	mov	%edx, VAR_COUNT
99	jnc	L(evn)
100
101	mov	%eax, %ebx
102	shl	%cl, %ebx
103	neg	cnt
104	shr	%cl, %eax
105	test	%edx, %edx
106	jnz	L(gt1)
107	mov	%ebx, (rp)
108	jmp	L(quit)
109
110L(gt1):	mov	%ebp, SAVE_EBP
111	push	%eax
112	mov	-4(up), %eax
113	mov	%eax, %ebp
114	shr	%cl, %eax
115	jmp	L(lo1)
116
117L(evn):	mov	%ebp, SAVE_EBP
118	neg	cnt
119	mov	%eax, %ebp
120	mov	-4(up), %edx
121	shr	%cl, %eax
122	mov	%edx, %ebx
123	shr	%cl, %edx
124	neg	cnt
125	decl	VAR_COUNT
126	lea	4(rp), rp
127	lea	-4(up), up
128	jz	L(end)
129	push	%eax			FRAME_pushl()
130
131	ALIGN(8)
132L(top):	shl	%cl, %ebp
133	or	%ebp, %edx
134	shl	%cl, %ebx
135	neg	cnt
136	mov	-4(up), %eax
137	mov	%eax, %ebp
138	mov	%edx, -4(rp)
139	shr	%cl, %eax
140	lea	-8(rp), rp
141L(lo1):	mov	-8(up), %edx
142	or	%ebx, %eax
143	mov	%edx, %ebx
144	shr	%cl, %edx
145	lea	-8(up), up
146	neg	cnt
147	mov	%eax, (rp)
148	decl	VAR_COUNT
149	jg	L(top)
150
151	pop	%eax			FRAME_popl()
152L(end):
153	shl	%cl, %ebp
154	shl	%cl, %ebx
155	or	%ebp, %edx
156	mov	SAVE_EBP, %ebp
157	mov	%edx, -4(rp)
158	mov	%ebx, -8(rp)
159
160L(quit):
161	mov	SAVE_UP, up
162	mov	SAVE_EBX, %ebx
163	pop	rp			FRAME_popl()
164	ret
165
166L(special):
167deflit(`FRAME',4)
168	lea	3(%edx), %eax		C size + 3
169	dec	%edx			C size - 1
170	mov	(up), %ecx
171	shr	$2, %eax		C (size + 3) / 4
172	and	$3, %edx		C (size - 1) % 4
173	jz	L(goloop)		C jmp if  size == 1 (mod 4)
174	shr	%edx
175	jnc	L(odd)			C jum if  size == 3 (mod 4)
176
177	add	%ecx, %ecx
178	lea	4(up), up
179	mov	%ecx, (rp)
180	mov	(up), %ecx
181	lea	4(rp), rp
182
183	dec	%edx
184	jnz	L(goloop)		C jump if  size == 0 (mod 4)
185L(odd):	lea	-8(up), up
186	lea	-8(rp), rp
187	jmp	L(sentry)		C reached if size == 2 or 3 (mod 4)
188
189L(sloop):
190	adc	%ecx, %ecx
191	mov	4(up), %edx
192	mov	%ecx, (rp)
193	adc	%edx, %edx
194	mov	8(up), %ecx
195	mov	%edx, 4(rp)
196L(sentry):
197	adc	%ecx, %ecx
198	mov	12(up), %edx
199	mov	%ecx, 8(rp)
200	adc	%edx, %edx
201	lea	16(up), up
202	mov	%edx, 12(rp)
203	lea	16(rp), rp
204	mov	(up), %ecx
205L(goloop):
206	decl	%eax
207	jnz	L(sloop)
208
209L(squit):
210	adc	%ecx, %ecx
211	mov	%ecx, (rp)
212	adc	%eax, %eax
213
214	mov	SAVE_UP, up
215	pop	rp			FRAME_popl()
216	ret
217EPILOGUE()
218ASM_END()
219