1dnl  x86-64 mpn_lshift optimized for Pentium 4.
2
3dnl  Copyright 2003, 2005, 2007, 2008 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C	     cycles/limb
24C K8,K9:	 2.5
25C K10:		 ?
26C P4:		 3.29
27C P6-15 (Core2): 2.1 (fluctuates, presumably cache related)
28C P6-28 (Atom):	14.3
29
30C INPUT PARAMETERS
31define(`rp',`%rdi')
32define(`up',`%rsi')
33define(`n',`%rdx')
34define(`cnt',`%cl')
35
36ASM_START()
37	TEXT
38	ALIGN(32)
39PROLOGUE(mpn_lshift)
40	mov	-8(up,n,8), %rax
41	movd	%ecx, %mm4
42	neg	%ecx			C put rsh count in cl
43	and	$63, %ecx
44	movd	%ecx, %mm5
45
46	lea	1(n), %r8d
47
48	shr	%cl, %rax		C function return value
49
50	and	$3, %r8d
51	je	L(rol)			C jump for n = 3, 7, 11, ...
52
53	dec	%r8d
54	jne	L(1)
55C	n = 4, 8, 12, ...
56	movq	-8(up,n,8), %mm2
57	psllq	%mm4, %mm2
58	movq	-16(up,n,8), %mm0
59	psrlq	%mm5, %mm0
60	por	%mm0, %mm2
61	movq	%mm2, -8(rp,n,8)
62	dec	n
63	jmp	L(rol)
64
65L(1):	dec	%r8d
66	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
67C	n = 2, 6, 10, 16, ...
68	movq	-8(up,n,8), %mm2
69	psllq	%mm4, %mm2
70	movq	-16(up,n,8), %mm0
71	psrlq	%mm5, %mm0
72	por	%mm0, %mm2
73	movq	%mm2, -8(rp,n,8)
74	dec	n
75L(1x):
76	cmp	$1, n
77	je	L(ast)
78	movq	-8(up,n,8), %mm2
79	psllq	%mm4, %mm2
80	movq	-16(up,n,8), %mm3
81	psllq	%mm4, %mm3
82	movq	-16(up,n,8), %mm0
83	movq	-24(up,n,8), %mm1
84	psrlq	%mm5, %mm0
85	por	%mm0, %mm2
86	psrlq	%mm5, %mm1
87	por	%mm1, %mm3
88	movq	%mm2, -8(rp,n,8)
89	movq	%mm3, -16(rp,n,8)
90	sub	$2, n
91
92L(rol):	movq	-8(up,n,8), %mm2
93	psllq	%mm4, %mm2
94	movq	-16(up,n,8), %mm3
95	psllq	%mm4, %mm3
96
97	sub	$4, n			C				      4
98	jb	L(end)			C				      2
99	ALIGN(32)
100L(top):
101	C finish stuff from lsh block
102	movq	16(up,n,8), %mm0
103	movq	8(up,n,8), %mm1
104	psrlq	%mm5, %mm0
105	por	%mm0, %mm2
106	psrlq	%mm5, %mm1
107	movq	(up,n,8), %mm0
108	por	%mm1, %mm3
109	movq	-8(up,n,8), %mm1
110	movq	%mm2, 24(rp,n,8)
111	movq	%mm3, 16(rp,n,8)
112	C start two new rsh
113	psrlq	%mm5, %mm0
114	psrlq	%mm5, %mm1
115
116	C finish stuff from rsh block
117	movq	8(up,n,8), %mm2
118	movq	(up,n,8), %mm3
119	psllq	%mm4, %mm2
120	por	%mm2, %mm0
121	psllq	%mm4, %mm3
122	movq	-8(up,n,8), %mm2
123	por	%mm3, %mm1
124	movq	-16(up,n,8), %mm3
125	movq	%mm0, 8(rp,n,8)
126	movq	%mm1, (rp,n,8)
127	C start two new lsh
128	sub	$4, n
129	psllq	%mm4, %mm2
130	psllq	%mm4, %mm3
131
132	jae	L(top)			C				      2
133L(end):
134	movq	16(up,n,8), %mm0
135	psrlq	%mm5, %mm0
136	por	%mm0, %mm2
137	movq	8(up,n,8), %mm1
138	psrlq	%mm5, %mm1
139	por	%mm1, %mm3
140	movq	%mm2, 24(rp,n,8)
141	movq	%mm3, 16(rp,n,8)
142
143L(ast):	movq	(up), %mm2
144	psllq	%mm4, %mm2
145	movq	%mm2, (rp)
146	emms
147	ret
148EPILOGUE()
149