1dnl  x86-64 mpn_rshift optimized for Pentium 4.
2
3dnl  Copyright 2003, 2005, 2007, 2008 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C	     cycles/limb
24C K8,K9:	 2.5
25C K10:		 ?
26C P4:		 3.29
27C P6-15 (Core2): 2.1 (fluctuates, presumably cache related)
28C P6-28 (Atom):	14.3
29
30C INPUT PARAMETERS
31define(`rp',`%rdi')
32define(`up',`%rsi')
33define(`n',`%rdx')
34define(`cnt',`%cl')
35
36ASM_START()
37	TEXT
38	ALIGN(32)
39PROLOGUE(mpn_rshift)
40	mov	(up), %rax
41	movd	%ecx, %mm4
42	neg	%ecx			C put lsh count in cl
43	and	$63, %ecx
44	movd	%ecx, %mm5
45
46	lea	-8(up,n,8), up
47	lea	-8(rp,n,8), rp
48	lea	1(n), %r8d
49	neg	n
50
51	shl	%cl, %rax		C function return value
52
53	and	$3, %r8d
54	je	L(rol)			C jump for n = 3, 7, 11, ...
55
56	dec	%r8d
57	jne	L(1)
58C	n = 4, 8, 12, ...
59	movq	8(up,n,8), %mm2
60	psrlq	%mm4, %mm2
61	movq	16(up,n,8), %mm0
62	psllq	%mm5, %mm0
63	por	%mm0, %mm2
64	movq	%mm2, 8(rp,n,8)
65	inc	n
66	jmp	L(rol)
67
68L(1):	dec	%r8d
69	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
70C	n = 2, 6, 10, 16, ...
71	movq	8(up,n,8), %mm2
72	psrlq	%mm4, %mm2
73	movq	16(up,n,8), %mm0
74	psllq	%mm5, %mm0
75	por	%mm0, %mm2
76	movq	%mm2, 8(rp,n,8)
77	inc	n
78L(1x):
79	cmp	$-1, n
80	je	L(ast)
81	movq	8(up,n,8), %mm2
82	psrlq	%mm4, %mm2
83	movq	16(up,n,8), %mm3
84	psrlq	%mm4, %mm3
85	movq	16(up,n,8), %mm0
86	movq	24(up,n,8), %mm1
87	psllq	%mm5, %mm0
88	por	%mm0, %mm2
89	psllq	%mm5, %mm1
90	por	%mm1, %mm3
91	movq	%mm2, 8(rp,n,8)
92	movq	%mm3, 16(rp,n,8)
93	add	$2, n
94
95L(rol):	movq	8(up,n,8), %mm2
96	psrlq	%mm4, %mm2
97	movq	16(up,n,8), %mm3
98	psrlq	%mm4, %mm3
99
100	add	$4, n			C				      4
101	jb	L(end)			C				      2
102	ALIGN(32)
103L(top):
104	C finish stuff from lsh block
105	movq	-16(up,n,8), %mm0
106	movq	-8(up,n,8), %mm1
107	psllq	%mm5, %mm0
108	por	%mm0, %mm2
109	psllq	%mm5, %mm1
110	movq	(up,n,8), %mm0
111	por	%mm1, %mm3
112	movq	8(up,n,8), %mm1
113	movq	%mm2, -24(rp,n,8)
114	movq	%mm3, -16(rp,n,8)
115	C start two new rsh
116	psllq	%mm5, %mm0
117	psllq	%mm5, %mm1
118
119	C finish stuff from rsh block
120	movq	-8(up,n,8), %mm2
121	movq	(up,n,8), %mm3
122	psrlq	%mm4, %mm2
123	por	%mm2, %mm0
124	psrlq	%mm4, %mm3
125	movq	8(up,n,8), %mm2
126	por	%mm3, %mm1
127	movq	16(up,n,8), %mm3
128	movq	%mm0, -8(rp,n,8)
129	movq	%mm1, (rp,n,8)
130	C start two new lsh
131	add	$4, n
132	psrlq	%mm4, %mm2
133	psrlq	%mm4, %mm3
134
135	jae	L(top)			C				      2
136L(end):
137	movq	-16(up,n,8), %mm0
138	psllq	%mm5, %mm0
139	por	%mm0, %mm2
140	movq	-8(up,n,8), %mm1
141	psllq	%mm5, %mm1
142	por	%mm1, %mm3
143	movq	%mm2, -24(rp,n,8)
144	movq	%mm3, -16(rp,n,8)
145
146L(ast):	movq	(up), %mm2
147	psrlq	%mm4, %mm2
148	movq	%mm2, (rp)
149	emms
150	ret
151EPILOGUE()
152