1dnl  AMD64 mpn_rshift -- mpn right shift.
2
3dnl  Copyright 2003, 2005, 2009 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C	     cycles/limb
24C K8,K9:	 2.375
25C K10:		 2.375
26C P4:		 8
27C P6-15 (Core2): 2.11
28C P6-28 (Atom):	 5.75
29
30
31C INPUT PARAMETERS
32define(`rp',	`%rdi')
33define(`up',	`%rsi')
34define(`n',	`%rdx')
35define(`cnt',	`%rcx')
36
37ASM_START()
38	TEXT
39	ALIGN(32)
40PROLOGUE(mpn_rshift)
41	neg	R32(%rcx)		C put rsh count in cl
42	mov	(up), %rax
43	shl	R8(%rcx), %rax		C function return value
44	neg	R32(%rcx)		C put lsh count in cl
45
46	lea	1(n), R32(%r8)
47
48	lea	-8(up,n,8), up
49	lea	-8(rp,n,8), rp
50	neg	n
51
52	and	$3, R32(%r8)
53	je	L(rlx)			C jump for n = 3, 7, 11, ...
54
55	dec	R32(%r8)
56	jne	L(1)
57C	n = 4, 8, 12, ...
58	mov	8(up,n,8), %r10
59	shr	R8(%rcx), %r10
60	neg	R32(%rcx)		C put rsh count in cl
61	mov	16(up,n,8), %r8
62	shl	R8(%rcx), %r8
63	or	%r8, %r10
64	mov	%r10, 8(rp,n,8)
65	inc	n
66	jmp	L(rll)
67
68L(1):	dec	R32(%r8)
69	je	L(1x)			C jump for n = 1, 5, 9, 13, ...
70C	n = 2, 6, 10, 16, ...
71	mov	8(up,n,8), %r10
72	shr	R8(%rcx), %r10
73	neg	R32(%rcx)		C put rsh count in cl
74	mov	16(up,n,8), %r8
75	shl	R8(%rcx), %r8
76	or	%r8, %r10
77	mov	%r10, 8(rp,n,8)
78	inc	n
79	neg	R32(%rcx)		C put lsh count in cl
80L(1x):
81	cmp	$-1, n
82	je	L(ast)
83	mov	8(up,n,8), %r10
84	shr	R8(%rcx), %r10
85	mov	16(up,n,8), %r11
86	shr	R8(%rcx), %r11
87	neg	R32(%rcx)		C put rsh count in cl
88	mov	16(up,n,8), %r8
89	mov	24(up,n,8), %r9
90	shl	R8(%rcx), %r8
91	or	%r8, %r10
92	shl	R8(%rcx), %r9
93	or	%r9, %r11
94	mov	%r10, 8(rp,n,8)
95	mov	%r11, 16(rp,n,8)
96	add	$2, n
97
98L(rll):	neg	R32(%rcx)		C put lsh count in cl
99L(rlx):	mov	8(up,n,8), %r10
100	shr	R8(%rcx), %r10
101	mov	16(up,n,8), %r11
102	shr	R8(%rcx), %r11
103
104	add	$4, n			C				      4
105	jb	L(end)			C				      2
106	ALIGN(16)
107L(top):
108	C finish stuff from lsh block
109	neg	R32(%rcx)		C put rsh count in cl
110	mov	-16(up,n,8), %r8
111	mov	-8(up,n,8), %r9
112	shl	R8(%rcx), %r8
113	or	%r8, %r10
114	shl	R8(%rcx), %r9
115	or	%r9, %r11
116	mov	%r10, -24(rp,n,8)
117	mov	%r11, -16(rp,n,8)
118	C start two new rsh
119	mov	(up,n,8), %r8
120	mov	8(up,n,8), %r9
121	shl	R8(%rcx), %r8
122	shl	R8(%rcx), %r9
123
124	C finish stuff from rsh block
125	neg	R32(%rcx)		C put lsh count in cl
126	mov	-8(up,n,8), %r10
127	mov	0(up,n,8), %r11
128	shr	R8(%rcx), %r10
129	or	%r10, %r8
130	shr	R8(%rcx), %r11
131	or	%r11, %r9
132	mov	%r8, -8(rp,n,8)
133	mov	%r9, 0(rp,n,8)
134	C start two new lsh
135	mov	8(up,n,8), %r10
136	mov	16(up,n,8), %r11
137	shr	R8(%rcx), %r10
138	shr	R8(%rcx), %r11
139
140	add	$4, n
141	jae	L(top)			C				      2
142L(end):
143	neg	R32(%rcx)		C put rsh count in cl
144	mov	-8(up), %r8
145	shl	R8(%rcx), %r8
146	or	%r8, %r10
147	mov	(up), %r9
148	shl	R8(%rcx), %r9
149	or	%r9, %r11
150	mov	%r10, -16(rp)
151	mov	%r11, -8(rp)
152
153	neg	R32(%rcx)		C put lsh count in cl
154L(ast):	mov	(up), %r10
155	shr	R8(%rcx), %r10
156	mov	%r10, (rp)
157	ret
158EPILOGUE()
159