1dnl  x86-64 mpn_rshift optimized for "Core 2".
2
3dnl  Copyright 2007, 2009 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C	     cycles/limb
24C K8,K9:	 4.25
25C K10:		 4.25
26C P4:		14.7
27C P6 core2:	 1.27
28C P6 corei7:	 1.5
29
30
31C INPUT PARAMETERS
32define(`rp',	`%rdi')
33define(`up',	`%rsi')
34define(`n',	`%rdx')
35define(`cnt',	`%cl')
36
37ASM_START()
38	TEXT
39	ALIGN(16)
40PROLOGUE(mpn_rshift)
41	mov	%edx, %eax
42	and	$3, %eax
43	jne	L(nb00)
44L(b00):	C n = 4, 8, 12, ...
45	mov	(up), %r10
46	mov	8(up), %r11
47	xor	%eax, %eax
48	shrd	%cl, %r10, %rax
49	mov	16(up), %r8
50	lea	8(up), up
51	lea	-24(rp), rp
52	sub	$4, n
53	jmp	L(00)
54
55L(nb00):C n = 1, 5, 9, ...
56	cmp	$2, %eax
57	jae	L(nb01)
58L(b01):	mov	(up), %r9
59	xor	%eax, %eax
60	shrd	%cl, %r9, %rax
61	sub	$2, n
62	jb	L(le1)
63	mov	8(up), %r10
64	mov	16(up), %r11
65	lea	16(up), up
66	lea	-16(rp), rp
67	jmp	L(01)
68L(le1):	shr	%cl, %r9
69	mov	%r9, (rp)
70	ret
71
72L(nb01):C n = 2, 6, 10, ...
73	jne	L(b11)
74L(b10):	mov	(up), %r8
75	mov	8(up), %r9
76	xor	%eax, %eax
77	shrd	%cl, %r8, %rax
78	sub	$3, n
79	jb	L(le2)
80	mov	16(up), %r10
81	lea	24(up), up
82	lea	-8(rp), rp
83	jmp	L(10)
84L(le2):	shrd	%cl, %r9, %r8
85	mov	%r8, (rp)
86	shr	%cl, %r9
87	mov	%r9, 8(rp)
88	ret
89
90	ALIGN(16)
91L(b11):	C n = 3, 7, 11, ...
92	mov	(up), %r11
93	mov	8(up), %r8
94	xor	%eax, %eax
95	shrd	%cl, %r11, %rax
96	mov	16(up), %r9
97	lea	32(up), up
98	sub	$4, n
99	jb	L(end)
100
101	ALIGN(16)
102L(top):	shrd	%cl, %r8, %r11
103	mov	-8(up), %r10
104	mov	%r11, (rp)
105L(10):	shrd	%cl, %r9, %r8
106	mov	(up), %r11
107	mov	%r8, 8(rp)
108L(01):	shrd	%cl, %r10, %r9
109	mov	8(up), %r8
110	mov	%r9, 16(rp)
111L(00):	shrd	%cl, %r11, %r10
112	mov	16(up), %r9
113	mov	%r10, 24(rp)
114	add	$32, up
115	lea	32(rp), rp
116	sub	$4, n
117	jnc	L(top)
118
119L(end):	shrd	%cl, %r8, %r11
120	mov	%r11, (rp)
121	shrd	%cl, %r9, %r8
122	mov	%r8, 8(rp)
123	shr	%cl, %r9
124	mov	%r9, 16(rp)
125	ret
126EPILOGUE()
127