1dnl  x86-64 mpn_lshift optimized for "Core 2".
2
3dnl  Copyright 2007, 2009 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C	     cycles/limb
24C K8,K9:	 4.25
25C K10:		 4.25
26C P4:		14.7
27C P6 core2:	 1.27
28C P6 corei7:	 1.5
29
30
31C INPUT PARAMETERS
32define(`rp',	`%rdi')
33define(`up',	`%rsi')
34define(`n',	`%rdx')
35define(`cnt',	`%cl')
36
37ASM_START()
38	TEXT
39	ALIGN(16)
40PROLOGUE(mpn_lshift)
41	lea	-8(rp,n,8), rp
42	lea	-8(up,n,8), up
43
44	mov	%edx, %eax
45	and	$3, %eax
46	jne	L(nb00)
47L(b00):	C n = 4, 8, 12, ...
48	mov	(up), %r10
49	mov	-8(up), %r11
50	xor	%eax, %eax
51	shld	%cl, %r10, %rax
52	mov	-16(up), %r8
53	lea	24(rp), rp
54	sub	$4, n
55	jmp	L(00)
56
57L(nb00):C n = 1, 5, 9, ...
58	cmp	$2, %eax
59	jae	L(nb01)
60L(b01):	mov	(up), %r9
61	xor	%eax, %eax
62	shld	%cl, %r9, %rax
63	sub	$2, n
64	jb	L(le1)
65	mov	-8(up), %r10
66	mov	-16(up), %r11
67	lea	-8(up), up
68	lea	16(rp), rp
69	jmp	L(01)
70L(le1):	shl	%cl, %r9
71	mov	%r9, (rp)
72	ret
73
74L(nb01):C n = 2, 6, 10, ...
75	jne	L(b11)
76L(b10):	mov	(up), %r8
77	mov	-8(up), %r9
78	xor	%eax, %eax
79	shld	%cl, %r8, %rax
80	sub	$3, n
81	jb	L(le2)
82	mov	-16(up), %r10
83	lea	-16(up), up
84	lea	8(rp), rp
85	jmp	L(10)
86L(le2):	shld	%cl, %r9, %r8
87	mov	%r8, (rp)
88	shl	%cl, %r9
89	mov	%r9, -8(rp)
90	ret
91
92	ALIGN(16)			C performance critical!
93L(b11):	C n = 3, 7, 11, ...
94	mov	(up), %r11
95	mov	-8(up), %r8
96	xor	%eax, %eax
97	shld	%cl, %r11, %rax
98	mov	-16(up), %r9
99	lea	-24(up), up
100	sub	$4, n
101	jb	L(end)
102
103	ALIGN(16)
104L(top):	shld	%cl, %r8, %r11
105	mov	(up), %r10
106	mov	%r11, (rp)
107L(10):	shld	%cl, %r9, %r8
108	mov	-8(up), %r11
109	mov	%r8, -8(rp)
110L(01):	shld	%cl, %r10, %r9
111	mov	-16(up), %r8
112	mov	%r9, -16(rp)
113L(00):	shld	%cl, %r11, %r10
114	mov	-24(up), %r9
115	mov	%r10, -24(rp)
116	add	$-32, up
117	lea	-32(rp), rp
118	sub	$4, n
119	jnc	L(top)
120
121L(end):	shld	%cl, %r8, %r11
122	mov	%r11, (rp)
123	shld	%cl, %r9, %r8
124	mov	%r8, -8(rp)
125	shl	%cl, %r9
126	mov	%r9, -16(rp)
127	ret
128EPILOGUE()
129