1dnl  x86-64 mpn_lshiftc optimized for "Core 2".
2
3dnl  Copyright 2007, 2009 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C	     cycles/limb
24C K8,K9:	 ?
25C K10:		 ?
26C P4:		 ?
27C P6 core2:	 1.5
28C P6 corei7:	 1.75
29
30
31C INPUT PARAMETERS
32define(`rp',	`%rdi')
33define(`up',	`%rsi')
34define(`n',	`%rdx')
35define(`cnt',	`%cl')
36
37ASM_START()
38	TEXT
39	ALIGN(16)
40PROLOGUE(mpn_lshiftc)
41	lea	-8(rp,n,8), rp
42	lea	-8(up,n,8), up
43
44	mov	%edx, %eax
45	and	$3, %eax
46	jne	L(nb00)
47L(b00):	C n = 4, 8, 12, ...
48	mov	(up), %r10
49	mov	-8(up), %r11
50	xor	%eax, %eax
51	shld	%cl, %r10, %rax
52	mov	-16(up), %r8
53	lea	24(rp), rp
54	sub	$4, n
55	jmp	L(00)
56
57L(nb00):C n = 1, 5, 9, ...
58	cmp	$2, %eax
59	jae	L(nb01)
60L(b01):	mov	(up), %r9
61	xor	%eax, %eax
62	shld	%cl, %r9, %rax
63	sub	$2, n
64	jb	L(le1)
65	mov	-8(up), %r10
66	mov	-16(up), %r11
67	lea	-8(up), up
68	lea	16(rp), rp
69	jmp	L(01)
70L(le1):	shl	%cl, %r9
71	not	%r9
72	mov	%r9, (rp)
73	ret
74
75L(nb01):C n = 2, 6, 10, ...
76	jne	L(b11)
77L(b10):	mov	(up), %r8
78	mov	-8(up), %r9
79	xor	%eax, %eax
80	shld	%cl, %r8, %rax
81	sub	$3, n
82	jb	L(le2)
83	mov	-16(up), %r10
84	lea	-16(up), up
85	lea	8(rp), rp
86	jmp	L(10)
87L(le2):	shld	%cl, %r9, %r8
88	not	%r8
89	mov	%r8, (rp)
90	shl	%cl, %r9
91	not	%r9
92	mov	%r9, -8(rp)
93	ret
94
95	ALIGN(16)			C performance critical!
96L(b11):	C n = 3, 7, 11, ...
97	mov	(up), %r11
98	mov	-8(up), %r8
99	xor	%eax, %eax
100	shld	%cl, %r11, %rax
101	mov	-16(up), %r9
102	lea	-24(up), up
103	sub	$4, n
104	jb	L(end)
105
106	ALIGN(16)
107L(top):	shld	%cl, %r8, %r11
108	mov	(up), %r10
109	not	%r11
110	mov	%r11, (rp)
111L(10):	shld	%cl, %r9, %r8
112	mov	-8(up), %r11
113	not	%r8
114	mov	%r8, -8(rp)
115L(01):	shld	%cl, %r10, %r9
116	mov	-16(up), %r8
117	not	%r9
118	mov	%r9, -16(rp)
119L(00):	shld	%cl, %r11, %r10
120	mov	-24(up), %r9
121	not	%r10
122	mov	%r10, -24(rp)
123	add	$-32, up
124	lea	-32(rp), rp
125	sub	$4, n
126	jnc	L(top)
127
128L(end):	shld	%cl, %r8, %r11
129	not	%r11
130	mov	%r11, (rp)
131	shld	%cl, %r9, %r8
132	not	%r8
133	mov	%r8, -8(rp)
134	shl	%cl, %r9
135	not	%r9
136	mov	%r9, -16(rp)
137	ret
138EPILOGUE()
139