1251875Speterdnl  AMD64 mpn_lshsub_n.  R = 2^k(U - V).
2251875Speter
3251875Speterdnl  Copyright 2006 Free Software Foundation, Inc.
4251875Speter
5251875Speterdnl  This file is part of the GNU MP Library.
6251875Speter
7251875Speterdnl  The GNU MP Library is free software; you can redistribute it and/or modify
8251875Speterdnl  it under the terms of the GNU Lesser General Public License as published
9251875Speterdnl  by the Free Software Foundation; either version 3 of the License, or (at
10251875Speterdnl  your option) any later version.
11251875Speter
12251875Speterdnl  The GNU MP Library is distributed in the hope that it will be useful, but
13251875Speterdnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14251875Speterdnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15251875Speterdnl  License for more details.
16251875Speter
17251875Speterdnl  You should have received a copy of the GNU Lesser General Public License
18251875Speterdnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19251875Speter
20251875Speterinclude(`../config.m4')
21251875Speter
22251875Speter
23251875SpeterC	     cycles/limb
24251875SpeterC K8,K9:	 3.15	(mpn_sub_n + mpn_lshift costs about 4 c/l)
25251875SpeterC K10:		 3.15	(mpn_sub_n + mpn_lshift costs about 4 c/l)
26251875SpeterC P4:		16.5
27251875SpeterC P6-15:	 4.35
28251875Speter
29251875SpeterC This was written quickly and not optimized at all, but it runs very well on
30251875SpeterC K8.  But perhaps one could get under 3 c/l.  Ideas:
31251875SpeterC   1) Use indexing to save the 3 LEA
32251875SpeterC   2) Write reasonable feed-in code
33251875SpeterC   3) Be more clever about register usage
34251875SpeterC   4) Unroll more, handling CL negation, carry save/restore cost much now
35251875SpeterC   5) Reschedule
36251875Speter
37251875SpeterC INPUT PARAMETERS
38251875Speterdefine(`rp',	`%rdi')
39251875Speterdefine(`up',	`%rsi')
40251875Speterdefine(`vp',	`%rdx')
41251875Speterdefine(`n',	`%rcx')
42251875Speterdefine(`cnt',	`%r8')
43251875Speter
44251875SpeterASM_START()
45251875Speter	TEXT
46251875Speter	ALIGN(16)
47251875SpeterPROLOGUE(mpn_lshsub_n)
48251875Speter
49251875Speter	push	%r12
50251875Speter	push	%r13
51251875Speter	push	%r14
52251875Speter	push	%r15
53251875Speter	push	%rbx
54251875Speter
55251875Speter	mov	n, %rax
56251875Speter	xor	%ebx, %ebx		C clear carry save register
57251875Speter	mov	%r8d, %ecx		C shift count
58251875Speter	xor	%r15d, %r15d		C limb carry
59251875Speter
60251875Speter	mov	%eax, %r11d
61251875Speter	and	$3, %r11d
62251875Speter	je	L(4)
63251875Speter	sub	$1, %r11d
64251875Speter
65251875SpeterL(oopette):
66251875Speter	add	%ebx, %ebx		C restore carry flag
67251875Speter	mov	0(up), %r8
68251875Speter	lea	8(up), up
69251875Speter	sbb	0(vp), %r8
70251875Speter	mov	%r8, %r12
71251875Speter	sbb	%ebx, %ebx		C save carry flag
72251875Speter	shl	%cl, %r8
73251875Speter	or	%r15, %r8
74251875Speter	mov	%r12, %r15
75251875Speter	lea	8(vp), vp
76251875Speter	neg	%cl
77251875Speter	shr	%cl, %r15
78251875Speter	neg	%cl
79251875Speter	mov	%r8, 0(rp)
80251875Speter	lea	8(rp), rp
81251875Speter	sub	$1, %r11d
82251875Speter	jnc	L(oopette)
83251875Speter
84251875SpeterL(4):
85251875Speter	sub	$4, %rax
86251875Speter	jc	L(end)
87251875Speter
88251875Speter	ALIGN(16)
89251875SpeterL(oop):
90251875Speter	add	%ebx, %ebx		C restore carry flag
91251875Speter
92251875Speter	mov	0(up), %r8
93251875Speter	mov	8(up), %r9
94251875Speter	mov	16(up), %r10
95251875Speter	mov	24(up), %r11
96251875Speter
97251875Speter	lea	32(up), up
98251875Speter
99251875Speter	sbb	0(vp), %r8
100251875Speter	mov	%r8, %r12
101251875Speter	sbb	8(vp), %r9
102251875Speter	mov	%r9, %r13
103251875Speter	sbb	16(vp), %r10
104251875Speter	mov	%r10, %r14
105251875Speter	sbb	24(vp), %r11
106251875Speter
107251875Speter	sbb	%ebx, %ebx		C save carry flag
108251875Speter
109251875Speter	shl	%cl, %r8
110251875Speter	shl	%cl, %r9
111251875Speter	shl	%cl, %r10
112251875Speter	or	%r15, %r8
113251875Speter	mov	%r11, %r15
114251875Speter	shl	%cl, %r11
115251875Speter
116251875Speter	lea	32(vp), vp
117251875Speter
118251875Speter	neg	%cl
119251875Speter
120251875Speter	shr	%cl, %r12
121251875Speter	shr	%cl, %r13
122251875Speter	shr	%cl, %r14
123251875Speter	shr	%cl, %r15		C used next loop
124251875Speter
125251875Speter	or	%r12, %r9
126251875Speter	or	%r13, %r10
127251875Speter	or	%r14, %r11
128251875Speter
129251875Speter	neg	%cl
130251875Speter
131251875Speter	mov	%r8, 0(rp)
132251875Speter	mov	%r9, 8(rp)
133251875Speter	mov	%r10, 16(rp)
134251875Speter	mov	%r11, 24(rp)
135251875Speter
136251875Speter	lea	32(rp), rp
137251875Speter
138251875Speter	sub	$4, %rax
139251875Speter	jnc	L(oop)
140251875SpeterL(end):
141251875Speter	neg	%ebx
142251875Speter	shl	%cl, %rbx
143251875Speter	adc	%r15, %rbx
144251875Speter	mov	%rbx, %rax
145251875Speter	pop	%rbx
146251875Speter	pop	%r15
147251875Speter	pop	%r14
148251875Speter	pop	%r13
149251875Speter	pop	%r12
150251875Speter
151251875Speter	ret
152251875SpeterEPILOGUE()
153251875Speter