1254721Semastednl  AMD64 mpn_lshsub_n.  R = 2^k(U - V).
2254721Semaste
3254721Semastednl  Copyright 2006, 2011, 2012 Free Software Foundation, Inc.
4254721Semaste
5254721Semastednl  This file is part of the GNU MP Library.
6254721Semastednl
7254721Semastednl  The GNU MP Library is free software; you can redistribute it and/or modify
8254721Semastednl  it under the terms of either:
9254721Semastednl
10254721Semastednl    * the GNU Lesser General Public License as published by the Free
11254721Semastednl      Software Foundation; either version 3 of the License, or (at your
12254721Semastednl      option) any later version.
13254721Semastednl
14263363Semastednl  or
15263363Semastednl
16254721Semastednl    * the GNU General Public License as published by the Free Software
17254721Semastednl      Foundation; either version 2 of the License, or (at your option) any
18254721Semastednl      later version.
19254721Semastednl
20254721Semastednl  or both in parallel, as here.
21254721Semastednl
22254721Semastednl  The GNU MP Library is distributed in the hope that it will be useful, but
23254721Semastednl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24254721Semastednl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25263363Semastednl  for more details.
26254721Semastednl
27254721Semastednl  You should have received copies of the GNU General Public License and the
28254721Semastednl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29254721Semastednl  see https://www.gnu.org/licenses/.
30254721Semaste
31254721Semasteinclude(`../config.m4')
32254721Semaste
33254721Semaste
34254721SemasteC	     cycles/limb
35263363SemasteC AMD K8,K9	 3.15	(mpn_sub_n + mpn_lshift costs about 4 c/l)
36254721SemasteC AMD K10	 3.15	(mpn_sub_n + mpn_lshift costs about 4 c/l)
37254721SemasteC Intel P4	16.5
38254721SemasteC Intel core2	 4.35
39254721SemasteC Intel corei	 ?
40263363SemasteC Intel atom	 ?
41263363SemasteC VIA nano	 ?
42263363Semaste
43263363SemasteC This was written quickly and not optimized at all, but it runs very well on
44254721SemasteC K8.  But perhaps one could get under 3 c/l.  Ideas:
45254721SemasteC   1) Use indexing to save the 3 LEA
46254721SemasteC   2) Write reasonable feed-in code
47254721SemasteC   3) Be more clever about register usage
48254721SemasteC   4) Unroll more, handling CL negation, carry save/restore cost much now
49254721SemasteC   5) Reschedule
50254721Semaste
51254721SemasteC INPUT PARAMETERS
52254721Semastedefine(`rp',	`%rdi')
53254721Semastedefine(`up',	`%rsi')
54254721Semastedefine(`vp',	`%rdx')
55254721Semastedefine(`n',	`%rcx')
56254721Semastedefine(`cnt',	`%r8')
57254721Semaste
58254721SemasteABI_SUPPORT(DOS64)
59254721SemasteABI_SUPPORT(STD64)
60254721Semaste
61254721SemasteASM_START()
62254721Semaste	TEXT
63254721Semaste	ALIGN(16)
64254721SemastePROLOGUE(mpn_lshsub_n)
65254721Semaste	FUNC_ENTRY(4)
66254721SemasteIFDOS(`	mov	56(%rsp), %r8d	')
67263363Semaste
68263367Semaste	push	%r12
69269024Semaste	push	%r13
70269024Semaste	push	%r14
71269024Semaste	push	%r15
72254721Semaste	push	%rbx
73254721Semaste
74254721Semaste	mov	n, %rax
75254721Semaste	xor	R32(%rbx), R32(%rbx)	C clear carry save register
76254721Semaste	mov	R32(%r8), R32(%rcx)	C shift count
77254721Semaste	xor	R32(%r15), R32(%r15)	C limb carry
78254721Semaste
79254721Semaste	mov	R32(%rax), R32(%r11)
80254721Semaste	and	$3, R32(%r11)
81254721Semaste	je	L(4)
82263363Semaste	sub	$1, R32(%r11)
83263363Semaste
84254721SemasteL(oopette):
85254721Semaste	add	R32(%rbx), R32(%rbx)	C restore carry flag
86254721Semaste	mov	0(up), %r8
87254721Semaste	lea	8(up), up
88254721Semaste	sbb	0(vp), %r8
89254721Semaste	mov	%r8, %r12
90269024Semaste	sbb	R32(%rbx), R32(%rbx)	C save carry flag
91254721Semaste	shl	R8(%rcx), %r8
92254721Semaste	or	%r15, %r8
93254721Semaste	mov	%r12, %r15
94254721Semaste	lea	8(vp), vp
95254721Semaste	neg	R8(%rcx)
96254721Semaste	shr	R8(%rcx), %r15
97254721Semaste	neg	R8(%rcx)
98263363Semaste	mov	%r8, 0(rp)
99263363Semaste	lea	8(rp), rp
100263363Semaste	sub	$1, R32(%r11)
101263363Semaste	jnc	L(oopette)
102269024Semaste
103269024SemasteL(4):
104254721Semaste	sub	$4, %rax
105254721Semaste	jc	L(end)
106254721Semaste
107254721Semaste	ALIGN(16)
108254721SemasteL(oop):
109254721Semaste	add	R32(%rbx), R32(%rbx)	C restore carry flag
110254721Semaste
111254721Semaste	mov	0(up), %r8
112254721Semaste	mov	8(up), %r9
113254721Semaste	mov	16(up), %r10
114254721Semaste	mov	24(up), %r11
115254721Semaste
116254721Semaste	lea	32(up), up
117254721Semaste
118254721Semaste	sbb	0(vp), %r8
119263367Semaste	mov	%r8, %r12
120263367Semaste	sbb	8(vp), %r9
121254721Semaste	mov	%r9, %r13
122254721Semaste	sbb	16(vp), %r10
123254721Semaste	mov	%r10, %r14
124263367Semaste	sbb	24(vp), %r11
125269024Semaste
126269024Semaste	sbb	R32(%rbx), R32(%rbx)	C save carry flag
127269024Semaste
128269024Semaste	shl	R8(%rcx), %r8
129269024Semaste	shl	R8(%rcx), %r9
130269024Semaste	shl	R8(%rcx), %r10
131269024Semaste	or	%r15, %r8
132269024Semaste	mov	%r11, %r15
133263367Semaste	shl	R8(%rcx), %r11
134263367Semaste
135263367Semaste	lea	32(vp), vp
136263367Semaste
137263367Semaste	neg	R8(%rcx)
138263367Semaste
139269024Semaste	shr	R8(%rcx), %r12
140269024Semaste	shr	R8(%rcx), %r13
141269024Semaste	shr	R8(%rcx), %r14
142269024Semaste	shr	R8(%rcx), %r15		C used next loop
143269024Semaste
144263367Semaste	or	%r12, %r9
145263367Semaste	or	%r13, %r10
146263367Semaste	or	%r14, %r11
147263367Semaste
148263367Semaste	neg	R8(%rcx)
149263367Semaste
150263367Semaste	mov	%r8, 0(rp)
151263367Semaste	mov	%r9, 8(rp)
152263367Semaste	mov	%r10, 16(rp)
153263367Semaste	mov	%r11, 24(rp)
154263367Semaste
155263367Semaste	lea	32(rp), rp
156263367Semaste
157254721Semaste	sub	$4, %rax
158254721Semaste	jnc	L(oop)
159254721SemasteL(end):
160263367Semaste	neg	R32(%rbx)
161269024Semaste	shl	R8(%rcx), %rbx
162269024Semaste	adc	%r15, %rbx
163269024Semaste	mov	%rbx, %rax
164269024Semaste	pop	%rbx
165269024Semaste	pop	%r15
166269024Semaste	pop	%r14
167269024Semaste	pop	%r13
168269024Semaste	pop	%r12
169269024Semaste
170269024Semaste	FUNC_EXIT()
171269024Semaste	ret
172269024SemasteEPILOGUE()
173269024Semaste