1dnl  AMD64 mpn_lshsub_n.  R = 2^k(U - V).
2
3dnl  Copyright 2006 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C	     cycles/limb
24C K8,K9:	 3.15	(mpn_sub_n + mpn_lshift costs about 4 c/l)
25C K10:		 3.15	(mpn_sub_n + mpn_lshift costs about 4 c/l)
26C P4:		16.5
27C P6-15:	 4.35
28
29C This was written quickly and not optimized at all, but it runs very well on
30C K8.  But perhaps one could get under 3 c/l.  Ideas:
31C   1) Use indexing to save the 3 LEA
32C   2) Write reasonable feed-in code
33C   3) Be more clever about register usage
34C   4) Unroll more, handling CL negation, carry save/restore cost much now
35C   5) Reschedule
36
37C INPUT PARAMETERS
38define(`rp',	`%rdi')
39define(`up',	`%rsi')
40define(`vp',	`%rdx')
41define(`n',	`%rcx')
42define(`cnt',	`%r8')
43
44ASM_START()
45	TEXT
46	ALIGN(16)
47PROLOGUE(mpn_lshsub_n)
48
49	push	%r12
50	push	%r13
51	push	%r14
52	push	%r15
53	push	%rbx
54
55	mov	n, %rax
56	xor	%ebx, %ebx		C clear carry save register
57	mov	%r8d, %ecx		C shift count
58	xor	%r15d, %r15d		C limb carry
59
60	mov	%eax, %r11d
61	and	$3, %r11d
62	je	L(4)
63	sub	$1, %r11d
64
65L(oopette):
66	add	%ebx, %ebx		C restore carry flag
67	mov	0(up), %r8
68	lea	8(up), up
69	sbb	0(vp), %r8
70	mov	%r8, %r12
71	sbb	%ebx, %ebx		C save carry flag
72	shl	%cl, %r8
73	or	%r15, %r8
74	mov	%r12, %r15
75	lea	8(vp), vp
76	neg	%cl
77	shr	%cl, %r15
78	neg	%cl
79	mov	%r8, 0(rp)
80	lea	8(rp), rp
81	sub	$1, %r11d
82	jnc	L(oopette)
83
84L(4):
85	sub	$4, %rax
86	jc	L(end)
87
88	ALIGN(16)
89L(oop):
90	add	%ebx, %ebx		C restore carry flag
91
92	mov	0(up), %r8
93	mov	8(up), %r9
94	mov	16(up), %r10
95	mov	24(up), %r11
96
97	lea	32(up), up
98
99	sbb	0(vp), %r8
100	mov	%r8, %r12
101	sbb	8(vp), %r9
102	mov	%r9, %r13
103	sbb	16(vp), %r10
104	mov	%r10, %r14
105	sbb	24(vp), %r11
106
107	sbb	%ebx, %ebx		C save carry flag
108
109	shl	%cl, %r8
110	shl	%cl, %r9
111	shl	%cl, %r10
112	or	%r15, %r8
113	mov	%r11, %r15
114	shl	%cl, %r11
115
116	lea	32(vp), vp
117
118	neg	%cl
119
120	shr	%cl, %r12
121	shr	%cl, %r13
122	shr	%cl, %r14
123	shr	%cl, %r15		C used next loop
124
125	or	%r12, %r9
126	or	%r13, %r10
127	or	%r14, %r11
128
129	neg	%cl
130
131	mov	%r8, 0(rp)
132	mov	%r9, 8(rp)
133	mov	%r10, 16(rp)
134	mov	%r11, 24(rp)
135
136	lea	32(rp), rp
137
138	sub	$4, %rax
139	jnc	L(oop)
140L(end):
141	neg	%ebx
142	shl	%cl, %rbx
143	adc	%r15, %rbx
144	mov	%rbx, %rax
145	pop	%rbx
146	pop	%r15
147	pop	%r14
148	pop	%r13
149	pop	%r12
150
151	ret
152EPILOGUE()
153