1dnl  AMD64 mpn_lshsub_n.  R = 2^k(U - V).
2
3dnl  Copyright 2006, 2011, 2012 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C	     cycles/limb
35C AMD K8,K9	 3.15	(mpn_sub_n + mpn_lshift costs about 4 c/l)
36C AMD K10	 3.15	(mpn_sub_n + mpn_lshift costs about 4 c/l)
37C Intel P4	16.5
38C Intel core2	 4.35
39C Intel corei	 ?
40C Intel atom	 ?
41C VIA nano	 ?
42
43C This was written quickly and not optimized at all, but it runs very well on
44C K8.  But perhaps one could get under 3 c/l.  Ideas:
45C   1) Use indexing to save the 3 LEA
46C   2) Write reasonable feed-in code
47C   3) Be more clever about register usage
48C   4) Unroll more, handling CL negation, carry save/restore cost much now
49C   5) Reschedule
50
51C INPUT PARAMETERS
52define(`rp',	`%rdi')
53define(`up',	`%rsi')
54define(`vp',	`%rdx')
55define(`n',	`%rcx')
56define(`cnt',	`%r8')
57
58ABI_SUPPORT(DOS64)
59ABI_SUPPORT(STD64)
60
61ASM_START()
62	TEXT
63	ALIGN(16)
64PROLOGUE(mpn_lshsub_n)
65	FUNC_ENTRY(4)
66IFDOS(`	mov	56(%rsp), %r8d	')
67
68	push	%r12
69	push	%r13
70	push	%r14
71	push	%r15
72	push	%rbx
73
74	mov	n, %rax
75	xor	R32(%rbx), R32(%rbx)	C clear carry save register
76	mov	R32(%r8), R32(%rcx)	C shift count
77	xor	R32(%r15), R32(%r15)	C limb carry
78
79	mov	R32(%rax), R32(%r11)
80	and	$3, R32(%r11)
81	je	L(4)
82	sub	$1, R32(%r11)
83
84L(oopette):
85	add	R32(%rbx), R32(%rbx)	C restore carry flag
86	mov	0(up), %r8
87	lea	8(up), up
88	sbb	0(vp), %r8
89	mov	%r8, %r12
90	sbb	R32(%rbx), R32(%rbx)	C save carry flag
91	shl	R8(%rcx), %r8
92	or	%r15, %r8
93	mov	%r12, %r15
94	lea	8(vp), vp
95	neg	R8(%rcx)
96	shr	R8(%rcx), %r15
97	neg	R8(%rcx)
98	mov	%r8, 0(rp)
99	lea	8(rp), rp
100	sub	$1, R32(%r11)
101	jnc	L(oopette)
102
103L(4):
104	sub	$4, %rax
105	jc	L(end)
106
107	ALIGN(16)
108L(oop):
109	add	R32(%rbx), R32(%rbx)	C restore carry flag
110
111	mov	0(up), %r8
112	mov	8(up), %r9
113	mov	16(up), %r10
114	mov	24(up), %r11
115
116	lea	32(up), up
117
118	sbb	0(vp), %r8
119	mov	%r8, %r12
120	sbb	8(vp), %r9
121	mov	%r9, %r13
122	sbb	16(vp), %r10
123	mov	%r10, %r14
124	sbb	24(vp), %r11
125
126	sbb	R32(%rbx), R32(%rbx)	C save carry flag
127
128	shl	R8(%rcx), %r8
129	shl	R8(%rcx), %r9
130	shl	R8(%rcx), %r10
131	or	%r15, %r8
132	mov	%r11, %r15
133	shl	R8(%rcx), %r11
134
135	lea	32(vp), vp
136
137	neg	R8(%rcx)
138
139	shr	R8(%rcx), %r12
140	shr	R8(%rcx), %r13
141	shr	R8(%rcx), %r14
142	shr	R8(%rcx), %r15		C used next loop
143
144	or	%r12, %r9
145	or	%r13, %r10
146	or	%r14, %r11
147
148	neg	R8(%rcx)
149
150	mov	%r8, 0(rp)
151	mov	%r9, 8(rp)
152	mov	%r10, 16(rp)
153	mov	%r11, 24(rp)
154
155	lea	32(rp), rp
156
157	sub	$4, %rax
158	jnc	L(oop)
159L(end):
160	neg	R32(%rbx)
161	shl	R8(%rcx), %rbx
162	adc	%r15, %rbx
163	mov	%rbx, %rax
164	pop	%rbx
165	pop	%r15
166	pop	%r14
167	pop	%r13
168	pop	%r12
169
170	FUNC_EXIT()
171	ret
172EPILOGUE()
173