1dnl  AMD64 mpn_addlsh_n and mpn_rsblsh_n.  R = V2^k +- U.
2dnl  ("rsb" means reversed subtract, name mandated by mpn_sublsh1_n which
3dnl  subtacts the shifted operand from the unshifted operand.)
4
5dnl  Copyright 2006 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24
25C	     cycles/limb
26C K8,K9:	 3.25	(mpn_lshift + mpn_add_n costs about 4.1 c/l)
27C K10:		 3.25	(mpn_lshift + mpn_add_n costs about 4.1 c/l)
28C P4:		14
29C P6-15:	 4
30
31C This was written quickly and not optimized at all.  Surely one could get
32C closer to 3 c/l or perhaps even under 3 c/l.  Ideas:
33C   1) Use indexing to save the 3 LEA
34C   2) Write reasonable feed-in code
35C   3) Be more clever about register usage
36C   4) Unroll more, handling CL negation, carry save/restore cost much now
37C   5) Reschedule
38
39C INPUT PARAMETERS
40define(`rp',	`%rdi')
41define(`up',	`%rsi')
42define(`vp',	`%rdx')
43define(`n',	`%rcx')
44define(`cnt',	`%r8')
45
46ifdef(`OPERATION_addlsh_n',`
47  define(ADDSUBC,       `adc')
48  define(func, mpn_addlsh_n)
49')
50ifdef(`OPERATION_rsblsh_n',`
51  define(ADDSUBC,       `sbb')
52  define(func, mpn_rsblsh_n)
53')
54
55MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
56
57ASM_START()
58	TEXT
59	ALIGN(16)
60PROLOGUE(func)
61
62	push	%r12
63	push	%r13
64	push	%r14
65	push	%r15
66	push	%rbx
67
68	mov	n, %rax
69	xor	%ebx, %ebx		C clear carry save register
70	mov	%r8d, %ecx		C shift count
71	xor	%r15d, %r15d		C limb carry
72
73	mov	%eax, %r11d
74	and	$3, %r11d
75	je	L(4)
76	sub	$1, %r11d
77
78L(oopette):
79	mov	0(vp), %r8
80	mov	%r8, %r12
81	shl	%cl, %r8
82	or	%r15, %r8
83	neg	%cl
84	mov	%r12, %r15
85	shr	%cl, %r15
86	neg	%cl
87	add	%ebx, %ebx
88	ADDSUBC	0(up), %r8
89	mov	%r8, 0(rp)
90	sbb	%ebx, %ebx
91	lea	8(up), up
92	lea	8(vp), vp
93	lea	8(rp), rp
94	sub	$1, %r11d
95	jnc	L(oopette)
96
97L(4):
98	sub	$4, %rax
99	jc	L(end)
100
101L(oop):
102	mov	0(vp), %r8
103	mov	%r8, %r12
104	mov	8(vp), %r9
105	mov	%r9, %r13
106	mov	16(vp), %r10
107	mov	%r10, %r14
108	mov	24(vp), %r11
109
110	shl	%cl, %r8
111	shl	%cl, %r9
112	shl	%cl, %r10
113	or	%r15, %r8
114	mov	%r11, %r15
115	shl	%cl, %r11
116
117	neg	%cl
118
119	shr	%cl, %r12
120	shr	%cl, %r13
121	shr	%cl, %r14
122	shr	%cl, %r15		C used next loop
123
124	or	%r12, %r9
125	or	%r13, %r10
126	or	%r14, %r11
127
128	neg	%cl
129
130	add	%ebx, %ebx		C restore carry flag
131
132	ADDSUBC	0(up), %r8
133	ADDSUBC	8(up), %r9
134	ADDSUBC	16(up), %r10
135	ADDSUBC	24(up), %r11
136
137	mov	%r8, 0(rp)
138	mov	%r9, 8(rp)
139	mov	%r10, 16(rp)
140	mov	%r11, 24(rp)
141
142	sbb	%ebx, %ebx		C save carry flag
143
144	lea	32(up), up
145	lea	32(vp), vp
146	lea	32(rp), rp
147
148	sub	$4, %rax
149	jnc	L(oop)
150L(end):
151	add	%ebx, %ebx
152	ADDSUBC	$0, %r15
153	mov	%r15, %rax
154	pop	%rbx
155	pop	%r15
156	pop	%r14
157	pop	%r13
158	pop	%r12
159
160	ret
161EPILOGUE()
162