1dnl  AMD64 mpn_rsh1add_n -- rp[] = (up[] + vp[]) >> 1
2dnl  AMD64 mpn_rsh1sub_n -- rp[] = (up[] - vp[]) >> 1
3
4dnl  Copyright 2003, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C	     cycles/limb
35C AMD K8,K9	 2.14	(mpn_add_n + mpn_rshift need 4.125)
36C AMD K10	 2.14	(mpn_add_n + mpn_rshift need 4.125)
37C Intel P4	12.75
38C Intel core2	 3.75
39C Intel NMH	 4.4
40C Intel SBR	 ?
41C Intel atom	 ?
42C VIA nano	 3.25
43
44C TODO
45C  * Rewrite to use indexed addressing, like addlsh1.asm and sublsh1.asm.
46
47C INPUT PARAMETERS
48define(`rp', `%rdi')
49define(`up', `%rsi')
50define(`vp', `%rdx')
51define(`n',`  %rcx')
52
53ifdef(`OPERATION_rsh1add_n', `
54	define(ADDSUB,	      add)
55	define(ADCSBB,	      adc)
56	define(func_n,	      mpn_rsh1add_n)
57	define(func_nc,	      mpn_rsh1add_nc)')
58ifdef(`OPERATION_rsh1sub_n', `
59	define(ADDSUB,	      sub)
60	define(ADCSBB,	      sbb)
61	define(func_n,	      mpn_rsh1sub_n)
62	define(func_nc,	      mpn_rsh1sub_nc)')
63
64MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
65
66ABI_SUPPORT(DOS64)
67ABI_SUPPORT(STD64)
68
69ASM_START()
70	TEXT
71	ALIGN(16)
72PROLOGUE(func_nc)
73	FUNC_ENTRY(4)
74IFDOS(`	mov	56(%rsp), %r8	')
75	push	%rbx
76
77	xor	R32(%rax), R32(%rax)
78	neg	%r8			C set C flag from parameter
79	mov	(up), %rbx
80	ADCSBB	(vp), %rbx
81	jmp	L(ent)
82EPILOGUE()
83
84	ALIGN(16)
85PROLOGUE(func_n)
86	FUNC_ENTRY(4)
87	push	%rbx
88
89	xor	R32(%rax), R32(%rax)
90	mov	(up), %rbx
91	ADDSUB	(vp), %rbx
92L(ent):
93	rcr	%rbx			C rotate, save acy
94	adc	R32(%rax), R32(%rax)	C return value
95
96	mov	R32(n), R32(%r11)
97	and	$3, R32(%r11)
98
99	cmp	$1, R32(%r11)
100	je	L(do)			C jump if n = 1 5 9 ...
101
102L(n1):	cmp	$2, R32(%r11)
103	jne	L(n2)			C jump unless n = 2 6 10 ...
104	add	%rbx, %rbx		C rotate carry limb, restore acy
105	mov	8(up), %r10
106	ADCSBB	8(vp), %r10
107	lea	8(up), up
108	lea	8(vp), vp
109	lea	8(rp), rp
110	rcr	%r10
111	rcr	%rbx
112	mov	%rbx, -8(rp)
113	jmp	L(cj1)
114
115L(n2):	cmp	$3, R32(%r11)
116	jne	L(n3)			C jump unless n = 3 7 11 ...
117	add	%rbx, %rbx		C rotate carry limb, restore acy
118	mov	8(up), %r9
119	mov	16(up), %r10
120	ADCSBB	8(vp), %r9
121	ADCSBB	16(vp), %r10
122	lea	16(up), up
123	lea	16(vp), vp
124	lea	16(rp), rp
125	rcr	%r10
126	rcr	%r9
127	rcr	%rbx
128	mov	%rbx, -16(rp)
129	jmp	L(cj2)
130
131L(n3):	dec	n			C come here for n = 4 8 12 ...
132	add	%rbx, %rbx		C rotate carry limb, restore acy
133	mov	8(up), %r8
134	mov	16(up), %r9
135	ADCSBB	8(vp), %r8
136	ADCSBB	16(vp), %r9
137	mov	24(up), %r10
138	ADCSBB	24(vp), %r10
139	lea	24(up), up
140	lea	24(vp), vp
141	lea	24(rp), rp
142	rcr	%r10
143	rcr	%r9
144	rcr	%r8
145	rcr	%rbx
146	mov	%rbx, -24(rp)
147	mov	%r8, -16(rp)
148L(cj2):	mov	%r9, -8(rp)
149L(cj1):	mov	%r10, %rbx
150
151L(do):
152	shr	$2, n			C				4
153	je	L(end)			C				2
154	ALIGN(16)
155L(top):	add	%rbx, %rbx		C rotate carry limb, restore acy
156
157	mov	8(up), %r8
158	mov	16(up), %r9
159	ADCSBB	8(vp), %r8
160	ADCSBB	16(vp), %r9
161	mov	24(up), %r10
162	mov	32(up), %r11
163	ADCSBB	24(vp), %r10
164	ADCSBB	32(vp), %r11
165
166	lea	32(up), up
167	lea	32(vp), vp
168
169	rcr	%r11			C rotate, save acy
170	rcr	%r10
171	rcr	%r9
172	rcr	%r8
173
174	rcr	%rbx
175	mov	%rbx, (rp)
176	mov	%r8, 8(rp)
177	mov	%r9, 16(rp)
178	mov	%r10, 24(rp)
179	mov	%r11, %rbx
180
181	lea	32(rp), rp
182	dec	n
183	jne	L(top)
184
185L(end):	mov	%rbx, (rp)
186	pop	%rbx
187	FUNC_EXIT()
188	ret
189EPILOGUE()
190