rsh1aors_n.asm revision 1.1.1.1
1dnl  AMD64 mpn_rsh1add_n -- rp[] = (up[] + vp[]) >> 1
2
3dnl  Copyright 2003, 2005, 2009 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C	     cycles/limb
24C K8,K9:	 2.14	(mpn_add_n + mpn_rshift need 4.125)
25C K10:		 2.14	(mpn_add_n + mpn_rshift need 4.125)
26C P4:		12.75
27C P6-15:	 3.75
28
29C TODO
30C  * Rewrite to use indexed addressing, like addlsh1.asm and sublsh1.asm.
31C  * Try to approach the cache bandwidth 1.5 c/l.  It should be possible.
32
33C INPUT PARAMETERS
34define(`rp',`%rdi')
35define(`up',`%rsi')
36define(`vp',`%rdx')
37define(`n',`%rcx')
38define(`n32',`%ecx')
39
40ifdef(`OPERATION_rsh1add_n', `
41	define(ADDSUB,	      add)
42	define(ADCSBB,	      adc)
43	define(func_n,	      mpn_rsh1add_n)
44	define(func_nc,	      mpn_rsh1add_nc)')
45ifdef(`OPERATION_rsh1sub_n', `
46	define(ADDSUB,	      sub)
47	define(ADCSBB,	      sbb)
48	define(func_n,	      mpn_rsh1sub_n)
49	define(func_nc,	      mpn_rsh1sub_nc)')
50
51MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1add_nc mpn_rsh1sub_n mpn_rsh1sub_nc)
52
53ASM_START()
54	TEXT
55
56	ALIGN(16)
57PROLOGUE(func_nc)
58	push	%rbx
59
60	xor	%eax, %eax
61	neg	%r8			C set C flag from parameter
62	mov	(up), %rbx
63	ADCSBB	(vp), %rbx
64	jmp	L(ent)
65EPILOGUE()
66
67	ALIGN(16)
68PROLOGUE(func_n)
69	push	%rbx
70
71	xor	%eax, %eax
72	mov	(up), %rbx
73	ADDSUB	(vp), %rbx
74L(ent):
75	rcr	%rbx			C rotate, save acy
76	adc	%eax, %eax		C return value
77
78	mov	n32, R32(%r11)
79	and	$3, R32(%r11)
80
81	cmp	$1, R32(%r11)
82	je	L(do)			C jump if n = 1 5 9 ...
83
84L(n1):	cmp	$2, R32(%r11)
85	jne	L(n2)			C jump unless n = 2 6 10 ...
86	add	%rbx, %rbx		C rotate carry limb, restore acy
87	mov	8(up), %r10
88	ADCSBB	8(vp), %r10
89	lea	8(up), up
90	lea	8(vp), vp
91	lea	8(rp), rp
92	rcr	%r10
93	rcr	%rbx
94	mov	%rbx, -8(rp)
95	jmp	L(cj1)
96
97L(n2):	cmp	$3, R32(%r11)
98	jne	L(n3)			C jump unless n = 3 7 11 ...
99	add	%rbx, %rbx		C rotate carry limb, restore acy
100	mov	8(up), %r9
101	mov	16(up), %r10
102	ADCSBB	8(vp), %r9
103	ADCSBB	16(vp), %r10
104	lea	16(up), up
105	lea	16(vp), vp
106	lea	16(rp), rp
107	rcr	%r10
108	rcr	%r9
109	rcr	%rbx
110	mov	%rbx, -16(rp)
111	jmp	L(cj2)
112
113L(n3):	dec	n			C come here for n = 4 8 12 ...
114	add	%rbx, %rbx		C rotate carry limb, restore acy
115	mov	8(up), %r8
116	mov	16(up), %r9
117	ADCSBB	8(vp), %r8
118	ADCSBB	16(vp), %r9
119	mov	24(up), %r10
120	ADCSBB	24(vp), %r10
121	lea	24(up), up
122	lea	24(vp), vp
123	lea	24(rp), rp
124	rcr	%r10
125	rcr	%r9
126	rcr	%r8
127	rcr	%rbx
128	mov	%rbx, -24(rp)
129	mov	%r8, -16(rp)
130L(cj2):	mov	%r9, -8(rp)
131L(cj1):	mov	%r10, %rbx
132
133L(do):
134	shr	$2, n			C				4
135	je	L(end)			C				2
136	ALIGN(16)
137L(top):	add	%rbx, %rbx		C rotate carry limb, restore acy
138
139	mov	8(up), %r8
140	mov	16(up), %r9
141	ADCSBB	8(vp), %r8
142	ADCSBB	16(vp), %r9
143	mov	24(up), %r10
144	mov	32(up), %r11
145	ADCSBB	24(vp), %r10
146	ADCSBB	32(vp), %r11
147
148	lea	32(up), up
149	lea	32(vp), vp
150
151	rcr	%r11			C rotate, save acy
152	rcr	%r10
153	rcr	%r9
154	rcr	%r8
155
156	rcr	%rbx
157	mov	%rbx, (rp)
158	mov	%r8, 8(rp)
159	mov	%r9, 16(rp)
160	mov	%r10, 24(rp)
161	mov	%r11, %rbx
162
163	lea	32(rp), rp
164	dec	n
165	jne	L(top)
166
167L(end):	mov	%rbx, (rp)
168	pop	%rbx
169	ret
170EPILOGUE()
171