aorrlshC_n.asm revision 1.1.1.1
1dnl  Intel Atom mpn_addlshC_n/mpn_rsblshC_n -- rp[] = (vp[] << C) +- up[]
2
3dnl  Contributed to the GNU project by Marco Bodrato.
4
5dnl  Copyright 2011 Free Software Foundation, Inc.
6dnl
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or
10dnl  modify it under the terms of the GNU Lesser General Public License as
11dnl  published by the Free Software Foundation; either version 3 of the
12dnl  License, or (at your option) any later version.
13dnl
14dnl  The GNU MP Library is distributed in the hope that it will be useful,
15dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
16dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17dnl  Lesser General Public License for more details.
18dnl
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C mp_limb_t mpn_addlshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
25C                          mp_size_t size);
26C mp_limb_t mpn_addlshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
27C                           mp_size_t size, mp_limb_t carry);
28C mp_limb_t mpn_rsblshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
29C                          mp_size_t size);
30C mp_limb_t mpn_rsblshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
31C                           mp_size_t size, mp_signed_limb_t carry);
32
33C				cycles/limb
34C P5
35C P6 model 0-8,10-12
36C P6 model 9  (Banias)
37C P6 model 13 (Dothan)
38C P4 model 0  (Willamette)
39C P4 model 1  (?)
40C P4 model 2  (Northwood)
41C P4 model 3  (Prescott)
42C P4 model 4  (Nocona)
43C Intel Atom			 6
44C AMD K6
45C AMD K7
46C AMD K8
47C AMD K10
48
49defframe(PARAM_CORB,	20)
50defframe(PARAM_SIZE,	16)
51defframe(PARAM_DBLD,	12)
52defframe(PARAM_SRC,	 8)
53defframe(PARAM_DST,	 4)
54
55dnl  re-use parameter space
56define(VAR_COUNT,`PARAM_SIZE')
57define(SAVE_EBP,`PARAM_DBLD')
58define(SAVE_VP,`PARAM_SRC')
59define(SAVE_UP,`PARAM_DST')
60
61define(M, eval(m4_lshift(1,LSH)))
62define(`rp',  `%edi')
63define(`up',  `%esi')
64define(`vp',  `%ebx')
65
66ASM_START()
67	TEXT
68	ALIGN(8)
69
70PROLOGUE(M4_function_c)
71deflit(`FRAME',0)
72	movl	PARAM_CORB, %eax
73	movl	%eax, %edx
74	shr	$LSH, %edx
75	andl	$1, %edx
76	M4_opp	%edx, %eax
77	jmp	L(start_nc)
78EPILOGUE()
79
80PROLOGUE(M4_function)
81deflit(`FRAME',0)
82
83	xor	%eax, %eax
84	xor	%edx, %edx
85L(start_nc):
86	push	rp			FRAME_pushl()
87
88	mov	PARAM_SIZE, %ecx	C size
89	mov	PARAM_DST, rp
90	mov	up, SAVE_UP
91	incl	%ecx			C size + 1
92	mov	PARAM_SRC, up
93	mov	vp, SAVE_VP
94	shr	%ecx			C (size+1)\2
95	mov	PARAM_DBLD, vp
96	mov	%ebp, SAVE_EBP
97	mov	%ecx, VAR_COUNT
98	jnc	L(entry)		C size odd
99
100	shr	%edx			C size even
101	mov	(vp), %ecx
102	lea	4(vp), vp
103	lea	(%eax,%ecx,M), %edx
104	mov	%ecx, %eax
105	lea	-4(up), up
106	lea	-4(rp), rp
107	jmp	L(enteven)
108
109	ALIGN(16)
110L(oop):
111	lea	(%eax,%ecx,M), %ebp
112	shr	$RSH, %ecx
113	mov	4(vp), %eax
114	shr	%edx
115	lea	8(vp), vp
116	M4_inst	(up), %ebp
117	lea	(%ecx,%eax,M), %edx
118	mov	%ebp, (rp)
119L(enteven):
120	M4_inst	4(up), %edx
121	lea	8(up), up
122	mov	%edx, 4(rp)
123	adc	%edx, %edx
124	shr	$RSH, %eax
125	lea	8(rp), rp
126L(entry):
127	mov	(vp), %ecx
128	decl	VAR_COUNT
129	jnz	L(oop)
130
131	lea	(%eax,%ecx,M), %ebp
132	shr	$RSH, %ecx
133	shr	%edx
134	mov	SAVE_VP, vp
135	M4_inst	(up), %ebp
136	mov	%ecx, %eax
137	mov	SAVE_UP, up
138	M4_inst	$0, %eax
139	mov	%ebp, (rp)
140	mov	SAVE_EBP, %ebp
141	pop	rp			FRAME_popl()
142	ret
143EPILOGUE()
144
145ASM_END()
146