aorrlshC_n.asm revision 1.1.1.2
1dnl  Intel Atom mpn_addlshC_n/mpn_rsblshC_n -- rp[] = (vp[] << C) +- up[]
2
3dnl  Contributed to the GNU project by Marco Bodrato.
4
5dnl  Copyright 2011 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C mp_limb_t mpn_addlshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
36C                          mp_size_t size);
37C mp_limb_t mpn_addlshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
38C                           mp_size_t size, mp_limb_t carry);
39C mp_limb_t mpn_rsblshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
40C                          mp_size_t size);
41C mp_limb_t mpn_rsblshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
42C                           mp_size_t size, mp_signed_limb_t carry);
43
44C				cycles/limb
45C P5
46C P6 model 0-8,10-12
47C P6 model 9  (Banias)
48C P6 model 13 (Dothan)
49C P4 model 0  (Willamette)
50C P4 model 1  (?)
51C P4 model 2  (Northwood)
52C P4 model 3  (Prescott)
53C P4 model 4  (Nocona)
54C Intel Atom			 6
55C AMD K6
56C AMD K7
57C AMD K8
58C AMD K10
59
60defframe(PARAM_CORB,	20)
61defframe(PARAM_SIZE,	16)
62defframe(PARAM_DBLD,	12)
63defframe(PARAM_SRC,	 8)
64defframe(PARAM_DST,	 4)
65
66dnl  re-use parameter space
67define(VAR_COUNT,`PARAM_SIZE')
68define(SAVE_EBP,`PARAM_DBLD')
69define(SAVE_VP,`PARAM_SRC')
70define(SAVE_UP,`PARAM_DST')
71
72define(M, eval(m4_lshift(1,LSH)))
73define(`rp',  `%edi')
74define(`up',  `%esi')
75define(`vp',  `%ebx')
76
77ASM_START()
78	TEXT
79	ALIGN(8)
80
81PROLOGUE(M4_function_c)
82deflit(`FRAME',0)
83	movl	PARAM_CORB, %eax
84	movl	%eax, %edx
85	shr	$LSH, %edx
86	andl	$1, %edx
87	M4_opp	%edx, %eax
88	jmp	L(start_nc)
89EPILOGUE()
90
91PROLOGUE(M4_function)
92deflit(`FRAME',0)
93
94	xor	%eax, %eax
95	xor	%edx, %edx
96L(start_nc):
97	push	rp			FRAME_pushl()
98
99	mov	PARAM_SIZE, %ecx	C size
100	mov	PARAM_DST, rp
101	mov	up, SAVE_UP
102	incl	%ecx			C size + 1
103	mov	PARAM_SRC, up
104	mov	vp, SAVE_VP
105	shr	%ecx			C (size+1)\2
106	mov	PARAM_DBLD, vp
107	mov	%ebp, SAVE_EBP
108	mov	%ecx, VAR_COUNT
109	jnc	L(entry)		C size odd
110
111	shr	%edx			C size even
112	mov	(vp), %ecx
113	lea	4(vp), vp
114	lea	(%eax,%ecx,M), %edx
115	mov	%ecx, %eax
116	lea	-4(up), up
117	lea	-4(rp), rp
118	jmp	L(enteven)
119
120	ALIGN(16)
121L(oop):
122	lea	(%eax,%ecx,M), %ebp
123	shr	$RSH, %ecx
124	mov	4(vp), %eax
125	shr	%edx
126	lea	8(vp), vp
127	M4_inst	(up), %ebp
128	lea	(%ecx,%eax,M), %edx
129	mov	%ebp, (rp)
130L(enteven):
131	M4_inst	4(up), %edx
132	lea	8(up), up
133	mov	%edx, 4(rp)
134	adc	%edx, %edx
135	shr	$RSH, %eax
136	lea	8(rp), rp
137L(entry):
138	mov	(vp), %ecx
139	decl	VAR_COUNT
140	jnz	L(oop)
141
142	lea	(%eax,%ecx,M), %ebp
143	shr	$RSH, %ecx
144	shr	%edx
145	mov	SAVE_VP, vp
146	M4_inst	(up), %ebp
147	mov	%ecx, %eax
148	mov	SAVE_UP, up
149	M4_inst	$0, %eax
150	mov	%ebp, (rp)
151	mov	SAVE_EBP, %ebp
152	pop	rp			FRAME_popl()
153	ret
154EPILOGUE()
155
156ASM_END()
157