1dnl  Intel Pentium-4 mpn_rsh1add_n -- mpn (x+y)/2
2
3dnl  Copyright 2001-2004 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C        cycles/limb (approx)
35C      dst!=src1,2  dst==src1  dst==src2
36C P4:      4.5         6.5        6.5
37
38
39C mp_limb_t mpn_rsh1add_n (mp_ptr wp, mp_srcptr xp, mp_srcptr yp,
40C                          mp_size_t size);
41C
42C The slightly strange combination of indexing and pointer incrementing
43C that's used seems to work best.  Not sure why, but for instance leal
44C incrementing on %esi is a 1 or 2 cycle slowdown.
45C
46C The dependent chain is paddq combining the carry and next (shifted) part,
47C plus psrlq to move the new carry down.  That, and just 4 mmx instructions
48C in total, makes 4 c/l the target speed, which is almost achieved for
49C separate src/dst but when src==dst the write combining anomalies slow it
50C down.
51
52defframe(PARAM_SIZE, 16)
53defframe(PARAM_YP,   12)
54defframe(PARAM_XP,   8)
55defframe(PARAM_WP,   4)
56
57dnl  re-use parameter space
58define(SAVE_EBX,`PARAM_XP')
59define(SAVE_ESI,`PARAM_YP')
60
61	TEXT
62	ALIGN(8)
63
64PROLOGUE(mpn_rsh1add_n)
65deflit(`FRAME',0)
66
67	movl	PARAM_XP, %edx
68	movl	%ebx, SAVE_EBX
69
70	movl	PARAM_YP, %ebx
71	movl	%esi, SAVE_ESI
72
73	movl	PARAM_WP, %esi
74
75	movd	(%edx), %mm0		C xp[0]
76
77	movd	(%ebx), %mm1		C yp[0]
78	movl	PARAM_SIZE, %ecx
79
80	movl	(%edx), %eax		C xp[0]
81
82	addl	(%ebx), %eax		C xp[0]+yp[0]
83
84	paddq	%mm1, %mm0		C xp[0]+yp[0]
85	leal	(%esi,%ecx,4), %esi	C wp end
86	negl	%ecx			C -size
87
88	psrlq	$1, %mm0		C (xp[0]+yp[0])/2
89	and	$1, %eax		C return value, rsh1 bit of xp[0]+yp[0]
90	addl	$1, %ecx		C -(size-1)
91	jz	L(done)
92
93
94L(top):
95	C eax	return value
96	C ebx	yp end
97	C ecx	counter, limbs, -(size-1) to -1 inclusive
98	C edx	xp end
99	C esi	wp end
100	C mm0	carry (32 bits)
101
102	movd	4(%edx), %mm1	C xp[i+1]
103	movd	4(%ebx), %mm2	C yp[i+1]
104	leal	4(%edx), %edx
105	leal	4(%ebx), %ebx
106	paddq	%mm2, %mm1		C xp[i+1]+yp[i+1]
107	psllq	$31, %mm1		C low bit at 31, further 32 above
108
109	paddq	%mm1, %mm0		C 31 and carry from prev add
110	movd	%mm0, -4(%esi,%ecx,4)	C low ready to store dst[i]
111
112	psrlq	$32, %mm0		C high becomes new carry
113
114	addl	$1, %ecx
115	jnz	L(top)
116
117
118L(done):
119	movd	%mm0, -4(%esi)		C dst[size-1]
120	movl	SAVE_EBX, %ebx
121
122	movl	SAVE_ESI, %esi
123	emms
124	ret
125
126EPILOGUE()
127