com.asm revision 1.1.1.1
1dnl  Intel Pentium mpn_com -- mpn ones complement.
2
3dnl  Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C P5: 1.75 cycles/limb
24
25
26NAILS_SUPPORT(0-31)
27
28
29C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
30C
31C This code is similar to mpn_copyi, basically there's just some "xorl
32C $GMP_NUMB_MASK"s inserted.
33C
34C Alternatives:
35C
36C On P55 some MMX code could be 1.25 c/l (8 limb unrolled) if src and dst
37C are the same alignment mod 8, but it doesn't seem worth the trouble for
38C just that case (there'd need to be some plain integer available too for
39C the unaligned case).
40
41defframe(PARAM_SIZE,12)
42defframe(PARAM_SRC, 8)
43defframe(PARAM_DST, 4)
44
45	TEXT
46	ALIGN(8)
47PROLOGUE(mpn_com)
48deflit(`FRAME',0)
49
50	movl	PARAM_SRC, %eax
51	movl	PARAM_SIZE, %ecx
52
53	pushl	%esi	FRAME_pushl()
54	pushl	%edi	FRAME_pushl()
55
56	leal	(%eax,%ecx,4), %eax
57	xorl	$-1, %ecx		C -size-1
58
59	movl	PARAM_DST, %edx
60	addl	$8, %ecx		C -size+7
61
62	jns	L(end)
63
64	movl	(%edx), %esi		C fetch destination cache line
65	nop
66
67L(top):
68	C eax	&src[size]
69	C ebx
70	C ecx	counter, limbs, negative
71	C edx	dst, incrementing
72	C esi	scratch
73	C edi	scratch
74	C ebp
75
76	movl	28(%edx), %esi		C destination prefetch
77	addl	$32, %edx
78
79	movl	-28(%eax,%ecx,4), %esi
80	movl	-24(%eax,%ecx,4), %edi
81	xorl	$GMP_NUMB_MASK, %esi
82	xorl	$GMP_NUMB_MASK, %edi
83	movl	%esi, -32(%edx)
84	movl	%edi, -28(%edx)
85
86	movl	-20(%eax,%ecx,4), %esi
87	movl	-16(%eax,%ecx,4), %edi
88	xorl	$GMP_NUMB_MASK, %esi
89	xorl	$GMP_NUMB_MASK, %edi
90	movl	%esi, -24(%edx)
91	movl	%edi, -20(%edx)
92
93	movl	-12(%eax,%ecx,4), %esi
94	movl	-8(%eax,%ecx,4), %edi
95	xorl	$GMP_NUMB_MASK, %esi
96	xorl	$GMP_NUMB_MASK, %edi
97	movl	%esi, -16(%edx)
98	movl	%edi, -12(%edx)
99
100	movl	-4(%eax,%ecx,4), %esi
101	movl	(%eax,%ecx,4), %edi
102	xorl	$GMP_NUMB_MASK, %esi
103	xorl	$GMP_NUMB_MASK, %edi
104	movl	%esi, -8(%edx)
105	movl	%edi, -4(%edx)
106
107	addl	$8, %ecx
108	js	L(top)
109
110
111L(end):
112	C eax	&src[size]
113	C ecx	0 to 7, representing respectively 7 to 0 limbs remaining
114	C edx	dst, next location to store
115
116	subl	$4, %ecx
117	nop
118
119	jns	L(no4)
120
121	movl	-12(%eax,%ecx,4), %esi
122	movl	-8(%eax,%ecx,4), %edi
123	xorl	$GMP_NUMB_MASK, %esi
124	xorl	$GMP_NUMB_MASK, %edi
125	movl	%esi, (%edx)
126	movl	%edi, 4(%edx)
127
128	movl	-4(%eax,%ecx,4), %esi
129	movl	(%eax,%ecx,4), %edi
130	xorl	$GMP_NUMB_MASK, %esi
131	xorl	$GMP_NUMB_MASK, %edi
132	movl	%esi, 8(%edx)
133	movl	%edi, 12(%edx)
134
135	addl	$16, %edx
136	addl	$4, %ecx
137L(no4):
138
139	subl	$2, %ecx
140	nop
141
142	jns	L(no2)
143
144	movl	-4(%eax,%ecx,4), %esi
145	movl	(%eax,%ecx,4), %edi
146	xorl	$GMP_NUMB_MASK, %esi
147	xorl	$GMP_NUMB_MASK, %edi
148	movl	%esi, (%edx)
149	movl	%edi, 4(%edx)
150
151	addl	$8, %edx
152	addl	$2, %ecx
153L(no2):
154
155	popl	%edi
156	jnz	L(done)
157
158	movl	-4(%eax), %ecx
159
160	xorl	$GMP_NUMB_MASK, %ecx
161	popl	%esi
162
163	movl	%ecx, (%edx)
164	ret
165
166L(done):
167	popl	%esi
168	ret
169
170EPILOGUE()
171