1dnl  Intel Pentium mpn_com -- mpn ones complement.
2
3dnl  Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C P5: 1.75 cycles/limb
35
36
37NAILS_SUPPORT(0-31)
38
39
40C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
41C
42C This code is similar to mpn_copyi, basically there's just some "xorl
43C $GMP_NUMB_MASK"s inserted.
44C
45C Alternatives:
46C
47C On P55 some MMX code could be 1.25 c/l (8 limb unrolled) if src and dst
48C are the same alignment mod 8, but it doesn't seem worth the trouble for
49C just that case (there'd need to be some plain integer available too for
50C the unaligned case).
51
52defframe(PARAM_SIZE,12)
53defframe(PARAM_SRC, 8)
54defframe(PARAM_DST, 4)
55
56	TEXT
57	ALIGN(8)
58PROLOGUE(mpn_com)
59deflit(`FRAME',0)
60
61	movl	PARAM_SRC, %eax
62	movl	PARAM_SIZE, %ecx
63
64	pushl	%esi	FRAME_pushl()
65	pushl	%edi	FRAME_pushl()
66
67	leal	(%eax,%ecx,4), %eax
68	xorl	$-1, %ecx		C -size-1
69
70	movl	PARAM_DST, %edx
71	addl	$8, %ecx		C -size+7
72
73	jns	L(end)
74
75	movl	(%edx), %esi		C fetch destination cache line
76	nop
77
78L(top):
79	C eax	&src[size]
80	C ebx
81	C ecx	counter, limbs, negative
82	C edx	dst, incrementing
83	C esi	scratch
84	C edi	scratch
85	C ebp
86
87	movl	28(%edx), %esi		C destination prefetch
88	addl	$32, %edx
89
90	movl	-28(%eax,%ecx,4), %esi
91	movl	-24(%eax,%ecx,4), %edi
92	xorl	$GMP_NUMB_MASK, %esi
93	xorl	$GMP_NUMB_MASK, %edi
94	movl	%esi, -32(%edx)
95	movl	%edi, -28(%edx)
96
97	movl	-20(%eax,%ecx,4), %esi
98	movl	-16(%eax,%ecx,4), %edi
99	xorl	$GMP_NUMB_MASK, %esi
100	xorl	$GMP_NUMB_MASK, %edi
101	movl	%esi, -24(%edx)
102	movl	%edi, -20(%edx)
103
104	movl	-12(%eax,%ecx,4), %esi
105	movl	-8(%eax,%ecx,4), %edi
106	xorl	$GMP_NUMB_MASK, %esi
107	xorl	$GMP_NUMB_MASK, %edi
108	movl	%esi, -16(%edx)
109	movl	%edi, -12(%edx)
110
111	movl	-4(%eax,%ecx,4), %esi
112	movl	(%eax,%ecx,4), %edi
113	xorl	$GMP_NUMB_MASK, %esi
114	xorl	$GMP_NUMB_MASK, %edi
115	movl	%esi, -8(%edx)
116	movl	%edi, -4(%edx)
117
118	addl	$8, %ecx
119	js	L(top)
120
121
122L(end):
123	C eax	&src[size]
124	C ecx	0 to 7, representing respectively 7 to 0 limbs remaining
125	C edx	dst, next location to store
126
127	subl	$4, %ecx
128	nop
129
130	jns	L(no4)
131
132	movl	-12(%eax,%ecx,4), %esi
133	movl	-8(%eax,%ecx,4), %edi
134	xorl	$GMP_NUMB_MASK, %esi
135	xorl	$GMP_NUMB_MASK, %edi
136	movl	%esi, (%edx)
137	movl	%edi, 4(%edx)
138
139	movl	-4(%eax,%ecx,4), %esi
140	movl	(%eax,%ecx,4), %edi
141	xorl	$GMP_NUMB_MASK, %esi
142	xorl	$GMP_NUMB_MASK, %edi
143	movl	%esi, 8(%edx)
144	movl	%edi, 12(%edx)
145
146	addl	$16, %edx
147	addl	$4, %ecx
148L(no4):
149
150	subl	$2, %ecx
151	nop
152
153	jns	L(no2)
154
155	movl	-4(%eax,%ecx,4), %esi
156	movl	(%eax,%ecx,4), %edi
157	xorl	$GMP_NUMB_MASK, %esi
158	xorl	$GMP_NUMB_MASK, %edi
159	movl	%esi, (%edx)
160	movl	%edi, 4(%edx)
161
162	addl	$8, %edx
163	addl	$2, %ecx
164L(no2):
165
166	popl	%edi
167	jnz	L(done)
168
169	movl	-4(%eax), %ecx
170
171	xorl	$GMP_NUMB_MASK, %ecx
172	popl	%esi
173
174	movl	%ecx, (%edx)
175	ret
176
177L(done):
178	popl	%esi
179	ret
180
181EPILOGUE()
182