com.asm revision 1.1.1.1
1dnl  AMD Athlon mpn_com -- mpn bitwise one's complement.
2
3dnl  Copyright 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C K7: 1.0 cycles/limb
24
25
26C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
27C
28C The loop form below is necessary for the claimed speed.  It needs to be
29C aligned to a 16 byte boundary and only 16 bytes long.  Maybe that's so it
30C fits in a BTB entry.  The adjustments to %eax and %edx avoid offsets on
31C the movq's and achieve the necessary size.
32C
33C If both src and dst are 4mod8, the loop runs at 1.5 c/l.  So long as one
34C of the two is 0mod8, it runs at 1.0 c/l.  On that basis dst is checked
35C (offset by the size, as per the loop addressing) and one high limb
36C processed separately to get alignment.
37C
38C The padding for the nails case is unattractive, but shouldn't cost any
39C cycles.  Explicit .byte's guarantee the desired instructions, at a point
40C where we're probably stalled waiting for loads anyway.
41C
42C Enhancements:
43C
44C The combination load/pxor/store might be able to be unrolled to approach
45C 0.5 c/l if desired.
46
47defframe(PARAM_SIZE,12)
48defframe(PARAM_SRC, 8)
49defframe(PARAM_DST, 4)
50
51	TEXT
52	ALIGN(16)
53
54PROLOGUE(mpn_com)
55deflit(`FRAME',0)
56
57	movl	PARAM_DST, %edx
58	movl	PARAM_SIZE, %ecx
59	pcmpeqd	%mm7, %mm7
60
61	leal	(%edx,%ecx,4), %eax
62	andl	$4, %eax
63ifelse(GMP_NAIL_BITS,0,,
64`	psrld	$GMP_NAIL_BITS, %mm7')		C GMP_NUMB_MASK
65
66	movl	PARAM_SRC, %eax
67	movd	-4(%eax,%ecx,4), %mm0		C src high limb
68
69ifelse(GMP_NAIL_BITS,0,,
70`	C padding for alignment below
71	.byte	0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00	C lea 0(%esi),%esi
72	.byte	0x8d, 0xbf, 0x00, 0x00, 0x00, 0x00	C lea 0(%edi),%edi
73')
74
75	jz	L(aligned)
76
77	pxor	%mm7, %mm0
78	movd	%mm0, -4(%edx,%ecx,4)		C dst high limb
79	decl	%ecx
80	jz	L(done)
81L(aligned):
82
83	addl	$4, %eax
84	addl	$4, %edx
85	decl	%ecx
86	jz	L(one)
87
88	C offset 0x30 for no nails, or 0x40 for nails
89	ALIGN(16)
90L(top):
91	C eax	src
92	C ebx
93	C ecx	counter
94	C edx	dst
95
96	subl	$2, %ecx
97	movq	(%eax,%ecx,4), %mm0
98	pxor	%mm7, %mm0
99	movq	%mm0, (%edx,%ecx,4)
100	jg	L(top)
101
102	jnz	L(done)				C if size even
103
104L(one):
105	movd	-4(%eax), %mm0			C src low limb
106	pxor	%mm7, %mm0
107	movd	%mm0, -4(%edx)			C dst low limb
108
109L(done):
110	emms
111
112	ret
113
114EPILOGUE()
115