1dnl  Alpha ev67 mpn_hamdist -- mpn hamming distance.
2
3dnl  Copyright 2003, 2005 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C ev67: 2.5 cycles/limb
24
25
26C unsigned long mpn_hamdist (mp_srcptr xp, mp_srcptr yp, mp_size_t size);
27C
28C The hope was for 2.0 c/l here, but that isn't achieved.  We're limited by
29C renaming register shortage.  Since we need 5 instructions per limb, further
30C unrolling could approach 1.5 c/l.
31C
32C The main loop processes two limbs from each operand on each iteration.  An
33C odd size is handled by processing xp[0]^yp[0] at the start.  If the size
34C is even that result is discarded, and is repeated by the main loop.
35C
36
37ASM_START()
38PROLOGUE(mpn_hamdist)
39
40	C r16	xp
41	C r17	yp
42	C r18	size
43
44	ldq	r1, 0(r16)		C L0  xp[0]
45	ldq	r2, 0(r17)		C L1  yp[0]
46	and	r18, 1, r8		C U1  1 if size odd
47	srl	r18, 1, r18		C U0  size, limb pairs
48
49	clr	r0			C L0  initial total
50	s8addq	r8, r17, r17		C U1  yp++ if size odd
51	s8addq	r8, r16, r16		C L1  xp++ if size odd
52	clr	r6			C U0  dummy initial xor 1
53
54	xor	r1, r2, r5		C L   initial xor 0
55	beq	r18, L(one)		C U   if size==1
56
57	cmoveq	r8, r31, r5		C L   discard first limb if size even
58	unop				C U
59
60
61	ALIGN(16)
62L(top):
63	C r0	total accumulating
64	C r7	xor 0
65	C r8	xor 1
66	C r16	xp, incrementing
67	C r17	yp, incrementing
68	C r18	size, limb pairs, decrementing
69
70	ldq	r1, 0(r16)		C L
71	ldq	r2, 0(r17)		C L
72	ctpop	r5, r7			C U0
73	lda	r16, 16(r16)		C U
74
75	ldq	r3, -8(r16)		C L
76	ldq	r4, 8(r17)		C L
77	ctpop	r6, r8			C U0
78	lda	r17, 16(r17)		C U
79
80	ldl	r31, 256(r16)		C L	prefetch
81	ldl	r31, 256(r17)		C L	prefetch
82	xor	r1, r2, r5		C U
83	lda	r18, -1(r18)		C U
84
85	xor	r3, r4, r6		C U
86	addq	r0, r7, r0		C L
87	addq	r0, r8, r0		C L
88	bne	r18, L(top)		C U
89
90
91	ctpop	r6, r8			C U0
92	addq	r0, r8, r0		C L
93L(one):
94	ctpop	r5, r7			C U0
95	addq	r0, r7, r0		C L
96
97	ret	r31, (r26), 1		C L0
98
99EPILOGUE()
100ASM_END()
101