1219820Sjeffdnl  Alpha ev67 mpn_hamdist -- mpn hamming distance.
2219820Sjeff
3219820Sjeffdnl  Copyright 2003, 2005 Free Software Foundation, Inc.
4219820Sjeff
5219820Sjeffdnl  This file is part of the GNU MP Library.
6219820Sjeffdnl
7219820Sjeffdnl  The GNU MP Library is free software; you can redistribute it and/or
8219820Sjeffdnl  modify it under the terms of the GNU Lesser General Public License as
9219820Sjeffdnl  published by the Free Software Foundation; either version 3 of the
10219820Sjeffdnl  License, or (at your option) any later version.
11219820Sjeffdnl
12219820Sjeffdnl  The GNU MP Library is distributed in the hope that it will be useful,
13219820Sjeffdnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14219820Sjeffdnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15219820Sjeffdnl  Lesser General Public License for more details.
16219820Sjeffdnl
17219820Sjeffdnl  You should have received a copy of the GNU Lesser General Public License
18219820Sjeffdnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19219820Sjeff
20219820Sjeffinclude(`../config.m4')
21219820Sjeff
22219820Sjeff
23219820SjeffC ev67: 2.5 cycles/limb
24219820Sjeff
25219820Sjeff
26219820SjeffC unsigned long mpn_hamdist (mp_srcptr xp, mp_srcptr yp, mp_size_t size);
27219820SjeffC
28219820SjeffC The hope was for 2.0 c/l here, but that isn't achieved.  We're limited by
29219820SjeffC renaming register shortage.  Since we need 5 instructions per limb, further
30219820SjeffC unrolling could approach 1.5 c/l.
31219820SjeffC
32219820SjeffC The main loop processes two limbs from each operand on each iteration.  An
33219820SjeffC odd size is handled by processing xp[0]^yp[0] at the start.  If the size
34219820SjeffC is even that result is discarded, and is repeated by the main loop.
35219820SjeffC
36219820Sjeff
37219820SjeffASM_START()
38219820SjeffPROLOGUE(mpn_hamdist)
39219820Sjeff
40219820Sjeff	C r16	xp
41219820Sjeff	C r17	yp
42219820Sjeff	C r18	size
43219820Sjeff
44219820Sjeff	ldq	r1, 0(r16)		C L0  xp[0]
45219820Sjeff	ldq	r2, 0(r17)		C L1  yp[0]
46219820Sjeff	and	r18, 1, r8		C U1  1 if size odd
47219820Sjeff	srl	r18, 1, r18		C U0  size, limb pairs
48219820Sjeff
49219820Sjeff	clr	r0			C L0  initial total
50219820Sjeff	s8addq	r8, r17, r17		C U1  yp++ if size odd
51219820Sjeff	s8addq	r8, r16, r16		C L1  xp++ if size odd
52219820Sjeff	clr	r6			C U0  dummy initial xor 1
53219820Sjeff
54219820Sjeff	xor	r1, r2, r5		C L   initial xor 0
55219820Sjeff	beq	r18, L(one)		C U   if size==1
56219820Sjeff
57219820Sjeff	cmoveq	r8, r31, r5		C L   discard first limb if size even
58219820Sjeff	unop				C U
59219820Sjeff
60219820Sjeff
61219820Sjeff	ALIGN(16)
62219820SjeffL(top):
63219820Sjeff	C r0	total accumulating
64219820Sjeff	C r7	xor 0
65219820Sjeff	C r8	xor 1
66219820Sjeff	C r16	xp, incrementing
67219820Sjeff	C r17	yp, incrementing
68219820Sjeff	C r18	size, limb pairs, decrementing
69219820Sjeff
70219820Sjeff	ldq	r1, 0(r16)		C L
71219820Sjeff	ldq	r2, 0(r17)		C L
72219820Sjeff	ctpop	r5, r7			C U0
73219820Sjeff	lda	r16, 16(r16)		C U
74219820Sjeff
75219820Sjeff	ldq	r3, -8(r16)		C L
76219820Sjeff	ldq	r4, 8(r17)		C L
77219820Sjeff	ctpop	r6, r8			C U0
78	lda	r17, 16(r17)		C U
79
80	ldl	r31, 256(r16)		C L	prefetch
81	ldl	r31, 256(r17)		C L	prefetch
82	xor	r1, r2, r5		C U
83	lda	r18, -1(r18)		C U
84
85	xor	r3, r4, r6		C U
86	addq	r0, r7, r0		C L
87	addq	r0, r8, r0		C L
88	bne	r18, L(top)		C U
89
90
91	ctpop	r6, r8			C U0
92	addq	r0, r8, r0		C L
93L(one):
94	ctpop	r5, r7			C U0
95	addq	r0, r7, r0		C L
96
97	ret	r31, (r26), 1		C L0
98
99EPILOGUE()
100ASM_END()
101