1dnl  Alpha ev67 mpn_hamdist -- mpn hamming distance.
2
3dnl  Copyright 2003, 2005 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C ev67: 2.5 cycles/limb
35
36
37C unsigned long mpn_hamdist (mp_srcptr xp, mp_srcptr yp, mp_size_t size);
38C
39C The hope was for 2.0 c/l here, but that isn't achieved.  We're limited by
40C renaming register shortage.  Since we need 5 instructions per limb, further
41C unrolling could approach 1.5 c/l.
42C
43C The main loop processes two limbs from each operand on each iteration.  An
44C odd size is handled by processing xp[0]^yp[0] at the start.  If the size
45C is even that result is discarded, and is repeated by the main loop.
46C
47
48ASM_START()
49PROLOGUE(mpn_hamdist)
50
51	C r16	xp
52	C r17	yp
53	C r18	size
54
55	ldq	r1, 0(r16)		C L0  xp[0]
56	ldq	r2, 0(r17)		C L1  yp[0]
57	and	r18, 1, r8		C U1  1 if size odd
58	srl	r18, 1, r18		C U0  size, limb pairs
59
60	clr	r0			C L0  initial total
61	s8addq	r8, r17, r17		C U1  yp++ if size odd
62	s8addq	r8, r16, r16		C L1  xp++ if size odd
63	clr	r6			C U0  dummy initial xor 1
64
65	xor	r1, r2, r5		C L   initial xor 0
66	beq	r18, L(one)		C U   if size==1
67
68	cmoveq	r8, r31, r5		C L   discard first limb if size even
69	unop				C U
70
71
72	ALIGN(16)
73L(top):
74	C r0	total accumulating
75	C r7	xor 0
76	C r8	xor 1
77	C r16	xp, incrementing
78	C r17	yp, incrementing
79	C r18	size, limb pairs, decrementing
80
81	ldq	r1, 0(r16)		C L
82	ldq	r2, 0(r17)		C L
83	ctpop	r5, r7			C U0
84	lda	r16, 16(r16)		C U
85
86	ldq	r3, -8(r16)		C L
87	ldq	r4, 8(r17)		C L
88	ctpop	r6, r8			C U0
89	lda	r17, 16(r17)		C U
90
91	ldl	r31, 256(r16)		C L	prefetch
92	ldl	r31, 256(r17)		C L	prefetch
93	xor	r1, r2, r5		C U
94	lda	r18, -1(r18)		C U
95
96	xor	r3, r4, r6		C U
97	addq	r0, r7, r0		C L
98	addq	r0, r8, r0		C L
99	bne	r18, L(top)		C U
100
101
102	ctpop	r6, r8			C U0
103	addq	r0, r8, r0		C L
104L(one):
105	ctpop	r5, r7			C U0
106	addq	r0, r7, r0		C L
107
108	ret	r31, (r26), 1		C L0
109
110EPILOGUE()
111ASM_END()
112