/*
 * Copyright (c) 2011 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

/*
 * http://code.google.com/p/smhasher/
 *
 * Copyright (c) 2009-2011 Austin Appleby.
 *
 * MurmurHash3 was written by Austin Appleby, and is placed in the public
 * domain. The author hereby disclaims copyright to this source code.
 */

/*
 * http://burtleburtle.net/bob/hash/
 *
 * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
 *
 * You can use this free for any purpose.  It's in the public domain.
 * It has no warranty.
 */

#include <stdbool.h>
#include <sys/types.h>
#include <machine/endian.h>
#include <net/flowhash.h>

static inline u_int32_t getblock32(const u_int32_t *, int);
static inline u_int64_t getblock64(const u_int64_t *, int);
static inline u_int32_t mh3_fmix32(u_int32_t);
static inline u_int64_t mh3_fmix64(u_int64_t);

#define	ALIGNED16(v)	((((uintptr_t)(v)) & 1) == 0)
#define	ALIGNED32(v)	((((uintptr_t)(v)) & 3) == 0)
#define	ALIGNED64(v)	((((uintptr_t)(v)) & 7) == 0)

#define	ROTL32(x, r)	(((x) << (r)) | ((x) >> (32 - (r))))
#define	ROTL64(x, r)	(((x) << (r)) | ((x) >> (64 - (r))))

/*
 * The following hash algorithms are selected based on performance:
 *
 * Intel 32-bit:	MurmurHash3_x86_32
 * Intel 64-bit:	MurmurHash3_x64_128
 * ARM, et al:		JHash
 */
#if defined(__i386__)
net_flowhash_fn_t *net_flowhash = net_flowhash_mh3_x86_32;
#elif defined(__x86_64__)
net_flowhash_fn_t *net_flowhash = net_flowhash_mh3_x64_128;
#else /* !__i386__ && !__x86_64__ */
net_flowhash_fn_t *net_flowhash = net_flowhash_jhash;
#endif /* !__i386__ && !__x86_64__ */

#if defined(__i386__) || defined(__x86_64__)
static inline u_int32_t
getblock32(const u_int32_t *p, int i)
{
	return (p[i]);
}

static inline u_int64_t
getblock64(const u_int64_t *p, int i)
{
	return (p[i]);
}
#else /* !__i386__ && !__x86_64 */
static inline u_int32_t
getblock32(const u_int32_t *p, int i)
{
	const u_int8_t *bytes = (u_int8_t *)(void *)(uintptr_t)(p + i);
	u_int32_t value;

	if (ALIGNED32(p)) {
		value = p[i];
	} else {
#if BYTE_ORDER == BIG_ENDIAN
		value =
		    (((u_int32_t)bytes[0]) << 24) |
		    (((u_int32_t)bytes[1]) << 16) |
		    (((u_int32_t)bytes[2]) << 8) |
		    ((u_int32_t)bytes[3]);
#else /* LITTLE_ENDIAN */
		value =
		    (((u_int32_t)bytes[3]) << 24) |
		    (((u_int32_t)bytes[2]) << 16) |
		    (((u_int32_t)bytes[1]) << 8) |
		    ((u_int32_t)bytes[0]);
#endif /* LITTLE_ENDIAN */
	}
	return (value);
}

static inline u_int64_t
getblock64(const u_int64_t *p, int i)
{
	const u_int8_t *bytes = (const u_int8_t *)(void *)(uintptr_t)(p + i);
	u_int64_t value;

	if (ALIGNED64(p)) {
		value = p[i];
	} else {
#if BYTE_ORDER == BIG_ENDIAN
		value =
		    (((u_int64_t)bytes[0]) << 56) |
		    (((u_int64_t)bytes[1]) << 48) |
		    (((u_int64_t)bytes[2]) << 40) |
		    (((u_int64_t)bytes[3]) << 32) |
		    (((u_int64_t)bytes[4]) << 24) |
		    (((u_int64_t)bytes[5]) << 16) |
		    (((u_int64_t)bytes[6]) << 8) |
		    ((u_int64_t)bytes[7]);
#else /* LITTLE_ENDIAN */
		value =
		    (((u_int64_t)bytes[7]) << 56) |
		    (((u_int64_t)bytes[6]) << 48) |
		    (((u_int64_t)bytes[5]) << 40) |
		    (((u_int64_t)bytes[4]) << 32) |
		    (((u_int64_t)bytes[3]) << 24) |
		    (((u_int64_t)bytes[2]) << 16) |
		    (((u_int64_t)bytes[1]) << 8) |
		    ((u_int64_t)bytes[0]);
#endif /* LITTLE_ENDIAN */
	}
	return (value);
}
#endif /* !__i386__ && !__x86_64 */

static inline u_int32_t
mh3_fmix32(u_int32_t h)
{
	h ^= h >> 16;
	h *= 0x85ebca6b;
	h ^= h >> 13;
	h *= 0xc2b2ae35;
	h ^= h >> 16;

	return (h);
}

static inline u_int64_t
mh3_fmix64(u_int64_t k)
{
	k ^= k >> 33;
	k *= 0xff51afd7ed558ccdLLU;
	k ^= k >> 33;
	k *= 0xc4ceb9fe1a85ec53LLU;
	k ^= k >> 33;

	return (k);
}

/*
 * MurmurHash3_x86_32
 */
#define	MH3_X86_32_C1	0xcc9e2d51
#define	MH3_X86_32_C2	0x1b873593

u_int32_t
net_flowhash_mh3_x86_32(const void *key, u_int32_t len, const u_int32_t seed)
{
	const u_int8_t *data = (const u_int8_t *)key;
	const u_int32_t nblocks = len / 4;
	const u_int32_t *blocks;
	const u_int8_t *tail;
	u_int32_t h1 = seed, k1;
	int i;

	/* body */
	blocks = (const u_int32_t *)(const void *)(data + nblocks * 4);

	for (i = -nblocks; i; i++) {
		k1 = getblock32(blocks, i);

		k1 *= MH3_X86_32_C1;
		k1 = ROTL32(k1, 15);
		k1 *= MH3_X86_32_C2;

		h1 ^= k1;
		h1 = ROTL32(h1, 13);
		h1 = h1 * 5 + 0xe6546b64;
	}

	/* tail */
	tail = (const u_int8_t *)(const void *)(data + nblocks * 4);
	k1 = 0;

	switch (len & 3) {
	case 3:
		k1 ^= tail[2] << 16;
		/* FALLTHRU */
	case 2:
		k1 ^= tail[1] << 8;
		/* FALLTHRU */
	case 1:
		k1 ^= tail[0];
		k1 *= MH3_X86_32_C1;
		k1 = ROTL32(k1, 15);
		k1 *= MH3_X86_32_C2;
		h1 ^= k1;
	};

	/* finalization */
	h1 ^= len;

	h1 = mh3_fmix32(h1);

	return (h1);
}

/*
 * MurmurHash3_x64_128
 */
#define	MH3_X64_128_C1	0x87c37b91114253d5LLU
#define	MH3_X64_128_C2	0x4cf5ad432745937fLLU

u_int32_t
net_flowhash_mh3_x64_128(const void *key, u_int32_t len, const u_int32_t seed)
{
	const u_int8_t *data = (const u_int8_t *)key;
	const u_int32_t nblocks = len / 16;
	const u_int64_t *blocks;
	const u_int8_t *tail;
	u_int64_t h1 = seed, k1;
	u_int64_t h2 = seed, k2;
	u_int32_t i;

	/* body */
	blocks = (const u_int64_t *)(const void *)data;

	for (i = 0; i < nblocks; i++) {
		k1 = getblock64(blocks, i * 2 + 0);
		k2 = getblock64(blocks, i * 2 + 1);

		k1 *= MH3_X64_128_C1;
		k1 = ROTL64(k1, 31);
		k1 *= MH3_X64_128_C2;
		h1 ^= k1;

		h1 = ROTL64(h1, 27);
		h1 += h2;
		h1 = h1 * 5 + 0x52dce729;

		k2 *= MH3_X64_128_C2;
		k2 = ROTL64(k2, 33);
		k2 *= MH3_X64_128_C1;
		h2 ^= k2;

		h2 = ROTL64(h2, 31);
		h2 += h1;
		h2 = h2 * 5+ 0x38495ab5;
	}

	/* tail */
	tail = (const u_int8_t *)(const void *)(data + nblocks * 16);
	k1 = 0;
	k2 = 0;

	switch (len & 15) {
	case 15:
		k2 ^= ((u_int64_t)tail[14]) << 48;
		/* FALLTHRU */
	case 14:
		k2 ^= ((u_int64_t)tail[13]) << 40;
		/* FALLTHRU */
	case 13:
		k2 ^= ((u_int64_t)tail[12]) << 32;
		/* FALLTHRU */
	case 12:
		k2 ^= ((u_int64_t)tail[11]) << 24;
		/* FALLTHRU */
	case 11:
		k2 ^= ((u_int64_t)tail[10]) << 16;
		/* FALLTHRU */
	case 10:
		k2 ^= ((u_int64_t)tail[9]) << 8;
		/* FALLTHRU */
	case 9:
		k2 ^= ((u_int64_t)tail[8]) << 0;
		k2 *= MH3_X64_128_C2;
		k2 = ROTL64(k2, 33);
		k2 *= MH3_X64_128_C1;
		h2 ^= k2;
		/* FALLTHRU */
	case 8:
		k1 ^= ((u_int64_t)tail[7]) << 56;
		/* FALLTHRU */
	case 7:
		k1 ^= ((u_int64_t)tail[6]) << 48;
		/* FALLTHRU */
	case 6:
		k1 ^= ((u_int64_t)tail[5]) << 40;
		/* FALLTHRU */
	case 5:
		k1 ^= ((u_int64_t)tail[4]) << 32;
		/* FALLTHRU */
	case 4:
		k1 ^= ((u_int64_t)tail[3]) << 24;
		/* FALLTHRU */
	case 3:
		k1 ^= ((u_int64_t)tail[2]) << 16;
		/* FALLTHRU */
	case 2:
		k1 ^= ((u_int64_t)tail[1]) << 8;
		/* FALLTHRU */
	case 1:
		k1 ^= ((u_int64_t)tail[0]) << 0;
		k1 *= MH3_X64_128_C1;
		k1 = ROTL64(k1, 31);
		k1 *= MH3_X64_128_C2;
		h1 ^= k1;
	};

	/* finalization */
	h1 ^= len;
	h2 ^= len;

	h1 += h2;
	h2 += h1;

	h1 = mh3_fmix64(h1);
	h2 = mh3_fmix64(h2);

	h1 += h2;
	h2 += h1;

	/* throw all but lowest 32-bit */
	return (h1 & 0xffffffff);
}

#define	JHASH_INIT	0xdeadbeef

#define	JHASH_MIX(a, b, c) {			\
	a -= c;  a ^= ROTL32(c, 4);   c += b;	\
	b -= a;  b ^= ROTL32(a, 6);   a += c;	\
	c -= b;  c ^= ROTL32(b, 8);   b += a;	\
	a -= c;  a ^= ROTL32(c, 16);  c += b;	\
	b -= a;  b ^= ROTL32(a, 19);  a += c;	\
	c -= b;  c ^= ROTL32(b, 4);   b += a;	\
}

#define	JHASH_FINAL(a, b, c) {			\
	c ^= b;  c -= ROTL32(b, 14);		\
	a ^= c;  a -= ROTL32(c, 11);		\
	b ^= a;  b -= ROTL32(a, 25);		\
	c ^= b;  c -= ROTL32(b, 16);		\
	a ^= c;  a -= ROTL32(c, 4);		\
	b ^= a;  b -= ROTL32(a, 14);		\
	c ^= b;  c -= ROTL32(b, 24);		\
}

#if BYTE_ORDER == BIG_ENDIAN
/*
 * hashbig()
 */
u_int32_t
net_flowhash_jhash(const void *key, u_int32_t len, const u_int32_t seed)
{
	u_int32_t a, b, c;

	/* Set up the internal state */
	a = b = c = JHASH_INIT + len + seed;

	if (ALIGNED32(key)) {
		/* read 32-bit chunks */
		const u_int32_t *k = (const u_int32_t *)key;

		/*
		 * all but last block:
		 * aligned reads and affect 32 bits of (a,b,c)
		 */
		while (len > 12) {
			a += k[0];
			b += k[1];
			c += k[2];
			JHASH_MIX(a, b, c);
			len -= 12;
			k += 3;
		}

		/*
		 * handle the last (probably partial) block
		 *
		 * "k[2] << 8" actually reads beyond the end of the string,
		 * but then shifts out the part it's not allowed to read.
		 * Because the string is aligned, the illegal read is in
		 * the same word as the rest of the string.  The masking
		 * trick does make the hash noticably faster for short
		 * strings (like English words).
		 */
		switch (len) {
		case 12:
			c += k[2];
			b += k[1];
			a += k[0];
			break;

		case 11:
			c += k[2] & 0xffffff00;
			b += k[1];
			a += k[0];
			break;

		case 10:
			c += k[2] & 0xffff0000;
			b += k[1];
			a += k[0];
			break;

		case 9:
			c += k[2] & 0xff000000;
			b += k[1];
			a += k[0];
			break;

		case 8:
			b += k[1];
			a += k[0];
			break;

		case 7:
			b += k[1] & 0xffffff00;
			a += k[0];
			break;

		case 6:
			b += k[1] & 0xffff0000;
			a += k[0];
			break;

		case 5:
			b += k[1] & 0xff000000;
			a += k[0];
			break;

		case 4:
			a += k[0];
			break;

		case 3:
			a += k[0] & 0xffffff00;
			break;

		case 2:
			a += k[0] & 0xffff0000;
			break;

		case 1:
			a += k[0] & 0xff000000;
			break;

		case 0:
			/* zero length requires no mixing */
			return (c);
		}

		JHASH_FINAL(a, b, c);

		return (c);
	}

	/* need to read the key one byte at a time */
	const u_int8_t *k = (const u_int8_t *)key;

	/* all but the last block: affect some 32 bits of (a,b,c) */
	while (len > 12) {
		a += ((u_int32_t)k[0]) << 24;
		a += ((u_int32_t)k[1]) << 16;
		a += ((u_int32_t)k[2]) << 8;
		a += ((u_int32_t)k[3]);
		b += ((u_int32_t)k[4]) << 24;
		b += ((u_int32_t)k[5]) << 16;
		b += ((u_int32_t)k[6]) << 8;
		b += ((u_int32_t)k[7]);
		c += ((u_int32_t)k[8]) << 24;
		c += ((u_int32_t)k[9]) << 16;
		c += ((u_int32_t)k[10]) << 8;
		c += ((u_int32_t)k[11]);
		JHASH_MIX(a, b, c);
		len -= 12;
		k += 12;
	}

	/* last block: affect all 32 bits of (c) */
	switch (len) {
	case 12:
		c += k[11];
		/* FALLTHRU */
	case 11:
		c += ((u_int32_t)k[10]) << 8;
		/* FALLTHRU */
	case 10:
		c += ((u_int32_t)k[9]) << 16;
		/* FALLTHRU */
	case 9:
		c += ((u_int32_t)k[8]) << 24;
		/* FALLTHRU */
	case 8:
		b += k[7];
		/* FALLTHRU */
	case 7:
		b += ((u_int32_t)k[6]) << 8;
		/* FALLTHRU */
	case 6:
		b += ((u_int32_t)k[5]) << 16;
		/* FALLTHRU */
	case 5:
		b += ((u_int32_t)k[4]) << 24;
		/* FALLTHRU */
	case 4:
		a += k[3];
		/* FALLTHRU */
	case 3:
		a += ((u_int32_t)k[2]) << 8;
		/* FALLTHRU */
	case 2:
		a += ((u_int32_t)k[1]) << 16;
		/* FALLTHRU */
	case 1:
		a += ((u_int32_t)k[0]) << 24;
		break;

	case 0:
		/* zero length requires no mixing */
		return (c);
	}

	JHASH_FINAL(a, b, c);

	return (c);
}
#else /* LITTLE_ENDIAN */
/*
 * hashlittle()
 */
u_int32_t
net_flowhash_jhash(const void *key, u_int32_t len, const u_int32_t seed)
{
	u_int32_t a, b, c;

	/* Set up the internal state */
	a = b = c = JHASH_INIT + len + seed;

#if defined(__i386__) || defined(__x86_64__)
	/*
	 * On i386/x86_64, it is faster to read 32-bit chunks if the key
	 * is aligned 32-bit OR not 16-bit, and perform 16-bit reads if it
	 * is aligned 16-bit.
	 */
	if (ALIGNED32(key) || !ALIGNED16(key)) {
#else /* !defined(__i386__) && !defined(__x86_64__) */
	if (ALIGNED32(key)) {
#endif /* !defined(__i386__) && !defined(__x86_64__) */
		/* read 32-bit chunks */
		const u_int32_t *k = (const u_int32_t *)key;

		/*
		 * all but last block:
		 * aligned reads and affect 32 bits of (a,b,c)
		 */
		while (len > 12) {
			a += k[0];
			b += k[1];
			c += k[2];
			JHASH_MIX(a, b, c);
			len -= 12;
			k += 3;
		}

		/*
		 * handle the last (probably partial) block
		 *
		 * "k[2] & 0xffffff" actually reads beyond the end of the
		 * string, but then masks off the part it's not allowed
		 * to read.  Because the string is aligned, the masked-off
		 * tail is in the same word as the rest of the string.
		 * The masking trick does make the hash noticably faster
		 * for short strings (like English words).
		 */
		switch (len) {
		case 12:
			c += k[2];
			b += k[1];
			a += k[0];
			break;

		case 11:
			c += k[2] & 0xffffff;
			b += k[1];
			a += k[0];
			break;

		case 10:
			c += k[2] & 0xffff;
			b += k[1];
			a += k[0];
			break;

		case 9:
			c += k[2] & 0xff;
			b += k[1];
			a += k[0];
			break;

		case 8:
			b += k[1];
			a += k[0];
			break;

		case 7:
			b += k[1] & 0xffffff;
			a += k[0];
			break;

		case 6:
			b += k[1] & 0xffff;
			a += k[0];
			break;

		case 5:
			b += k[1] & 0xff;
			a += k[0];
			break;

		case 4:
			a += k[0];
			break;

		case 3:
			a += k[0] & 0xffffff;
			break;

		case 2:
			a += k[0] & 0xffff;
			break;

		case 1:
			a += k[0] & 0xff;
			break;

		case 0:
			/* zero length requires no mixing */
			return (c);
		}

		JHASH_FINAL(a, b, c);

		return (c);
	}
#if !defined(__i386__) && !defined(__x86_64__)
	else if (ALIGNED16(key)) {
#endif /* !defined(__i386__) && !defined(__x86_64__) */
		/* read 16-bit chunks */
		const u_int16_t *k = (const u_int16_t *)key;
		const u_int8_t *k8;

		/* all but last block: aligned reads and different mixing */
		while (len > 12) {
			a += k[0] + (((u_int32_t)k[1]) << 16);
			b += k[2] + (((u_int32_t)k[3]) << 16);
			c += k[4] + (((u_int32_t)k[5]) << 16);
			JHASH_MIX(a, b, c);
			len -= 12;
			k += 6;
		}

		/* handle the last (probably partial) block */
		k8 = (const u_int8_t *)k;
		switch (len) {
		case 12:
			c += k[4] + (((u_int32_t)k[5]) << 16);
			b += k[2] + (((u_int32_t)k[3]) << 16);
			a += k[0] + (((u_int32_t)k[1]) << 16);
			break;

		case 11:
			c += ((u_int32_t)k8[10]) << 16;
			/* FALLTHRU */
		case 10:
			c += k[4];
			b += k[2] + (((u_int32_t)k[3]) << 16);
			a += k[0] + (((u_int32_t)k[1]) << 16);
			break;

		case 9:
			c += k8[8];
			/* FALLTHRU */
		case 8:
			b += k[2] + (((u_int32_t)k[3]) << 16);
			a += k[0] + (((u_int32_t)k[1]) << 16);
			break;

		case 7:
			b += ((u_int32_t)k8[6]) << 16;
			/* FALLTHRU */
		case 6:
			b += k[2];
			a += k[0] + (((u_int32_t)k[1]) << 16);
			break;

		case 5:
			b += k8[4];
			/* FALLTHRU */
		case 4:
			a += k[0] + (((u_int32_t)k[1]) << 16);
			break;

		case 3:
			a += ((u_int32_t)k8[2]) << 16;
			/* FALLTHRU */
		case 2:
			a += k[0];
			break;

		case 1:
			a += k8[0];
			break;

		case 0:
			/* zero length requires no mixing */
			return (c);
		}

		JHASH_FINAL(a, b, c);

		return (c);
#if !defined(__i386__) && !defined(__x86_64__)
	}

	/* need to read the key one byte at a time */
	const u_int8_t *k = (const u_int8_t *)key;

	/* all but the last block: affect some 32 bits of (a,b,c) */
	while (len > 12) {
		a += k[0];
		a += ((u_int32_t)k[1]) << 8;
		a += ((u_int32_t)k[2]) << 16;
		a += ((u_int32_t)k[3]) << 24;
		b += k[4];
		b += ((u_int32_t)k[5]) << 8;
		b += ((u_int32_t)k[6]) << 16;
		b += ((u_int32_t)k[7]) << 24;
		c += k[8];
		c += ((u_int32_t)k[9]) << 8;
		c += ((u_int32_t)k[10]) << 16;
		c += ((u_int32_t)k[11]) << 24;
		JHASH_MIX(a, b, c);
		len -= 12;
		k += 12;
	}

	/* last block: affect all 32 bits of (c) */
	switch (len) {
	case 12:
		c += ((u_int32_t)k[11]) << 24;
		/* FALLTHRU */
	case 11:
		c += ((u_int32_t)k[10]) << 16;
		/* FALLTHRU */
	case 10:
		c += ((u_int32_t)k[9]) << 8;
		/* FALLTHRU */
	case 9:
		c += k[8];
		/* FALLTHRU */
	case 8:
		b += ((u_int32_t)k[7]) << 24;
		/* FALLTHRU */
	case 7:
		b += ((u_int32_t)k[6]) << 16;
		/* FALLTHRU */
	case 6:
		b += ((u_int32_t)k[5]) << 8;
		/* FALLTHRU */
	case 5:
		b += k[4];
		/* FALLTHRU */
	case 4:
		a += ((u_int32_t)k[3]) << 24;
		/* FALLTHRU */
	case 3:
		a += ((u_int32_t)k[2]) << 16;
		/* FALLTHRU */
	case 2:
		a += ((u_int32_t)k[1]) << 8;
		/* FALLTHRU */
	case 1:
		a += k[0];
		break;

	case 0:
		/* zero length requires no mixing */
		return (c);
	}

	JHASH_FINAL(a, b, c);

	return (c);
#endif /* !defined(__i386__) && !defined(__x86_64__) */
}
#endif /* LITTLE_ENDIAN */