libucl/src/mum.h

301333Sbapt/* Copyright (c) 2016 Vladimir Makarov <vmakarov@gcc.gnu.org>
301333Sbapt
301333Sbapt   Permission is hereby granted, free of charge, to any person
301333Sbapt   obtaining a copy of this software and associated documentation
301333Sbapt   files (the "Software"), to deal in the Software without
301333Sbapt   restriction, including without limitation the rights to use, copy,
301333Sbapt   modify, merge, publish, distribute, sublicense, and/or sell copies
301333Sbapt   of the Software, and to permit persons to whom the Software is
301333Sbapt   furnished to do so, subject to the following conditions:
301333Sbapt
301333Sbapt   The above copyright notice and this permission notice shall be
301333Sbapt   included in all copies or substantial portions of the Software.
301333Sbapt
301333Sbapt   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
301333Sbapt   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
301333Sbapt   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
301333Sbapt   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
301333Sbapt   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
301333Sbapt   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
301333Sbapt   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
301333Sbapt   SOFTWARE.
301333Sbapt*/
301333Sbapt
301333Sbapt/* This file implements MUM (MUltiply and Mix) hashing.  We randomize
301333Sbapt   input data by 64x64-bit multiplication and mixing hi- and low-parts
301333Sbapt   of the multiplication result by using an addition and then mix it
301333Sbapt   into the current state.  We use prime numbers randomly generated
301333Sbapt   with the equal probability of their bit values for the
301333Sbapt   multiplication.  When all primes are used once, the state is
301333Sbapt   randomized and the same prime numbers are used again for data
301333Sbapt   randomization.
301333Sbapt
301333Sbapt   The MUM hashing passes all SMHasher tests.  Pseudo Random Number
301333Sbapt   Generator based on MUM also passes NIST Statistical Test Suite for
301333Sbapt   Random and Pseudorandom Number Generators for Cryptographic
301333Sbapt   Applications (version 2.2.1) with 1000 bitstreams each containing
301333Sbapt   1M bits.  MUM hashing is also faster Spooky64 and City64 on small
301333Sbapt   strings (at least upto 512-bit) on Haswell and Power7.  The MUM bulk
301333Sbapt   speed (speed on very long data) is bigger than Spooky and City on
301333Sbapt   Power7.  On Haswell the bulk speed is bigger than Spooky one and
301333Sbapt   close to City speed.  */
301333Sbapt
301333Sbapt#ifndef __MUM_HASH__
301333Sbapt#define __MUM_HASH__
301333Sbapt
301333Sbapt#include <stddef.h>
301333Sbapt#include <stdlib.h>
301333Sbapt#include <string.h>
301333Sbapt#include <limits.h>
301333Sbapt
301333Sbapt#ifdef _MSC_VER
301333Sbapttypedef unsigned __int16 uint16_t;
301333Sbapttypedef unsigned __int32 uint32_t;
301333Sbapttypedef unsigned __int64 uint64_t;
301333Sbapt#else
301333Sbapt#include <stdint.h>
301333Sbapt#endif
301333Sbapt
301333Sbapt/* Macro saying to use 128-bit integers implemented by GCC for some
301333Sbapt   targets.  */
301333Sbapt#ifndef _MUM_USE_INT128
301333Sbapt/* In GCC uint128_t is defined if HOST_BITS_PER_WIDE_INT >= 64.
301333Sbapt   HOST_WIDE_INT is long if HOST_BITS_PER_LONG > HOST_BITS_PER_INT,
301333Sbapt   otherwise int. */
301333Sbapt#if defined(__GNUC__) && UINT_MAX != ULONG_MAX
301333Sbapt#define _MUM_USE_INT128 1
301333Sbapt#else
301333Sbapt#define _MUM_USE_INT128 0
301333Sbapt#endif
301333Sbapt#endif
301333Sbapt
301333Sbapt#if defined(__GNUC__) && ((__GNUC__ == 4) &&  (__GNUC_MINOR__ >= 9) || (__GNUC__ > 4))
301333Sbapt#define _MUM_FRESH_GCC
301333Sbapt#endif
301333Sbapt
301333Sbapt#if defined(__GNUC__) && !defined(__llvm__)
301333Sbapt#define _MUM_ATTRIBUTE_UNUSED  __attribute__((unused))
301333Sbapt#define _MUM_OPTIMIZE(opts) __attribute__((__optimize__ (opts)))
301333Sbapt#define _MUM_TARGET(opts) __attribute__((__target__ (opts)))
301333Sbapt#else
301333Sbapt#define _MUM_ATTRIBUTE_UNUSED
301333Sbapt#define _MUM_OPTIMIZE(opts)
301333Sbapt#define _MUM_TARGET(opts)
301333Sbapt#endif
301333Sbapt
301333Sbapt
301333Sbapt/* Here are different primes randomly generated with the equal
301333Sbapt   probability of their bit values.  They are used to randomize input
301333Sbapt   values.  */
301333Sbaptstatic uint64_t _mum_hash_step_prime = 0x2e0bb864e9ea7df5ULL;
301333Sbaptstatic uint64_t _mum_key_step_prime = 0xcdb32970830fcaa1ULL;
301333Sbaptstatic uint64_t _mum_block_start_prime = 0xc42b5e2e6480b23bULL;
301333Sbaptstatic uint64_t _mum_unroll_prime = 0x7b51ec3d22f7096fULL;
301333Sbaptstatic uint64_t _mum_tail_prime = 0xaf47d47c99b1461bULL;
301333Sbaptstatic uint64_t _mum_finish_prime1 = 0xa9a7ae7ceff79f3fULL;
301333Sbaptstatic uint64_t _mum_finish_prime2 = 0xaf47d47c99b1461bULL;
301333Sbapt
301333Sbaptstatic uint64_t _mum_primes [] = {
301333Sbapt  0X9ebdcae10d981691, 0X32b9b9b97a27ac7d, 0X29b5584d83d35bbd, 0X4b04e0e61401255f,
301333Sbapt  0X25e8f7b1f1c9d027, 0X80d4c8c000f3e881, 0Xbd1255431904b9dd, 0X8a3bd4485eee6d81,
301333Sbapt  0X3bc721b2aad05197, 0X71b1a19b907d6e33, 0X525e6c1084a8534b, 0X9e4c2cd340c1299f,
301333Sbapt  0Xde3add92e94caa37, 0X7e14eadb1f65311d, 0X3f5aa40f89812853, 0X33b15a3b587d15c9,
301333Sbapt};
301333Sbapt
301333Sbapt/* Multiply 64-bit V and P and return sum of high and low parts of the
301333Sbapt   result.  */
301333Sbaptstatic inline uint64_t
301333Sbapt_mum (uint64_t v, uint64_t p) {
301333Sbapt  uint64_t hi, lo;
301333Sbapt#if _MUM_USE_INT128
301333Sbapt#if defined(__aarch64__)
301333Sbapt  /* AARCH64 needs 2 insns to calculate 128-bit result of the
301333Sbapt     multiplication.  If we use a generic code we actually call a
301333Sbapt     function doing 128x128->128 bit multiplication.  The function is
301333Sbapt     very slow.  */
301333Sbapt  lo = v * p, hi;
301333Sbapt  asm ("umulh %0, %1, %2" : "=r" (hi) : "r" (v), "r" (p));
301333Sbapt#else
301333Sbapt  __uint128_t r = (__uint128_t) v * (__uint128_t) p;
301333Sbapt  hi = (uint64_t) (r >> 64);
301333Sbapt  lo = (uint64_t) r;
301333Sbapt#endif
301333Sbapt#else
301333Sbapt  /* Implementation of 64x64->128-bit multiplication by four 32x32->64
301333Sbapt     bit multiplication.  */
301333Sbapt  uint64_t hv = v >> 32, hp = p >> 32;
301333Sbapt  uint64_t lv = (uint32_t) v, lp = (uint32_t) p;
301333Sbapt  uint64_t rh =  hv * hp;
301333Sbapt  uint64_t rm_0 = hv * lp;
301333Sbapt  uint64_t rm_1 = hp * lv;
301333Sbapt  uint64_t rl =  lv * lp;
301333Sbapt  uint64_t t, carry = 0;
301333Sbapt
301333Sbapt  /* We could ignore a carry bit here if we did not care about the
301333Sbapt     same hash for 32-bit and 64-bit targets.  */
301333Sbapt  t = rl + (rm_0 << 32);
301333Sbapt#ifdef MUM_TARGET_INDEPENDENT_HASH
301333Sbapt  carry = t < rl;
301333Sbapt#endif
301333Sbapt  lo = t + (rm_1 << 32);
301333Sbapt#ifdef MUM_TARGET_INDEPENDENT_HASH
301333Sbapt  carry += lo < t;
301333Sbapt#endif
301333Sbapt  hi = rh + (rm_0 >> 32) + (rm_1 >> 32) + carry;
301333Sbapt#endif
301333Sbapt  /* We could use XOR here too but, for some reasons, on Haswell and
301333Sbapt     Power7 using an addition improves hashing performance by 10% for
301333Sbapt     small strings.  */
301333Sbapt  return hi + lo;
301333Sbapt}
301333Sbapt
301333Sbapt#if defined(_MSC_VER)
301333Sbapt#define _mum_bswap_32(x) _byteswap_uint32_t (x)
301333Sbapt#define _mum_bswap_64(x) _byteswap_uint64_t (x)
301333Sbapt#elif defined(__APPLE__)
301333Sbapt#include <libkern/OSByteOrder.h>
301333Sbapt#define _mum_bswap_32(x) OSSwapInt32 (x)
301333Sbapt#define _mum_bswap_64(x) OSSwapInt64 (x)
301333Sbapt#elif defined(__GNUC__)
301333Sbapt#define _mum_bswap32(x) __builtin_bswap32 (x)
301333Sbapt#define _mum_bswap64(x) __builtin_bswap64 (x)
301333Sbapt#else
301333Sbapt#include <byteswap.h>
301333Sbapt#define _mum_bswap32(x) bswap32 (x)
301333Sbapt#define _mum_bswap64(x) bswap64 (x)
301333Sbapt#endif
301333Sbapt
301333Sbaptstatic inline uint64_t
301333Sbapt_mum_le (uint64_t v) {
301333Sbapt#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
301333Sbapt  return v;
301333Sbapt#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
301333Sbapt  return _mum_bswap64 (v);
301333Sbapt#else
301333Sbapt#error "Unknown endianess"
301333Sbapt#endif
301333Sbapt}
301333Sbapt
301333Sbaptstatic inline uint32_t
301333Sbapt_mum_le32 (uint32_t v) {
301333Sbapt#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
301333Sbapt  return v;
301333Sbapt#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
301333Sbapt  return _mum_bswap32 (v);
301333Sbapt#else
301333Sbapt#error "Unknown endianess"
301333Sbapt#endif
301333Sbapt}
301333Sbapt
301333Sbapt/* Macro defining how many times the most nested loop in
301333Sbapt   _mum_hash_aligned will be unrolled by the compiler (although it can
301333Sbapt   make an own decision:).  Use only a constant here to help a
301333Sbapt   compiler to unroll a major loop.
301333Sbapt
301333Sbapt   The macro value affects the result hash for strings > 128 bit.  The
301333Sbapt   unroll factor greatly affects the hashing speed.  We prefer the
301333Sbapt   speed.  */
301333Sbapt#ifndef _MUM_UNROLL_FACTOR_POWER
301333Sbapt#if defined(__PPC64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
301333Sbapt#define _MUM_UNROLL_FACTOR_POWER 3
301333Sbapt#elif defined(__aarch64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
301333Sbapt#define _MUM_UNROLL_FACTOR_POWER 4
301333Sbapt#else
301333Sbapt#define _MUM_UNROLL_FACTOR_POWER 2
301333Sbapt#endif
301333Sbapt#endif
301333Sbapt
301333Sbapt#if _MUM_UNROLL_FACTOR_POWER < 1
301333Sbapt#error "too small unroll factor"
301333Sbapt#elif _MUM_UNROLL_FACTOR_POWER > 4
301333Sbapt#error "We have not enough primes for such unroll factor"
301333Sbapt#endif
301333Sbapt
301333Sbapt#define _MUM_UNROLL_FACTOR (1 << _MUM_UNROLL_FACTOR_POWER)
301333Sbapt
301333Sbaptstatic inline uint64_t _MUM_OPTIMIZE("unroll-loops")
301333Sbapt_mum_hash_aligned (uint64_t start, const void *key, size_t len) {
301333Sbapt  uint64_t result = start;
301333Sbapt  const unsigned char *str = (const unsigned char *) key;
301333Sbapt  uint64_t u64;
301333Sbapt  int i;
301333Sbapt  size_t n;
301333Sbapt
301333Sbapt  result = _mum (result, _mum_block_start_prime);
301333Sbapt  while  (len > _MUM_UNROLL_FACTOR * sizeof (uint64_t)) {
301333Sbapt    /* This loop could be vectorized when we have vector insns for
301333Sbapt       64x64->128-bit multiplication.  AVX2 currently only have a
301333Sbapt       vector insn for 4 32x32->64-bit multiplication.  */
301333Sbapt    for (i = 0; i < _MUM_UNROLL_FACTOR; i++)
301333Sbapt      result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
301333Sbapt    len -= _MUM_UNROLL_FACTOR * sizeof (uint64_t);
301333Sbapt    str += _MUM_UNROLL_FACTOR * sizeof (uint64_t);
301333Sbapt    /* We will use the same prime numbers on the next iterations --
301333Sbapt       randomize the state.  */
301333Sbapt    result = _mum (result, _mum_unroll_prime);
301333Sbapt  }
301333Sbapt  n = len / sizeof (uint64_t);
301333Sbapt  for (i = 0; i < (int)n; i++)
301333Sbapt    result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
301333Sbapt  len -= n * sizeof (uint64_t); str += n * sizeof (uint64_t);
301333Sbapt  switch (len) {
301333Sbapt  case 7:
301333Sbapt    u64 = _mum_le32 (*(uint32_t *) str);
301333Sbapt    u64 |= (uint64_t) str[4] << 32;
301333Sbapt    u64 |= (uint64_t) str[5] << 40;
301333Sbapt    u64 |= (uint64_t) str[6] << 48;
301333Sbapt    return result ^ _mum (u64, _mum_tail_prime);
301333Sbapt  case 6:
301333Sbapt    u64 = _mum_le32 (*(uint32_t *) str);
301333Sbapt    u64 |= (uint64_t) str[4] << 32;
301333Sbapt    u64 |= (uint64_t) str[5] << 40;
301333Sbapt    return result ^ _mum (u64, _mum_tail_prime);
301333Sbapt  case 5:
301333Sbapt    u64 = _mum_le32 (*(uint32_t *) str);
301333Sbapt    u64 |= (uint64_t) str[4] << 32;
301333Sbapt    return result ^ _mum (u64, _mum_tail_prime);
301333Sbapt  case 4:
301333Sbapt    u64 = _mum_le32 (*(uint32_t *) str);
301333Sbapt    return result ^ _mum (u64, _mum_tail_prime);
301333Sbapt  case 3:
301333Sbapt    u64 = str[0];
301333Sbapt    u64 |= (uint64_t) str[1] << 8;
301333Sbapt    u64 |= (uint64_t) str[2] << 16;
301333Sbapt    return result ^ _mum (u64, _mum_tail_prime);
301333Sbapt  case 2:
301333Sbapt    u64 = str[0];
301333Sbapt    u64 |= (uint64_t) str[1] << 8;
301333Sbapt    return result ^ _mum (u64, _mum_tail_prime);
301333Sbapt  case 1:
301333Sbapt    u64 = str[0];
301333Sbapt    return result ^ _mum (u64, _mum_tail_prime);
301333Sbapt  }
301333Sbapt  return result;
301333Sbapt}
301333Sbapt
301333Sbapt/* Final randomization of H.  */
301333Sbaptstatic inline uint64_t
301333Sbapt_mum_final (uint64_t h) {
301333Sbapt  h ^= _mum (h, _mum_finish_prime1);
301333Sbapt  h ^= _mum (h, _mum_finish_prime2);
301333Sbapt  return h;
301333Sbapt}
301333Sbapt
301333Sbapt#if defined(__x86_64__) && defined(_MUM_FRESH_GCC)
301333Sbapt
301333Sbapt/* We want to use AVX2 insn MULX instead of generic x86-64 MULQ where
301333Sbapt   it is possible.  Although on modern Intel processors MULQ takes
301333Sbapt   3-cycles vs. 4 for MULX, MULX permits more freedom in insn
301333Sbapt   scheduling as it uses less fixed registers.  */
301333Sbaptstatic inline uint64_t _MUM_TARGET("arch=haswell")
301333Sbapt_mum_hash_avx2 (const void * key, size_t len, uint64_t seed) {
301333Sbapt  return _mum_final (_mum_hash_aligned (seed + len, key, len));
301333Sbapt}
301333Sbapt#endif
301333Sbapt
301333Sbapt#ifndef _MUM_UNALIGNED_ACCESS
301333Sbapt#if defined(__x86_64__) || defined(__i386__) || defined(__PPC64__) \
301333Sbapt    || defined(__s390__) || defined(__m32c__) || defined(cris)     \
301333Sbapt    || defined(__CR16__) || defined(__vax__) || defined(__m68k__) \
301333Sbapt    || defined(__aarch64__)
301333Sbapt#define _MUM_UNALIGNED_ACCESS 1
301333Sbapt#else
301333Sbapt#define _MUM_UNALIGNED_ACCESS 0
301333Sbapt#endif
301333Sbapt#endif
301333Sbapt
301333Sbapt/* When we need an aligned access to data being hashed we move part of
301333Sbapt   the unaligned data to an aligned block of given size and then
301333Sbapt   process it, repeating processing the data by the block.  */
301333Sbapt#ifndef _MUM_BLOCK_LEN
301333Sbapt#define _MUM_BLOCK_LEN 1024
301333Sbapt#endif
301333Sbapt
301333Sbapt#if _MUM_BLOCK_LEN < 8
301333Sbapt#error "too small block length"
301333Sbapt#endif
301333Sbapt
301333Sbaptstatic inline uint64_t
301333Sbapt#if defined(__x86_64__)
301333Sbapt_MUM_TARGET("inline-all-stringops")
301333Sbapt#endif
301333Sbapt_mum_hash_default (const void *key, size_t len, uint64_t seed) {
301333Sbapt  uint64_t result;
301333Sbapt  const unsigned char *str = (const unsigned char *) key;
301333Sbapt  size_t block_len;
301333Sbapt  uint64_t buf[_MUM_BLOCK_LEN / sizeof (uint64_t)];
301333Sbapt
301333Sbapt  result = seed + len;
301333Sbapt  if (_MUM_UNALIGNED_ACCESS || ((size_t) str & 0x7) == 0)
301333Sbapt    result = _mum_hash_aligned (result, key, len);
301333Sbapt  else {
301333Sbapt    while (len != 0) {
301333Sbapt      block_len = len < _MUM_BLOCK_LEN ? len : _MUM_BLOCK_LEN;
301333Sbapt      memmove (buf, str, block_len);
301333Sbapt      result = _mum_hash_aligned (result, buf, block_len);
301333Sbapt      len -= block_len;
301333Sbapt      str += block_len;
301333Sbapt    }
301333Sbapt  }
301333Sbapt  return _mum_final (result);
301333Sbapt}
301333Sbapt
301333Sbaptstatic inline uint64_t
301333Sbapt_mum_next_factor (void) {
301333Sbapt  uint64_t start = 0;
301333Sbapt  int i;
301333Sbapt
301333Sbapt  for (i = 0; i < 8; i++)
301333Sbapt    start = (start << 8) | rand() % 256;
301333Sbapt  return start;
301333Sbapt}
301333Sbapt
301333Sbapt/* ++++++++++++++++++++++++++ Interface functions: +++++++++++++++++++  */
301333Sbapt
301333Sbapt/* Set random multiplicators depending on SEED.  */
301333Sbaptstatic inline void
301333Sbaptmum_hash_randomize (uint64_t seed) {
301333Sbapt  int i;
301333Sbapt
301333Sbapt  srand (seed);
301333Sbapt  _mum_hash_step_prime = _mum_next_factor ();
301333Sbapt  _mum_key_step_prime = _mum_next_factor ();
301333Sbapt  _mum_finish_prime1 = _mum_next_factor ();
301333Sbapt  _mum_finish_prime2 = _mum_next_factor ();
301333Sbapt  _mum_block_start_prime = _mum_next_factor ();
301333Sbapt  _mum_unroll_prime = _mum_next_factor ();
301333Sbapt  _mum_tail_prime = _mum_next_factor ();
301333Sbapt  for (i = 0; i < (int)(sizeof (_mum_primes) / sizeof (uint64_t)); i++)
301333Sbapt    _mum_primes[i] = _mum_next_factor ();
301333Sbapt}
301333Sbapt
301333Sbapt/* Start hashing data with SEED.  Return the state.  */
301333Sbaptstatic inline uint64_t
301333Sbaptmum_hash_init (uint64_t seed) {
301333Sbapt  return seed;
301333Sbapt}
301333Sbapt
301333Sbapt/* Process data KEY with the state H and return the updated state.  */
301333Sbaptstatic inline uint64_t
301333Sbaptmum_hash_step (uint64_t h, uint64_t key)
301333Sbapt{
301333Sbapt  return _mum (h, _mum_hash_step_prime) ^ _mum (key, _mum_key_step_prime);
301333Sbapt}
301333Sbapt
301333Sbapt/* Return the result of hashing using the current state H.  */
301333Sbaptstatic inline uint64_t
301333Sbaptmum_hash_finish (uint64_t h) {
301333Sbapt  return _mum_final (h);
301333Sbapt}
301333Sbapt
301333Sbapt/* Fast hashing of KEY with SEED.  The hash is always the same for the
301333Sbapt   same key on any target. */
301333Sbaptstatic inline size_t
301333Sbaptmum_hash64 (uint64_t key, uint64_t seed) {
301333Sbapt  return mum_hash_finish (mum_hash_step (mum_hash_init (seed), key));
301333Sbapt}
301333Sbapt
301333Sbapt/* Hash data KEY of length LEN and SEED.  The hash depends on the
301333Sbapt   target endianess and the unroll factor.  */
301333Sbaptstatic inline uint64_t
301333Sbaptmum_hash (const void *key, size_t len, uint64_t seed) {
301333Sbapt#if defined(__x86_64__) && defined(_MUM_FRESH_GCC)
301333Sbapt  static int avx2_support = 0;
301333Sbapt
301333Sbapt  if (avx2_support > 0)
301333Sbapt    return _mum_hash_avx2 (key, len, seed);
301333Sbapt  else if (! avx2_support) {
301333Sbapt    __builtin_cpu_init ();
301333Sbapt    avx2_support =  __builtin_cpu_supports ("avx2") ? 1 : -1;
301333Sbapt    if (avx2_support > 0)
301333Sbapt      return _mum_hash_avx2 (key, len, seed);
301333Sbapt  }
301333Sbapt#endif
301333Sbapt  return _mum_hash_default (key, len, seed);
301333Sbapt}
301333Sbapt
301333Sbapt#endif