1/* Copyright (c) 2016 Vladimir Makarov <vmakarov@gcc.gnu.org>
2
3   Permission is hereby granted, free of charge, to any person
4   obtaining a copy of this software and associated documentation
5   files (the "Software"), to deal in the Software without
6   restriction, including without limitation the rights to use, copy,
7   modify, merge, publish, distribute, sublicense, and/or sell copies
8   of the Software, and to permit persons to whom the Software is
9   furnished to do so, subject to the following conditions:
10
11   The above copyright notice and this permission notice shall be
12   included in all copies or substantial portions of the Software.
13
14   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21   SOFTWARE.
22*/
23
24/* This file implements MUM (MUltiply and Mix) hashing.  We randomize
25   input data by 64x64-bit multiplication and mixing hi- and low-parts
26   of the multiplication result by using an addition and then mix it
27   into the current state.  We use prime numbers randomly generated
28   with the equal probability of their bit values for the
29   multiplication.  When all primes are used once, the state is
30   randomized and the same prime numbers are used again for data
31   randomization.
32
33   The MUM hashing passes all SMHasher tests.  Pseudo Random Number
34   Generator based on MUM also passes NIST Statistical Test Suite for
35   Random and Pseudorandom Number Generators for Cryptographic
36   Applications (version 2.2.1) with 1000 bitstreams each containing
37   1M bits.  MUM hashing is also faster Spooky64 and City64 on small
38   strings (at least upto 512-bit) on Haswell and Power7.  The MUM bulk
39   speed (speed on very long data) is bigger than Spooky and City on
40   Power7.  On Haswell the bulk speed is bigger than Spooky one and
41   close to City speed.  */
42
43#ifndef __MUM_HASH__
44#define __MUM_HASH__
45
46#include <stddef.h>
47#include <stdlib.h>
48#include <string.h>
49#include <limits.h>
50
51#ifdef _MSC_VER
52typedef unsigned __int16 uint16_t;
53typedef unsigned __int32 uint32_t;
54typedef unsigned __int64 uint64_t;
55#else
56#include <stdint.h>
57#endif
58
59/* Macro saying to use 128-bit integers implemented by GCC for some
60   targets.  */
61#ifndef _MUM_USE_INT128
62/* In GCC uint128_t is defined if HOST_BITS_PER_WIDE_INT >= 64.
63   HOST_WIDE_INT is long if HOST_BITS_PER_LONG > HOST_BITS_PER_INT,
64   otherwise int. */
65#if defined(__GNUC__) && UINT_MAX != ULONG_MAX
66#define _MUM_USE_INT128 1
67#else
68#define _MUM_USE_INT128 0
69#endif
70#endif
71
72#if 0
73#if defined(__GNUC__) && ((__GNUC__ == 4) &&  (__GNUC_MINOR__ >= 9) || (__GNUC__ > 4))
74#define _MUM_FRESH_GCC
75#endif
76#endif
77
78#if defined(__GNUC__) && !defined(__llvm__) && defined(_MUM_FRESH_GCC)
79#define _MUM_ATTRIBUTE_UNUSED  __attribute__((unused))
80#define _MUM_OPTIMIZE(opts) __attribute__((__optimize__ (opts)))
81#define _MUM_TARGET(opts) __attribute__((__target__ (opts)))
82#else
83#define _MUM_ATTRIBUTE_UNUSED
84#define _MUM_OPTIMIZE(opts)
85#define _MUM_TARGET(opts)
86#endif
87
88
89/* Here are different primes randomly generated with the equal
90   probability of their bit values.  They are used to randomize input
91   values.  */
92static uint64_t _mum_hash_step_prime = 0x2e0bb864e9ea7df5ULL;
93static uint64_t _mum_key_step_prime = 0xcdb32970830fcaa1ULL;
94static uint64_t _mum_block_start_prime = 0xc42b5e2e6480b23bULL;
95static uint64_t _mum_unroll_prime = 0x7b51ec3d22f7096fULL;
96static uint64_t _mum_tail_prime = 0xaf47d47c99b1461bULL;
97static uint64_t _mum_finish_prime1 = 0xa9a7ae7ceff79f3fULL;
98static uint64_t _mum_finish_prime2 = 0xaf47d47c99b1461bULL;
99
100static uint64_t _mum_primes [] = {
101  0X9ebdcae10d981691, 0X32b9b9b97a27ac7d, 0X29b5584d83d35bbd, 0X4b04e0e61401255f,
102  0X25e8f7b1f1c9d027, 0X80d4c8c000f3e881, 0Xbd1255431904b9dd, 0X8a3bd4485eee6d81,
103  0X3bc721b2aad05197, 0X71b1a19b907d6e33, 0X525e6c1084a8534b, 0X9e4c2cd340c1299f,
104  0Xde3add92e94caa37, 0X7e14eadb1f65311d, 0X3f5aa40f89812853, 0X33b15a3b587d15c9,
105};
106
107/* Multiply 64-bit V and P and return sum of high and low parts of the
108   result.  */
109static inline uint64_t
110_mum (uint64_t v, uint64_t p) {
111  uint64_t hi, lo;
112#if _MUM_USE_INT128
113#if defined(__aarch64__)
114  /* AARCH64 needs 2 insns to calculate 128-bit result of the
115     multiplication.  If we use a generic code we actually call a
116     function doing 128x128->128 bit multiplication.  The function is
117     very slow.  */
118  lo = v * p, hi;
119  asm ("umulh %0, %1, %2" : "=r" (hi) : "r" (v), "r" (p));
120#else
121  __uint128_t r = (__uint128_t) v * (__uint128_t) p;
122  hi = (uint64_t) (r >> 64);
123  lo = (uint64_t) r;
124#endif
125#else
126  /* Implementation of 64x64->128-bit multiplication by four 32x32->64
127     bit multiplication.  */
128  uint64_t hv = v >> 32, hp = p >> 32;
129  uint64_t lv = (uint32_t) v, lp = (uint32_t) p;
130  uint64_t rh =  hv * hp;
131  uint64_t rm_0 = hv * lp;
132  uint64_t rm_1 = hp * lv;
133  uint64_t rl =  lv * lp;
134  uint64_t t, carry = 0;
135
136  /* We could ignore a carry bit here if we did not care about the
137     same hash for 32-bit and 64-bit targets.  */
138  t = rl + (rm_0 << 32);
139#ifdef MUM_TARGET_INDEPENDENT_HASH
140  carry = t < rl;
141#endif
142  lo = t + (rm_1 << 32);
143#ifdef MUM_TARGET_INDEPENDENT_HASH
144  carry += lo < t;
145#endif
146  hi = rh + (rm_0 >> 32) + (rm_1 >> 32) + carry;
147#endif
148  /* We could use XOR here too but, for some reasons, on Haswell and
149     Power7 using an addition improves hashing performance by 10% for
150     small strings.  */
151  return hi + lo;
152}
153
154#if defined(_MSC_VER)
155#define _mum_bswap_32(x) _byteswap_uint32_t (x)
156#define _mum_bswap_64(x) _byteswap_uint64_t (x)
157#elif defined(__APPLE__)
158#include <libkern/OSByteOrder.h>
159#define _mum_bswap_32(x) OSSwapInt32 (x)
160#define _mum_bswap_64(x) OSSwapInt64 (x)
161#elif defined(__GNUC__)
162#define _mum_bswap32(x) __builtin_bswap32 (x)
163#define _mum_bswap64(x) __builtin_bswap64 (x)
164#else
165#include <byteswap.h>
166#define _mum_bswap32(x) bswap32 (x)
167#define _mum_bswap64(x) bswap64 (x)
168#endif
169
170static inline uint64_t
171_mum_le (uint64_t v) {
172#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
173  return v;
174#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
175  return _mum_bswap64 (v);
176#else
177#error "Unknown endianess"
178#endif
179}
180
181static inline uint32_t
182_mum_le32 (uint32_t v) {
183#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || !defined(MUM_TARGET_INDEPENDENT_HASH)
184  return v;
185#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
186  return _mum_bswap32 (v);
187#else
188#error "Unknown endianess"
189#endif
190}
191
192/* Macro defining how many times the most nested loop in
193   _mum_hash_aligned will be unrolled by the compiler (although it can
194   make an own decision:).  Use only a constant here to help a
195   compiler to unroll a major loop.
196
197   The macro value affects the result hash for strings > 128 bit.  The
198   unroll factor greatly affects the hashing speed.  We prefer the
199   speed.  */
200#ifndef _MUM_UNROLL_FACTOR_POWER
201#if defined(__PPC64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
202#define _MUM_UNROLL_FACTOR_POWER 3
203#elif defined(__aarch64__) && !defined(MUM_TARGET_INDEPENDENT_HASH)
204#define _MUM_UNROLL_FACTOR_POWER 4
205#else
206#define _MUM_UNROLL_FACTOR_POWER 2
207#endif
208#endif
209
210#if _MUM_UNROLL_FACTOR_POWER < 1
211#error "too small unroll factor"
212#elif _MUM_UNROLL_FACTOR_POWER > 4
213#error "We have not enough primes for such unroll factor"
214#endif
215
216#define _MUM_UNROLL_FACTOR (1 << _MUM_UNROLL_FACTOR_POWER)
217
218static inline uint64_t _MUM_OPTIMIZE("unroll-loops")
219_mum_hash_aligned (uint64_t start, const void *key, size_t len) {
220  uint64_t result = start;
221  const unsigned char *str = (const unsigned char *) key;
222  uint64_t u64;
223  int i;
224  size_t n;
225
226  result = _mum (result, _mum_block_start_prime);
227  while  (len > _MUM_UNROLL_FACTOR * sizeof (uint64_t)) {
228    /* This loop could be vectorized when we have vector insns for
229       64x64->128-bit multiplication.  AVX2 currently only have a
230       vector insn for 4 32x32->64-bit multiplication.  */
231    for (i = 0; i < _MUM_UNROLL_FACTOR; i++)
232      result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
233    len -= _MUM_UNROLL_FACTOR * sizeof (uint64_t);
234    str += _MUM_UNROLL_FACTOR * sizeof (uint64_t);
235    /* We will use the same prime numbers on the next iterations --
236       randomize the state.  */
237    result = _mum (result, _mum_unroll_prime);
238  }
239  n = len / sizeof (uint64_t);
240  for (i = 0; i < (int)n; i++)
241    result ^= _mum (_mum_le (((uint64_t *) str)[i]), _mum_primes[i]);
242  len -= n * sizeof (uint64_t); str += n * sizeof (uint64_t);
243  switch (len) {
244  case 7:
245    u64 = _mum_le32 (*(uint32_t *) str);
246    u64 |= (uint64_t) str[4] << 32;
247    u64 |= (uint64_t) str[5] << 40;
248    u64 |= (uint64_t) str[6] << 48;
249    return result ^ _mum (u64, _mum_tail_prime);
250  case 6:
251    u64 = _mum_le32 (*(uint32_t *) str);
252    u64 |= (uint64_t) str[4] << 32;
253    u64 |= (uint64_t) str[5] << 40;
254    return result ^ _mum (u64, _mum_tail_prime);
255  case 5:
256    u64 = _mum_le32 (*(uint32_t *) str);
257    u64 |= (uint64_t) str[4] << 32;
258    return result ^ _mum (u64, _mum_tail_prime);
259  case 4:
260    u64 = _mum_le32 (*(uint32_t *) str);
261    return result ^ _mum (u64, _mum_tail_prime);
262  case 3:
263    u64 = str[0];
264    u64 |= (uint64_t) str[1] << 8;
265    u64 |= (uint64_t) str[2] << 16;
266    return result ^ _mum (u64, _mum_tail_prime);
267  case 2:
268    u64 = str[0];
269    u64 |= (uint64_t) str[1] << 8;
270    return result ^ _mum (u64, _mum_tail_prime);
271  case 1:
272    u64 = str[0];
273    return result ^ _mum (u64, _mum_tail_prime);
274  }
275  return result;
276}
277
278/* Final randomization of H.  */
279static inline uint64_t
280_mum_final (uint64_t h) {
281  h ^= _mum (h, _mum_finish_prime1);
282  h ^= _mum (h, _mum_finish_prime2);
283  return h;
284}
285
286#if defined(__x86_64__) && defined(_MUM_FRESH_GCC)
287
288/* We want to use AVX2 insn MULX instead of generic x86-64 MULQ where
289   it is possible.  Although on modern Intel processors MULQ takes
290   3-cycles vs. 4 for MULX, MULX permits more freedom in insn
291   scheduling as it uses less fixed registers.  */
292static inline uint64_t _MUM_TARGET("arch=haswell")
293_mum_hash_avx2 (const void * key, size_t len, uint64_t seed) {
294  return _mum_final (_mum_hash_aligned (seed + len, key, len));
295}
296#endif
297
298#ifndef _MUM_UNALIGNED_ACCESS
299#if defined(__x86_64__) || defined(__i386__) || defined(__PPC64__) \
300    || defined(__s390__) || defined(__m32c__) || defined(cris)     \
301    || defined(__CR16__) || defined(__vax__) || defined(__m68k__) \
302    || defined(__aarch64__)
303#define _MUM_UNALIGNED_ACCESS 1
304#else
305#define _MUM_UNALIGNED_ACCESS 0
306#endif
307#endif
308
309/* When we need an aligned access to data being hashed we move part of
310   the unaligned data to an aligned block of given size and then
311   process it, repeating processing the data by the block.  */
312#ifndef _MUM_BLOCK_LEN
313#define _MUM_BLOCK_LEN 1024
314#endif
315
316#if _MUM_BLOCK_LEN < 8
317#error "too small block length"
318#endif
319
320static inline uint64_t
321#if defined(__x86_64__)
322_MUM_TARGET("inline-all-stringops")
323#endif
324_mum_hash_default (const void *key, size_t len, uint64_t seed) {
325  uint64_t result;
326  const unsigned char *str = (const unsigned char *) key;
327  size_t block_len;
328  uint64_t buf[_MUM_BLOCK_LEN / sizeof (uint64_t)];
329
330  result = seed + len;
331  if (_MUM_UNALIGNED_ACCESS || ((size_t) str & 0x7) == 0)
332    result = _mum_hash_aligned (result, key, len);
333  else {
334    while (len != 0) {
335      block_len = len < _MUM_BLOCK_LEN ? len : _MUM_BLOCK_LEN;
336      memmove (buf, str, block_len);
337      result = _mum_hash_aligned (result, buf, block_len);
338      len -= block_len;
339      str += block_len;
340    }
341  }
342  return _mum_final (result);
343}
344
345static inline uint64_t
346_mum_next_factor (void) {
347  uint64_t start = 0;
348  int i;
349
350  for (i = 0; i < 8; i++)
351    start = (start << 8) | rand() % 256;
352  return start;
353}
354
355/* ++++++++++++++++++++++++++ Interface functions: +++++++++++++++++++  */
356
357/* Set random multiplicators depending on SEED.  */
358static inline void
359mum_hash_randomize (uint64_t seed) {
360  int i;
361
362  srand (seed);
363  _mum_hash_step_prime = _mum_next_factor ();
364  _mum_key_step_prime = _mum_next_factor ();
365  _mum_finish_prime1 = _mum_next_factor ();
366  _mum_finish_prime2 = _mum_next_factor ();
367  _mum_block_start_prime = _mum_next_factor ();
368  _mum_unroll_prime = _mum_next_factor ();
369  _mum_tail_prime = _mum_next_factor ();
370  for (i = 0; i < (int)(sizeof (_mum_primes) / sizeof (uint64_t)); i++)
371    _mum_primes[i] = _mum_next_factor ();
372}
373
374/* Start hashing data with SEED.  Return the state.  */
375static inline uint64_t
376mum_hash_init (uint64_t seed) {
377  return seed;
378}
379
380/* Process data KEY with the state H and return the updated state.  */
381static inline uint64_t
382mum_hash_step (uint64_t h, uint64_t key)
383{
384  return _mum (h, _mum_hash_step_prime) ^ _mum (key, _mum_key_step_prime);
385}
386
387/* Return the result of hashing using the current state H.  */
388static inline uint64_t
389mum_hash_finish (uint64_t h) {
390  return _mum_final (h);
391}
392
393/* Fast hashing of KEY with SEED.  The hash is always the same for the
394   same key on any target. */
395static inline size_t
396mum_hash64 (uint64_t key, uint64_t seed) {
397  return mum_hash_finish (mum_hash_step (mum_hash_init (seed), key));
398}
399
400/* Hash data KEY of length LEN and SEED.  The hash depends on the
401   target endianess and the unroll factor.  */
402static inline uint64_t
403mum_hash (const void *key, size_t len, uint64_t seed) {
404#if defined(__x86_64__) && defined(_MUM_FRESH_GCC)
405  static int avx2_support = 0;
406
407  if (avx2_support > 0)
408    return _mum_hash_avx2 (key, len, seed);
409  else if (! avx2_support) {
410    __builtin_cpu_init ();
411    avx2_support =  __builtin_cpu_supports ("avx2") ? 1 : -1;
412    if (avx2_support > 0)
413      return _mum_hash_avx2 (key, len, seed);
414  }
415#endif
416  return _mum_hash_default (key, len, seed);
417}
418
419#endif
420